5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
109 /* this is a chain of data about sub patterns we are processing that
110 need to be handled separately/specially in study_chunk. Its so
111 we can simulate recursion without losing state. */
113 typedef struct scan_frame {
114 regnode *last_regnode; /* last node to process in this frame */
115 regnode *next_regnode; /* next node to process when last is reached */
116 U32 prev_recursed_depth;
117 I32 stopparen; /* what stopparen do we use */
118 U32 is_top_frame; /* what flags do we use? */
120 struct scan_frame *this_prev_frame; /* this previous frame */
121 struct scan_frame *prev_frame; /* previous frame */
122 struct scan_frame *next_frame; /* next frame */
125 /* Certain characters are output as a sequence with the first being a
127 #define isBACKSLASHED_PUNCT(c) \
128 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
131 struct RExC_state_t {
132 U32 flags; /* RXf_* are we folding, multilining? */
133 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
134 char *precomp; /* uncompiled string. */
135 REGEXP *rx_sv; /* The SV that is the regexp. */
136 regexp *rx; /* perl core regexp structure */
137 regexp_internal *rxi; /* internal data for regexp object
139 char *start; /* Start of input for compile */
140 char *end; /* End of input for compile */
141 char *parse; /* Input-scan pointer. */
142 SSize_t whilem_seen; /* number of WHILEM in this expr */
143 regnode *emit_start; /* Start of emitted-code area */
144 regnode *emit_bound; /* First regnode outside of the
146 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
147 implies compiling, so don't emit */
148 regnode_ssc emit_dummy; /* placeholder for emit to point to;
149 large enough for the largest
150 non-EXACTish node, so can use it as
152 I32 naughty; /* How bad is this pattern? */
153 I32 sawback; /* Did we see \1, ...? */
155 SSize_t size; /* Code size. */
156 I32 npar; /* Capture buffer count, (OPEN) plus
157 one. ("par" 0 is the whole
159 I32 nestroot; /* root parens we are in - used by
163 regnode **open_parens; /* pointers to open parens */
164 regnode **close_parens; /* pointers to close parens */
165 regnode *opend; /* END node in program */
166 I32 utf8; /* whether the pattern is utf8 or not */
167 I32 orig_utf8; /* whether the pattern was originally in utf8 */
168 /* XXX use this for future optimisation of case
169 * where pattern must be upgraded to utf8. */
170 I32 uni_semantics; /* If a d charset modifier should use unicode
171 rules, even if the pattern is not in
173 HV *paren_names; /* Paren names */
175 regnode **recurse; /* Recurse regops */
176 I32 recurse_count; /* Number of recurse regops */
177 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
179 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
183 I32 override_recoding;
184 I32 in_multi_char_class;
185 struct reg_code_block *code_blocks; /* positions of literal (?{})
187 int num_code_blocks; /* size of code_blocks[] */
188 int code_index; /* next code_blocks[] slot */
189 SSize_t maxlen; /* mininum possible number of chars in string to match */
190 scan_frame *frame_head;
191 scan_frame *frame_last;
194 #ifdef ADD_TO_REGEXEC
195 char *starttry; /* -Dr: where regtry was called. */
196 #define RExC_starttry (pRExC_state->starttry)
198 SV *runtime_code_qr; /* qr with the runtime code blocks */
200 const char *lastparse;
202 AV *paren_name_list; /* idx -> name */
203 U32 study_chunk_recursed_count;
206 #define RExC_lastparse (pRExC_state->lastparse)
207 #define RExC_lastnum (pRExC_state->lastnum)
208 #define RExC_paren_name_list (pRExC_state->paren_name_list)
209 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
210 #define RExC_mysv (pRExC_state->mysv1)
211 #define RExC_mysv1 (pRExC_state->mysv1)
212 #define RExC_mysv2 (pRExC_state->mysv2)
217 #define RExC_flags (pRExC_state->flags)
218 #define RExC_pm_flags (pRExC_state->pm_flags)
219 #define RExC_precomp (pRExC_state->precomp)
220 #define RExC_rx_sv (pRExC_state->rx_sv)
221 #define RExC_rx (pRExC_state->rx)
222 #define RExC_rxi (pRExC_state->rxi)
223 #define RExC_start (pRExC_state->start)
224 #define RExC_end (pRExC_state->end)
225 #define RExC_parse (pRExC_state->parse)
226 #define RExC_whilem_seen (pRExC_state->whilem_seen)
227 #ifdef RE_TRACK_PATTERN_OFFSETS
228 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
231 #define RExC_emit (pRExC_state->emit)
232 #define RExC_emit_dummy (pRExC_state->emit_dummy)
233 #define RExC_emit_start (pRExC_state->emit_start)
234 #define RExC_emit_bound (pRExC_state->emit_bound)
235 #define RExC_sawback (pRExC_state->sawback)
236 #define RExC_seen (pRExC_state->seen)
237 #define RExC_size (pRExC_state->size)
238 #define RExC_maxlen (pRExC_state->maxlen)
239 #define RExC_npar (pRExC_state->npar)
240 #define RExC_nestroot (pRExC_state->nestroot)
241 #define RExC_extralen (pRExC_state->extralen)
242 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
243 #define RExC_utf8 (pRExC_state->utf8)
244 #define RExC_uni_semantics (pRExC_state->uni_semantics)
245 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
246 #define RExC_open_parens (pRExC_state->open_parens)
247 #define RExC_close_parens (pRExC_state->close_parens)
248 #define RExC_opend (pRExC_state->opend)
249 #define RExC_paren_names (pRExC_state->paren_names)
250 #define RExC_recurse (pRExC_state->recurse)
251 #define RExC_recurse_count (pRExC_state->recurse_count)
252 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
253 #define RExC_study_chunk_recursed_bytes \
254 (pRExC_state->study_chunk_recursed_bytes)
255 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
256 #define RExC_contains_locale (pRExC_state->contains_locale)
257 #define RExC_contains_i (pRExC_state->contains_i)
258 #define RExC_override_recoding (pRExC_state->override_recoding)
259 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
260 #define RExC_frame_head (pRExC_state->frame_head)
261 #define RExC_frame_last (pRExC_state->frame_last)
262 #define RExC_frame_count (pRExC_state->frame_count)
263 #define RExC_strict (pRExC_state->strict)
265 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
266 * a flag to disable back-off on the fixed/floating substrings - if it's
267 * a high complexity pattern we assume the benefit of avoiding a full match
268 * is worth the cost of checking for the substrings even if they rarely help.
270 #define RExC_naughty (pRExC_state->naughty)
271 #define TOO_NAUGHTY (10)
272 #define MARK_NAUGHTY(add) \
273 if (RExC_naughty < TOO_NAUGHTY) \
274 RExC_naughty += (add)
275 #define MARK_NAUGHTY_EXP(exp, add) \
276 if (RExC_naughty < TOO_NAUGHTY) \
277 RExC_naughty += RExC_naughty / (exp) + (add)
279 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
280 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
281 ((*s) == '{' && regcurly(s)))
284 * Flags to be passed up and down.
286 #define WORST 0 /* Worst case. */
287 #define HASWIDTH 0x01 /* Known to match non-null strings. */
289 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
290 * character. (There needs to be a case: in the switch statement in regexec.c
291 * for any node marked SIMPLE.) Note that this is not the same thing as
294 #define SPSTART 0x04 /* Starts with * or + */
295 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
296 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
297 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
299 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
301 /* whether trie related optimizations are enabled */
302 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
303 #define TRIE_STUDY_OPT
304 #define FULL_TRIE_STUDY
310 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
311 #define PBITVAL(paren) (1 << ((paren) & 7))
312 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
313 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
314 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
316 #define REQUIRE_UTF8 STMT_START { \
318 *flagp = RESTART_UTF8; \
323 /* This converts the named class defined in regcomp.h to its equivalent class
324 * number defined in handy.h. */
325 #define namedclass_to_classnum(class) ((int) ((class) / 2))
326 #define classnum_to_namedclass(classnum) ((classnum) * 2)
328 #define _invlist_union_complement_2nd(a, b, output) \
329 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
330 #define _invlist_intersection_complement_2nd(a, b, output) \
331 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
333 /* About scan_data_t.
335 During optimisation we recurse through the regexp program performing
336 various inplace (keyhole style) optimisations. In addition study_chunk
337 and scan_commit populate this data structure with information about
338 what strings MUST appear in the pattern. We look for the longest
339 string that must appear at a fixed location, and we look for the
340 longest string that may appear at a floating location. So for instance
345 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
346 strings (because they follow a .* construct). study_chunk will identify
347 both FOO and BAR as being the longest fixed and floating strings respectively.
349 The strings can be composites, for instance
353 will result in a composite fixed substring 'foo'.
355 For each string some basic information is maintained:
357 - offset or min_offset
358 This is the position the string must appear at, or not before.
359 It also implicitly (when combined with minlenp) tells us how many
360 characters must match before the string we are searching for.
361 Likewise when combined with minlenp and the length of the string it
362 tells us how many characters must appear after the string we have
366 Only used for floating strings. This is the rightmost point that
367 the string can appear at. If set to SSize_t_MAX it indicates that the
368 string can occur infinitely far to the right.
371 A pointer to the minimum number of characters of the pattern that the
372 string was found inside. This is important as in the case of positive
373 lookahead or positive lookbehind we can have multiple patterns
378 The minimum length of the pattern overall is 3, the minimum length
379 of the lookahead part is 3, but the minimum length of the part that
380 will actually match is 1. So 'FOO's minimum length is 3, but the
381 minimum length for the F is 1. This is important as the minimum length
382 is used to determine offsets in front of and behind the string being
383 looked for. Since strings can be composites this is the length of the
384 pattern at the time it was committed with a scan_commit. Note that
385 the length is calculated by study_chunk, so that the minimum lengths
386 are not known until the full pattern has been compiled, thus the
387 pointer to the value.
391 In the case of lookbehind the string being searched for can be
392 offset past the start point of the final matching string.
393 If this value was just blithely removed from the min_offset it would
394 invalidate some of the calculations for how many chars must match
395 before or after (as they are derived from min_offset and minlen and
396 the length of the string being searched for).
397 When the final pattern is compiled and the data is moved from the
398 scan_data_t structure into the regexp structure the information
399 about lookbehind is factored in, with the information that would
400 have been lost precalculated in the end_shift field for the
403 The fields pos_min and pos_delta are used to store the minimum offset
404 and the delta to the maximum offset at the current point in the pattern.
408 typedef struct scan_data_t {
409 /*I32 len_min; unused */
410 /*I32 len_delta; unused */
414 SSize_t last_end; /* min value, <0 unless valid. */
415 SSize_t last_start_min;
416 SSize_t last_start_max;
417 SV **longest; /* Either &l_fixed, or &l_float. */
418 SV *longest_fixed; /* longest fixed string found in pattern */
419 SSize_t offset_fixed; /* offset where it starts */
420 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
421 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
422 SV *longest_float; /* longest floating string found in pattern */
423 SSize_t offset_float_min; /* earliest point in string it can appear */
424 SSize_t offset_float_max; /* latest point in string it can appear */
425 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
426 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
429 SSize_t *last_closep;
430 regnode_ssc *start_class;
434 * Forward declarations for pregcomp()'s friends.
437 static const scan_data_t zero_scan_data =
438 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
440 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
441 #define SF_BEFORE_SEOL 0x0001
442 #define SF_BEFORE_MEOL 0x0002
443 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
444 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
446 #define SF_FIX_SHIFT_EOL (+2)
447 #define SF_FL_SHIFT_EOL (+4)
449 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
450 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
452 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
453 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
454 #define SF_IS_INF 0x0040
455 #define SF_HAS_PAR 0x0080
456 #define SF_IN_PAR 0x0100
457 #define SF_HAS_EVAL 0x0200
458 #define SCF_DO_SUBSTR 0x0400
459 #define SCF_DO_STCLASS_AND 0x0800
460 #define SCF_DO_STCLASS_OR 0x1000
461 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
462 #define SCF_WHILEM_VISITED_POS 0x2000
464 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
465 #define SCF_SEEN_ACCEPT 0x8000
466 #define SCF_TRIE_DOING_RESTUDY 0x10000
467 #define SCF_IN_DEFINE 0x20000
472 #define UTF cBOOL(RExC_utf8)
474 /* The enums for all these are ordered so things work out correctly */
475 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
476 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
477 == REGEX_DEPENDS_CHARSET)
478 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
479 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
480 >= REGEX_UNICODE_CHARSET)
481 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
482 == REGEX_ASCII_RESTRICTED_CHARSET)
483 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
484 >= REGEX_ASCII_RESTRICTED_CHARSET)
485 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
486 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
488 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
490 /* For programs that want to be strictly Unicode compatible by dying if any
491 * attempt is made to match a non-Unicode code point against a Unicode
493 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
495 #define OOB_NAMEDCLASS -1
497 /* There is no code point that is out-of-bounds, so this is problematic. But
498 * its only current use is to initialize a variable that is always set before
500 #define OOB_UNICODE 0xDEADBEEF
502 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
503 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
506 /* length of regex to show in messages that don't mark a position within */
507 #define RegexLengthToShowInErrorMessages 127
510 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
511 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
512 * op/pragma/warn/regcomp.
514 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
515 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
517 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
518 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
520 #define REPORT_LOCATION_ARGS(offset) \
521 UTF8fARG(UTF, offset, RExC_precomp), \
522 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
525 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
526 * arg. Show regex, up to a maximum length. If it's too long, chop and add
529 #define _FAIL(code) STMT_START { \
530 const char *ellipses = ""; \
531 IV len = RExC_end - RExC_precomp; \
534 SAVEFREESV(RExC_rx_sv); \
535 if (len > RegexLengthToShowInErrorMessages) { \
536 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
537 len = RegexLengthToShowInErrorMessages - 10; \
543 #define FAIL(msg) _FAIL( \
544 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
545 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
547 #define FAIL2(msg,arg) _FAIL( \
548 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
549 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
552 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
554 #define Simple_vFAIL(m) STMT_START { \
556 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
557 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
558 m, REPORT_LOCATION_ARGS(offset)); \
562 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
564 #define vFAIL(m) STMT_START { \
566 SAVEFREESV(RExC_rx_sv); \
571 * Like Simple_vFAIL(), but accepts two arguments.
573 #define Simple_vFAIL2(m,a1) STMT_START { \
574 const IV offset = RExC_parse - RExC_precomp; \
575 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
576 REPORT_LOCATION_ARGS(offset)); \
580 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
582 #define vFAIL2(m,a1) STMT_START { \
584 SAVEFREESV(RExC_rx_sv); \
585 Simple_vFAIL2(m, a1); \
590 * Like Simple_vFAIL(), but accepts three arguments.
592 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
593 const IV offset = RExC_parse - RExC_precomp; \
594 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
595 REPORT_LOCATION_ARGS(offset)); \
599 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
601 #define vFAIL3(m,a1,a2) STMT_START { \
603 SAVEFREESV(RExC_rx_sv); \
604 Simple_vFAIL3(m, a1, a2); \
608 * Like Simple_vFAIL(), but accepts four arguments.
610 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
611 const IV offset = RExC_parse - RExC_precomp; \
612 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
613 REPORT_LOCATION_ARGS(offset)); \
616 #define vFAIL4(m,a1,a2,a3) STMT_START { \
618 SAVEFREESV(RExC_rx_sv); \
619 Simple_vFAIL4(m, a1, a2, a3); \
622 /* A specialized version of vFAIL2 that works with UTF8f */
623 #define vFAIL2utf8f(m, a1) STMT_START { \
624 const IV offset = RExC_parse - RExC_precomp; \
626 SAVEFREESV(RExC_rx_sv); \
627 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
628 REPORT_LOCATION_ARGS(offset)); \
631 /* These have asserts in them because of [perl #122671] Many warnings in
632 * regcomp.c can occur twice. If they get output in pass1 and later in that
633 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
634 * would get output again. So they should be output in pass2, and these
635 * asserts make sure new warnings follow that paradigm. */
637 /* m is not necessarily a "literal string", in this macro */
638 #define reg_warn_non_literal_string(loc, m) STMT_START { \
639 const IV offset = loc - RExC_precomp; \
640 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
641 m, REPORT_LOCATION_ARGS(offset)); \
644 #define ckWARNreg(loc,m) STMT_START { \
645 const IV offset = loc - RExC_precomp; \
646 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
647 REPORT_LOCATION_ARGS(offset)); \
650 #define vWARN(loc, m) STMT_START { \
651 const IV offset = loc - RExC_precomp; \
652 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
653 REPORT_LOCATION_ARGS(offset)); \
656 #define vWARN_dep(loc, m) STMT_START { \
657 const IV offset = loc - RExC_precomp; \
658 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
659 REPORT_LOCATION_ARGS(offset)); \
662 #define ckWARNdep(loc,m) STMT_START { \
663 const IV offset = loc - RExC_precomp; \
664 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
666 REPORT_LOCATION_ARGS(offset)); \
669 #define ckWARNregdep(loc,m) STMT_START { \
670 const IV offset = loc - RExC_precomp; \
671 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
673 REPORT_LOCATION_ARGS(offset)); \
676 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
677 const IV offset = loc - RExC_precomp; \
678 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
680 a1, REPORT_LOCATION_ARGS(offset)); \
683 #define ckWARN2reg(loc, m, a1) STMT_START { \
684 const IV offset = loc - RExC_precomp; \
685 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
686 a1, REPORT_LOCATION_ARGS(offset)); \
689 #define vWARN3(loc, m, a1, a2) STMT_START { \
690 const IV offset = loc - RExC_precomp; \
691 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
692 a1, a2, REPORT_LOCATION_ARGS(offset)); \
695 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
696 const IV offset = loc - RExC_precomp; \
697 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
698 a1, a2, REPORT_LOCATION_ARGS(offset)); \
701 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
702 const IV offset = loc - RExC_precomp; \
703 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
704 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
707 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
708 const IV offset = loc - RExC_precomp; \
709 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
710 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
713 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
714 const IV offset = loc - RExC_precomp; \
715 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
716 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
719 /* Macros for recording node offsets. 20001227 mjd@plover.com
720 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
721 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
722 * Element 0 holds the number n.
723 * Position is 1 indexed.
725 #ifndef RE_TRACK_PATTERN_OFFSETS
726 #define Set_Node_Offset_To_R(node,byte)
727 #define Set_Node_Offset(node,byte)
728 #define Set_Cur_Node_Offset
729 #define Set_Node_Length_To_R(node,len)
730 #define Set_Node_Length(node,len)
731 #define Set_Node_Cur_Length(node,start)
732 #define Node_Offset(n)
733 #define Node_Length(n)
734 #define Set_Node_Offset_Length(node,offset,len)
735 #define ProgLen(ri) ri->u.proglen
736 #define SetProgLen(ri,x) ri->u.proglen = x
738 #define ProgLen(ri) ri->u.offsets[0]
739 #define SetProgLen(ri,x) ri->u.offsets[0] = x
740 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
742 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
743 __LINE__, (int)(node), (int)(byte))); \
745 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
748 RExC_offsets[2*(node)-1] = (byte); \
753 #define Set_Node_Offset(node,byte) \
754 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
755 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
757 #define Set_Node_Length_To_R(node,len) STMT_START { \
759 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
760 __LINE__, (int)(node), (int)(len))); \
762 Perl_croak(aTHX_ "value of node is %d in Length macro", \
765 RExC_offsets[2*(node)] = (len); \
770 #define Set_Node_Length(node,len) \
771 Set_Node_Length_To_R((node)-RExC_emit_start, len)
772 #define Set_Node_Cur_Length(node, start) \
773 Set_Node_Length(node, RExC_parse - start)
775 /* Get offsets and lengths */
776 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
777 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
779 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
780 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
781 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
785 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
786 #define EXPERIMENTAL_INPLACESCAN
787 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
789 #define DEBUG_RExC_seen() \
790 DEBUG_OPTIMISE_MORE_r({ \
791 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
793 if (RExC_seen & REG_ZERO_LEN_SEEN) \
794 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
796 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
797 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
799 if (RExC_seen & REG_GPOS_SEEN) \
800 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
802 if (RExC_seen & REG_CANY_SEEN) \
803 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
805 if (RExC_seen & REG_RECURSE_SEEN) \
806 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
808 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
809 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
811 if (RExC_seen & REG_VERBARG_SEEN) \
812 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
814 if (RExC_seen & REG_CUTGROUP_SEEN) \
815 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
817 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
818 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
820 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
821 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
823 if (RExC_seen & REG_GOSTART_SEEN) \
824 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
826 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
827 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
829 PerlIO_printf(Perl_debug_log,"\n"); \
832 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
833 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
835 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
837 PerlIO_printf(Perl_debug_log, "%s", open_str); \
838 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
839 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
840 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
841 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
842 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
843 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
844 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
845 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
846 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
847 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
848 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
849 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
850 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
851 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
852 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
853 PerlIO_printf(Perl_debug_log, "%s", close_str); \
857 #define DEBUG_STUDYDATA(str,data,depth) \
858 DEBUG_OPTIMISE_MORE_r(if(data){ \
859 PerlIO_printf(Perl_debug_log, \
860 "%*s" str "Pos:%"IVdf"/%"IVdf \
862 (int)(depth)*2, "", \
863 (IV)((data)->pos_min), \
864 (IV)((data)->pos_delta), \
865 (UV)((data)->flags) \
867 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
868 PerlIO_printf(Perl_debug_log, \
869 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
870 (IV)((data)->whilem_c), \
871 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
872 is_inf ? "INF " : "" \
874 if ((data)->last_found) \
875 PerlIO_printf(Perl_debug_log, \
876 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
877 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
878 SvPVX_const((data)->last_found), \
879 (IV)((data)->last_end), \
880 (IV)((data)->last_start_min), \
881 (IV)((data)->last_start_max), \
882 ((data)->longest && \
883 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
884 SvPVX_const((data)->longest_fixed), \
885 (IV)((data)->offset_fixed), \
886 ((data)->longest && \
887 (data)->longest==&((data)->longest_float)) ? "*" : "", \
888 SvPVX_const((data)->longest_float), \
889 (IV)((data)->offset_float_min), \
890 (IV)((data)->offset_float_max) \
892 PerlIO_printf(Perl_debug_log,"\n"); \
895 /* is c a control character for which we have a mnemonic? */
896 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
899 S_cntrl_to_mnemonic(const U8 c)
901 /* Returns the mnemonic string that represents character 'c', if one
902 * exists; NULL otherwise. The only ones that exist for the purposes of
903 * this routine are a few control characters */
906 case '\a': return "\\a";
907 case '\b': return "\\b";
908 case ESC_NATIVE: return "\\e";
909 case '\f': return "\\f";
910 case '\n': return "\\n";
911 case '\r': return "\\r";
912 case '\t': return "\\t";
918 /* Mark that we cannot extend a found fixed substring at this point.
919 Update the longest found anchored substring and the longest found
920 floating substrings if needed. */
923 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
924 SSize_t *minlenp, int is_inf)
926 const STRLEN l = CHR_SVLEN(data->last_found);
927 const STRLEN old_l = CHR_SVLEN(*data->longest);
928 GET_RE_DEBUG_FLAGS_DECL;
930 PERL_ARGS_ASSERT_SCAN_COMMIT;
932 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
933 SvSetMagicSV(*data->longest, data->last_found);
934 if (*data->longest == data->longest_fixed) {
935 data->offset_fixed = l ? data->last_start_min : data->pos_min;
936 if (data->flags & SF_BEFORE_EOL)
938 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
940 data->flags &= ~SF_FIX_BEFORE_EOL;
941 data->minlen_fixed=minlenp;
942 data->lookbehind_fixed=0;
944 else { /* *data->longest == data->longest_float */
945 data->offset_float_min = l ? data->last_start_min : data->pos_min;
946 data->offset_float_max = (l
947 ? data->last_start_max
948 : (data->pos_delta > SSize_t_MAX - data->pos_min
950 : data->pos_min + data->pos_delta));
952 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
953 data->offset_float_max = SSize_t_MAX;
954 if (data->flags & SF_BEFORE_EOL)
956 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
958 data->flags &= ~SF_FL_BEFORE_EOL;
959 data->minlen_float=minlenp;
960 data->lookbehind_float=0;
963 SvCUR_set(data->last_found, 0);
965 SV * const sv = data->last_found;
966 if (SvUTF8(sv) && SvMAGICAL(sv)) {
967 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
973 data->flags &= ~SF_BEFORE_EOL;
974 DEBUG_STUDYDATA("commit: ",data,0);
977 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
978 * list that describes which code points it matches */
981 S_ssc_anything(pTHX_ regnode_ssc *ssc)
983 /* Set the SSC 'ssc' to match an empty string or any code point */
985 PERL_ARGS_ASSERT_SSC_ANYTHING;
987 assert(is_ANYOF_SYNTHETIC(ssc));
989 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
990 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
991 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
995 S_ssc_is_anything(const regnode_ssc *ssc)
997 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
998 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
999 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1000 * in any way, so there's no point in using it */
1005 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1007 assert(is_ANYOF_SYNTHETIC(ssc));
1009 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1013 /* See if the list consists solely of the range 0 - Infinity */
1014 invlist_iterinit(ssc->invlist);
1015 ret = invlist_iternext(ssc->invlist, &start, &end)
1019 invlist_iterfinish(ssc->invlist);
1025 /* If e.g., both \w and \W are set, matches everything */
1026 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1028 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1029 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1039 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1041 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1042 * string, any code point, or any posix class under locale */
1044 PERL_ARGS_ASSERT_SSC_INIT;
1046 Zero(ssc, 1, regnode_ssc);
1047 set_ANYOF_SYNTHETIC(ssc);
1048 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1051 /* If any portion of the regex is to operate under locale rules that aren't
1052 * fully known at compile time, initialization includes it. The reason
1053 * this isn't done for all regexes is that the optimizer was written under
1054 * the assumption that locale was all-or-nothing. Given the complexity and
1055 * lack of documentation in the optimizer, and that there are inadequate
1056 * test cases for locale, many parts of it may not work properly, it is
1057 * safest to avoid locale unless necessary. */
1058 if (RExC_contains_locale) {
1059 ANYOF_POSIXL_SETALL(ssc);
1062 ANYOF_POSIXL_ZERO(ssc);
1067 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1068 const regnode_ssc *ssc)
1070 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1071 * to the list of code points matched, and locale posix classes; hence does
1072 * not check its flags) */
1077 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1079 assert(is_ANYOF_SYNTHETIC(ssc));
1081 invlist_iterinit(ssc->invlist);
1082 ret = invlist_iternext(ssc->invlist, &start, &end)
1086 invlist_iterfinish(ssc->invlist);
1092 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1100 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1101 const regnode_charclass* const node)
1103 /* Returns a mortal inversion list defining which code points are matched
1104 * by 'node', which is of type ANYOF. Handles complementing the result if
1105 * appropriate. If some code points aren't knowable at this time, the
1106 * returned list must, and will, contain every code point that is a
1109 SV* invlist = sv_2mortal(_new_invlist(0));
1110 SV* only_utf8_locale_invlist = NULL;
1112 const U32 n = ARG(node);
1113 bool new_node_has_latin1 = FALSE;
1115 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1117 /* Look at the data structure created by S_set_ANYOF_arg() */
1118 if (n != ANYOF_ONLY_HAS_BITMAP) {
1119 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1120 AV * const av = MUTABLE_AV(SvRV(rv));
1121 SV **const ary = AvARRAY(av);
1122 assert(RExC_rxi->data->what[n] == 's');
1124 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1125 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1127 else if (ary[0] && ary[0] != &PL_sv_undef) {
1129 /* Here, no compile-time swash, and there are things that won't be
1130 * known until runtime -- we have to assume it could be anything */
1131 return _add_range_to_invlist(invlist, 0, UV_MAX);
1133 else if (ary[3] && ary[3] != &PL_sv_undef) {
1135 /* Here no compile-time swash, and no run-time only data. Use the
1136 * node's inversion list */
1137 invlist = sv_2mortal(invlist_clone(ary[3]));
1140 /* Get the code points valid only under UTF-8 locales */
1141 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1142 && ary[2] && ary[2] != &PL_sv_undef)
1144 only_utf8_locale_invlist = ary[2];
1148 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1149 * code points, and an inversion list for the others, but if there are code
1150 * points that should match only conditionally on the target string being
1151 * UTF-8, those are placed in the inversion list, and not the bitmap.
1152 * Since there are circumstances under which they could match, they are
1153 * included in the SSC. But if the ANYOF node is to be inverted, we have
1154 * to exclude them here, so that when we invert below, the end result
1155 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1156 * have to do this here before we add the unconditionally matched code
1158 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1159 _invlist_intersection_complement_2nd(invlist,
1164 /* Add in the points from the bit map */
1165 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1166 if (ANYOF_BITMAP_TEST(node, i)) {
1167 invlist = add_cp_to_invlist(invlist, i);
1168 new_node_has_latin1 = TRUE;
1172 /* If this can match all upper Latin1 code points, have to add them
1174 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1175 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1178 /* Similarly for these */
1179 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1180 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1183 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1184 _invlist_invert(invlist);
1186 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1188 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1189 * locale. We can skip this if there are no 0-255 at all. */
1190 _invlist_union(invlist, PL_Latin1, &invlist);
1193 /* Similarly add the UTF-8 locale possible matches. These have to be
1194 * deferred until after the non-UTF-8 locale ones are taken care of just
1195 * above, or it leads to wrong results under ANYOF_INVERT */
1196 if (only_utf8_locale_invlist) {
1197 _invlist_union_maybe_complement_2nd(invlist,
1198 only_utf8_locale_invlist,
1199 ANYOF_FLAGS(node) & ANYOF_INVERT,
1206 /* These two functions currently do the exact same thing */
1207 #define ssc_init_zero ssc_init
1209 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1210 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1212 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1213 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1214 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1217 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1218 const regnode_charclass *and_with)
1220 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1221 * another SSC or a regular ANYOF class. Can create false positives. */
1226 PERL_ARGS_ASSERT_SSC_AND;
1228 assert(is_ANYOF_SYNTHETIC(ssc));
1230 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1231 * the code point inversion list and just the relevant flags */
1232 if (is_ANYOF_SYNTHETIC(and_with)) {
1233 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1234 anded_flags = ANYOF_FLAGS(and_with);
1236 /* XXX This is a kludge around what appears to be deficiencies in the
1237 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1238 * there are paths through the optimizer where it doesn't get weeded
1239 * out when it should. And if we don't make some extra provision for
1240 * it like the code just below, it doesn't get added when it should.
1241 * This solution is to add it only when AND'ing, which is here, and
1242 * only when what is being AND'ed is the pristine, original node
1243 * matching anything. Thus it is like adding it to ssc_anything() but
1244 * only when the result is to be AND'ed. Probably the same solution
1245 * could be adopted for the same problem we have with /l matching,
1246 * which is solved differently in S_ssc_init(), and that would lead to
1247 * fewer false positives than that solution has. But if this solution
1248 * creates bugs, the consequences are only that a warning isn't raised
1249 * that should be; while the consequences for having /l bugs is
1250 * incorrect matches */
1251 if (ssc_is_anything((regnode_ssc *)and_with)) {
1252 anded_flags |= ANYOF_WARN_SUPER;
1256 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1257 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1260 ANYOF_FLAGS(ssc) &= anded_flags;
1262 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1263 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1264 * 'and_with' may be inverted. When not inverted, we have the situation of
1266 * (C1 | P1) & (C2 | P2)
1267 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1268 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1269 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1270 * <= ((C1 & C2) | P1 | P2)
1271 * Alternatively, the last few steps could be:
1272 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1273 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1274 * <= (C1 | C2 | (P1 & P2))
1275 * We favor the second approach if either P1 or P2 is non-empty. This is
1276 * because these components are a barrier to doing optimizations, as what
1277 * they match cannot be known until the moment of matching as they are
1278 * dependent on the current locale, 'AND"ing them likely will reduce or
1280 * But we can do better if we know that C1,P1 are in their initial state (a
1281 * frequent occurrence), each matching everything:
1282 * (<everything>) & (C2 | P2) = C2 | P2
1283 * Similarly, if C2,P2 are in their initial state (again a frequent
1284 * occurrence), the result is a no-op
1285 * (C1 | P1) & (<everything>) = C1 | P1
1288 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1289 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1290 * <= (C1 & ~C2) | (P1 & ~P2)
1293 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1294 && ! is_ANYOF_SYNTHETIC(and_with))
1298 ssc_intersection(ssc,
1300 FALSE /* Has already been inverted */
1303 /* If either P1 or P2 is empty, the intersection will be also; can skip
1305 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1306 ANYOF_POSIXL_ZERO(ssc);
1308 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1310 /* Note that the Posix class component P from 'and_with' actually
1312 * P = Pa | Pb | ... | Pn
1313 * where each component is one posix class, such as in [\w\s].
1315 * ~P = ~(Pa | Pb | ... | Pn)
1316 * = ~Pa & ~Pb & ... & ~Pn
1317 * <= ~Pa | ~Pb | ... | ~Pn
1318 * The last is something we can easily calculate, but unfortunately
1319 * is likely to have many false positives. We could do better
1320 * in some (but certainly not all) instances if two classes in
1321 * P have known relationships. For example
1322 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1324 * :lower: & :print: = :lower:
1325 * And similarly for classes that must be disjoint. For example,
1326 * since \s and \w can have no elements in common based on rules in
1327 * the POSIX standard,
1328 * \w & ^\S = nothing
1329 * Unfortunately, some vendor locales do not meet the Posix
1330 * standard, in particular almost everything by Microsoft.
1331 * The loop below just changes e.g., \w into \W and vice versa */
1333 regnode_charclass_posixl temp;
1334 int add = 1; /* To calculate the index of the complement */
1336 ANYOF_POSIXL_ZERO(&temp);
1337 for (i = 0; i < ANYOF_MAX; i++) {
1339 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1340 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1342 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1343 ANYOF_POSIXL_SET(&temp, i + add);
1345 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1347 ANYOF_POSIXL_AND(&temp, ssc);
1349 } /* else ssc already has no posixes */
1350 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1351 in its initial state */
1352 else if (! is_ANYOF_SYNTHETIC(and_with)
1353 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1355 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1356 * copy it over 'ssc' */
1357 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1358 if (is_ANYOF_SYNTHETIC(and_with)) {
1359 StructCopy(and_with, ssc, regnode_ssc);
1362 ssc->invlist = anded_cp_list;
1363 ANYOF_POSIXL_ZERO(ssc);
1364 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1365 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1369 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1370 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1372 /* One or the other of P1, P2 is non-empty. */
1373 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1374 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1376 ssc_union(ssc, anded_cp_list, FALSE);
1378 else { /* P1 = P2 = empty */
1379 ssc_intersection(ssc, anded_cp_list, FALSE);
1385 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1386 const regnode_charclass *or_with)
1388 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1389 * another SSC or a regular ANYOF class. Can create false positives if
1390 * 'or_with' is to be inverted. */
1395 PERL_ARGS_ASSERT_SSC_OR;
1397 assert(is_ANYOF_SYNTHETIC(ssc));
1399 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1400 * the code point inversion list and just the relevant flags */
1401 if (is_ANYOF_SYNTHETIC(or_with)) {
1402 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1403 ored_flags = ANYOF_FLAGS(or_with);
1406 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1407 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1410 ANYOF_FLAGS(ssc) |= ored_flags;
1412 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1413 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1414 * 'or_with' may be inverted. When not inverted, we have the simple
1415 * situation of computing:
1416 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1417 * If P1|P2 yields a situation with both a class and its complement are
1418 * set, like having both \w and \W, this matches all code points, and we
1419 * can delete these from the P component of the ssc going forward. XXX We
1420 * might be able to delete all the P components, but I (khw) am not certain
1421 * about this, and it is better to be safe.
1424 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1425 * <= (C1 | P1) | ~C2
1426 * <= (C1 | ~C2) | P1
1427 * (which results in actually simpler code than the non-inverted case)
1430 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1431 && ! is_ANYOF_SYNTHETIC(or_with))
1433 /* We ignore P2, leaving P1 going forward */
1434 } /* else Not inverted */
1435 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1436 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1437 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1439 for (i = 0; i < ANYOF_MAX; i += 2) {
1440 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1442 ssc_match_all_cp(ssc);
1443 ANYOF_POSIXL_CLEAR(ssc, i);
1444 ANYOF_POSIXL_CLEAR(ssc, i+1);
1452 FALSE /* Already has been inverted */
1456 PERL_STATIC_INLINE void
1457 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1459 PERL_ARGS_ASSERT_SSC_UNION;
1461 assert(is_ANYOF_SYNTHETIC(ssc));
1463 _invlist_union_maybe_complement_2nd(ssc->invlist,
1469 PERL_STATIC_INLINE void
1470 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1472 const bool invert2nd)
1474 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1476 assert(is_ANYOF_SYNTHETIC(ssc));
1478 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1484 PERL_STATIC_INLINE void
1485 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1487 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1489 assert(is_ANYOF_SYNTHETIC(ssc));
1491 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1494 PERL_STATIC_INLINE void
1495 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1497 /* AND just the single code point 'cp' into the SSC 'ssc' */
1499 SV* cp_list = _new_invlist(2);
1501 PERL_ARGS_ASSERT_SSC_CP_AND;
1503 assert(is_ANYOF_SYNTHETIC(ssc));
1505 cp_list = add_cp_to_invlist(cp_list, cp);
1506 ssc_intersection(ssc, cp_list,
1507 FALSE /* Not inverted */
1509 SvREFCNT_dec_NN(cp_list);
1512 PERL_STATIC_INLINE void
1513 S_ssc_clear_locale(regnode_ssc *ssc)
1515 /* Set the SSC 'ssc' to not match any locale things */
1516 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1518 assert(is_ANYOF_SYNTHETIC(ssc));
1520 ANYOF_POSIXL_ZERO(ssc);
1521 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1524 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1527 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1529 /* The synthetic start class is used to hopefully quickly winnow down
1530 * places where a pattern could start a match in the target string. If it
1531 * doesn't really narrow things down that much, there isn't much point to
1532 * having the overhead of using it. This function uses some very crude
1533 * heuristics to decide if to use the ssc or not.
1535 * It returns TRUE if 'ssc' rules out more than half what it considers to
1536 * be the "likely" possible matches, but of course it doesn't know what the
1537 * actual things being matched are going to be; these are only guesses
1539 * For /l matches, it assumes that the only likely matches are going to be
1540 * in the 0-255 range, uniformly distributed, so half of that is 127
1541 * For /a and /d matches, it assumes that the likely matches will be just
1542 * the ASCII range, so half of that is 63
1543 * For /u and there isn't anything matching above the Latin1 range, it
1544 * assumes that that is the only range likely to be matched, and uses
1545 * half that as the cut-off: 127. If anything matches above Latin1,
1546 * it assumes that all of Unicode could match (uniformly), except for
1547 * non-Unicode code points and things in the General Category "Other"
1548 * (unassigned, private use, surrogates, controls and formats). This
1549 * is a much large number. */
1551 const U32 max_match = (LOC)
1555 : (invlist_highest(ssc->invlist) < 256)
1557 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1558 U32 count = 0; /* Running total of number of code points matched by
1560 UV start, end; /* Start and end points of current range in inversion
1563 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1565 invlist_iterinit(ssc->invlist);
1566 while (invlist_iternext(ssc->invlist, &start, &end)) {
1568 /* /u is the only thing that we expect to match above 255; so if not /u
1569 * and even if there are matches above 255, ignore them. This catches
1570 * things like \d under /d which does match the digits above 255, but
1571 * since the pattern is /d, it is not likely to be expecting them */
1572 if (! UNI_SEMANTICS) {
1576 end = MIN(end, 255);
1578 count += end - start + 1;
1579 if (count > max_match) {
1580 invlist_iterfinish(ssc->invlist);
1590 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1592 /* The inversion list in the SSC is marked mortal; now we need a more
1593 * permanent copy, which is stored the same way that is done in a regular
1594 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1597 SV* invlist = invlist_clone(ssc->invlist);
1599 PERL_ARGS_ASSERT_SSC_FINALIZE;
1601 assert(is_ANYOF_SYNTHETIC(ssc));
1603 /* The code in this file assumes that all but these flags aren't relevant
1604 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1605 * by the time we reach here */
1606 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1608 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1610 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1611 NULL, NULL, NULL, FALSE);
1613 /* Make sure is clone-safe */
1614 ssc->invlist = NULL;
1616 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1617 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1620 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1623 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1624 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1625 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1626 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1627 ? (TRIE_LIST_CUR( idx ) - 1) \
1633 dump_trie(trie,widecharmap,revcharmap)
1634 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1635 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1637 These routines dump out a trie in a somewhat readable format.
1638 The _interim_ variants are used for debugging the interim
1639 tables that are used to generate the final compressed
1640 representation which is what dump_trie expects.
1642 Part of the reason for their existence is to provide a form
1643 of documentation as to how the different representations function.
1648 Dumps the final compressed table form of the trie to Perl_debug_log.
1649 Used for debugging make_trie().
1653 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1654 AV *revcharmap, U32 depth)
1657 SV *sv=sv_newmortal();
1658 int colwidth= widecharmap ? 6 : 4;
1660 GET_RE_DEBUG_FLAGS_DECL;
1662 PERL_ARGS_ASSERT_DUMP_TRIE;
1664 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1665 (int)depth * 2 + 2,"",
1666 "Match","Base","Ofs" );
1668 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1669 SV ** const tmp = av_fetch( revcharmap, state, 0);
1671 PerlIO_printf( Perl_debug_log, "%*s",
1673 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1674 PL_colors[0], PL_colors[1],
1675 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1676 PERL_PV_ESCAPE_FIRSTCHAR
1681 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1682 (int)depth * 2 + 2,"");
1684 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1685 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1686 PerlIO_printf( Perl_debug_log, "\n");
1688 for( state = 1 ; state < trie->statecount ; state++ ) {
1689 const U32 base = trie->states[ state ].trans.base;
1691 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1692 (int)depth * 2 + 2,"", (UV)state);
1694 if ( trie->states[ state ].wordnum ) {
1695 PerlIO_printf( Perl_debug_log, " W%4X",
1696 trie->states[ state ].wordnum );
1698 PerlIO_printf( Perl_debug_log, "%6s", "" );
1701 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1706 while( ( base + ofs < trie->uniquecharcount ) ||
1707 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1708 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1712 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1714 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1715 if ( ( base + ofs >= trie->uniquecharcount )
1716 && ( base + ofs - trie->uniquecharcount
1718 && trie->trans[ base + ofs
1719 - trie->uniquecharcount ].check == state )
1721 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1723 (UV)trie->trans[ base + ofs
1724 - trie->uniquecharcount ].next );
1726 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1730 PerlIO_printf( Perl_debug_log, "]");
1733 PerlIO_printf( Perl_debug_log, "\n" );
1735 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1737 for (word=1; word <= trie->wordcount; word++) {
1738 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1739 (int)word, (int)(trie->wordinfo[word].prev),
1740 (int)(trie->wordinfo[word].len));
1742 PerlIO_printf(Perl_debug_log, "\n" );
1745 Dumps a fully constructed but uncompressed trie in list form.
1746 List tries normally only are used for construction when the number of
1747 possible chars (trie->uniquecharcount) is very high.
1748 Used for debugging make_trie().
1751 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1752 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1756 SV *sv=sv_newmortal();
1757 int colwidth= widecharmap ? 6 : 4;
1758 GET_RE_DEBUG_FLAGS_DECL;
1760 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1762 /* print out the table precompression. */
1763 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1764 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1765 "------:-----+-----------------\n" );
1767 for( state=1 ; state < next_alloc ; state ++ ) {
1770 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1771 (int)depth * 2 + 2,"", (UV)state );
1772 if ( ! trie->states[ state ].wordnum ) {
1773 PerlIO_printf( Perl_debug_log, "%5s| ","");
1775 PerlIO_printf( Perl_debug_log, "W%4x| ",
1776 trie->states[ state ].wordnum
1779 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1780 SV ** const tmp = av_fetch( revcharmap,
1781 TRIE_LIST_ITEM(state,charid).forid, 0);
1783 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1785 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1787 PL_colors[0], PL_colors[1],
1788 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1789 | PERL_PV_ESCAPE_FIRSTCHAR
1791 TRIE_LIST_ITEM(state,charid).forid,
1792 (UV)TRIE_LIST_ITEM(state,charid).newstate
1795 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1796 (int)((depth * 2) + 14), "");
1799 PerlIO_printf( Perl_debug_log, "\n");
1804 Dumps a fully constructed but uncompressed trie in table form.
1805 This is the normal DFA style state transition table, with a few
1806 twists to facilitate compression later.
1807 Used for debugging make_trie().
1810 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1811 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1816 SV *sv=sv_newmortal();
1817 int colwidth= widecharmap ? 6 : 4;
1818 GET_RE_DEBUG_FLAGS_DECL;
1820 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1823 print out the table precompression so that we can do a visual check
1824 that they are identical.
1827 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1829 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1830 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1832 PerlIO_printf( Perl_debug_log, "%*s",
1834 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1835 PL_colors[0], PL_colors[1],
1836 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1837 PERL_PV_ESCAPE_FIRSTCHAR
1843 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1845 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1846 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1849 PerlIO_printf( Perl_debug_log, "\n" );
1851 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1853 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1854 (int)depth * 2 + 2,"",
1855 (UV)TRIE_NODENUM( state ) );
1857 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1858 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1860 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1862 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1864 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1865 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1866 (UV)trie->trans[ state ].check );
1868 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1869 (UV)trie->trans[ state ].check,
1870 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1878 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1879 startbranch: the first branch in the whole branch sequence
1880 first : start branch of sequence of branch-exact nodes.
1881 May be the same as startbranch
1882 last : Thing following the last branch.
1883 May be the same as tail.
1884 tail : item following the branch sequence
1885 count : words in the sequence
1886 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1887 depth : indent depth
1889 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1891 A trie is an N'ary tree where the branches are determined by digital
1892 decomposition of the key. IE, at the root node you look up the 1st character and
1893 follow that branch repeat until you find the end of the branches. Nodes can be
1894 marked as "accepting" meaning they represent a complete word. Eg:
1898 would convert into the following structure. Numbers represent states, letters
1899 following numbers represent valid transitions on the letter from that state, if
1900 the number is in square brackets it represents an accepting state, otherwise it
1901 will be in parenthesis.
1903 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1907 (1) +-i->(6)-+-s->[7]
1909 +-s->(3)-+-h->(4)-+-e->[5]
1911 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1913 This shows that when matching against the string 'hers' we will begin at state 1
1914 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1915 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1916 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1917 single traverse. We store a mapping from accepting to state to which word was
1918 matched, and then when we have multiple possibilities we try to complete the
1919 rest of the regex in the order in which they occured in the alternation.
1921 The only prior NFA like behaviour that would be changed by the TRIE support is
1922 the silent ignoring of duplicate alternations which are of the form:
1924 / (DUPE|DUPE) X? (?{ ... }) Y /x
1926 Thus EVAL blocks following a trie may be called a different number of times with
1927 and without the optimisation. With the optimisations dupes will be silently
1928 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1929 the following demonstrates:
1931 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1933 which prints out 'word' three times, but
1935 'words'=~/(word|word|word)(?{ print $1 })S/
1937 which doesnt print it out at all. This is due to other optimisations kicking in.
1939 Example of what happens on a structural level:
1941 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1943 1: CURLYM[1] {1,32767}(18)
1954 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1955 and should turn into:
1957 1: CURLYM[1] {1,32767}(18)
1959 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1967 Cases where tail != last would be like /(?foo|bar)baz/:
1977 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1978 and would end up looking like:
1981 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1988 d = uvchr_to_utf8_flags(d, uv, 0);
1990 is the recommended Unicode-aware way of saying
1995 #define TRIE_STORE_REVCHAR(val) \
1998 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1999 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2000 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2001 SvCUR_set(zlopp, kapow - flrbbbbb); \
2004 av_push(revcharmap, zlopp); \
2006 char ooooff = (char)val; \
2007 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2011 /* This gets the next character from the input, folding it if not already
2013 #define TRIE_READ_CHAR STMT_START { \
2016 /* if it is UTF then it is either already folded, or does not need \
2018 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2020 else if (folder == PL_fold_latin1) { \
2021 /* This folder implies Unicode rules, which in the range expressible \
2022 * by not UTF is the lower case, with the two exceptions, one of \
2023 * which should have been taken care of before calling this */ \
2024 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2025 uvc = toLOWER_L1(*uc); \
2026 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2029 /* raw data, will be folded later if needed */ \
2037 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2038 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2039 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2040 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2042 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2043 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2044 TRIE_LIST_CUR( state )++; \
2047 #define TRIE_LIST_NEW(state) STMT_START { \
2048 Newxz( trie->states[ state ].trans.list, \
2049 4, reg_trie_trans_le ); \
2050 TRIE_LIST_CUR( state ) = 1; \
2051 TRIE_LIST_LEN( state ) = 4; \
2054 #define TRIE_HANDLE_WORD(state) STMT_START { \
2055 U16 dupe= trie->states[ state ].wordnum; \
2056 regnode * const noper_next = regnext( noper ); \
2059 /* store the word for dumping */ \
2061 if (OP(noper) != NOTHING) \
2062 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2064 tmp = newSVpvn_utf8( "", 0, UTF ); \
2065 av_push( trie_words, tmp ); \
2069 trie->wordinfo[curword].prev = 0; \
2070 trie->wordinfo[curword].len = wordlen; \
2071 trie->wordinfo[curword].accept = state; \
2073 if ( noper_next < tail ) { \
2075 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2077 trie->jump[curword] = (U16)(noper_next - convert); \
2079 jumper = noper_next; \
2081 nextbranch= regnext(cur); \
2085 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2086 /* chain, so that when the bits of chain are later */\
2087 /* linked together, the dups appear in the chain */\
2088 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2089 trie->wordinfo[dupe].prev = curword; \
2091 /* we haven't inserted this word yet. */ \
2092 trie->states[ state ].wordnum = curword; \
2097 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2098 ( ( base + charid >= ucharcount \
2099 && base + charid < ubound \
2100 && state == trie->trans[ base - ucharcount + charid ].check \
2101 && trie->trans[ base - ucharcount + charid ].next ) \
2102 ? trie->trans[ base - ucharcount + charid ].next \
2103 : ( state==1 ? special : 0 ) \
2107 #define MADE_JUMP_TRIE 2
2108 #define MADE_EXACT_TRIE 4
2111 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2112 regnode *first, regnode *last, regnode *tail,
2113 U32 word_count, U32 flags, U32 depth)
2115 /* first pass, loop through and scan words */
2116 reg_trie_data *trie;
2117 HV *widecharmap = NULL;
2118 AV *revcharmap = newAV();
2124 regnode *jumper = NULL;
2125 regnode *nextbranch = NULL;
2126 regnode *convert = NULL;
2127 U32 *prev_states; /* temp array mapping each state to previous one */
2128 /* we just use folder as a flag in utf8 */
2129 const U8 * folder = NULL;
2132 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2133 AV *trie_words = NULL;
2134 /* along with revcharmap, this only used during construction but both are
2135 * useful during debugging so we store them in the struct when debugging.
2138 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2139 STRLEN trie_charcount=0;
2141 SV *re_trie_maxbuff;
2142 GET_RE_DEBUG_FLAGS_DECL;
2144 PERL_ARGS_ASSERT_MAKE_TRIE;
2146 PERL_UNUSED_ARG(depth);
2150 case EXACT: case EXACTL: break;
2154 case EXACTFLU8: folder = PL_fold_latin1; break;
2155 case EXACTF: folder = PL_fold; break;
2156 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2159 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2161 trie->startstate = 1;
2162 trie->wordcount = word_count;
2163 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2164 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2165 if (flags == EXACT || flags == EXACTL)
2166 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2167 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2168 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2171 trie_words = newAV();
2174 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2175 assert(re_trie_maxbuff);
2176 if (!SvIOK(re_trie_maxbuff)) {
2177 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2179 DEBUG_TRIE_COMPILE_r({
2180 PerlIO_printf( Perl_debug_log,
2181 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2182 (int)depth * 2 + 2, "",
2183 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2184 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2187 /* Find the node we are going to overwrite */
2188 if ( first == startbranch && OP( last ) != BRANCH ) {
2189 /* whole branch chain */
2192 /* branch sub-chain */
2193 convert = NEXTOPER( first );
2196 /* -- First loop and Setup --
2198 We first traverse the branches and scan each word to determine if it
2199 contains widechars, and how many unique chars there are, this is
2200 important as we have to build a table with at least as many columns as we
2203 We use an array of integers to represent the character codes 0..255
2204 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2205 the native representation of the character value as the key and IV's for
2208 *TODO* If we keep track of how many times each character is used we can
2209 remap the columns so that the table compression later on is more
2210 efficient in terms of memory by ensuring the most common value is in the
2211 middle and the least common are on the outside. IMO this would be better
2212 than a most to least common mapping as theres a decent chance the most
2213 common letter will share a node with the least common, meaning the node
2214 will not be compressible. With a middle is most common approach the worst
2215 case is when we have the least common nodes twice.
2219 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2220 regnode *noper = NEXTOPER( cur );
2221 const U8 *uc = (U8*)STRING( noper );
2222 const U8 *e = uc + STR_LEN( noper );
2224 U32 wordlen = 0; /* required init */
2225 STRLEN minchars = 0;
2226 STRLEN maxchars = 0;
2227 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2230 if (OP(noper) == NOTHING) {
2231 regnode *noper_next= regnext(noper);
2232 if (noper_next != tail && OP(noper_next) == flags) {
2234 uc= (U8*)STRING(noper);
2235 e= uc + STR_LEN(noper);
2236 trie->minlen= STR_LEN(noper);
2243 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2244 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2245 regardless of encoding */
2246 if (OP( noper ) == EXACTFU_SS) {
2247 /* false positives are ok, so just set this */
2248 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2251 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2253 TRIE_CHARCOUNT(trie)++;
2256 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2257 * is in effect. Under /i, this character can match itself, or
2258 * anything that folds to it. If not under /i, it can match just
2259 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2260 * all fold to k, and all are single characters. But some folds
2261 * expand to more than one character, so for example LATIN SMALL
2262 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2263 * the string beginning at 'uc' is 'ffi', it could be matched by
2264 * three characters, or just by the one ligature character. (It
2265 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2266 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2267 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2268 * match.) The trie needs to know the minimum and maximum number
2269 * of characters that could match so that it can use size alone to
2270 * quickly reject many match attempts. The max is simple: it is
2271 * the number of folded characters in this branch (since a fold is
2272 * never shorter than what folds to it. */
2276 /* And the min is equal to the max if not under /i (indicated by
2277 * 'folder' being NULL), or there are no multi-character folds. If
2278 * there is a multi-character fold, the min is incremented just
2279 * once, for the character that folds to the sequence. Each
2280 * character in the sequence needs to be added to the list below of
2281 * characters in the trie, but we count only the first towards the
2282 * min number of characters needed. This is done through the
2283 * variable 'foldlen', which is returned by the macros that look
2284 * for these sequences as the number of bytes the sequence
2285 * occupies. Each time through the loop, we decrement 'foldlen' by
2286 * how many bytes the current char occupies. Only when it reaches
2287 * 0 do we increment 'minchars' or look for another multi-character
2289 if (folder == NULL) {
2292 else if (foldlen > 0) {
2293 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2298 /* See if *uc is the beginning of a multi-character fold. If
2299 * so, we decrement the length remaining to look at, to account
2300 * for the current character this iteration. (We can use 'uc'
2301 * instead of the fold returned by TRIE_READ_CHAR because for
2302 * non-UTF, the latin1_safe macro is smart enough to account
2303 * for all the unfolded characters, and because for UTF, the
2304 * string will already have been folded earlier in the
2305 * compilation process */
2307 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2308 foldlen -= UTF8SKIP(uc);
2311 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2316 /* The current character (and any potential folds) should be added
2317 * to the possible matching characters for this position in this
2321 U8 folded= folder[ (U8) uvc ];
2322 if ( !trie->charmap[ folded ] ) {
2323 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2324 TRIE_STORE_REVCHAR( folded );
2327 if ( !trie->charmap[ uvc ] ) {
2328 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2329 TRIE_STORE_REVCHAR( uvc );
2332 /* store the codepoint in the bitmap, and its folded
2334 TRIE_BITMAP_SET(trie, uvc);
2336 /* store the folded codepoint */
2337 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2340 /* store first byte of utf8 representation of
2341 variant codepoints */
2342 if (! UVCHR_IS_INVARIANT(uvc)) {
2343 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2346 set_bit = 0; /* We've done our bit :-) */
2350 /* XXX We could come up with the list of code points that fold
2351 * to this using PL_utf8_foldclosures, except not for
2352 * multi-char folds, as there may be multiple combinations
2353 * there that could work, which needs to wait until runtime to
2354 * resolve (The comment about LIGATURE FFI above is such an
2359 widecharmap = newHV();
2361 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2364 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2366 if ( !SvTRUE( *svpp ) ) {
2367 sv_setiv( *svpp, ++trie->uniquecharcount );
2368 TRIE_STORE_REVCHAR(uvc);
2371 } /* end loop through characters in this branch of the trie */
2373 /* We take the min and max for this branch and combine to find the min
2374 * and max for all branches processed so far */
2375 if( cur == first ) {
2376 trie->minlen = minchars;
2377 trie->maxlen = maxchars;
2378 } else if (minchars < trie->minlen) {
2379 trie->minlen = minchars;
2380 } else if (maxchars > trie->maxlen) {
2381 trie->maxlen = maxchars;
2383 } /* end first pass */
2384 DEBUG_TRIE_COMPILE_r(
2385 PerlIO_printf( Perl_debug_log,
2386 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2387 (int)depth * 2 + 2,"",
2388 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2389 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2390 (int)trie->minlen, (int)trie->maxlen )
2394 We now know what we are dealing with in terms of unique chars and
2395 string sizes so we can calculate how much memory a naive
2396 representation using a flat table will take. If it's over a reasonable
2397 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2398 conservative but potentially much slower representation using an array
2401 At the end we convert both representations into the same compressed
2402 form that will be used in regexec.c for matching with. The latter
2403 is a form that cannot be used to construct with but has memory
2404 properties similar to the list form and access properties similar
2405 to the table form making it both suitable for fast searches and
2406 small enough that its feasable to store for the duration of a program.
2408 See the comment in the code where the compressed table is produced
2409 inplace from the flat tabe representation for an explanation of how
2410 the compression works.
2415 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2419 > SvIV(re_trie_maxbuff) )
2422 Second Pass -- Array Of Lists Representation
2424 Each state will be represented by a list of charid:state records
2425 (reg_trie_trans_le) the first such element holds the CUR and LEN
2426 points of the allocated array. (See defines above).
2428 We build the initial structure using the lists, and then convert
2429 it into the compressed table form which allows faster lookups
2430 (but cant be modified once converted).
2433 STRLEN transcount = 1;
2435 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2436 "%*sCompiling trie using list compiler\n",
2437 (int)depth * 2 + 2, ""));
2439 trie->states = (reg_trie_state *)
2440 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2441 sizeof(reg_trie_state) );
2445 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2447 regnode *noper = NEXTOPER( cur );
2448 U8 *uc = (U8*)STRING( noper );
2449 const U8 *e = uc + STR_LEN( noper );
2450 U32 state = 1; /* required init */
2451 U16 charid = 0; /* sanity init */
2452 U32 wordlen = 0; /* required init */
2454 if (OP(noper) == NOTHING) {
2455 regnode *noper_next= regnext(noper);
2456 if (noper_next != tail && OP(noper_next) == flags) {
2458 uc= (U8*)STRING(noper);
2459 e= uc + STR_LEN(noper);
2463 if (OP(noper) != NOTHING) {
2464 for ( ; uc < e ; uc += len ) {
2469 charid = trie->charmap[ uvc ];
2471 SV** const svpp = hv_fetch( widecharmap,
2478 charid=(U16)SvIV( *svpp );
2481 /* charid is now 0 if we dont know the char read, or
2482 * nonzero if we do */
2489 if ( !trie->states[ state ].trans.list ) {
2490 TRIE_LIST_NEW( state );
2493 check <= TRIE_LIST_USED( state );
2496 if ( TRIE_LIST_ITEM( state, check ).forid
2499 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2504 newstate = next_alloc++;
2505 prev_states[newstate] = state;
2506 TRIE_LIST_PUSH( state, charid, newstate );
2511 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2515 TRIE_HANDLE_WORD(state);
2517 } /* end second pass */
2519 /* next alloc is the NEXT state to be allocated */
2520 trie->statecount = next_alloc;
2521 trie->states = (reg_trie_state *)
2522 PerlMemShared_realloc( trie->states,
2524 * sizeof(reg_trie_state) );
2526 /* and now dump it out before we compress it */
2527 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2528 revcharmap, next_alloc,
2532 trie->trans = (reg_trie_trans *)
2533 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2540 for( state=1 ; state < next_alloc ; state ++ ) {
2544 DEBUG_TRIE_COMPILE_MORE_r(
2545 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2549 if (trie->states[state].trans.list) {
2550 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2554 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2555 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2556 if ( forid < minid ) {
2558 } else if ( forid > maxid ) {
2562 if ( transcount < tp + maxid - minid + 1) {
2564 trie->trans = (reg_trie_trans *)
2565 PerlMemShared_realloc( trie->trans,
2567 * sizeof(reg_trie_trans) );
2568 Zero( trie->trans + (transcount / 2),
2572 base = trie->uniquecharcount + tp - minid;
2573 if ( maxid == minid ) {
2575 for ( ; zp < tp ; zp++ ) {
2576 if ( ! trie->trans[ zp ].next ) {
2577 base = trie->uniquecharcount + zp - minid;
2578 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2580 trie->trans[ zp ].check = state;
2586 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2588 trie->trans[ tp ].check = state;
2593 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2594 const U32 tid = base
2595 - trie->uniquecharcount
2596 + TRIE_LIST_ITEM( state, idx ).forid;
2597 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2599 trie->trans[ tid ].check = state;
2601 tp += ( maxid - minid + 1 );
2603 Safefree(trie->states[ state ].trans.list);
2606 DEBUG_TRIE_COMPILE_MORE_r(
2607 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2610 trie->states[ state ].trans.base=base;
2612 trie->lasttrans = tp + 1;
2616 Second Pass -- Flat Table Representation.
2618 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2619 each. We know that we will need Charcount+1 trans at most to store
2620 the data (one row per char at worst case) So we preallocate both
2621 structures assuming worst case.
2623 We then construct the trie using only the .next slots of the entry
2626 We use the .check field of the first entry of the node temporarily
2627 to make compression both faster and easier by keeping track of how
2628 many non zero fields are in the node.
2630 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2633 There are two terms at use here: state as a TRIE_NODEIDX() which is
2634 a number representing the first entry of the node, and state as a
2635 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2636 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2637 if there are 2 entrys per node. eg:
2645 The table is internally in the right hand, idx form. However as we
2646 also have to deal with the states array which is indexed by nodenum
2647 we have to use TRIE_NODENUM() to convert.
2650 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2651 "%*sCompiling trie using table compiler\n",
2652 (int)depth * 2 + 2, ""));
2654 trie->trans = (reg_trie_trans *)
2655 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2656 * trie->uniquecharcount + 1,
2657 sizeof(reg_trie_trans) );
2658 trie->states = (reg_trie_state *)
2659 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2660 sizeof(reg_trie_state) );
2661 next_alloc = trie->uniquecharcount + 1;
2664 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2666 regnode *noper = NEXTOPER( cur );
2667 const U8 *uc = (U8*)STRING( noper );
2668 const U8 *e = uc + STR_LEN( noper );
2670 U32 state = 1; /* required init */
2672 U16 charid = 0; /* sanity init */
2673 U32 accept_state = 0; /* sanity init */
2675 U32 wordlen = 0; /* required init */
2677 if (OP(noper) == NOTHING) {
2678 regnode *noper_next= regnext(noper);
2679 if (noper_next != tail && OP(noper_next) == flags) {
2681 uc= (U8*)STRING(noper);
2682 e= uc + STR_LEN(noper);
2686 if ( OP(noper) != NOTHING ) {
2687 for ( ; uc < e ; uc += len ) {
2692 charid = trie->charmap[ uvc ];
2694 SV* const * const svpp = hv_fetch( widecharmap,
2698 charid = svpp ? (U16)SvIV(*svpp) : 0;
2702 if ( !trie->trans[ state + charid ].next ) {
2703 trie->trans[ state + charid ].next = next_alloc;
2704 trie->trans[ state ].check++;
2705 prev_states[TRIE_NODENUM(next_alloc)]
2706 = TRIE_NODENUM(state);
2707 next_alloc += trie->uniquecharcount;
2709 state = trie->trans[ state + charid ].next;
2711 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2713 /* charid is now 0 if we dont know the char read, or
2714 * nonzero if we do */
2717 accept_state = TRIE_NODENUM( state );
2718 TRIE_HANDLE_WORD(accept_state);
2720 } /* end second pass */
2722 /* and now dump it out before we compress it */
2723 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2725 next_alloc, depth+1));
2729 * Inplace compress the table.*
2731 For sparse data sets the table constructed by the trie algorithm will
2732 be mostly 0/FAIL transitions or to put it another way mostly empty.
2733 (Note that leaf nodes will not contain any transitions.)
2735 This algorithm compresses the tables by eliminating most such
2736 transitions, at the cost of a modest bit of extra work during lookup:
2738 - Each states[] entry contains a .base field which indicates the
2739 index in the state[] array wheres its transition data is stored.
2741 - If .base is 0 there are no valid transitions from that node.
2743 - If .base is nonzero then charid is added to it to find an entry in
2746 -If trans[states[state].base+charid].check!=state then the
2747 transition is taken to be a 0/Fail transition. Thus if there are fail
2748 transitions at the front of the node then the .base offset will point
2749 somewhere inside the previous nodes data (or maybe even into a node
2750 even earlier), but the .check field determines if the transition is
2754 The following process inplace converts the table to the compressed
2755 table: We first do not compress the root node 1,and mark all its
2756 .check pointers as 1 and set its .base pointer as 1 as well. This
2757 allows us to do a DFA construction from the compressed table later,
2758 and ensures that any .base pointers we calculate later are greater
2761 - We set 'pos' to indicate the first entry of the second node.
2763 - We then iterate over the columns of the node, finding the first and
2764 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2765 and set the .check pointers accordingly, and advance pos
2766 appropriately and repreat for the next node. Note that when we copy
2767 the next pointers we have to convert them from the original
2768 NODEIDX form to NODENUM form as the former is not valid post
2771 - If a node has no transitions used we mark its base as 0 and do not
2772 advance the pos pointer.
2774 - If a node only has one transition we use a second pointer into the
2775 structure to fill in allocated fail transitions from other states.
2776 This pointer is independent of the main pointer and scans forward
2777 looking for null transitions that are allocated to a state. When it
2778 finds one it writes the single transition into the "hole". If the
2779 pointer doesnt find one the single transition is appended as normal.
2781 - Once compressed we can Renew/realloc the structures to release the
2784 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2785 specifically Fig 3.47 and the associated pseudocode.
2789 const U32 laststate = TRIE_NODENUM( next_alloc );
2792 trie->statecount = laststate;
2794 for ( state = 1 ; state < laststate ; state++ ) {
2796 const U32 stateidx = TRIE_NODEIDX( state );
2797 const U32 o_used = trie->trans[ stateidx ].check;
2798 U32 used = trie->trans[ stateidx ].check;
2799 trie->trans[ stateidx ].check = 0;
2802 used && charid < trie->uniquecharcount;
2805 if ( flag || trie->trans[ stateidx + charid ].next ) {
2806 if ( trie->trans[ stateidx + charid ].next ) {
2808 for ( ; zp < pos ; zp++ ) {
2809 if ( ! trie->trans[ zp ].next ) {
2813 trie->states[ state ].trans.base
2815 + trie->uniquecharcount
2817 trie->trans[ zp ].next
2818 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2820 trie->trans[ zp ].check = state;
2821 if ( ++zp > pos ) pos = zp;
2828 trie->states[ state ].trans.base
2829 = pos + trie->uniquecharcount - charid ;
2831 trie->trans[ pos ].next
2832 = SAFE_TRIE_NODENUM(
2833 trie->trans[ stateidx + charid ].next );
2834 trie->trans[ pos ].check = state;
2839 trie->lasttrans = pos + 1;
2840 trie->states = (reg_trie_state *)
2841 PerlMemShared_realloc( trie->states, laststate
2842 * sizeof(reg_trie_state) );
2843 DEBUG_TRIE_COMPILE_MORE_r(
2844 PerlIO_printf( Perl_debug_log,
2845 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2846 (int)depth * 2 + 2,"",
2847 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2851 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2854 } /* end table compress */
2856 DEBUG_TRIE_COMPILE_MORE_r(
2857 PerlIO_printf(Perl_debug_log,
2858 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2859 (int)depth * 2 + 2, "",
2860 (UV)trie->statecount,
2861 (UV)trie->lasttrans)
2863 /* resize the trans array to remove unused space */
2864 trie->trans = (reg_trie_trans *)
2865 PerlMemShared_realloc( trie->trans, trie->lasttrans
2866 * sizeof(reg_trie_trans) );
2868 { /* Modify the program and insert the new TRIE node */
2869 U8 nodetype =(U8)(flags & 0xFF);
2873 regnode *optimize = NULL;
2874 #ifdef RE_TRACK_PATTERN_OFFSETS
2877 U32 mjd_nodelen = 0;
2878 #endif /* RE_TRACK_PATTERN_OFFSETS */
2879 #endif /* DEBUGGING */
2881 This means we convert either the first branch or the first Exact,
2882 depending on whether the thing following (in 'last') is a branch
2883 or not and whther first is the startbranch (ie is it a sub part of
2884 the alternation or is it the whole thing.)
2885 Assuming its a sub part we convert the EXACT otherwise we convert
2886 the whole branch sequence, including the first.
2888 /* Find the node we are going to overwrite */
2889 if ( first != startbranch || OP( last ) == BRANCH ) {
2890 /* branch sub-chain */
2891 NEXT_OFF( first ) = (U16)(last - first);
2892 #ifdef RE_TRACK_PATTERN_OFFSETS
2894 mjd_offset= Node_Offset((convert));
2895 mjd_nodelen= Node_Length((convert));
2898 /* whole branch chain */
2900 #ifdef RE_TRACK_PATTERN_OFFSETS
2903 const regnode *nop = NEXTOPER( convert );
2904 mjd_offset= Node_Offset((nop));
2905 mjd_nodelen= Node_Length((nop));
2909 PerlIO_printf(Perl_debug_log,
2910 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2911 (int)depth * 2 + 2, "",
2912 (UV)mjd_offset, (UV)mjd_nodelen)
2915 /* But first we check to see if there is a common prefix we can
2916 split out as an EXACT and put in front of the TRIE node. */
2917 trie->startstate= 1;
2918 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2920 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2924 const U32 base = trie->states[ state ].trans.base;
2926 if ( trie->states[state].wordnum )
2929 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2930 if ( ( base + ofs >= trie->uniquecharcount ) &&
2931 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2932 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2934 if ( ++count > 1 ) {
2935 SV **tmp = av_fetch( revcharmap, ofs, 0);
2936 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2937 if ( state == 1 ) break;
2939 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2941 PerlIO_printf(Perl_debug_log,
2942 "%*sNew Start State=%"UVuf" Class: [",
2943 (int)depth * 2 + 2, "",
2946 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2947 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2949 TRIE_BITMAP_SET(trie,*ch);
2951 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2953 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2957 TRIE_BITMAP_SET(trie,*ch);
2959 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2960 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2966 SV **tmp = av_fetch( revcharmap, idx, 0);
2968 char *ch = SvPV( *tmp, len );
2970 SV *sv=sv_newmortal();
2971 PerlIO_printf( Perl_debug_log,
2972 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2973 (int)depth * 2 + 2, "",
2975 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2976 PL_colors[0], PL_colors[1],
2977 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2978 PERL_PV_ESCAPE_FIRSTCHAR
2983 OP( convert ) = nodetype;
2984 str=STRING(convert);
2987 STR_LEN(convert) += len;
2993 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2998 trie->prefixlen = (state-1);
3000 regnode *n = convert+NODE_SZ_STR(convert);
3001 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3002 trie->startstate = state;
3003 trie->minlen -= (state - 1);
3004 trie->maxlen -= (state - 1);
3006 /* At least the UNICOS C compiler choked on this
3007 * being argument to DEBUG_r(), so let's just have
3010 #ifdef PERL_EXT_RE_BUILD
3016 regnode *fix = convert;
3017 U32 word = trie->wordcount;
3019 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3020 while( ++fix < n ) {
3021 Set_Node_Offset_Length(fix, 0, 0);
3024 SV ** const tmp = av_fetch( trie_words, word, 0 );
3026 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3027 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3029 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3037 NEXT_OFF(convert) = (U16)(tail - convert);
3038 DEBUG_r(optimize= n);
3044 if ( trie->maxlen ) {
3045 NEXT_OFF( convert ) = (U16)(tail - convert);
3046 ARG_SET( convert, data_slot );
3047 /* Store the offset to the first unabsorbed branch in
3048 jump[0], which is otherwise unused by the jump logic.
3049 We use this when dumping a trie and during optimisation. */
3051 trie->jump[0] = (U16)(nextbranch - convert);
3053 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3054 * and there is a bitmap
3055 * and the first "jump target" node we found leaves enough room
3056 * then convert the TRIE node into a TRIEC node, with the bitmap
3057 * embedded inline in the opcode - this is hypothetically faster.
3059 if ( !trie->states[trie->startstate].wordnum
3061 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3063 OP( convert ) = TRIEC;
3064 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3065 PerlMemShared_free(trie->bitmap);
3068 OP( convert ) = TRIE;
3070 /* store the type in the flags */
3071 convert->flags = nodetype;
3075 + regarglen[ OP( convert ) ];
3077 /* XXX We really should free up the resource in trie now,
3078 as we won't use them - (which resources?) dmq */
3080 /* needed for dumping*/
3081 DEBUG_r(if (optimize) {
3082 regnode *opt = convert;
3084 while ( ++opt < optimize) {
3085 Set_Node_Offset_Length(opt,0,0);
3088 Try to clean up some of the debris left after the
3091 while( optimize < jumper ) {
3092 mjd_nodelen += Node_Length((optimize));
3093 OP( optimize ) = OPTIMIZED;
3094 Set_Node_Offset_Length(optimize,0,0);
3097 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3099 } /* end node insert */
3101 /* Finish populating the prev field of the wordinfo array. Walk back
3102 * from each accept state until we find another accept state, and if
3103 * so, point the first word's .prev field at the second word. If the
3104 * second already has a .prev field set, stop now. This will be the
3105 * case either if we've already processed that word's accept state,
3106 * or that state had multiple words, and the overspill words were
3107 * already linked up earlier.
3114 for (word=1; word <= trie->wordcount; word++) {
3116 if (trie->wordinfo[word].prev)
3118 state = trie->wordinfo[word].accept;
3120 state = prev_states[state];
3123 prev = trie->states[state].wordnum;
3127 trie->wordinfo[word].prev = prev;
3129 Safefree(prev_states);
3133 /* and now dump out the compressed format */
3134 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3136 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3138 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3139 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3141 SvREFCNT_dec_NN(revcharmap);
3145 : trie->startstate>1
3151 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3153 /* The Trie is constructed and compressed now so we can build a fail array if
3156 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3158 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3162 We find the fail state for each state in the trie, this state is the longest
3163 proper suffix of the current state's 'word' that is also a proper prefix of
3164 another word in our trie. State 1 represents the word '' and is thus the
3165 default fail state. This allows the DFA not to have to restart after its
3166 tried and failed a word at a given point, it simply continues as though it
3167 had been matching the other word in the first place.
3169 'abcdgu'=~/abcdefg|cdgu/
3170 When we get to 'd' we are still matching the first word, we would encounter
3171 'g' which would fail, which would bring us to the state representing 'd' in
3172 the second word where we would try 'g' and succeed, proceeding to match
3175 /* add a fail transition */
3176 const U32 trie_offset = ARG(source);
3177 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3179 const U32 ucharcount = trie->uniquecharcount;
3180 const U32 numstates = trie->statecount;
3181 const U32 ubound = trie->lasttrans + ucharcount;
3185 U32 base = trie->states[ 1 ].trans.base;
3188 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3190 GET_RE_DEBUG_FLAGS_DECL;
3192 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3193 PERL_UNUSED_CONTEXT;
3195 PERL_UNUSED_ARG(depth);
3198 if ( OP(source) == TRIE ) {
3199 struct regnode_1 *op = (struct regnode_1 *)
3200 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3201 StructCopy(source,op,struct regnode_1);
3202 stclass = (regnode *)op;
3204 struct regnode_charclass *op = (struct regnode_charclass *)
3205 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3206 StructCopy(source,op,struct regnode_charclass);
3207 stclass = (regnode *)op;
3209 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3211 ARG_SET( stclass, data_slot );
3212 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3213 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3214 aho->trie=trie_offset;
3215 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3216 Copy( trie->states, aho->states, numstates, reg_trie_state );
3217 Newxz( q, numstates, U32);
3218 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3221 /* initialize fail[0..1] to be 1 so that we always have
3222 a valid final fail state */
3223 fail[ 0 ] = fail[ 1 ] = 1;
3225 for ( charid = 0; charid < ucharcount ; charid++ ) {
3226 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3228 q[ q_write ] = newstate;
3229 /* set to point at the root */
3230 fail[ q[ q_write++ ] ]=1;
3233 while ( q_read < q_write) {
3234 const U32 cur = q[ q_read++ % numstates ];
3235 base = trie->states[ cur ].trans.base;
3237 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3238 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3240 U32 fail_state = cur;
3243 fail_state = fail[ fail_state ];
3244 fail_base = aho->states[ fail_state ].trans.base;
3245 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3247 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3248 fail[ ch_state ] = fail_state;
3249 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3251 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3253 q[ q_write++ % numstates] = ch_state;
3257 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3258 when we fail in state 1, this allows us to use the
3259 charclass scan to find a valid start char. This is based on the principle
3260 that theres a good chance the string being searched contains lots of stuff
3261 that cant be a start char.
3263 fail[ 0 ] = fail[ 1 ] = 0;
3264 DEBUG_TRIE_COMPILE_r({
3265 PerlIO_printf(Perl_debug_log,
3266 "%*sStclass Failtable (%"UVuf" states): 0",
3267 (int)(depth * 2), "", (UV)numstates
3269 for( q_read=1; q_read<numstates; q_read++ ) {
3270 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3272 PerlIO_printf(Perl_debug_log, "\n");
3275 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3280 #define DEBUG_PEEP(str,scan,depth) \
3281 DEBUG_OPTIMISE_r({if (scan){ \
3282 regnode *Next = regnext(scan); \
3283 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3284 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3285 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3286 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3287 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3288 PerlIO_printf(Perl_debug_log, "\n"); \
3291 /* The below joins as many adjacent EXACTish nodes as possible into a single
3292 * one. The regop may be changed if the node(s) contain certain sequences that
3293 * require special handling. The joining is only done if:
3294 * 1) there is room in the current conglomerated node to entirely contain the
3296 * 2) they are the exact same node type
3298 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3299 * these get optimized out
3301 * If a node is to match under /i (folded), the number of characters it matches
3302 * can be different than its character length if it contains a multi-character
3303 * fold. *min_subtract is set to the total delta number of characters of the
3306 * And *unfolded_multi_char is set to indicate whether or not the node contains
3307 * an unfolded multi-char fold. This happens when whether the fold is valid or
3308 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3309 * SMALL LETTER SHARP S, as only if the target string being matched against
3310 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3311 * folding rules depend on the locale in force at runtime. (Multi-char folds
3312 * whose components are all above the Latin1 range are not run-time locale
3313 * dependent, and have already been folded by the time this function is
3316 * This is as good a place as any to discuss the design of handling these
3317 * multi-character fold sequences. It's been wrong in Perl for a very long
3318 * time. There are three code points in Unicode whose multi-character folds
3319 * were long ago discovered to mess things up. The previous designs for
3320 * dealing with these involved assigning a special node for them. This
3321 * approach doesn't always work, as evidenced by this example:
3322 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3323 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3324 * would match just the \xDF, it won't be able to handle the case where a
3325 * successful match would have to cross the node's boundary. The new approach
3326 * that hopefully generally solves the problem generates an EXACTFU_SS node
3327 * that is "sss" in this case.
3329 * It turns out that there are problems with all multi-character folds, and not
3330 * just these three. Now the code is general, for all such cases. The
3331 * approach taken is:
3332 * 1) This routine examines each EXACTFish node that could contain multi-
3333 * character folded sequences. Since a single character can fold into
3334 * such a sequence, the minimum match length for this node is less than
3335 * the number of characters in the node. This routine returns in
3336 * *min_subtract how many characters to subtract from the the actual
3337 * length of the string to get a real minimum match length; it is 0 if
3338 * there are no multi-char foldeds. This delta is used by the caller to
3339 * adjust the min length of the match, and the delta between min and max,
3340 * so that the optimizer doesn't reject these possibilities based on size
3342 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3343 * is used for an EXACTFU node that contains at least one "ss" sequence in
3344 * it. For non-UTF-8 patterns and strings, this is the only case where
3345 * there is a possible fold length change. That means that a regular
3346 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3347 * with length changes, and so can be processed faster. regexec.c takes
3348 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3349 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3350 * known until runtime). This saves effort in regex matching. However,
3351 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3352 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3353 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3354 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3355 * possibilities for the non-UTF8 patterns are quite simple, except for
3356 * the sharp s. All the ones that don't involve a UTF-8 target string are
3357 * members of a fold-pair, and arrays are set up for all of them so that
3358 * the other member of the pair can be found quickly. Code elsewhere in
3359 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3360 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3361 * described in the next item.
3362 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3363 * validity of the fold won't be known until runtime, and so must remain
3364 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3365 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3366 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3367 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3368 * The reason this is a problem is that the optimizer part of regexec.c
3369 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3370 * that a character in the pattern corresponds to at most a single
3371 * character in the target string. (And I do mean character, and not byte
3372 * here, unlike other parts of the documentation that have never been
3373 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3374 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3375 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3376 * nodes, violate the assumption, and they are the only instances where it
3377 * is violated. I'm reluctant to try to change the assumption, as the
3378 * code involved is impenetrable to me (khw), so instead the code here
3379 * punts. This routine examines EXACTFL nodes, and (when the pattern
3380 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3381 * boolean indicating whether or not the node contains such a fold. When
3382 * it is true, the caller sets a flag that later causes the optimizer in
3383 * this file to not set values for the floating and fixed string lengths,
3384 * and thus avoids the optimizer code in regexec.c that makes the invalid
3385 * assumption. Thus, there is no optimization based on string lengths for
3386 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3387 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3388 * assumption is wrong only in these cases is that all other non-UTF-8
3389 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3390 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3391 * EXACTF nodes because we don't know at compile time if it actually
3392 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3393 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3394 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3395 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3396 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3397 * string would require the pattern to be forced into UTF-8, the overhead
3398 * of which we want to avoid. Similarly the unfolded multi-char folds in
3399 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3402 * Similarly, the code that generates tries doesn't currently handle
3403 * not-already-folded multi-char folds, and it looks like a pain to change
3404 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3405 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3406 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3407 * using /iaa matching will be doing so almost entirely with ASCII
3408 * strings, so this should rarely be encountered in practice */
3410 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3411 if (PL_regkind[OP(scan)] == EXACT) \
3412 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3415 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3416 UV *min_subtract, bool *unfolded_multi_char,
3417 U32 flags,regnode *val, U32 depth)
3419 /* Merge several consecutive EXACTish nodes into one. */
3420 regnode *n = regnext(scan);
3422 regnode *next = scan + NODE_SZ_STR(scan);
3426 regnode *stop = scan;
3427 GET_RE_DEBUG_FLAGS_DECL;
3429 PERL_UNUSED_ARG(depth);
3432 PERL_ARGS_ASSERT_JOIN_EXACT;
3433 #ifndef EXPERIMENTAL_INPLACESCAN
3434 PERL_UNUSED_ARG(flags);
3435 PERL_UNUSED_ARG(val);
3437 DEBUG_PEEP("join",scan,depth);
3439 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3440 * EXACT ones that are mergeable to the current one. */
3442 && (PL_regkind[OP(n)] == NOTHING
3443 || (stringok && OP(n) == OP(scan)))
3445 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3448 if (OP(n) == TAIL || n > next)
3450 if (PL_regkind[OP(n)] == NOTHING) {
3451 DEBUG_PEEP("skip:",n,depth);
3452 NEXT_OFF(scan) += NEXT_OFF(n);
3453 next = n + NODE_STEP_REGNODE;
3460 else if (stringok) {
3461 const unsigned int oldl = STR_LEN(scan);
3462 regnode * const nnext = regnext(n);
3464 /* XXX I (khw) kind of doubt that this works on platforms (should
3465 * Perl ever run on one) where U8_MAX is above 255 because of lots
3466 * of other assumptions */
3467 /* Don't join if the sum can't fit into a single node */
3468 if (oldl + STR_LEN(n) > U8_MAX)
3471 DEBUG_PEEP("merg",n,depth);
3474 NEXT_OFF(scan) += NEXT_OFF(n);
3475 STR_LEN(scan) += STR_LEN(n);
3476 next = n + NODE_SZ_STR(n);
3477 /* Now we can overwrite *n : */
3478 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3486 #ifdef EXPERIMENTAL_INPLACESCAN
3487 if (flags && !NEXT_OFF(n)) {
3488 DEBUG_PEEP("atch", val, depth);
3489 if (reg_off_by_arg[OP(n)]) {
3490 ARG_SET(n, val - n);
3493 NEXT_OFF(n) = val - n;
3501 *unfolded_multi_char = FALSE;
3503 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3504 * can now analyze for sequences of problematic code points. (Prior to
3505 * this final joining, sequences could have been split over boundaries, and
3506 * hence missed). The sequences only happen in folding, hence for any
3507 * non-EXACT EXACTish node */
3508 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3509 U8* s0 = (U8*) STRING(scan);
3511 U8* s_end = s0 + STR_LEN(scan);
3513 int total_count_delta = 0; /* Total delta number of characters that
3514 multi-char folds expand to */
3516 /* One pass is made over the node's string looking for all the
3517 * possibilities. To avoid some tests in the loop, there are two main
3518 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3523 if (OP(scan) == EXACTFL) {
3526 /* An EXACTFL node would already have been changed to another
3527 * node type unless there is at least one character in it that
3528 * is problematic; likely a character whose fold definition
3529 * won't be known until runtime, and so has yet to be folded.
3530 * For all but the UTF-8 locale, folds are 1-1 in length, but
3531 * to handle the UTF-8 case, we need to create a temporary
3532 * folded copy using UTF-8 locale rules in order to analyze it.
3533 * This is because our macros that look to see if a sequence is
3534 * a multi-char fold assume everything is folded (otherwise the
3535 * tests in those macros would be too complicated and slow).
3536 * Note that here, the non-problematic folds will have already
3537 * been done, so we can just copy such characters. We actually
3538 * don't completely fold the EXACTFL string. We skip the
3539 * unfolded multi-char folds, as that would just create work
3540 * below to figure out the size they already are */
3542 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3545 STRLEN s_len = UTF8SKIP(s);
3546 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3547 Copy(s, d, s_len, U8);
3550 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3551 *unfolded_multi_char = TRUE;
3552 Copy(s, d, s_len, U8);
3555 else if (isASCII(*s)) {
3556 *(d++) = toFOLD(*s);
3560 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3566 /* Point the remainder of the routine to look at our temporary
3570 } /* End of creating folded copy of EXACTFL string */
3572 /* Examine the string for a multi-character fold sequence. UTF-8
3573 * patterns have all characters pre-folded by the time this code is
3575 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3576 length sequence we are looking for is 2 */
3578 int count = 0; /* How many characters in a multi-char fold */
3579 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3580 if (! len) { /* Not a multi-char fold: get next char */
3585 /* Nodes with 'ss' require special handling, except for
3586 * EXACTFA-ish for which there is no multi-char fold to this */
3587 if (len == 2 && *s == 's' && *(s+1) == 's'
3588 && OP(scan) != EXACTFA
3589 && OP(scan) != EXACTFA_NO_TRIE)
3592 if (OP(scan) != EXACTFL) {
3593 OP(scan) = EXACTFU_SS;
3597 else { /* Here is a generic multi-char fold. */
3598 U8* multi_end = s + len;
3600 /* Count how many characters are in it. In the case of
3601 * /aa, no folds which contain ASCII code points are
3602 * allowed, so check for those, and skip if found. */
3603 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3604 count = utf8_length(s, multi_end);
3608 while (s < multi_end) {
3611 goto next_iteration;
3621 /* The delta is how long the sequence is minus 1 (1 is how long
3622 * the character that folds to the sequence is) */
3623 total_count_delta += count - 1;
3627 /* We created a temporary folded copy of the string in EXACTFL
3628 * nodes. Therefore we need to be sure it doesn't go below zero,
3629 * as the real string could be shorter */
3630 if (OP(scan) == EXACTFL) {
3631 int total_chars = utf8_length((U8*) STRING(scan),
3632 (U8*) STRING(scan) + STR_LEN(scan));
3633 if (total_count_delta > total_chars) {
3634 total_count_delta = total_chars;
3638 *min_subtract += total_count_delta;
3641 else if (OP(scan) == EXACTFA) {
3643 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3644 * fold to the ASCII range (and there are no existing ones in the
3645 * upper latin1 range). But, as outlined in the comments preceding
3646 * this function, we need to flag any occurrences of the sharp s.
3647 * This character forbids trie formation (because of added
3650 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3651 OP(scan) = EXACTFA_NO_TRIE;
3652 *unfolded_multi_char = TRUE;
3661 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3662 * folds that are all Latin1. As explained in the comments
3663 * preceding this function, we look also for the sharp s in EXACTF
3664 * and EXACTFL nodes; it can be in the final position. Otherwise
3665 * we can stop looking 1 byte earlier because have to find at least
3666 * two characters for a multi-fold */
3667 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3672 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3673 if (! len) { /* Not a multi-char fold. */
3674 if (*s == LATIN_SMALL_LETTER_SHARP_S
3675 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3677 *unfolded_multi_char = TRUE;
3684 && isALPHA_FOLD_EQ(*s, 's')
3685 && isALPHA_FOLD_EQ(*(s+1), 's'))
3688 /* EXACTF nodes need to know that the minimum length
3689 * changed so that a sharp s in the string can match this
3690 * ss in the pattern, but they remain EXACTF nodes, as they
3691 * won't match this unless the target string is is UTF-8,
3692 * which we don't know until runtime. EXACTFL nodes can't
3693 * transform into EXACTFU nodes */
3694 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3695 OP(scan) = EXACTFU_SS;
3699 *min_subtract += len - 1;
3706 /* Allow dumping but overwriting the collection of skipped
3707 * ops and/or strings with fake optimized ops */
3708 n = scan + NODE_SZ_STR(scan);
3716 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3720 /* REx optimizer. Converts nodes into quicker variants "in place".
3721 Finds fixed substrings. */
3723 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3724 to the position after last scanned or to NULL. */
3726 #define INIT_AND_WITHP \
3727 assert(!and_withp); \
3728 Newx(and_withp,1, regnode_ssc); \
3729 SAVEFREEPV(and_withp)
3733 S_unwind_scan_frames(pTHX_ const void *p)
3735 scan_frame *f= (scan_frame *)p;
3737 scan_frame *n= f->next_frame;
3745 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3746 SSize_t *minlenp, SSize_t *deltap,
3751 regnode_ssc *and_withp,
3752 U32 flags, U32 depth)
3753 /* scanp: Start here (read-write). */
3754 /* deltap: Write maxlen-minlen here. */
3755 /* last: Stop before this one. */
3756 /* data: string data about the pattern */
3757 /* stopparen: treat close N as END */
3758 /* recursed: which subroutines have we recursed into */
3759 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3761 /* There must be at least this number of characters to match */
3764 regnode *scan = *scanp, *next;
3766 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3767 int is_inf_internal = 0; /* The studied chunk is infinite */
3768 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3769 scan_data_t data_fake;
3770 SV *re_trie_maxbuff = NULL;
3771 regnode *first_non_open = scan;
3772 SSize_t stopmin = SSize_t_MAX;
3773 scan_frame *frame = NULL;
3774 GET_RE_DEBUG_FLAGS_DECL;
3776 PERL_ARGS_ASSERT_STUDY_CHUNK;
3780 while (first_non_open && OP(first_non_open) == OPEN)
3781 first_non_open=regnext(first_non_open);
3787 RExC_study_chunk_recursed_count++;
3789 DEBUG_OPTIMISE_MORE_r(
3791 PerlIO_printf(Perl_debug_log,
3792 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3793 (int)(depth*2), "", (long)stopparen,
3794 (unsigned long)RExC_study_chunk_recursed_count,
3795 (unsigned long)depth, (unsigned long)recursed_depth,
3798 if (recursed_depth) {
3801 for ( j = 0 ; j < recursed_depth ; j++ ) {
3802 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3804 PAREN_TEST(RExC_study_chunk_recursed +
3805 ( j * RExC_study_chunk_recursed_bytes), i )
3808 !PAREN_TEST(RExC_study_chunk_recursed +
3809 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3812 PerlIO_printf(Perl_debug_log," %d",(int)i);
3816 if ( j + 1 < recursed_depth ) {
3817 PerlIO_printf(Perl_debug_log, ",");
3821 PerlIO_printf(Perl_debug_log,"\n");
3824 while ( scan && OP(scan) != END && scan < last ){
3825 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3826 node length to get a real minimum (because
3827 the folded version may be shorter) */
3828 bool unfolded_multi_char = FALSE;
3829 /* Peephole optimizer: */
3830 DEBUG_STUDYDATA("Peep:", data, depth);
3831 DEBUG_PEEP("Peep", scan, depth);
3834 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3835 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3836 * by a different invocation of reg() -- Yves
3838 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3840 /* Follow the next-chain of the current node and optimize
3841 away all the NOTHINGs from it. */
3842 if (OP(scan) != CURLYX) {
3843 const int max = (reg_off_by_arg[OP(scan)]
3845 /* I32 may be smaller than U16 on CRAYs! */
3846 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3847 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3851 /* Skip NOTHING and LONGJMP. */
3852 while ((n = regnext(n))
3853 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3854 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3855 && off + noff < max)
3857 if (reg_off_by_arg[OP(scan)])
3860 NEXT_OFF(scan) = off;
3863 /* The principal pseudo-switch. Cannot be a switch, since we
3864 look into several different things. */
3865 if ( OP(scan) == DEFINEP ) {
3867 SSize_t deltanext = 0;
3868 SSize_t fake_last_close = 0;
3869 I32 f = SCF_IN_DEFINE;
3871 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3872 scan = regnext(scan);
3873 assert( OP(scan) == IFTHEN );
3874 DEBUG_PEEP("expect IFTHEN", scan, depth);
3876 data_fake.last_closep= &fake_last_close;
3878 next = regnext(scan);
3879 scan = NEXTOPER(NEXTOPER(scan));
3880 DEBUG_PEEP("scan", scan, depth);
3881 DEBUG_PEEP("next", next, depth);
3883 /* we suppose the run is continuous, last=next...
3884 * NOTE we dont use the return here! */
3885 (void)study_chunk(pRExC_state, &scan, &minlen,
3886 &deltanext, next, &data_fake, stopparen,
3887 recursed_depth, NULL, f, depth+1);
3892 OP(scan) == BRANCH ||
3893 OP(scan) == BRANCHJ ||
3896 next = regnext(scan);
3899 /* The op(next)==code check below is to see if we
3900 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3901 * IFTHEN is special as it might not appear in pairs.
3902 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3903 * we dont handle it cleanly. */
3904 if (OP(next) == code || code == IFTHEN) {
3905 /* NOTE - There is similar code to this block below for
3906 * handling TRIE nodes on a re-study. If you change stuff here
3907 * check there too. */
3908 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3910 regnode * const startbranch=scan;
3912 if (flags & SCF_DO_SUBSTR) {
3913 /* Cannot merge strings after this. */
3914 scan_commit(pRExC_state, data, minlenp, is_inf);
3917 if (flags & SCF_DO_STCLASS)
3918 ssc_init_zero(pRExC_state, &accum);
3920 while (OP(scan) == code) {
3921 SSize_t deltanext, minnext, fake;
3923 regnode_ssc this_class;
3925 DEBUG_PEEP("Branch", scan, depth);
3928 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3930 data_fake.whilem_c = data->whilem_c;
3931 data_fake.last_closep = data->last_closep;
3934 data_fake.last_closep = &fake;
3936 data_fake.pos_delta = delta;
3937 next = regnext(scan);
3939 scan = NEXTOPER(scan); /* everything */
3940 if (code != BRANCH) /* everything but BRANCH */
3941 scan = NEXTOPER(scan);
3943 if (flags & SCF_DO_STCLASS) {
3944 ssc_init(pRExC_state, &this_class);
3945 data_fake.start_class = &this_class;
3946 f = SCF_DO_STCLASS_AND;
3948 if (flags & SCF_WHILEM_VISITED_POS)
3949 f |= SCF_WHILEM_VISITED_POS;
3951 /* we suppose the run is continuous, last=next...*/
3952 minnext = study_chunk(pRExC_state, &scan, minlenp,
3953 &deltanext, next, &data_fake, stopparen,
3954 recursed_depth, NULL, f,depth+1);
3958 if (deltanext == SSize_t_MAX) {
3959 is_inf = is_inf_internal = 1;
3961 } else if (max1 < minnext + deltanext)
3962 max1 = minnext + deltanext;
3964 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3966 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3967 if ( stopmin > minnext)
3968 stopmin = min + min1;
3969 flags &= ~SCF_DO_SUBSTR;
3971 data->flags |= SCF_SEEN_ACCEPT;
3974 if (data_fake.flags & SF_HAS_EVAL)
3975 data->flags |= SF_HAS_EVAL;
3976 data->whilem_c = data_fake.whilem_c;
3978 if (flags & SCF_DO_STCLASS)
3979 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3981 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3983 if (flags & SCF_DO_SUBSTR) {
3984 data->pos_min += min1;
3985 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3986 data->pos_delta = SSize_t_MAX;
3988 data->pos_delta += max1 - min1;
3989 if (max1 != min1 || is_inf)
3990 data->longest = &(data->longest_float);
3993 if (delta == SSize_t_MAX
3994 || SSize_t_MAX - delta - (max1 - min1) < 0)
3995 delta = SSize_t_MAX;
3997 delta += max1 - min1;
3998 if (flags & SCF_DO_STCLASS_OR) {
3999 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4001 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4002 flags &= ~SCF_DO_STCLASS;
4005 else if (flags & SCF_DO_STCLASS_AND) {
4007 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4008 flags &= ~SCF_DO_STCLASS;
4011 /* Switch to OR mode: cache the old value of
4012 * data->start_class */
4014 StructCopy(data->start_class, and_withp, regnode_ssc);
4015 flags &= ~SCF_DO_STCLASS_AND;
4016 StructCopy(&accum, data->start_class, regnode_ssc);
4017 flags |= SCF_DO_STCLASS_OR;
4021 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4022 OP( startbranch ) == BRANCH )
4026 Assuming this was/is a branch we are dealing with: 'scan'
4027 now points at the item that follows the branch sequence,
4028 whatever it is. We now start at the beginning of the
4029 sequence and look for subsequences of
4035 which would be constructed from a pattern like
4038 If we can find such a subsequence we need to turn the first
4039 element into a trie and then add the subsequent branch exact
4040 strings to the trie.
4044 1. patterns where the whole set of branches can be
4047 2. patterns where only a subset can be converted.
4049 In case 1 we can replace the whole set with a single regop
4050 for the trie. In case 2 we need to keep the start and end
4053 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4054 becomes BRANCH TRIE; BRANCH X;
4056 There is an additional case, that being where there is a
4057 common prefix, which gets split out into an EXACT like node
4058 preceding the TRIE node.
4060 If x(1..n)==tail then we can do a simple trie, if not we make
4061 a "jump" trie, such that when we match the appropriate word
4062 we "jump" to the appropriate tail node. Essentially we turn
4063 a nested if into a case structure of sorts.
4068 if (!re_trie_maxbuff) {
4069 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4070 if (!SvIOK(re_trie_maxbuff))
4071 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4073 if ( SvIV(re_trie_maxbuff)>=0 ) {
4075 regnode *first = (regnode *)NULL;
4076 regnode *last = (regnode *)NULL;
4077 regnode *tail = scan;
4081 /* var tail is used because there may be a TAIL
4082 regop in the way. Ie, the exacts will point to the
4083 thing following the TAIL, but the last branch will
4084 point at the TAIL. So we advance tail. If we
4085 have nested (?:) we may have to move through several
4089 while ( OP( tail ) == TAIL ) {
4090 /* this is the TAIL generated by (?:) */
4091 tail = regnext( tail );
4095 DEBUG_TRIE_COMPILE_r({
4096 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4097 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4098 (int)depth * 2 + 2, "",
4099 "Looking for TRIE'able sequences. Tail node is: ",
4100 SvPV_nolen_const( RExC_mysv )
4106 Step through the branches
4107 cur represents each branch,
4108 noper is the first thing to be matched as part
4110 noper_next is the regnext() of that node.
4112 We normally handle a case like this
4113 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4114 support building with NOJUMPTRIE, which restricts
4115 the trie logic to structures like /FOO|BAR/.
4117 If noper is a trieable nodetype then the branch is
4118 a possible optimization target. If we are building
4119 under NOJUMPTRIE then we require that noper_next is
4120 the same as scan (our current position in the regex
4123 Once we have two or more consecutive such branches
4124 we can create a trie of the EXACT's contents and
4125 stitch it in place into the program.
4127 If the sequence represents all of the branches in
4128 the alternation we replace the entire thing with a
4131 Otherwise when it is a subsequence we need to
4132 stitch it in place and replace only the relevant
4133 branches. This means the first branch has to remain
4134 as it is used by the alternation logic, and its
4135 next pointer, and needs to be repointed at the item
4136 on the branch chain following the last branch we
4137 have optimized away.
4139 This could be either a BRANCH, in which case the
4140 subsequence is internal, or it could be the item
4141 following the branch sequence in which case the
4142 subsequence is at the end (which does not
4143 necessarily mean the first node is the start of the
4146 TRIE_TYPE(X) is a define which maps the optype to a
4150 ----------------+-----------
4154 EXACTFU_SS | EXACTFU
4157 EXACTFLU8 | EXACTFLU8
4161 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4163 : ( EXACT == (X) ) \
4165 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4167 : ( EXACTFA == (X) ) \
4169 : ( EXACTL == (X) ) \
4171 : ( EXACTFLU8 == (X) ) \
4175 /* dont use tail as the end marker for this traverse */
4176 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4177 regnode * const noper = NEXTOPER( cur );
4178 U8 noper_type = OP( noper );
4179 U8 noper_trietype = TRIE_TYPE( noper_type );
4180 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4181 regnode * const noper_next = regnext( noper );
4182 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4183 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4186 DEBUG_TRIE_COMPILE_r({
4187 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4188 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4189 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4191 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4192 PerlIO_printf( Perl_debug_log, " -> %s",
4193 SvPV_nolen_const(RExC_mysv));
4196 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4197 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4198 SvPV_nolen_const(RExC_mysv));
4200 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4201 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4202 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4206 /* Is noper a trieable nodetype that can be merged
4207 * with the current trie (if there is one)? */
4211 ( noper_trietype == NOTHING)
4212 || ( trietype == NOTHING )
4213 || ( trietype == noper_trietype )
4216 && noper_next == tail
4220 /* Handle mergable triable node Either we are
4221 * the first node in a new trieable sequence,
4222 * in which case we do some bookkeeping,
4223 * otherwise we update the end pointer. */
4226 if ( noper_trietype == NOTHING ) {
4227 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4228 regnode * const noper_next = regnext( noper );
4229 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4230 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4233 if ( noper_next_trietype ) {
4234 trietype = noper_next_trietype;
4235 } else if (noper_next_type) {
4236 /* a NOTHING regop is 1 regop wide.
4237 * We need at least two for a trie
4238 * so we can't merge this in */
4242 trietype = noper_trietype;
4245 if ( trietype == NOTHING )
4246 trietype = noper_trietype;
4251 } /* end handle mergable triable node */
4253 /* handle unmergable node -
4254 * noper may either be a triable node which can
4255 * not be tried together with the current trie,
4256 * or a non triable node */
4258 /* If last is set and trietype is not
4259 * NOTHING then we have found at least two
4260 * triable branch sequences in a row of a
4261 * similar trietype so we can turn them
4262 * into a trie. If/when we allow NOTHING to
4263 * start a trie sequence this condition
4264 * will be required, and it isn't expensive
4265 * so we leave it in for now. */
4266 if ( trietype && trietype != NOTHING )
4267 make_trie( pRExC_state,
4268 startbranch, first, cur, tail,
4269 count, trietype, depth+1 );
4270 last = NULL; /* note: we clear/update
4271 first, trietype etc below,
4272 so we dont do it here */
4276 && noper_next == tail
4279 /* noper is triable, so we can start a new
4283 trietype = noper_trietype;
4285 /* if we already saw a first but the
4286 * current node is not triable then we have
4287 * to reset the first information. */
4292 } /* end handle unmergable node */
4293 } /* loop over branches */
4294 DEBUG_TRIE_COMPILE_r({
4295 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4296 PerlIO_printf( Perl_debug_log,
4297 "%*s- %s (%d) <SCAN FINISHED>\n",
4299 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4302 if ( last && trietype ) {
4303 if ( trietype != NOTHING ) {
4304 /* the last branch of the sequence was part of
4305 * a trie, so we have to construct it here
4306 * outside of the loop */
4307 made= make_trie( pRExC_state, startbranch,
4308 first, scan, tail, count,
4309 trietype, depth+1 );
4310 #ifdef TRIE_STUDY_OPT
4311 if ( ((made == MADE_EXACT_TRIE &&
4312 startbranch == first)
4313 || ( first_non_open == first )) &&
4315 flags |= SCF_TRIE_RESTUDY;
4316 if ( startbranch == first
4319 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4324 /* at this point we know whatever we have is a
4325 * NOTHING sequence/branch AND if 'startbranch'
4326 * is 'first' then we can turn the whole thing
4329 if ( startbranch == first ) {
4331 /* the entire thing is a NOTHING sequence,
4332 * something like this: (?:|) So we can
4333 * turn it into a plain NOTHING op. */
4334 DEBUG_TRIE_COMPILE_r({
4335 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4336 PerlIO_printf( Perl_debug_log,
4337 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4338 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4341 OP(startbranch)= NOTHING;
4342 NEXT_OFF(startbranch)= tail - startbranch;
4343 for ( opt= startbranch + 1; opt < tail ; opt++ )
4347 } /* end if ( last) */
4348 } /* TRIE_MAXBUF is non zero */
4353 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4354 scan = NEXTOPER(NEXTOPER(scan));
4355 } else /* single branch is optimized. */
4356 scan = NEXTOPER(scan);
4358 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4360 regnode *start = NULL;
4361 regnode *end = NULL;
4362 U32 my_recursed_depth= recursed_depth;
4365 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4366 /* Do setup, note this code has side effects beyond
4367 * the rest of this block. Specifically setting
4368 * RExC_recurse[] must happen at least once during
4370 if (OP(scan) == GOSUB) {
4372 RExC_recurse[ARG2L(scan)] = scan;
4373 start = RExC_open_parens[paren-1];
4374 end = RExC_close_parens[paren-1];
4376 start = RExC_rxi->program + 1;
4379 /* NOTE we MUST always execute the above code, even
4380 * if we do nothing with a GOSUB/GOSTART */
4382 ( flags & SCF_IN_DEFINE )
4385 (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4387 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4390 /* no need to do anything here if we are in a define. */
4391 /* or we are after some kind of infinite construct
4392 * so we can skip recursing into this item.
4393 * Since it is infinite we will not change the maxlen
4394 * or delta, and if we miss something that might raise
4395 * the minlen it will merely pessimise a little.
4397 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4398 * might result in a minlen of 1 and not of 4,
4399 * but this doesn't make us mismatch, just try a bit
4400 * harder than we should.
4402 scan= regnext(scan);
4409 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4411 /* it is quite possible that there are more efficient ways
4412 * to do this. We maintain a bitmap per level of recursion
4413 * of which patterns we have entered so we can detect if a
4414 * pattern creates a possible infinite loop. When we
4415 * recurse down a level we copy the previous levels bitmap
4416 * down. When we are at recursion level 0 we zero the top
4417 * level bitmap. It would be nice to implement a different
4418 * more efficient way of doing this. In particular the top
4419 * level bitmap may be unnecessary.
4421 if (!recursed_depth) {
4422 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4424 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4425 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4426 RExC_study_chunk_recursed_bytes, U8);
4428 /* we havent recursed into this paren yet, so recurse into it */
4429 DEBUG_STUDYDATA("set:", data,depth);
4430 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4431 my_recursed_depth= recursed_depth + 1;
4433 DEBUG_STUDYDATA("inf:", data,depth);
4434 /* some form of infinite recursion, assume infinite length
4436 if (flags & SCF_DO_SUBSTR) {
4437 scan_commit(pRExC_state, data, minlenp, is_inf);
4438 data->longest = &(data->longest_float);
4440 is_inf = is_inf_internal = 1;
4441 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4442 ssc_anything(data->start_class);
4443 flags &= ~SCF_DO_STCLASS;
4445 start= NULL; /* reset start so we dont recurse later on. */
4450 end = regnext(scan);
4453 scan_frame *newframe;
4455 if (!RExC_frame_last) {
4456 Newxz(newframe, 1, scan_frame);
4457 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4458 RExC_frame_head= newframe;
4460 } else if (!RExC_frame_last->next_frame) {
4461 Newxz(newframe,1,scan_frame);
4462 RExC_frame_last->next_frame= newframe;
4463 newframe->prev_frame= RExC_frame_last;
4466 newframe= RExC_frame_last->next_frame;
4468 RExC_frame_last= newframe;
4470 newframe->next_regnode = regnext(scan);
4471 newframe->last_regnode = last;
4472 newframe->stopparen = stopparen;
4473 newframe->prev_recursed_depth = recursed_depth;
4474 newframe->this_prev_frame= frame;
4476 DEBUG_STUDYDATA("frame-new:",data,depth);
4477 DEBUG_PEEP("fnew", scan, depth);
4484 recursed_depth= my_recursed_depth;
4489 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4490 SSize_t l = STR_LEN(scan);
4493 const U8 * const s = (U8*)STRING(scan);
4494 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4495 l = utf8_length(s, s + l);
4497 uc = *((U8*)STRING(scan));
4500 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4501 /* The code below prefers earlier match for fixed
4502 offset, later match for variable offset. */
4503 if (data->last_end == -1) { /* Update the start info. */
4504 data->last_start_min = data->pos_min;
4505 data->last_start_max = is_inf
4506 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4508 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4510 SvUTF8_on(data->last_found);
4512 SV * const sv = data->last_found;
4513 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4514 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4515 if (mg && mg->mg_len >= 0)
4516 mg->mg_len += utf8_length((U8*)STRING(scan),
4517 (U8*)STRING(scan)+STR_LEN(scan));
4519 data->last_end = data->pos_min + l;
4520 data->pos_min += l; /* As in the first entry. */
4521 data->flags &= ~SF_BEFORE_EOL;
4524 /* ANDing the code point leaves at most it, and not in locale, and
4525 * can't match null string */
4526 if (flags & SCF_DO_STCLASS_AND) {
4527 ssc_cp_and(data->start_class, uc);
4528 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4529 ssc_clear_locale(data->start_class);
4531 else if (flags & SCF_DO_STCLASS_OR) {
4532 ssc_add_cp(data->start_class, uc);
4533 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4535 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4536 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4538 flags &= ~SCF_DO_STCLASS;
4540 else if (PL_regkind[OP(scan)] == EXACT) {
4541 /* But OP != EXACT!, so is EXACTFish */
4542 SSize_t l = STR_LEN(scan);
4543 const U8 * s = (U8*)STRING(scan);
4545 /* Search for fixed substrings supports EXACT only. */
4546 if (flags & SCF_DO_SUBSTR) {
4548 scan_commit(pRExC_state, data, minlenp, is_inf);
4551 l = utf8_length(s, s + l);
4553 if (unfolded_multi_char) {
4554 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4556 min += l - min_subtract;
4558 delta += min_subtract;
4559 if (flags & SCF_DO_SUBSTR) {
4560 data->pos_min += l - min_subtract;
4561 if (data->pos_min < 0) {
4564 data->pos_delta += min_subtract;
4566 data->longest = &(data->longest_float);
4570 if (flags & SCF_DO_STCLASS) {
4571 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4573 assert(EXACTF_invlist);
4574 if (flags & SCF_DO_STCLASS_AND) {
4575 if (OP(scan) != EXACTFL)
4576 ssc_clear_locale(data->start_class);
4577 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4578 ANYOF_POSIXL_ZERO(data->start_class);
4579 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4581 else { /* SCF_DO_STCLASS_OR */
4582 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4583 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4585 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4586 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4588 flags &= ~SCF_DO_STCLASS;
4589 SvREFCNT_dec(EXACTF_invlist);
4592 else if (REGNODE_VARIES(OP(scan))) {
4593 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4594 I32 fl = 0, f = flags;
4595 regnode * const oscan = scan;
4596 regnode_ssc this_class;
4597 regnode_ssc *oclass = NULL;
4598 I32 next_is_eval = 0;
4600 switch (PL_regkind[OP(scan)]) {
4601 case WHILEM: /* End of (?:...)* . */
4602 scan = NEXTOPER(scan);
4605 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4606 next = NEXTOPER(scan);
4607 if (OP(next) == EXACT
4608 || OP(next) == EXACTL
4609 || (flags & SCF_DO_STCLASS))
4612 maxcount = REG_INFTY;
4613 next = regnext(scan);
4614 scan = NEXTOPER(scan);
4618 if (flags & SCF_DO_SUBSTR)
4623 if (flags & SCF_DO_STCLASS) {
4625 maxcount = REG_INFTY;
4626 next = regnext(scan);
4627 scan = NEXTOPER(scan);
4630 if (flags & SCF_DO_SUBSTR) {
4631 scan_commit(pRExC_state, data, minlenp, is_inf);
4632 /* Cannot extend fixed substrings */
4633 data->longest = &(data->longest_float);
4635 is_inf = is_inf_internal = 1;
4636 scan = regnext(scan);
4637 goto optimize_curly_tail;
4639 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4640 && (scan->flags == stopparen))
4645 mincount = ARG1(scan);
4646 maxcount = ARG2(scan);
4648 next = regnext(scan);
4649 if (OP(scan) == CURLYX) {
4650 I32 lp = (data ? *(data->last_closep) : 0);
4651 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4653 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4654 next_is_eval = (OP(scan) == EVAL);
4656 if (flags & SCF_DO_SUBSTR) {
4658 scan_commit(pRExC_state, data, minlenp, is_inf);
4659 /* Cannot extend fixed substrings */
4660 pos_before = data->pos_min;
4664 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4666 data->flags |= SF_IS_INF;
4668 if (flags & SCF_DO_STCLASS) {
4669 ssc_init(pRExC_state, &this_class);
4670 oclass = data->start_class;
4671 data->start_class = &this_class;
4672 f |= SCF_DO_STCLASS_AND;
4673 f &= ~SCF_DO_STCLASS_OR;
4675 /* Exclude from super-linear cache processing any {n,m}
4676 regops for which the combination of input pos and regex
4677 pos is not enough information to determine if a match
4680 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4681 regex pos at the \s*, the prospects for a match depend not
4682 only on the input position but also on how many (bar\s*)
4683 repeats into the {4,8} we are. */
4684 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4685 f &= ~SCF_WHILEM_VISITED_POS;
4687 /* This will finish on WHILEM, setting scan, or on NULL: */
4688 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4689 last, data, stopparen, recursed_depth, NULL,
4691 ? (f & ~SCF_DO_SUBSTR)
4695 if (flags & SCF_DO_STCLASS)
4696 data->start_class = oclass;
4697 if (mincount == 0 || minnext == 0) {
4698 if (flags & SCF_DO_STCLASS_OR) {
4699 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4701 else if (flags & SCF_DO_STCLASS_AND) {
4702 /* Switch to OR mode: cache the old value of
4703 * data->start_class */
4705 StructCopy(data->start_class, and_withp, regnode_ssc);
4706 flags &= ~SCF_DO_STCLASS_AND;
4707 StructCopy(&this_class, data->start_class, regnode_ssc);
4708 flags |= SCF_DO_STCLASS_OR;
4709 ANYOF_FLAGS(data->start_class)
4710 |= SSC_MATCHES_EMPTY_STRING;
4712 } else { /* Non-zero len */
4713 if (flags & SCF_DO_STCLASS_OR) {
4714 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4715 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4717 else if (flags & SCF_DO_STCLASS_AND)
4718 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4719 flags &= ~SCF_DO_STCLASS;
4721 if (!scan) /* It was not CURLYX, but CURLY. */
4723 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4724 /* ? quantifier ok, except for (?{ ... }) */
4725 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4726 && (minnext == 0) && (deltanext == 0)
4727 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4728 && maxcount <= REG_INFTY/3) /* Complement check for big
4731 /* Fatal warnings may leak the regexp without this: */
4732 SAVEFREESV(RExC_rx_sv);
4733 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4734 "Quantifier unexpected on zero-length expression "
4735 "in regex m/%"UTF8f"/",
4736 UTF8fARG(UTF, RExC_end - RExC_precomp,
4738 (void)ReREFCNT_inc(RExC_rx_sv);
4741 min += minnext * mincount;
4742 is_inf_internal |= deltanext == SSize_t_MAX
4743 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4744 is_inf |= is_inf_internal;
4746 delta = SSize_t_MAX;
4748 delta += (minnext + deltanext) * maxcount
4749 - minnext * mincount;
4751 /* Try powerful optimization CURLYX => CURLYN. */
4752 if ( OP(oscan) == CURLYX && data
4753 && data->flags & SF_IN_PAR
4754 && !(data->flags & SF_HAS_EVAL)
4755 && !deltanext && minnext == 1 ) {
4756 /* Try to optimize to CURLYN. */
4757 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4758 regnode * const nxt1 = nxt;
4765 if (!REGNODE_SIMPLE(OP(nxt))
4766 && !(PL_regkind[OP(nxt)] == EXACT
4767 && STR_LEN(nxt) == 1))
4773 if (OP(nxt) != CLOSE)
4775 if (RExC_open_parens) {
4776 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4777 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4779 /* Now we know that nxt2 is the only contents: */
4780 oscan->flags = (U8)ARG(nxt);
4782 OP(nxt1) = NOTHING; /* was OPEN. */
4785 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4786 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4787 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4788 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4789 OP(nxt + 1) = OPTIMIZED; /* was count. */
4790 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4795 /* Try optimization CURLYX => CURLYM. */
4796 if ( OP(oscan) == CURLYX && data
4797 && !(data->flags & SF_HAS_PAR)
4798 && !(data->flags & SF_HAS_EVAL)
4799 && !deltanext /* atom is fixed width */
4800 && minnext != 0 /* CURLYM can't handle zero width */
4802 /* Nor characters whose fold at run-time may be
4803 * multi-character */
4804 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4806 /* XXXX How to optimize if data == 0? */
4807 /* Optimize to a simpler form. */
4808 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4812 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4813 && (OP(nxt2) != WHILEM))
4815 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4816 /* Need to optimize away parenths. */
4817 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4818 /* Set the parenth number. */
4819 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4821 oscan->flags = (U8)ARG(nxt);
4822 if (RExC_open_parens) {
4823 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4824 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4826 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4827 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4830 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4831 OP(nxt + 1) = OPTIMIZED; /* was count. */
4832 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4833 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4836 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4837 regnode *nnxt = regnext(nxt1);
4839 if (reg_off_by_arg[OP(nxt1)])
4840 ARG_SET(nxt1, nxt2 - nxt1);
4841 else if (nxt2 - nxt1 < U16_MAX)
4842 NEXT_OFF(nxt1) = nxt2 - nxt1;
4844 OP(nxt) = NOTHING; /* Cannot beautify */
4849 /* Optimize again: */
4850 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4851 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4856 else if ((OP(oscan) == CURLYX)
4857 && (flags & SCF_WHILEM_VISITED_POS)
4858 /* See the comment on a similar expression above.
4859 However, this time it's not a subexpression
4860 we care about, but the expression itself. */
4861 && (maxcount == REG_INFTY)
4862 && data && ++data->whilem_c < 16) {
4863 /* This stays as CURLYX, we can put the count/of pair. */
4864 /* Find WHILEM (as in regexec.c) */
4865 regnode *nxt = oscan + NEXT_OFF(oscan);
4867 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4869 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4870 | (RExC_whilem_seen << 4)); /* On WHILEM */
4872 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4874 if (flags & SCF_DO_SUBSTR) {
4875 SV *last_str = NULL;
4876 STRLEN last_chrs = 0;
4877 int counted = mincount != 0;
4879 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4881 SSize_t b = pos_before >= data->last_start_min
4882 ? pos_before : data->last_start_min;
4884 const char * const s = SvPV_const(data->last_found, l);
4885 SSize_t old = b - data->last_start_min;
4888 old = utf8_hop((U8*)s, old) - (U8*)s;
4890 /* Get the added string: */
4891 last_str = newSVpvn_utf8(s + old, l, UTF);
4892 last_chrs = UTF ? utf8_length((U8*)(s + old),
4893 (U8*)(s + old + l)) : l;
4894 if (deltanext == 0 && pos_before == b) {
4895 /* What was added is a constant string */
4898 SvGROW(last_str, (mincount * l) + 1);
4899 repeatcpy(SvPVX(last_str) + l,
4900 SvPVX_const(last_str), l,
4902 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4903 /* Add additional parts. */
4904 SvCUR_set(data->last_found,
4905 SvCUR(data->last_found) - l);
4906 sv_catsv(data->last_found, last_str);
4908 SV * sv = data->last_found;
4910 SvUTF8(sv) && SvMAGICAL(sv) ?
4911 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4912 if (mg && mg->mg_len >= 0)
4913 mg->mg_len += last_chrs * (mincount-1);
4915 last_chrs *= mincount;
4916 data->last_end += l * (mincount - 1);
4919 /* start offset must point into the last copy */
4920 data->last_start_min += minnext * (mincount - 1);
4921 data->last_start_max =
4924 : data->last_start_max +
4925 (maxcount - 1) * (minnext + data->pos_delta);
4928 /* It is counted once already... */
4929 data->pos_min += minnext * (mincount - counted);
4931 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4932 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4933 " maxcount=%"UVuf" mincount=%"UVuf"\n",
4934 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4936 if (deltanext != SSize_t_MAX)
4937 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4938 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4939 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4941 if (deltanext == SSize_t_MAX
4942 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4943 data->pos_delta = SSize_t_MAX;
4945 data->pos_delta += - counted * deltanext +
4946 (minnext + deltanext) * maxcount - minnext * mincount;
4947 if (mincount != maxcount) {
4948 /* Cannot extend fixed substrings found inside
4950 scan_commit(pRExC_state, data, minlenp, is_inf);
4951 if (mincount && last_str) {
4952 SV * const sv = data->last_found;
4953 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4954 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4958 sv_setsv(sv, last_str);
4959 data->last_end = data->pos_min;
4960 data->last_start_min = data->pos_min - last_chrs;
4961 data->last_start_max = is_inf
4963 : data->pos_min + data->pos_delta - last_chrs;
4965 data->longest = &(data->longest_float);
4967 SvREFCNT_dec(last_str);
4969 if (data && (fl & SF_HAS_EVAL))
4970 data->flags |= SF_HAS_EVAL;
4971 optimize_curly_tail:
4972 if (OP(oscan) != CURLYX) {
4973 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4975 NEXT_OFF(oscan) += NEXT_OFF(next);
4981 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4986 if (flags & SCF_DO_SUBSTR) {
4987 /* Cannot expect anything... */
4988 scan_commit(pRExC_state, data, minlenp, is_inf);
4989 data->longest = &(data->longest_float);
4991 is_inf = is_inf_internal = 1;
4992 if (flags & SCF_DO_STCLASS_OR) {
4993 if (OP(scan) == CLUMP) {
4994 /* Actually is any start char, but very few code points
4995 * aren't start characters */
4996 ssc_match_all_cp(data->start_class);
4999 ssc_anything(data->start_class);
5002 flags &= ~SCF_DO_STCLASS;
5006 else if (OP(scan) == LNBREAK) {
5007 if (flags & SCF_DO_STCLASS) {
5008 if (flags & SCF_DO_STCLASS_AND) {
5009 ssc_intersection(data->start_class,
5010 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5011 ssc_clear_locale(data->start_class);
5012 ANYOF_FLAGS(data->start_class)
5013 &= ~SSC_MATCHES_EMPTY_STRING;
5015 else if (flags & SCF_DO_STCLASS_OR) {
5016 ssc_union(data->start_class,
5017 PL_XPosix_ptrs[_CC_VERTSPACE],
5019 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5021 /* See commit msg for
5022 * 749e076fceedeb708a624933726e7989f2302f6a */
5023 ANYOF_FLAGS(data->start_class)
5024 &= ~SSC_MATCHES_EMPTY_STRING;
5026 flags &= ~SCF_DO_STCLASS;
5029 if (delta != SSize_t_MAX)
5030 delta++; /* Because of the 2 char string cr-lf */
5031 if (flags & SCF_DO_SUBSTR) {
5032 /* Cannot expect anything... */
5033 scan_commit(pRExC_state, data, minlenp, is_inf);
5035 data->pos_delta += 1;
5036 data->longest = &(data->longest_float);
5039 else if (REGNODE_SIMPLE(OP(scan))) {
5041 if (flags & SCF_DO_SUBSTR) {
5042 scan_commit(pRExC_state, data, minlenp, is_inf);
5046 if (flags & SCF_DO_STCLASS) {
5048 SV* my_invlist = NULL;
5051 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5052 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5054 /* Some of the logic below assumes that switching
5055 locale on will only add false positives. */
5060 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5065 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5066 ssc_match_all_cp(data->start_class);
5071 SV* REG_ANY_invlist = _new_invlist(2);
5072 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5074 if (flags & SCF_DO_STCLASS_OR) {
5075 ssc_union(data->start_class,
5077 TRUE /* TRUE => invert, hence all but \n
5081 else if (flags & SCF_DO_STCLASS_AND) {
5082 ssc_intersection(data->start_class,
5084 TRUE /* TRUE => invert */
5086 ssc_clear_locale(data->start_class);
5088 SvREFCNT_dec_NN(REG_ANY_invlist);
5094 if (flags & SCF_DO_STCLASS_AND)
5095 ssc_and(pRExC_state, data->start_class,
5096 (regnode_charclass *) scan);
5098 ssc_or(pRExC_state, data->start_class,
5099 (regnode_charclass *) scan);
5107 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5108 if (flags & SCF_DO_STCLASS_AND) {
5109 bool was_there = cBOOL(
5110 ANYOF_POSIXL_TEST(data->start_class,
5112 ANYOF_POSIXL_ZERO(data->start_class);
5113 if (was_there) { /* Do an AND */
5114 ANYOF_POSIXL_SET(data->start_class, namedclass);
5116 /* No individual code points can now match */
5117 data->start_class->invlist
5118 = sv_2mortal(_new_invlist(0));
5121 int complement = namedclass + ((invert) ? -1 : 1);
5123 assert(flags & SCF_DO_STCLASS_OR);
5125 /* If the complement of this class was already there,
5126 * the result is that they match all code points,
5127 * (\d + \D == everything). Remove the classes from
5128 * future consideration. Locale is not relevant in
5130 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5131 ssc_match_all_cp(data->start_class);
5132 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5133 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5135 else { /* The usual case; just add this class to the
5137 ANYOF_POSIXL_SET(data->start_class, namedclass);
5142 case NPOSIXA: /* For these, we always know the exact set of
5147 if (FLAGS(scan) == _CC_ASCII) {
5148 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5151 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5152 PL_XPosix_ptrs[_CC_ASCII],
5163 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5165 /* NPOSIXD matches all upper Latin1 code points unless the
5166 * target string being matched is UTF-8, which is
5167 * unknowable until match time. Since we are going to
5168 * invert, we want to get rid of all of them so that the
5169 * inversion will match all */
5170 if (OP(scan) == NPOSIXD) {
5171 _invlist_subtract(my_invlist, PL_UpperLatin1,
5177 if (flags & SCF_DO_STCLASS_AND) {
5178 ssc_intersection(data->start_class, my_invlist, invert);
5179 ssc_clear_locale(data->start_class);
5182 assert(flags & SCF_DO_STCLASS_OR);
5183 ssc_union(data->start_class, my_invlist, invert);
5185 SvREFCNT_dec(my_invlist);
5187 if (flags & SCF_DO_STCLASS_OR)
5188 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5189 flags &= ~SCF_DO_STCLASS;
5192 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5193 data->flags |= (OP(scan) == MEOL
5196 scan_commit(pRExC_state, data, minlenp, is_inf);
5199 else if ( PL_regkind[OP(scan)] == BRANCHJ
5200 /* Lookbehind, or need to calculate parens/evals/stclass: */
5201 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5202 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5204 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5205 || OP(scan) == UNLESSM )
5207 /* Negative Lookahead/lookbehind
5208 In this case we can't do fixed string optimisation.
5211 SSize_t deltanext, minnext, fake = 0;
5216 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5218 data_fake.whilem_c = data->whilem_c;
5219 data_fake.last_closep = data->last_closep;
5222 data_fake.last_closep = &fake;
5223 data_fake.pos_delta = delta;
5224 if ( flags & SCF_DO_STCLASS && !scan->flags
5225 && OP(scan) == IFMATCH ) { /* Lookahead */
5226 ssc_init(pRExC_state, &intrnl);
5227 data_fake.start_class = &intrnl;
5228 f |= SCF_DO_STCLASS_AND;
5230 if (flags & SCF_WHILEM_VISITED_POS)
5231 f |= SCF_WHILEM_VISITED_POS;
5232 next = regnext(scan);
5233 nscan = NEXTOPER(NEXTOPER(scan));
5234 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5235 last, &data_fake, stopparen,
5236 recursed_depth, NULL, f, depth+1);
5239 FAIL("Variable length lookbehind not implemented");
5241 else if (minnext > (I32)U8_MAX) {
5242 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5245 scan->flags = (U8)minnext;
5248 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5250 if (data_fake.flags & SF_HAS_EVAL)
5251 data->flags |= SF_HAS_EVAL;
5252 data->whilem_c = data_fake.whilem_c;
5254 if (f & SCF_DO_STCLASS_AND) {
5255 if (flags & SCF_DO_STCLASS_OR) {
5256 /* OR before, AND after: ideally we would recurse with
5257 * data_fake to get the AND applied by study of the
5258 * remainder of the pattern, and then derecurse;
5259 * *** HACK *** for now just treat as "no information".
5260 * See [perl #56690].
5262 ssc_init(pRExC_state, data->start_class);
5264 /* AND before and after: combine and continue. These
5265 * assertions are zero-length, so can match an EMPTY
5267 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5268 ANYOF_FLAGS(data->start_class)
5269 |= SSC_MATCHES_EMPTY_STRING;
5273 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5275 /* Positive Lookahead/lookbehind
5276 In this case we can do fixed string optimisation,
5277 but we must be careful about it. Note in the case of
5278 lookbehind the positions will be offset by the minimum
5279 length of the pattern, something we won't know about
5280 until after the recurse.
5282 SSize_t deltanext, fake = 0;
5286 /* We use SAVEFREEPV so that when the full compile
5287 is finished perl will clean up the allocated
5288 minlens when it's all done. This way we don't
5289 have to worry about freeing them when we know
5290 they wont be used, which would be a pain.
5293 Newx( minnextp, 1, SSize_t );
5294 SAVEFREEPV(minnextp);
5297 StructCopy(data, &data_fake, scan_data_t);
5298 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5301 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5302 data_fake.last_found=newSVsv(data->last_found);
5306 data_fake.last_closep = &fake;
5307 data_fake.flags = 0;
5308 data_fake.pos_delta = delta;
5310 data_fake.flags |= SF_IS_INF;
5311 if ( flags & SCF_DO_STCLASS && !scan->flags
5312 && OP(scan) == IFMATCH ) { /* Lookahead */
5313 ssc_init(pRExC_state, &intrnl);
5314 data_fake.start_class = &intrnl;
5315 f |= SCF_DO_STCLASS_AND;
5317 if (flags & SCF_WHILEM_VISITED_POS)
5318 f |= SCF_WHILEM_VISITED_POS;
5319 next = regnext(scan);
5320 nscan = NEXTOPER(NEXTOPER(scan));
5322 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5323 &deltanext, last, &data_fake,
5324 stopparen, recursed_depth, NULL,
5328 FAIL("Variable length lookbehind not implemented");
5330 else if (*minnextp > (I32)U8_MAX) {
5331 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5334 scan->flags = (U8)*minnextp;
5339 if (f & SCF_DO_STCLASS_AND) {
5340 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5341 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5344 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5346 if (data_fake.flags & SF_HAS_EVAL)
5347 data->flags |= SF_HAS_EVAL;
5348 data->whilem_c = data_fake.whilem_c;
5349 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5350 if (RExC_rx->minlen<*minnextp)
5351 RExC_rx->minlen=*minnextp;
5352 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5353 SvREFCNT_dec_NN(data_fake.last_found);
5355 if ( data_fake.minlen_fixed != minlenp )
5357 data->offset_fixed= data_fake.offset_fixed;
5358 data->minlen_fixed= data_fake.minlen_fixed;
5359 data->lookbehind_fixed+= scan->flags;
5361 if ( data_fake.minlen_float != minlenp )
5363 data->minlen_float= data_fake.minlen_float;
5364 data->offset_float_min=data_fake.offset_float_min;
5365 data->offset_float_max=data_fake.offset_float_max;
5366 data->lookbehind_float+= scan->flags;
5373 else if (OP(scan) == OPEN) {
5374 if (stopparen != (I32)ARG(scan))
5377 else if (OP(scan) == CLOSE) {
5378 if (stopparen == (I32)ARG(scan)) {
5381 if ((I32)ARG(scan) == is_par) {
5382 next = regnext(scan);
5384 if ( next && (OP(next) != WHILEM) && next < last)
5385 is_par = 0; /* Disable optimization */
5388 *(data->last_closep) = ARG(scan);
5390 else if (OP(scan) == EVAL) {
5392 data->flags |= SF_HAS_EVAL;
5394 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5395 if (flags & SCF_DO_SUBSTR) {
5396 scan_commit(pRExC_state, data, minlenp, is_inf);
5397 flags &= ~SCF_DO_SUBSTR;
5399 if (data && OP(scan)==ACCEPT) {
5400 data->flags |= SCF_SEEN_ACCEPT;
5405 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5407 if (flags & SCF_DO_SUBSTR) {
5408 scan_commit(pRExC_state, data, minlenp, is_inf);
5409 data->longest = &(data->longest_float);
5411 is_inf = is_inf_internal = 1;
5412 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5413 ssc_anything(data->start_class);
5414 flags &= ~SCF_DO_STCLASS;
5416 else if (OP(scan) == GPOS) {
5417 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5418 !(delta || is_inf || (data && data->pos_delta)))
5420 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5421 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5422 if (RExC_rx->gofs < (STRLEN)min)
5423 RExC_rx->gofs = min;
5425 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5429 #ifdef TRIE_STUDY_OPT
5430 #ifdef FULL_TRIE_STUDY
5431 else if (PL_regkind[OP(scan)] == TRIE) {
5432 /* NOTE - There is similar code to this block above for handling
5433 BRANCH nodes on the initial study. If you change stuff here
5435 regnode *trie_node= scan;
5436 regnode *tail= regnext(scan);
5437 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5438 SSize_t max1 = 0, min1 = SSize_t_MAX;
5441 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5442 /* Cannot merge strings after this. */
5443 scan_commit(pRExC_state, data, minlenp, is_inf);
5445 if (flags & SCF_DO_STCLASS)
5446 ssc_init_zero(pRExC_state, &accum);
5452 const regnode *nextbranch= NULL;
5455 for ( word=1 ; word <= trie->wordcount ; word++)
5457 SSize_t deltanext=0, minnext=0, f = 0, fake;
5458 regnode_ssc this_class;
5460 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5462 data_fake.whilem_c = data->whilem_c;
5463 data_fake.last_closep = data->last_closep;
5466 data_fake.last_closep = &fake;
5467 data_fake.pos_delta = delta;
5468 if (flags & SCF_DO_STCLASS) {
5469 ssc_init(pRExC_state, &this_class);
5470 data_fake.start_class = &this_class;
5471 f = SCF_DO_STCLASS_AND;
5473 if (flags & SCF_WHILEM_VISITED_POS)
5474 f |= SCF_WHILEM_VISITED_POS;
5476 if (trie->jump[word]) {
5478 nextbranch = trie_node + trie->jump[0];
5479 scan= trie_node + trie->jump[word];
5480 /* We go from the jump point to the branch that follows
5481 it. Note this means we need the vestigal unused
5482 branches even though they arent otherwise used. */
5483 minnext = study_chunk(pRExC_state, &scan, minlenp,
5484 &deltanext, (regnode *)nextbranch, &data_fake,
5485 stopparen, recursed_depth, NULL, f,depth+1);
5487 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5488 nextbranch= regnext((regnode*)nextbranch);
5490 if (min1 > (SSize_t)(minnext + trie->minlen))
5491 min1 = minnext + trie->minlen;
5492 if (deltanext == SSize_t_MAX) {
5493 is_inf = is_inf_internal = 1;
5495 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5496 max1 = minnext + deltanext + trie->maxlen;
5498 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5500 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5501 if ( stopmin > min + min1)
5502 stopmin = min + min1;
5503 flags &= ~SCF_DO_SUBSTR;
5505 data->flags |= SCF_SEEN_ACCEPT;
5508 if (data_fake.flags & SF_HAS_EVAL)
5509 data->flags |= SF_HAS_EVAL;
5510 data->whilem_c = data_fake.whilem_c;
5512 if (flags & SCF_DO_STCLASS)
5513 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5516 if (flags & SCF_DO_SUBSTR) {
5517 data->pos_min += min1;
5518 data->pos_delta += max1 - min1;
5519 if (max1 != min1 || is_inf)
5520 data->longest = &(data->longest_float);
5523 if (delta != SSize_t_MAX)
5524 delta += max1 - min1;
5525 if (flags & SCF_DO_STCLASS_OR) {
5526 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5528 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5529 flags &= ~SCF_DO_STCLASS;
5532 else if (flags & SCF_DO_STCLASS_AND) {
5534 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5535 flags &= ~SCF_DO_STCLASS;
5538 /* Switch to OR mode: cache the old value of
5539 * data->start_class */
5541 StructCopy(data->start_class, and_withp, regnode_ssc);
5542 flags &= ~SCF_DO_STCLASS_AND;
5543 StructCopy(&accum, data->start_class, regnode_ssc);
5544 flags |= SCF_DO_STCLASS_OR;
5551 else if (PL_regkind[OP(scan)] == TRIE) {
5552 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5555 min += trie->minlen;
5556 delta += (trie->maxlen - trie->minlen);
5557 flags &= ~SCF_DO_STCLASS; /* xxx */
5558 if (flags & SCF_DO_SUBSTR) {
5559 /* Cannot expect anything... */
5560 scan_commit(pRExC_state, data, minlenp, is_inf);
5561 data->pos_min += trie->minlen;
5562 data->pos_delta += (trie->maxlen - trie->minlen);
5563 if (trie->maxlen != trie->minlen)
5564 data->longest = &(data->longest_float);
5566 if (trie->jump) /* no more substrings -- for now /grr*/
5567 flags &= ~SCF_DO_SUBSTR;
5569 #endif /* old or new */
5570 #endif /* TRIE_STUDY_OPT */
5572 /* Else: zero-length, ignore. */
5573 scan = regnext(scan);
5575 /* If we are exiting a recursion we can unset its recursed bit
5576 * and allow ourselves to enter it again - no danger of an
5577 * infinite loop there.
5578 if (stopparen > -1 && recursed) {
5579 DEBUG_STUDYDATA("unset:", data,depth);
5580 PAREN_UNSET( recursed, stopparen);
5586 DEBUG_STUDYDATA("frame-end:",data,depth);
5587 DEBUG_PEEP("fend", scan, depth);
5589 /* restore previous context */
5590 last = frame->last_regnode;
5591 scan = frame->next_regnode;
5592 stopparen = frame->stopparen;
5593 recursed_depth = frame->prev_recursed_depth;
5595 RExC_frame_last = frame->prev_frame;
5596 frame = frame->this_prev_frame;
5597 goto fake_study_recurse;
5602 DEBUG_STUDYDATA("pre-fin:",data,depth);
5605 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5607 if (flags & SCF_DO_SUBSTR && is_inf)
5608 data->pos_delta = SSize_t_MAX - data->pos_min;
5609 if (is_par > (I32)U8_MAX)
5611 if (is_par && pars==1 && data) {
5612 data->flags |= SF_IN_PAR;
5613 data->flags &= ~SF_HAS_PAR;
5615 else if (pars && data) {
5616 data->flags |= SF_HAS_PAR;
5617 data->flags &= ~SF_IN_PAR;
5619 if (flags & SCF_DO_STCLASS_OR)
5620 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5621 if (flags & SCF_TRIE_RESTUDY)
5622 data->flags |= SCF_TRIE_RESTUDY;
5624 DEBUG_STUDYDATA("post-fin:",data,depth);
5627 SSize_t final_minlen= min < stopmin ? min : stopmin;
5629 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5630 if (final_minlen > SSize_t_MAX - delta)
5631 RExC_maxlen = SSize_t_MAX;
5632 else if (RExC_maxlen < final_minlen + delta)
5633 RExC_maxlen = final_minlen + delta;
5635 return final_minlen;
5641 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5643 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5645 PERL_ARGS_ASSERT_ADD_DATA;
5647 Renewc(RExC_rxi->data,
5648 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5649 char, struct reg_data);
5651 Renew(RExC_rxi->data->what, count + n, U8);
5653 Newx(RExC_rxi->data->what, n, U8);
5654 RExC_rxi->data->count = count + n;
5655 Copy(s, RExC_rxi->data->what + count, n, U8);
5659 /*XXX: todo make this not included in a non debugging perl, but appears to be
5660 * used anyway there, in 'use re' */
5661 #ifndef PERL_IN_XSUB_RE
5663 Perl_reginitcolors(pTHX)
5665 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5667 char *t = savepv(s);
5671 t = strchr(t, '\t');
5677 PL_colors[i] = t = (char *)"";
5682 PL_colors[i++] = (char *)"";
5689 #ifdef TRIE_STUDY_OPT
5690 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5693 (data.flags & SCF_TRIE_RESTUDY) \
5701 #define CHECK_RESTUDY_GOTO_butfirst
5705 * pregcomp - compile a regular expression into internal code
5707 * Decides which engine's compiler to call based on the hint currently in
5711 #ifndef PERL_IN_XSUB_RE
5713 /* return the currently in-scope regex engine (or the default if none) */
5715 regexp_engine const *
5716 Perl_current_re_engine(pTHX)
5718 if (IN_PERL_COMPILETIME) {
5719 HV * const table = GvHV(PL_hintgv);
5722 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5723 return &PL_core_reg_engine;
5724 ptr = hv_fetchs(table, "regcomp", FALSE);
5725 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5726 return &PL_core_reg_engine;
5727 return INT2PTR(regexp_engine*,SvIV(*ptr));
5731 if (!PL_curcop->cop_hints_hash)
5732 return &PL_core_reg_engine;
5733 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5734 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5735 return &PL_core_reg_engine;
5736 return INT2PTR(regexp_engine*,SvIV(ptr));
5742 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5744 regexp_engine const *eng = current_re_engine();
5745 GET_RE_DEBUG_FLAGS_DECL;
5747 PERL_ARGS_ASSERT_PREGCOMP;
5749 /* Dispatch a request to compile a regexp to correct regexp engine. */
5751 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5754 return CALLREGCOMP_ENG(eng, pattern, flags);
5758 /* public(ish) entry point for the perl core's own regex compiling code.
5759 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5760 * pattern rather than a list of OPs, and uses the internal engine rather
5761 * than the current one */
5764 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5766 SV *pat = pattern; /* defeat constness! */
5767 PERL_ARGS_ASSERT_RE_COMPILE;
5768 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5769 #ifdef PERL_IN_XSUB_RE
5772 &PL_core_reg_engine,
5774 NULL, NULL, rx_flags, 0);
5778 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5779 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5780 * point to the realloced string and length.
5782 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5786 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5787 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5789 U8 *const src = (U8*)*pat_p;
5794 GET_RE_DEBUG_FLAGS_DECL;
5796 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5797 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5799 Newx(dst, *plen_p * 2 + 1, U8);
5802 while (s < *plen_p) {
5803 append_utf8_from_native_byte(src[s], &d);
5804 if (n < num_code_blocks) {
5805 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5806 pRExC_state->code_blocks[n].start = d - dst - 1;
5807 assert(*(d - 1) == '(');
5810 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5811 pRExC_state->code_blocks[n].end = d - dst - 1;
5812 assert(*(d - 1) == ')');
5821 *pat_p = (char*) dst;
5823 RExC_orig_utf8 = RExC_utf8 = 1;
5828 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5829 * while recording any code block indices, and handling overloading,
5830 * nested qr// objects etc. If pat is null, it will allocate a new
5831 * string, or just return the first arg, if there's only one.
5833 * Returns the malloced/updated pat.
5834 * patternp and pat_count is the array of SVs to be concatted;
5835 * oplist is the optional list of ops that generated the SVs;
5836 * recompile_p is a pointer to a boolean that will be set if
5837 * the regex will need to be recompiled.
5838 * delim, if non-null is an SV that will be inserted between each element
5842 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5843 SV *pat, SV ** const patternp, int pat_count,
5844 OP *oplist, bool *recompile_p, SV *delim)
5848 bool use_delim = FALSE;
5849 bool alloced = FALSE;
5851 /* if we know we have at least two args, create an empty string,
5852 * then concatenate args to that. For no args, return an empty string */
5853 if (!pat && pat_count != 1) {
5859 for (svp = patternp; svp < patternp + pat_count; svp++) {
5862 STRLEN orig_patlen = 0;
5864 SV *msv = use_delim ? delim : *svp;
5865 if (!msv) msv = &PL_sv_undef;
5867 /* if we've got a delimiter, we go round the loop twice for each
5868 * svp slot (except the last), using the delimiter the second
5877 if (SvTYPE(msv) == SVt_PVAV) {
5878 /* we've encountered an interpolated array within
5879 * the pattern, e.g. /...@a..../. Expand the list of elements,
5880 * then recursively append elements.
5881 * The code in this block is based on S_pushav() */
5883 AV *const av = (AV*)msv;
5884 const SSize_t maxarg = AvFILL(av) + 1;
5888 assert(oplist->op_type == OP_PADAV
5889 || oplist->op_type == OP_RV2AV);
5890 oplist = OpSIBLING(oplist);
5893 if (SvRMAGICAL(av)) {
5896 Newx(array, maxarg, SV*);
5898 for (i=0; i < maxarg; i++) {
5899 SV ** const svp = av_fetch(av, i, FALSE);
5900 array[i] = svp ? *svp : &PL_sv_undef;
5904 array = AvARRAY(av);
5906 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5907 array, maxarg, NULL, recompile_p,
5909 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5915 /* we make the assumption here that each op in the list of
5916 * op_siblings maps to one SV pushed onto the stack,
5917 * except for code blocks, with have both an OP_NULL and
5919 * This allows us to match up the list of SVs against the
5920 * list of OPs to find the next code block.
5922 * Note that PUSHMARK PADSV PADSV ..
5924 * PADRANGE PADSV PADSV ..
5925 * so the alignment still works. */
5928 if (oplist->op_type == OP_NULL
5929 && (oplist->op_flags & OPf_SPECIAL))
5931 assert(n < pRExC_state->num_code_blocks);
5932 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5933 pRExC_state->code_blocks[n].block = oplist;
5934 pRExC_state->code_blocks[n].src_regex = NULL;
5937 oplist = OpSIBLING(oplist); /* skip CONST */
5940 oplist = OpSIBLING(oplist);;
5943 /* apply magic and QR overloading to arg */
5946 if (SvROK(msv) && SvAMAGIC(msv)) {
5947 SV *sv = AMG_CALLunary(msv, regexp_amg);
5951 if (SvTYPE(sv) != SVt_REGEXP)
5952 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5957 /* try concatenation overload ... */
5958 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5959 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5962 /* overloading involved: all bets are off over literal
5963 * code. Pretend we haven't seen it */
5964 pRExC_state->num_code_blocks -= n;
5968 /* ... or failing that, try "" overload */
5969 while (SvAMAGIC(msv)
5970 && (sv = AMG_CALLunary(msv, string_amg))
5974 && SvRV(msv) == SvRV(sv))
5979 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5983 /* this is a partially unrolled
5984 * sv_catsv_nomg(pat, msv);
5985 * that allows us to adjust code block indices if
5988 char *dst = SvPV_force_nomg(pat, dlen);
5990 if (SvUTF8(msv) && !SvUTF8(pat)) {
5991 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5992 sv_setpvn(pat, dst, dlen);
5995 sv_catsv_nomg(pat, msv);
6002 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6005 /* extract any code blocks within any embedded qr//'s */
6006 if (rx && SvTYPE(rx) == SVt_REGEXP
6007 && RX_ENGINE((REGEXP*)rx)->op_comp)
6010 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6011 if (ri->num_code_blocks) {
6013 /* the presence of an embedded qr// with code means
6014 * we should always recompile: the text of the
6015 * qr// may not have changed, but it may be a
6016 * different closure than last time */
6018 Renew(pRExC_state->code_blocks,
6019 pRExC_state->num_code_blocks + ri->num_code_blocks,
6020 struct reg_code_block);
6021 pRExC_state->num_code_blocks += ri->num_code_blocks;
6023 for (i=0; i < ri->num_code_blocks; i++) {
6024 struct reg_code_block *src, *dst;
6025 STRLEN offset = orig_patlen
6026 + ReANY((REGEXP *)rx)->pre_prefix;
6027 assert(n < pRExC_state->num_code_blocks);
6028 src = &ri->code_blocks[i];
6029 dst = &pRExC_state->code_blocks[n];
6030 dst->start = src->start + offset;
6031 dst->end = src->end + offset;
6032 dst->block = src->block;
6033 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6042 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6051 /* see if there are any run-time code blocks in the pattern.
6052 * False positives are allowed */
6055 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6056 char *pat, STRLEN plen)
6061 PERL_UNUSED_CONTEXT;
6063 for (s = 0; s < plen; s++) {
6064 if (n < pRExC_state->num_code_blocks
6065 && s == pRExC_state->code_blocks[n].start)
6067 s = pRExC_state->code_blocks[n].end;
6071 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6073 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6075 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6082 /* Handle run-time code blocks. We will already have compiled any direct
6083 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6084 * copy of it, but with any literal code blocks blanked out and
6085 * appropriate chars escaped; then feed it into
6087 * eval "qr'modified_pattern'"
6091 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6095 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6097 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6098 * and merge them with any code blocks of the original regexp.
6100 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6101 * instead, just save the qr and return FALSE; this tells our caller that
6102 * the original pattern needs upgrading to utf8.
6106 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6107 char *pat, STRLEN plen)
6111 GET_RE_DEBUG_FLAGS_DECL;
6113 if (pRExC_state->runtime_code_qr) {
6114 /* this is the second time we've been called; this should
6115 * only happen if the main pattern got upgraded to utf8
6116 * during compilation; re-use the qr we compiled first time
6117 * round (which should be utf8 too)
6119 qr = pRExC_state->runtime_code_qr;
6120 pRExC_state->runtime_code_qr = NULL;
6121 assert(RExC_utf8 && SvUTF8(qr));
6127 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6131 /* determine how many extra chars we need for ' and \ escaping */
6132 for (s = 0; s < plen; s++) {
6133 if (pat[s] == '\'' || pat[s] == '\\')
6137 Newx(newpat, newlen, char);
6139 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6141 for (s = 0; s < plen; s++) {
6142 if (n < pRExC_state->num_code_blocks
6143 && s == pRExC_state->code_blocks[n].start)
6145 /* blank out literal code block */
6146 assert(pat[s] == '(');
6147 while (s <= pRExC_state->code_blocks[n].end) {
6155 if (pat[s] == '\'' || pat[s] == '\\')
6160 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6164 PerlIO_printf(Perl_debug_log,
6165 "%sre-parsing pattern for runtime code:%s %s\n",
6166 PL_colors[4],PL_colors[5],newpat);
6169 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6174 PUSHSTACKi(PERLSI_REQUIRE);
6175 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6176 * parsing qr''; normally only q'' does this. It also alters
6178 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6179 SvREFCNT_dec_NN(sv);
6184 SV * const errsv = ERRSV;
6185 if (SvTRUE_NN(errsv))
6187 Safefree(pRExC_state->code_blocks);
6188 /* use croak_sv ? */
6189 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6192 assert(SvROK(qr_ref));
6194 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6195 /* the leaving below frees the tmp qr_ref.
6196 * Give qr a life of its own */
6204 if (!RExC_utf8 && SvUTF8(qr)) {
6205 /* first time through; the pattern got upgraded; save the
6206 * qr for the next time through */
6207 assert(!pRExC_state->runtime_code_qr);
6208 pRExC_state->runtime_code_qr = qr;
6213 /* extract any code blocks within the returned qr// */
6216 /* merge the main (r1) and run-time (r2) code blocks into one */
6218 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6219 struct reg_code_block *new_block, *dst;
6220 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6223 if (!r2->num_code_blocks) /* we guessed wrong */
6225 SvREFCNT_dec_NN(qr);
6230 r1->num_code_blocks + r2->num_code_blocks,
6231 struct reg_code_block);
6234 while ( i1 < r1->num_code_blocks
6235 || i2 < r2->num_code_blocks)
6237 struct reg_code_block *src;
6240 if (i1 == r1->num_code_blocks) {
6241 src = &r2->code_blocks[i2++];
6244 else if (i2 == r2->num_code_blocks)
6245 src = &r1->code_blocks[i1++];
6246 else if ( r1->code_blocks[i1].start
6247 < r2->code_blocks[i2].start)
6249 src = &r1->code_blocks[i1++];
6250 assert(src->end < r2->code_blocks[i2].start);
6253 assert( r1->code_blocks[i1].start
6254 > r2->code_blocks[i2].start);
6255 src = &r2->code_blocks[i2++];
6257 assert(src->end < r1->code_blocks[i1].start);
6260 assert(pat[src->start] == '(');
6261 assert(pat[src->end] == ')');
6262 dst->start = src->start;
6263 dst->end = src->end;
6264 dst->block = src->block;
6265 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6269 r1->num_code_blocks += r2->num_code_blocks;
6270 Safefree(r1->code_blocks);
6271 r1->code_blocks = new_block;
6274 SvREFCNT_dec_NN(qr);
6280 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6281 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6282 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6283 STRLEN longest_length, bool eol, bool meol)
6285 /* This is the common code for setting up the floating and fixed length
6286 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6287 * as to whether succeeded or not */
6292 if (! (longest_length
6293 || (eol /* Can't have SEOL and MULTI */
6294 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6296 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6297 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6302 /* copy the information about the longest from the reg_scan_data
6303 over to the program. */
6304 if (SvUTF8(sv_longest)) {
6305 *rx_utf8 = sv_longest;
6308 *rx_substr = sv_longest;
6311 /* end_shift is how many chars that must be matched that
6312 follow this item. We calculate it ahead of time as once the
6313 lookbehind offset is added in we lose the ability to correctly
6315 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6316 *rx_end_shift = ml - offset
6317 - longest_length + (SvTAIL(sv_longest) != 0)
6320 t = (eol/* Can't have SEOL and MULTI */
6321 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6322 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6328 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6329 * regular expression into internal code.
6330 * The pattern may be passed either as:
6331 * a list of SVs (patternp plus pat_count)
6332 * a list of OPs (expr)
6333 * If both are passed, the SV list is used, but the OP list indicates
6334 * which SVs are actually pre-compiled code blocks
6336 * The SVs in the list have magic and qr overloading applied to them (and
6337 * the list may be modified in-place with replacement SVs in the latter
6340 * If the pattern hasn't changed from old_re, then old_re will be
6343 * eng is the current engine. If that engine has an op_comp method, then
6344 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6345 * do the initial concatenation of arguments and pass on to the external
6348 * If is_bare_re is not null, set it to a boolean indicating whether the
6349 * arg list reduced (after overloading) to a single bare regex which has
6350 * been returned (i.e. /$qr/).
6352 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6354 * pm_flags contains the PMf_* flags, typically based on those from the
6355 * pm_flags field of the related PMOP. Currently we're only interested in
6356 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6358 * We can't allocate space until we know how big the compiled form will be,
6359 * but we can't compile it (and thus know how big it is) until we've got a
6360 * place to put the code. So we cheat: we compile it twice, once with code
6361 * generation turned off and size counting turned on, and once "for real".
6362 * This also means that we don't allocate space until we are sure that the
6363 * thing really will compile successfully, and we never have to move the
6364 * code and thus invalidate pointers into it. (Note that it has to be in
6365 * one piece because free() must be able to free it all.) [NB: not true in perl]
6367 * Beware that the optimization-preparation code in here knows about some
6368 * of the structure of the compiled regexp. [I'll say.]
6372 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6373 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6374 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6378 regexp_internal *ri;
6386 SV *code_blocksv = NULL;
6387 SV** new_patternp = patternp;
6389 /* these are all flags - maybe they should be turned
6390 * into a single int with different bit masks */
6391 I32 sawlookahead = 0;
6396 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6398 bool runtime_code = 0;
6400 RExC_state_t RExC_state;
6401 RExC_state_t * const pRExC_state = &RExC_state;
6402 #ifdef TRIE_STUDY_OPT
6404 RExC_state_t copyRExC_state;
6406 GET_RE_DEBUG_FLAGS_DECL;
6408 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6410 DEBUG_r(if (!PL_colorset) reginitcolors());
6412 #ifndef PERL_IN_XSUB_RE
6413 /* Initialize these here instead of as-needed, as is quick and avoids
6414 * having to test them each time otherwise */
6415 if (! PL_AboveLatin1) {
6416 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6417 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6418 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6419 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6420 PL_HasMultiCharFold =
6421 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6423 /* This is calculated here, because the Perl program that generates the
6424 * static global ones doesn't currently have access to
6425 * NUM_ANYOF_CODE_POINTS */
6426 PL_InBitmap = _new_invlist(2);
6427 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6428 NUM_ANYOF_CODE_POINTS - 1);
6432 pRExC_state->code_blocks = NULL;
6433 pRExC_state->num_code_blocks = 0;
6436 *is_bare_re = FALSE;
6438 if (expr && (expr->op_type == OP_LIST ||
6439 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6440 /* allocate code_blocks if needed */
6444 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6445 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6446 ncode++; /* count of DO blocks */
6448 pRExC_state->num_code_blocks = ncode;
6449 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6454 /* compile-time pattern with just OP_CONSTs and DO blocks */
6459 /* find how many CONSTs there are */
6462 if (expr->op_type == OP_CONST)
6465 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6466 if (o->op_type == OP_CONST)
6470 /* fake up an SV array */
6472 assert(!new_patternp);
6473 Newx(new_patternp, n, SV*);
6474 SAVEFREEPV(new_patternp);
6478 if (expr->op_type == OP_CONST)
6479 new_patternp[n] = cSVOPx_sv(expr);
6481 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6482 if (o->op_type == OP_CONST)
6483 new_patternp[n++] = cSVOPo_sv;
6488 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6489 "Assembling pattern from %d elements%s\n", pat_count,
6490 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6492 /* set expr to the first arg op */
6494 if (pRExC_state->num_code_blocks
6495 && expr->op_type != OP_CONST)
6497 expr = cLISTOPx(expr)->op_first;
6498 assert( expr->op_type == OP_PUSHMARK
6499 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6500 || expr->op_type == OP_PADRANGE);
6501 expr = OpSIBLING(expr);
6504 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6505 expr, &recompile, NULL);
6507 /* handle bare (possibly after overloading) regex: foo =~ $re */
6512 if (SvTYPE(re) == SVt_REGEXP) {
6516 Safefree(pRExC_state->code_blocks);
6517 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6518 "Precompiled pattern%s\n",
6519 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6525 exp = SvPV_nomg(pat, plen);
6527 if (!eng->op_comp) {
6528 if ((SvUTF8(pat) && IN_BYTES)
6529 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6531 /* make a temporary copy; either to convert to bytes,
6532 * or to avoid repeating get-magic / overloaded stringify */
6533 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6534 (IN_BYTES ? 0 : SvUTF8(pat)));
6536 Safefree(pRExC_state->code_blocks);
6537 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6540 /* ignore the utf8ness if the pattern is 0 length */
6541 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6542 RExC_uni_semantics = 0;
6543 RExC_contains_locale = 0;
6544 RExC_contains_i = 0;
6545 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6546 pRExC_state->runtime_code_qr = NULL;
6547 RExC_frame_head= NULL;
6548 RExC_frame_last= NULL;
6549 RExC_frame_count= 0;
6552 RExC_mysv1= sv_newmortal();
6553 RExC_mysv2= sv_newmortal();
6556 SV *dsv= sv_newmortal();
6557 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6558 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6559 PL_colors[4],PL_colors[5],s);
6563 /* we jump here if we upgrade the pattern to utf8 and have to
6566 if ((pm_flags & PMf_USE_RE_EVAL)
6567 /* this second condition covers the non-regex literal case,
6568 * i.e. $foo =~ '(?{})'. */
6569 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6571 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6573 /* return old regex if pattern hasn't changed */
6574 /* XXX: note in the below we have to check the flags as well as the
6577 * Things get a touch tricky as we have to compare the utf8 flag
6578 * independently from the compile flags. */
6582 && !!RX_UTF8(old_re) == !!RExC_utf8
6583 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6584 && RX_PRECOMP(old_re)
6585 && RX_PRELEN(old_re) == plen
6586 && memEQ(RX_PRECOMP(old_re), exp, plen)
6587 && !runtime_code /* with runtime code, always recompile */ )
6589 Safefree(pRExC_state->code_blocks);
6593 rx_flags = orig_rx_flags;
6595 if (rx_flags & PMf_FOLD) {
6596 RExC_contains_i = 1;
6598 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6600 /* Set to use unicode semantics if the pattern is in utf8 and has the
6601 * 'depends' charset specified, as it means unicode when utf8 */
6602 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6606 RExC_flags = rx_flags;
6607 RExC_pm_flags = pm_flags;
6610 if (TAINTING_get && TAINT_get)
6611 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6613 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6614 /* whoops, we have a non-utf8 pattern, whilst run-time code
6615 * got compiled as utf8. Try again with a utf8 pattern */
6616 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6617 pRExC_state->num_code_blocks);
6618 goto redo_first_pass;
6621 assert(!pRExC_state->runtime_code_qr);
6627 RExC_in_lookbehind = 0;
6628 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6630 RExC_override_recoding = 0;
6631 RExC_in_multi_char_class = 0;
6633 /* First pass: determine size, legality. */
6636 RExC_end = exp + plen;
6641 RExC_emit = (regnode *) &RExC_emit_dummy;
6642 RExC_whilem_seen = 0;
6643 RExC_open_parens = NULL;
6644 RExC_close_parens = NULL;
6646 RExC_paren_names = NULL;
6648 RExC_paren_name_list = NULL;
6650 RExC_recurse = NULL;
6651 RExC_study_chunk_recursed = NULL;
6652 RExC_study_chunk_recursed_bytes= 0;
6653 RExC_recurse_count = 0;
6654 pRExC_state->code_index = 0;
6657 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6659 RExC_lastparse=NULL;
6661 /* reg may croak on us, not giving us a chance to free
6662 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6663 need it to survive as long as the regexp (qr/(?{})/).
6664 We must check that code_blocksv is not already set, because we may
6665 have jumped back to restart the sizing pass. */
6666 if (pRExC_state->code_blocks && !code_blocksv) {
6667 code_blocksv = newSV_type(SVt_PV);
6668 SAVEFREESV(code_blocksv);
6669 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6670 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6672 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6673 /* It's possible to write a regexp in ascii that represents Unicode
6674 codepoints outside of the byte range, such as via \x{100}. If we
6675 detect such a sequence we have to convert the entire pattern to utf8
6676 and then recompile, as our sizing calculation will have been based
6677 on 1 byte == 1 character, but we will need to use utf8 to encode
6678 at least some part of the pattern, and therefore must convert the whole
6681 if (flags & RESTART_UTF8) {
6682 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6683 pRExC_state->num_code_blocks);
6684 goto redo_first_pass;
6686 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6689 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6692 PerlIO_printf(Perl_debug_log,
6693 "Required size %"IVdf" nodes\n"
6694 "Starting second pass (creation)\n",
6697 RExC_lastparse=NULL;
6700 /* The first pass could have found things that force Unicode semantics */
6701 if ((RExC_utf8 || RExC_uni_semantics)
6702 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6704 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6707 /* Small enough for pointer-storage convention?
6708 If extralen==0, this means that we will not need long jumps. */
6709 if (RExC_size >= 0x10000L && RExC_extralen)
6710 RExC_size += RExC_extralen;
6713 if (RExC_whilem_seen > 15)
6714 RExC_whilem_seen = 15;
6716 /* Allocate space and zero-initialize. Note, the two step process
6717 of zeroing when in debug mode, thus anything assigned has to
6718 happen after that */
6719 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6721 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6722 char, regexp_internal);
6723 if ( r == NULL || ri == NULL )
6724 FAIL("Regexp out of space");
6726 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6727 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6730 /* bulk initialize base fields with 0. */
6731 Zero(ri, sizeof(regexp_internal), char);
6734 /* non-zero initialization begins here */
6737 r->extflags = rx_flags;
6738 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6740 if (pm_flags & PMf_IS_QR) {
6741 ri->code_blocks = pRExC_state->code_blocks;
6742 ri->num_code_blocks = pRExC_state->num_code_blocks;
6747 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6748 if (pRExC_state->code_blocks[n].src_regex)
6749 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6750 SAVEFREEPV(pRExC_state->code_blocks);
6754 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6755 bool has_charset = (get_regex_charset(r->extflags)
6756 != REGEX_DEPENDS_CHARSET);
6758 /* The caret is output if there are any defaults: if not all the STD
6759 * flags are set, or if no character set specifier is needed */
6761 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6763 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6764 == REG_RUN_ON_COMMENT_SEEN);
6765 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6766 >> RXf_PMf_STD_PMMOD_SHIFT);
6767 const char *fptr = STD_PAT_MODS; /*"msixn"*/
6769 /* Allocate for the worst case, which is all the std flags are turned
6770 * on. If more precision is desired, we could do a population count of
6771 * the flags set. This could be done with a small lookup table, or by
6772 * shifting, masking and adding, or even, when available, assembly
6773 * language for a machine-language population count.
6774 * We never output a minus, as all those are defaults, so are
6775 * covered by the caret */
6776 const STRLEN wraplen = plen + has_p + has_runon
6777 + has_default /* If needs a caret */
6779 /* If needs a character set specifier */
6780 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6781 + (sizeof(STD_PAT_MODS) - 1)
6782 + (sizeof("(?:)") - 1);
6784 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6785 r->xpv_len_u.xpvlenu_pv = p;
6787 SvFLAGS(rx) |= SVf_UTF8;
6790 /* If a default, cover it using the caret */
6792 *p++= DEFAULT_PAT_MOD;
6796 const char* const name = get_regex_charset_name(r->extflags, &len);
6797 Copy(name, p, len, char);
6801 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6804 while((ch = *fptr++)) {
6812 Copy(RExC_precomp, p, plen, char);
6813 assert ((RX_WRAPPED(rx) - p) < 16);
6814 r->pre_prefix = p - RX_WRAPPED(rx);
6820 SvCUR_set(rx, p - RX_WRAPPED(rx));
6824 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6826 /* setup various meta data about recursion, this all requires
6827 * RExC_npar to be correctly set, and a bit later on we clear it */
6828 if (RExC_seen & REG_RECURSE_SEEN) {
6829 Newxz(RExC_open_parens, RExC_npar,regnode *);
6830 SAVEFREEPV(RExC_open_parens);
6831 Newxz(RExC_close_parens,RExC_npar,regnode *);
6832 SAVEFREEPV(RExC_close_parens);
6834 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6835 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6836 * So its 1 if there are no parens. */
6837 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6838 ((RExC_npar & 0x07) != 0);
6839 Newx(RExC_study_chunk_recursed,
6840 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6841 SAVEFREEPV(RExC_study_chunk_recursed);
6844 /* Useful during FAIL. */
6845 #ifdef RE_TRACK_PATTERN_OFFSETS
6846 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6847 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6848 "%s %"UVuf" bytes for offset annotations.\n",
6849 ri->u.offsets ? "Got" : "Couldn't get",
6850 (UV)((2*RExC_size+1) * sizeof(U32))));
6852 SetProgLen(ri,RExC_size);
6857 /* Second pass: emit code. */
6858 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6859 RExC_pm_flags = pm_flags;
6861 RExC_end = exp + plen;
6864 RExC_emit_start = ri->program;
6865 RExC_emit = ri->program;
6866 RExC_emit_bound = ri->program + RExC_size + 1;
6867 pRExC_state->code_index = 0;
6869 *((char*) RExC_emit++) = (char) REG_MAGIC;
6870 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6872 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6874 /* XXXX To minimize changes to RE engine we always allocate
6875 3-units-long substrs field. */
6876 Newx(r->substrs, 1, struct reg_substr_data);
6877 if (RExC_recurse_count) {
6878 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6879 SAVEFREEPV(RExC_recurse);
6883 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6885 RExC_study_chunk_recursed_count= 0;
6887 Zero(r->substrs, 1, struct reg_substr_data);
6888 if (RExC_study_chunk_recursed) {
6889 Zero(RExC_study_chunk_recursed,
6890 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6894 #ifdef TRIE_STUDY_OPT
6896 StructCopy(&zero_scan_data, &data, scan_data_t);
6897 copyRExC_state = RExC_state;
6900 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6902 RExC_state = copyRExC_state;
6903 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6904 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6906 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6907 StructCopy(&zero_scan_data, &data, scan_data_t);
6910 StructCopy(&zero_scan_data, &data, scan_data_t);
6913 /* Dig out information for optimizations. */
6914 r->extflags = RExC_flags; /* was pm_op */
6915 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6918 SvUTF8_on(rx); /* Unicode in it? */
6919 ri->regstclass = NULL;
6920 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
6921 r->intflags |= PREGf_NAUGHTY;
6922 scan = ri->program + 1; /* First BRANCH. */
6924 /* testing for BRANCH here tells us whether there is "must appear"
6925 data in the pattern. If there is then we can use it for optimisations */
6926 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6929 STRLEN longest_float_length, longest_fixed_length;
6930 regnode_ssc ch_class; /* pointed to by data */
6932 SSize_t last_close = 0; /* pointed to by data */
6933 regnode *first= scan;
6934 regnode *first_next= regnext(first);
6936 * Skip introductions and multiplicators >= 1
6937 * so that we can extract the 'meat' of the pattern that must
6938 * match in the large if() sequence following.
6939 * NOTE that EXACT is NOT covered here, as it is normally
6940 * picked up by the optimiser separately.
6942 * This is unfortunate as the optimiser isnt handling lookahead
6943 * properly currently.
6946 while ((OP(first) == OPEN && (sawopen = 1)) ||
6947 /* An OR of *one* alternative - should not happen now. */
6948 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6949 /* for now we can't handle lookbehind IFMATCH*/
6950 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6951 (OP(first) == PLUS) ||
6952 (OP(first) == MINMOD) ||
6953 /* An {n,m} with n>0 */
6954 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6955 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6958 * the only op that could be a regnode is PLUS, all the rest
6959 * will be regnode_1 or regnode_2.
6961 * (yves doesn't think this is true)
6963 if (OP(first) == PLUS)
6966 if (OP(first) == MINMOD)
6968 first += regarglen[OP(first)];
6970 first = NEXTOPER(first);
6971 first_next= regnext(first);
6974 /* Starting-point info. */
6976 DEBUG_PEEP("first:",first,0);
6977 /* Ignore EXACT as we deal with it later. */
6978 if (PL_regkind[OP(first)] == EXACT) {
6979 if (OP(first) == EXACT || OP(first) == EXACTL)
6980 NOOP; /* Empty, get anchored substr later. */
6982 ri->regstclass = first;
6985 else if (PL_regkind[OP(first)] == TRIE &&
6986 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6988 /* this can happen only on restudy */
6989 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6992 else if (REGNODE_SIMPLE(OP(first)))
6993 ri->regstclass = first;
6994 else if (PL_regkind[OP(first)] == BOUND ||
6995 PL_regkind[OP(first)] == NBOUND)
6996 ri->regstclass = first;
6997 else if (PL_regkind[OP(first)] == BOL) {
6998 r->intflags |= (OP(first) == MBOL
7001 first = NEXTOPER(first);
7004 else if (OP(first) == GPOS) {
7005 r->intflags |= PREGf_ANCH_GPOS;
7006 first = NEXTOPER(first);
7009 else if ((!sawopen || !RExC_sawback) &&
7011 (OP(first) == STAR &&
7012 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7013 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7015 /* turn .* into ^.* with an implied $*=1 */
7017 (OP(NEXTOPER(first)) == REG_ANY)
7020 r->intflags |= (type | PREGf_IMPLICIT);
7021 first = NEXTOPER(first);
7024 if (sawplus && !sawminmod && !sawlookahead
7025 && (!sawopen || !RExC_sawback)
7026 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7027 /* x+ must match at the 1st pos of run of x's */
7028 r->intflags |= PREGf_SKIP;
7030 /* Scan is after the zeroth branch, first is atomic matcher. */
7031 #ifdef TRIE_STUDY_OPT
7034 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7035 (IV)(first - scan + 1))
7039 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7040 (IV)(first - scan + 1))
7046 * If there's something expensive in the r.e., find the
7047 * longest literal string that must appear and make it the
7048 * regmust. Resolve ties in favor of later strings, since
7049 * the regstart check works with the beginning of the r.e.
7050 * and avoiding duplication strengthens checking. Not a
7051 * strong reason, but sufficient in the absence of others.
7052 * [Now we resolve ties in favor of the earlier string if
7053 * it happens that c_offset_min has been invalidated, since the
7054 * earlier string may buy us something the later one won't.]
7057 data.longest_fixed = newSVpvs("");
7058 data.longest_float = newSVpvs("");
7059 data.last_found = newSVpvs("");
7060 data.longest = &(data.longest_fixed);
7061 ENTER_with_name("study_chunk");
7062 SAVEFREESV(data.longest_fixed);
7063 SAVEFREESV(data.longest_float);
7064 SAVEFREESV(data.last_found);
7066 if (!ri->regstclass) {
7067 ssc_init(pRExC_state, &ch_class);
7068 data.start_class = &ch_class;
7069 stclass_flag = SCF_DO_STCLASS_AND;
7070 } else /* XXXX Check for BOUND? */
7072 data.last_closep = &last_close;
7075 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7076 scan + RExC_size, /* Up to end */
7078 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7079 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7083 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7086 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7087 && data.last_start_min == 0 && data.last_end > 0
7088 && !RExC_seen_zerolen
7089 && !(RExC_seen & REG_VERBARG_SEEN)
7090 && !(RExC_seen & REG_GPOS_SEEN)
7092 r->extflags |= RXf_CHECK_ALL;
7094 scan_commit(pRExC_state, &data,&minlen,0);
7096 longest_float_length = CHR_SVLEN(data.longest_float);
7098 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7099 && data.offset_fixed == data.offset_float_min
7100 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7101 && S_setup_longest (aTHX_ pRExC_state,
7105 &(r->float_end_shift),
7106 data.lookbehind_float,
7107 data.offset_float_min,
7109 longest_float_length,
7110 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7111 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7113 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7114 r->float_max_offset = data.offset_float_max;
7115 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7116 r->float_max_offset -= data.lookbehind_float;
7117 SvREFCNT_inc_simple_void_NN(data.longest_float);
7120 r->float_substr = r->float_utf8 = NULL;
7121 longest_float_length = 0;
7124 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7126 if (S_setup_longest (aTHX_ pRExC_state,
7128 &(r->anchored_utf8),
7129 &(r->anchored_substr),
7130 &(r->anchored_end_shift),
7131 data.lookbehind_fixed,
7134 longest_fixed_length,
7135 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7136 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7138 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7139 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7142 r->anchored_substr = r->anchored_utf8 = NULL;
7143 longest_fixed_length = 0;
7145 LEAVE_with_name("study_chunk");
7148 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7149 ri->regstclass = NULL;
7151 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7153 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7154 && is_ssc_worth_it(pRExC_state, data.start_class))
7156 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7158 ssc_finalize(pRExC_state, data.start_class);
7160 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7161 StructCopy(data.start_class,
7162 (regnode_ssc*)RExC_rxi->data->data[n],
7164 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7165 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7166 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7167 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7168 PerlIO_printf(Perl_debug_log,
7169 "synthetic stclass \"%s\".\n",
7170 SvPVX_const(sv));});
7171 data.start_class = NULL;
7174 /* A temporary algorithm prefers floated substr to fixed one to dig
7176 if (longest_fixed_length > longest_float_length) {
7177 r->substrs->check_ix = 0;
7178 r->check_end_shift = r->anchored_end_shift;
7179 r->check_substr = r->anchored_substr;
7180 r->check_utf8 = r->anchored_utf8;
7181 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7182 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7183 r->intflags |= PREGf_NOSCAN;
7186 r->substrs->check_ix = 1;
7187 r->check_end_shift = r->float_end_shift;
7188 r->check_substr = r->float_substr;
7189 r->check_utf8 = r->float_utf8;
7190 r->check_offset_min = r->float_min_offset;
7191 r->check_offset_max = r->float_max_offset;
7193 if ((r->check_substr || r->check_utf8) ) {
7194 r->extflags |= RXf_USE_INTUIT;
7195 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7196 r->extflags |= RXf_INTUIT_TAIL;
7198 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7200 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7201 if ( (STRLEN)minlen < longest_float_length )
7202 minlen= longest_float_length;
7203 if ( (STRLEN)minlen < longest_fixed_length )
7204 minlen= longest_fixed_length;
7208 /* Several toplevels. Best we can is to set minlen. */
7210 regnode_ssc ch_class;
7211 SSize_t last_close = 0;
7213 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7215 scan = ri->program + 1;
7216 ssc_init(pRExC_state, &ch_class);
7217 data.start_class = &ch_class;
7218 data.last_closep = &last_close;
7221 minlen = study_chunk(pRExC_state,
7222 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7223 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7224 ? SCF_TRIE_DOING_RESTUDY
7228 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7230 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7231 = r->float_substr = r->float_utf8 = NULL;
7233 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7234 && is_ssc_worth_it(pRExC_state, data.start_class))
7236 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7238 ssc_finalize(pRExC_state, data.start_class);
7240 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7241 StructCopy(data.start_class,
7242 (regnode_ssc*)RExC_rxi->data->data[n],
7244 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7245 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7246 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7247 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7248 PerlIO_printf(Perl_debug_log,
7249 "synthetic stclass \"%s\".\n",
7250 SvPVX_const(sv));});
7251 data.start_class = NULL;
7255 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7256 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7257 r->maxlen = REG_INFTY;
7260 r->maxlen = RExC_maxlen;
7263 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7264 the "real" pattern. */
7266 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7267 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7269 r->minlenret = minlen;
7270 if (r->minlen < minlen)
7273 if (RExC_seen & REG_GPOS_SEEN)
7274 r->intflags |= PREGf_GPOS_SEEN;
7275 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7276 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7278 if (pRExC_state->num_code_blocks)
7279 r->extflags |= RXf_EVAL_SEEN;
7280 if (RExC_seen & REG_CANY_SEEN)
7281 r->intflags |= PREGf_CANY_SEEN;
7282 if (RExC_seen & REG_VERBARG_SEEN)
7284 r->intflags |= PREGf_VERBARG_SEEN;
7285 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7287 if (RExC_seen & REG_CUTGROUP_SEEN)
7288 r->intflags |= PREGf_CUTGROUP_SEEN;
7289 if (pm_flags & PMf_USE_RE_EVAL)
7290 r->intflags |= PREGf_USE_RE_EVAL;
7291 if (RExC_paren_names)
7292 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7294 RXp_PAREN_NAMES(r) = NULL;
7296 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7297 * so it can be used in pp.c */
7298 if (r->intflags & PREGf_ANCH)
7299 r->extflags |= RXf_IS_ANCHORED;
7303 /* this is used to identify "special" patterns that might result
7304 * in Perl NOT calling the regex engine and instead doing the match "itself",
7305 * particularly special cases in split//. By having the regex compiler
7306 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7307 * we avoid weird issues with equivalent patterns resulting in different behavior,
7308 * AND we allow non Perl engines to get the same optimizations by the setting the
7309 * flags appropriately - Yves */
7310 regnode *first = ri->program + 1;
7312 regnode *next = NEXTOPER(first);
7315 if (PL_regkind[fop] == NOTHING && nop == END)
7316 r->extflags |= RXf_NULL;
7317 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7318 /* when fop is SBOL first->flags will be true only when it was
7319 * produced by parsing /\A/, and not when parsing /^/. This is
7320 * very important for the split code as there we want to
7321 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7322 * See rt #122761 for more details. -- Yves */
7323 r->extflags |= RXf_START_ONLY;
7324 else if (fop == PLUS
7325 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7326 && OP(regnext(first)) == END)
7327 r->extflags |= RXf_WHITE;
7328 else if ( r->extflags & RXf_SPLIT
7329 && (fop == EXACT || fop == EXACTL)
7330 && STR_LEN(first) == 1
7331 && *(STRING(first)) == ' '
7332 && OP(regnext(first)) == END )
7333 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7337 if (RExC_contains_locale) {
7338 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7342 if (RExC_paren_names) {
7343 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7344 ri->data->data[ri->name_list_idx]
7345 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7348 ri->name_list_idx = 0;
7350 if (RExC_recurse_count) {
7351 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7352 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7353 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7356 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7357 /* assume we don't need to swap parens around before we match */
7359 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7360 (unsigned long)RExC_study_chunk_recursed_count);
7364 PerlIO_printf(Perl_debug_log,"Final program:\n");
7367 #ifdef RE_TRACK_PATTERN_OFFSETS
7368 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7369 const STRLEN len = ri->u.offsets[0];
7371 GET_RE_DEBUG_FLAGS_DECL;
7372 PerlIO_printf(Perl_debug_log,
7373 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7374 for (i = 1; i <= len; i++) {
7375 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7376 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7377 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7379 PerlIO_printf(Perl_debug_log, "\n");
7384 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7385 * by setting the regexp SV to readonly-only instead. If the
7386 * pattern's been recompiled, the USEDness should remain. */
7387 if (old_re && SvREADONLY(old_re))
7395 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7398 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7400 PERL_UNUSED_ARG(value);
7402 if (flags & RXapif_FETCH) {
7403 return reg_named_buff_fetch(rx, key, flags);
7404 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7405 Perl_croak_no_modify();
7407 } else if (flags & RXapif_EXISTS) {
7408 return reg_named_buff_exists(rx, key, flags)
7411 } else if (flags & RXapif_REGNAMES) {
7412 return reg_named_buff_all(rx, flags);
7413 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7414 return reg_named_buff_scalar(rx, flags);
7416 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7422 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7425 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7426 PERL_UNUSED_ARG(lastkey);
7428 if (flags & RXapif_FIRSTKEY)
7429 return reg_named_buff_firstkey(rx, flags);
7430 else if (flags & RXapif_NEXTKEY)
7431 return reg_named_buff_nextkey(rx, flags);
7433 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7440 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7443 AV *retarray = NULL;
7445 struct regexp *const rx = ReANY(r);
7447 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7449 if (flags & RXapif_ALL)
7452 if (rx && RXp_PAREN_NAMES(rx)) {
7453 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7456 SV* sv_dat=HeVAL(he_str);
7457 I32 *nums=(I32*)SvPVX(sv_dat);
7458 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7459 if ((I32)(rx->nparens) >= nums[i]
7460 && rx->offs[nums[i]].start != -1
7461 && rx->offs[nums[i]].end != -1)
7464 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7469 ret = newSVsv(&PL_sv_undef);
7472 av_push(retarray, ret);
7475 return newRV_noinc(MUTABLE_SV(retarray));
7482 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7485 struct regexp *const rx = ReANY(r);
7487 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7489 if (rx && RXp_PAREN_NAMES(rx)) {
7490 if (flags & RXapif_ALL) {
7491 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7493 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7495 SvREFCNT_dec_NN(sv);
7507 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7509 struct regexp *const rx = ReANY(r);
7511 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7513 if ( rx && RXp_PAREN_NAMES(rx) ) {
7514 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7516 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7523 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7525 struct regexp *const rx = ReANY(r);
7526 GET_RE_DEBUG_FLAGS_DECL;
7528 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7530 if (rx && RXp_PAREN_NAMES(rx)) {
7531 HV *hv = RXp_PAREN_NAMES(rx);
7533 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7536 SV* sv_dat = HeVAL(temphe);
7537 I32 *nums = (I32*)SvPVX(sv_dat);
7538 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7539 if ((I32)(rx->lastparen) >= nums[i] &&
7540 rx->offs[nums[i]].start != -1 &&
7541 rx->offs[nums[i]].end != -1)
7547 if (parno || flags & RXapif_ALL) {
7548 return newSVhek(HeKEY_hek(temphe));
7556 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7561 struct regexp *const rx = ReANY(r);
7563 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7565 if (rx && RXp_PAREN_NAMES(rx)) {
7566 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7567 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7568 } else if (flags & RXapif_ONE) {
7569 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7570 av = MUTABLE_AV(SvRV(ret));
7571 length = av_tindex(av);
7572 SvREFCNT_dec_NN(ret);
7573 return newSViv(length + 1);
7575 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7580 return &PL_sv_undef;
7584 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7586 struct regexp *const rx = ReANY(r);
7589 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7591 if (rx && RXp_PAREN_NAMES(rx)) {
7592 HV *hv= RXp_PAREN_NAMES(rx);
7594 (void)hv_iterinit(hv);
7595 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7598 SV* sv_dat = HeVAL(temphe);
7599 I32 *nums = (I32*)SvPVX(sv_dat);
7600 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7601 if ((I32)(rx->lastparen) >= nums[i] &&
7602 rx->offs[nums[i]].start != -1 &&
7603 rx->offs[nums[i]].end != -1)
7609 if (parno || flags & RXapif_ALL) {
7610 av_push(av, newSVhek(HeKEY_hek(temphe)));
7615 return newRV_noinc(MUTABLE_SV(av));
7619 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7622 struct regexp *const rx = ReANY(r);
7628 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7630 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7631 || n == RX_BUFF_IDX_CARET_FULLMATCH
7632 || n == RX_BUFF_IDX_CARET_POSTMATCH
7635 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7637 /* on something like
7640 * the KEEPCOPY is set on the PMOP rather than the regex */
7641 if (PL_curpm && r == PM_GETRE(PL_curpm))
7642 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7651 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7652 /* no need to distinguish between them any more */
7653 n = RX_BUFF_IDX_FULLMATCH;
7655 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7656 && rx->offs[0].start != -1)
7658 /* $`, ${^PREMATCH} */
7659 i = rx->offs[0].start;
7663 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7664 && rx->offs[0].end != -1)
7666 /* $', ${^POSTMATCH} */
7667 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7668 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7671 if ( 0 <= n && n <= (I32)rx->nparens &&
7672 (s1 = rx->offs[n].start) != -1 &&
7673 (t1 = rx->offs[n].end) != -1)
7675 /* $&, ${^MATCH}, $1 ... */
7677 s = rx->subbeg + s1 - rx->suboffset;
7682 assert(s >= rx->subbeg);
7683 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7685 #ifdef NO_TAINT_SUPPORT
7686 sv_setpvn(sv, s, i);
7688 const int oldtainted = TAINT_get;
7690 sv_setpvn(sv, s, i);
7691 TAINT_set(oldtainted);
7693 if ( (rx->intflags & PREGf_CANY_SEEN)
7694 ? (RXp_MATCH_UTF8(rx)
7695 && (!i || is_utf8_string((U8*)s, i)))
7696 : (RXp_MATCH_UTF8(rx)) )
7703 if (RXp_MATCH_TAINTED(rx)) {
7704 if (SvTYPE(sv) >= SVt_PVMG) {
7705 MAGIC* const mg = SvMAGIC(sv);
7708 SvMAGIC_set(sv, mg->mg_moremagic);
7710 if ((mgt = SvMAGIC(sv))) {
7711 mg->mg_moremagic = mgt;
7712 SvMAGIC_set(sv, mg);
7723 sv_setsv(sv,&PL_sv_undef);
7729 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7730 SV const * const value)
7732 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7734 PERL_UNUSED_ARG(rx);
7735 PERL_UNUSED_ARG(paren);
7736 PERL_UNUSED_ARG(value);
7739 Perl_croak_no_modify();
7743 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7746 struct regexp *const rx = ReANY(r);
7750 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7752 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7753 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7754 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7757 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7759 /* on something like
7762 * the KEEPCOPY is set on the PMOP rather than the regex */
7763 if (PL_curpm && r == PM_GETRE(PL_curpm))
7764 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7770 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7772 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7773 case RX_BUFF_IDX_PREMATCH: /* $` */
7774 if (rx->offs[0].start != -1) {
7775 i = rx->offs[0].start;
7784 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7785 case RX_BUFF_IDX_POSTMATCH: /* $' */
7786 if (rx->offs[0].end != -1) {
7787 i = rx->sublen - rx->offs[0].end;
7789 s1 = rx->offs[0].end;
7796 default: /* $& / ${^MATCH}, $1, $2, ... */
7797 if (paren <= (I32)rx->nparens &&
7798 (s1 = rx->offs[paren].start) != -1 &&
7799 (t1 = rx->offs[paren].end) != -1)
7805 if (ckWARN(WARN_UNINITIALIZED))
7806 report_uninit((const SV *)sv);
7811 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7812 const char * const s = rx->subbeg - rx->suboffset + s1;
7817 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7824 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7826 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7827 PERL_UNUSED_ARG(rx);
7831 return newSVpvs("Regexp");
7834 /* Scans the name of a named buffer from the pattern.
7835 * If flags is REG_RSN_RETURN_NULL returns null.
7836 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7837 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7838 * to the parsed name as looked up in the RExC_paren_names hash.
7839 * If there is an error throws a vFAIL().. type exception.
7842 #define REG_RSN_RETURN_NULL 0
7843 #define REG_RSN_RETURN_NAME 1
7844 #define REG_RSN_RETURN_DATA 2
7847 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7849 char *name_start = RExC_parse;
7851 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7853 assert (RExC_parse <= RExC_end);
7854 if (RExC_parse == RExC_end) NOOP;
7855 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7856 /* skip IDFIRST by using do...while */
7859 RExC_parse += UTF8SKIP(RExC_parse);
7860 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7864 } while (isWORDCHAR(*RExC_parse));
7866 RExC_parse++; /* so the <- from the vFAIL is after the offending
7868 vFAIL("Group name must start with a non-digit word character");
7872 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7873 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7874 if ( flags == REG_RSN_RETURN_NAME)
7876 else if (flags==REG_RSN_RETURN_DATA) {
7879 if ( ! sv_name ) /* should not happen*/
7880 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7881 if (RExC_paren_names)
7882 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7884 sv_dat = HeVAL(he_str);
7886 vFAIL("Reference to nonexistent named group");
7890 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7891 (unsigned long) flags);
7893 NOT_REACHED; /* NOT REACHED */
7898 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7900 if (RExC_lastparse!=RExC_parse) { \
7901 PerlIO_printf(Perl_debug_log, "%s", \
7902 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
7903 RExC_end - RExC_parse, 16, \
7905 PERL_PV_ESCAPE_UNI_DETECT | \
7906 PERL_PV_PRETTY_ELLIPSES | \
7907 PERL_PV_PRETTY_LTGT | \
7908 PERL_PV_ESCAPE_RE | \
7909 PERL_PV_PRETTY_EXACTSIZE \
7913 PerlIO_printf(Perl_debug_log,"%16s",""); \
7916 num = RExC_size + 1; \
7918 num=REG_NODE_NUM(RExC_emit); \
7919 if (RExC_lastnum!=num) \
7920 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7922 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7923 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7924 (int)((depth*2)), "", \
7928 RExC_lastparse=RExC_parse; \
7933 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7934 DEBUG_PARSE_MSG((funcname)); \
7935 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7937 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7938 DEBUG_PARSE_MSG((funcname)); \
7939 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7942 /* This section of code defines the inversion list object and its methods. The
7943 * interfaces are highly subject to change, so as much as possible is static to
7944 * this file. An inversion list is here implemented as a malloc'd C UV array
7945 * as an SVt_INVLIST scalar.
7947 * An inversion list for Unicode is an array of code points, sorted by ordinal
7948 * number. The zeroth element is the first code point in the list. The 1th
7949 * element is the first element beyond that not in the list. In other words,
7950 * the first range is
7951 * invlist[0]..(invlist[1]-1)
7952 * The other ranges follow. Thus every element whose index is divisible by two
7953 * marks the beginning of a range that is in the list, and every element not
7954 * divisible by two marks the beginning of a range not in the list. A single
7955 * element inversion list that contains the single code point N generally
7956 * consists of two elements
7959 * (The exception is when N is the highest representable value on the
7960 * machine, in which case the list containing just it would be a single
7961 * element, itself. By extension, if the last range in the list extends to
7962 * infinity, then the first element of that range will be in the inversion list
7963 * at a position that is divisible by two, and is the final element in the
7965 * Taking the complement (inverting) an inversion list is quite simple, if the
7966 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7967 * This implementation reserves an element at the beginning of each inversion
7968 * list to always contain 0; there is an additional flag in the header which
7969 * indicates if the list begins at the 0, or is offset to begin at the next
7972 * More about inversion lists can be found in "Unicode Demystified"
7973 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7974 * More will be coming when functionality is added later.
7976 * The inversion list data structure is currently implemented as an SV pointing
7977 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7978 * array of UV whose memory management is automatically handled by the existing
7979 * facilities for SV's.
7981 * Some of the methods should always be private to the implementation, and some
7982 * should eventually be made public */
7984 /* The header definitions are in F<inline_invlist.c> */
7986 PERL_STATIC_INLINE UV*
7987 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7989 /* Returns a pointer to the first element in the inversion list's array.
7990 * This is called upon initialization of an inversion list. Where the
7991 * array begins depends on whether the list has the code point U+0000 in it
7992 * or not. The other parameter tells it whether the code that follows this
7993 * call is about to put a 0 in the inversion list or not. The first
7994 * element is either the element reserved for 0, if TRUE, or the element
7995 * after it, if FALSE */
7997 bool* offset = get_invlist_offset_addr(invlist);
7998 UV* zero_addr = (UV *) SvPVX(invlist);
8000 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8003 assert(! _invlist_len(invlist));
8007 /* 1^1 = 0; 1^0 = 1 */
8008 *offset = 1 ^ will_have_0;
8009 return zero_addr + *offset;
8012 PERL_STATIC_INLINE void
8013 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8015 /* Sets the current number of elements stored in the inversion list.
8016 * Updates SvCUR correspondingly */
8017 PERL_UNUSED_CONTEXT;
8018 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8020 assert(SvTYPE(invlist) == SVt_INVLIST);
8025 : TO_INTERNAL_SIZE(len + offset));
8026 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8029 #ifndef PERL_IN_XSUB_RE
8031 PERL_STATIC_INLINE IV*
8032 S_get_invlist_previous_index_addr(SV* invlist)
8034 /* Return the address of the IV that is reserved to hold the cached index
8036 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8038 assert(SvTYPE(invlist) == SVt_INVLIST);
8040 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8043 PERL_STATIC_INLINE IV
8044 S_invlist_previous_index(SV* const invlist)
8046 /* Returns cached index of previous search */
8048 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8050 return *get_invlist_previous_index_addr(invlist);
8053 PERL_STATIC_INLINE void
8054 S_invlist_set_previous_index(SV* const invlist, const IV index)
8056 /* Caches <index> for later retrieval */
8058 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8060 assert(index == 0 || index < (int) _invlist_len(invlist));
8062 *get_invlist_previous_index_addr(invlist) = index;
8065 PERL_STATIC_INLINE void
8066 S_invlist_trim(SV* const invlist)
8068 PERL_ARGS_ASSERT_INVLIST_TRIM;
8070 assert(SvTYPE(invlist) == SVt_INVLIST);
8072 /* Change the length of the inversion list to how many entries it currently
8074 SvPV_shrink_to_cur((SV *) invlist);
8077 PERL_STATIC_INLINE bool
8078 S_invlist_is_iterating(SV* const invlist)
8080 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8082 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8085 #endif /* ifndef PERL_IN_XSUB_RE */
8087 PERL_STATIC_INLINE UV
8088 S_invlist_max(SV* const invlist)
8090 /* Returns the maximum number of elements storable in the inversion list's
8091 * array, without having to realloc() */
8093 PERL_ARGS_ASSERT_INVLIST_MAX;
8095 assert(SvTYPE(invlist) == SVt_INVLIST);
8097 /* Assumes worst case, in which the 0 element is not counted in the
8098 * inversion list, so subtracts 1 for that */
8099 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8100 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8101 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8104 #ifndef PERL_IN_XSUB_RE
8106 Perl__new_invlist(pTHX_ IV initial_size)
8109 /* Return a pointer to a newly constructed inversion list, with enough
8110 * space to store 'initial_size' elements. If that number is negative, a
8111 * system default is used instead */
8115 if (initial_size < 0) {
8119 /* Allocate the initial space */
8120 new_list = newSV_type(SVt_INVLIST);
8122 /* First 1 is in case the zero element isn't in the list; second 1 is for
8124 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8125 invlist_set_len(new_list, 0, 0);
8127 /* Force iterinit() to be used to get iteration to work */
8128 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8130 *get_invlist_previous_index_addr(new_list) = 0;
8136 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8138 /* Return a pointer to a newly constructed inversion list, initialized to
8139 * point to <list>, which has to be in the exact correct inversion list
8140 * form, including internal fields. Thus this is a dangerous routine that
8141 * should not be used in the wrong hands. The passed in 'list' contains
8142 * several header fields at the beginning that are not part of the
8143 * inversion list body proper */
8145 const STRLEN length = (STRLEN) list[0];
8146 const UV version_id = list[1];
8147 const bool offset = cBOOL(list[2]);
8148 #define HEADER_LENGTH 3
8149 /* If any of the above changes in any way, you must change HEADER_LENGTH
8150 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8151 * perl -E 'say int(rand 2**31-1)'
8153 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8154 data structure type, so that one being
8155 passed in can be validated to be an
8156 inversion list of the correct vintage.
8159 SV* invlist = newSV_type(SVt_INVLIST);
8161 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8163 if (version_id != INVLIST_VERSION_ID) {
8164 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8167 /* The generated array passed in includes header elements that aren't part
8168 * of the list proper, so start it just after them */
8169 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8171 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8172 shouldn't touch it */
8174 *(get_invlist_offset_addr(invlist)) = offset;
8176 /* The 'length' passed to us is the physical number of elements in the
8177 * inversion list. But if there is an offset the logical number is one
8179 invlist_set_len(invlist, length - offset, offset);
8181 invlist_set_previous_index(invlist, 0);
8183 /* Initialize the iteration pointer. */
8184 invlist_iterfinish(invlist);
8186 SvREADONLY_on(invlist);
8190 #endif /* ifndef PERL_IN_XSUB_RE */
8193 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8195 /* Grow the maximum size of an inversion list */
8197 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8199 assert(SvTYPE(invlist) == SVt_INVLIST);
8201 /* Add one to account for the zero element at the beginning which may not
8202 * be counted by the calling parameters */
8203 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8207 S__append_range_to_invlist(pTHX_ SV* const invlist,
8208 const UV start, const UV end)
8210 /* Subject to change or removal. Append the range from 'start' to 'end' at
8211 * the end of the inversion list. The range must be above any existing
8215 UV max = invlist_max(invlist);
8216 UV len = _invlist_len(invlist);
8219 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8221 if (len == 0) { /* Empty lists must be initialized */
8222 offset = start != 0;
8223 array = _invlist_array_init(invlist, ! offset);
8226 /* Here, the existing list is non-empty. The current max entry in the
8227 * list is generally the first value not in the set, except when the
8228 * set extends to the end of permissible values, in which case it is
8229 * the first entry in that final set, and so this call is an attempt to
8230 * append out-of-order */
8232 UV final_element = len - 1;
8233 array = invlist_array(invlist);
8234 if (array[final_element] > start
8235 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8237 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8238 array[final_element], start,
8239 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8242 /* Here, it is a legal append. If the new range begins with the first
8243 * value not in the set, it is extending the set, so the new first
8244 * value not in the set is one greater than the newly extended range.
8246 offset = *get_invlist_offset_addr(invlist);
8247 if (array[final_element] == start) {
8248 if (end != UV_MAX) {
8249 array[final_element] = end + 1;
8252 /* But if the end is the maximum representable on the machine,
8253 * just let the range that this would extend to have no end */
8254 invlist_set_len(invlist, len - 1, offset);
8260 /* Here the new range doesn't extend any existing set. Add it */
8262 len += 2; /* Includes an element each for the start and end of range */
8264 /* If wll overflow the existing space, extend, which may cause the array to
8267 invlist_extend(invlist, len);
8269 /* Have to set len here to avoid assert failure in invlist_array() */
8270 invlist_set_len(invlist, len, offset);
8272 array = invlist_array(invlist);
8275 invlist_set_len(invlist, len, offset);
8278 /* The next item on the list starts the range, the one after that is
8279 * one past the new range. */
8280 array[len - 2] = start;
8281 if (end != UV_MAX) {
8282 array[len - 1] = end + 1;
8285 /* But if the end is the maximum representable on the machine, just let
8286 * the range have no end */
8287 invlist_set_len(invlist, len - 1, offset);
8291 #ifndef PERL_IN_XSUB_RE
8294 Perl__invlist_search(SV* const invlist, const UV cp)
8296 /* Searches the inversion list for the entry that contains the input code
8297 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8298 * return value is the index into the list's array of the range that
8303 IV high = _invlist_len(invlist);
8304 const IV highest_element = high - 1;
8307 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8309 /* If list is empty, return failure. */
8314 /* (We can't get the array unless we know the list is non-empty) */
8315 array = invlist_array(invlist);
8317 mid = invlist_previous_index(invlist);
8318 assert(mid >=0 && mid <= highest_element);
8320 /* <mid> contains the cache of the result of the previous call to this
8321 * function (0 the first time). See if this call is for the same result,
8322 * or if it is for mid-1. This is under the theory that calls to this
8323 * function will often be for related code points that are near each other.
8324 * And benchmarks show that caching gives better results. We also test
8325 * here if the code point is within the bounds of the list. These tests
8326 * replace others that would have had to be made anyway to make sure that
8327 * the array bounds were not exceeded, and these give us extra information
8328 * at the same time */
8329 if (cp >= array[mid]) {
8330 if (cp >= array[highest_element]) {
8331 return highest_element;
8334 /* Here, array[mid] <= cp < array[highest_element]. This means that
8335 * the final element is not the answer, so can exclude it; it also
8336 * means that <mid> is not the final element, so can refer to 'mid + 1'
8338 if (cp < array[mid + 1]) {
8344 else { /* cp < aray[mid] */
8345 if (cp < array[0]) { /* Fail if outside the array */
8349 if (cp >= array[mid - 1]) {
8354 /* Binary search. What we are looking for is <i> such that
8355 * array[i] <= cp < array[i+1]
8356 * The loop below converges on the i+1. Note that there may not be an
8357 * (i+1)th element in the array, and things work nonetheless */
8358 while (low < high) {
8359 mid = (low + high) / 2;
8360 assert(mid <= highest_element);
8361 if (array[mid] <= cp) { /* cp >= array[mid] */
8364 /* We could do this extra test to exit the loop early.
8365 if (cp < array[low]) {
8370 else { /* cp < array[mid] */
8377 invlist_set_previous_index(invlist, high);
8382 Perl__invlist_populate_swatch(SV* const invlist,
8383 const UV start, const UV end, U8* swatch)
8385 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8386 * but is used when the swash has an inversion list. This makes this much
8387 * faster, as it uses a binary search instead of a linear one. This is
8388 * intimately tied to that function, and perhaps should be in utf8.c,
8389 * except it is intimately tied to inversion lists as well. It assumes
8390 * that <swatch> is all 0's on input */
8393 const IV len = _invlist_len(invlist);
8397 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8399 if (len == 0) { /* Empty inversion list */
8403 array = invlist_array(invlist);
8405 /* Find which element it is */
8406 i = _invlist_search(invlist, start);
8408 /* We populate from <start> to <end> */
8409 while (current < end) {
8412 /* The inversion list gives the results for every possible code point
8413 * after the first one in the list. Only those ranges whose index is
8414 * even are ones that the inversion list matches. For the odd ones,
8415 * and if the initial code point is not in the list, we have to skip
8416 * forward to the next element */
8417 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8419 if (i >= len) { /* Finished if beyond the end of the array */
8423 if (current >= end) { /* Finished if beyond the end of what we
8425 if (LIKELY(end < UV_MAX)) {
8429 /* We get here when the upper bound is the maximum
8430 * representable on the machine, and we are looking for just
8431 * that code point. Have to special case it */
8433 goto join_end_of_list;
8436 assert(current >= start);
8438 /* The current range ends one below the next one, except don't go past
8441 upper = (i < len && array[i] < end) ? array[i] : end;
8443 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8444 * for each code point in it */
8445 for (; current < upper; current++) {
8446 const STRLEN offset = (STRLEN)(current - start);
8447 swatch[offset >> 3] |= 1 << (offset & 7);
8452 /* Quit if at the end of the list */
8455 /* But first, have to deal with the highest possible code point on
8456 * the platform. The previous code assumes that <end> is one
8457 * beyond where we want to populate, but that is impossible at the
8458 * platform's infinity, so have to handle it specially */
8459 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8461 const STRLEN offset = (STRLEN)(end - start);
8462 swatch[offset >> 3] |= 1 << (offset & 7);
8467 /* Advance to the next range, which will be for code points not in the
8476 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8477 const bool complement_b, SV** output)
8479 /* Take the union of two inversion lists and point <output> to it. *output
8480 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8481 * the reference count to that list will be decremented if not already a
8482 * temporary (mortal); otherwise *output will be made correspondingly
8483 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8484 * second list is returned. If <complement_b> is TRUE, the union is taken
8485 * of the complement (inversion) of <b> instead of b itself.
8487 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8488 * Richard Gillam, published by Addison-Wesley, and explained at some
8489 * length there. The preface says to incorporate its examples into your
8490 * code at your own risk.
8492 * The algorithm is like a merge sort.
8494 * XXX A potential performance improvement is to keep track as we go along
8495 * if only one of the inputs contributes to the result, meaning the other
8496 * is a subset of that one. In that case, we can skip the final copy and
8497 * return the larger of the input lists, but then outside code might need
8498 * to keep track of whether to free the input list or not */
8500 const UV* array_a; /* a's array */
8502 UV len_a; /* length of a's array */
8505 SV* u; /* the resulting union */
8509 UV i_a = 0; /* current index into a's array */
8513 /* running count, as explained in the algorithm source book; items are
8514 * stopped accumulating and are output when the count changes to/from 0.
8515 * The count is incremented when we start a range that's in the set, and
8516 * decremented when we start a range that's not in the set. So its range
8517 * is 0 to 2. Only when the count is zero is something not in the set.
8521 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8524 /* If either one is empty, the union is the other one */
8525 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8526 bool make_temp = FALSE; /* Should we mortalize the result? */
8530 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8536 *output = invlist_clone(b);
8538 _invlist_invert(*output);
8540 } /* else *output already = b; */
8543 sv_2mortal(*output);
8547 else if ((len_b = _invlist_len(b)) == 0) {
8548 bool make_temp = FALSE;
8550 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8555 /* The complement of an empty list is a list that has everything in it,
8556 * so the union with <a> includes everything too */
8559 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8563 *output = _new_invlist(1);
8564 _append_range_to_invlist(*output, 0, UV_MAX);
8566 else if (*output != a) {
8567 *output = invlist_clone(a);
8569 /* else *output already = a; */
8572 sv_2mortal(*output);
8577 /* Here both lists exist and are non-empty */
8578 array_a = invlist_array(a);
8579 array_b = invlist_array(b);
8581 /* If are to take the union of 'a' with the complement of b, set it
8582 * up so are looking at b's complement. */
8585 /* To complement, we invert: if the first element is 0, remove it. To
8586 * do this, we just pretend the array starts one later */
8587 if (array_b[0] == 0) {
8593 /* But if the first element is not zero, we pretend the list starts
8594 * at the 0 that is always stored immediately before the array. */
8600 /* Size the union for the worst case: that the sets are completely
8602 u = _new_invlist(len_a + len_b);
8604 /* Will contain U+0000 if either component does */
8605 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8606 || (len_b > 0 && array_b[0] == 0));
8608 /* Go through each list item by item, stopping when exhausted one of
8610 while (i_a < len_a && i_b < len_b) {
8611 UV cp; /* The element to potentially add to the union's array */
8612 bool cp_in_set; /* is it in the the input list's set or not */
8614 /* We need to take one or the other of the two inputs for the union.
8615 * Since we are merging two sorted lists, we take the smaller of the
8616 * next items. In case of a tie, we take the one that is in its set
8617 * first. If we took one not in the set first, it would decrement the
8618 * count, possibly to 0 which would cause it to be output as ending the
8619 * range, and the next time through we would take the same number, and
8620 * output it again as beginning the next range. By doing it the
8621 * opposite way, there is no possibility that the count will be
8622 * momentarily decremented to 0, and thus the two adjoining ranges will
8623 * be seamlessly merged. (In a tie and both are in the set or both not
8624 * in the set, it doesn't matter which we take first.) */
8625 if (array_a[i_a] < array_b[i_b]
8626 || (array_a[i_a] == array_b[i_b]
8627 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8629 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8633 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8634 cp = array_b[i_b++];
8637 /* Here, have chosen which of the two inputs to look at. Only output
8638 * if the running count changes to/from 0, which marks the
8639 * beginning/end of a range in that's in the set */
8642 array_u[i_u++] = cp;
8649 array_u[i_u++] = cp;
8654 /* Here, we are finished going through at least one of the lists, which
8655 * means there is something remaining in at most one. We check if the list
8656 * that hasn't been exhausted is positioned such that we are in the middle
8657 * of a range in its set or not. (i_a and i_b point to the element beyond
8658 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8659 * is potentially more to output.
8660 * There are four cases:
8661 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8662 * in the union is entirely from the non-exhausted set.
8663 * 2) Both were in their sets, count is 2. Nothing further should
8664 * be output, as everything that remains will be in the exhausted
8665 * list's set, hence in the union; decrementing to 1 but not 0 insures
8667 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8668 * Nothing further should be output because the union includes
8669 * everything from the exhausted set. Not decrementing ensures that.
8670 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8671 * decrementing to 0 insures that we look at the remainder of the
8672 * non-exhausted set */
8673 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8674 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8679 /* The final length is what we've output so far, plus what else is about to
8680 * be output. (If 'count' is non-zero, then the input list we exhausted
8681 * has everything remaining up to the machine's limit in its set, and hence
8682 * in the union, so there will be no further output. */
8685 /* At most one of the subexpressions will be non-zero */
8686 len_u += (len_a - i_a) + (len_b - i_b);
8689 /* Set result to final length, which can change the pointer to array_u, so
8691 if (len_u != _invlist_len(u)) {
8692 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8694 array_u = invlist_array(u);
8697 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8698 * the other) ended with everything above it not in its set. That means
8699 * that the remaining part of the union is precisely the same as the
8700 * non-exhausted list, so can just copy it unchanged. (If both list were
8701 * exhausted at the same time, then the operations below will be both 0.)
8704 IV copy_count; /* At most one will have a non-zero copy count */
8705 if ((copy_count = len_a - i_a) > 0) {
8706 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8708 else if ((copy_count = len_b - i_b) > 0) {
8709 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8713 /* We may be removing a reference to one of the inputs. If so, the output
8714 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8715 * count decremented) */
8716 if (a == *output || b == *output) {
8717 assert(! invlist_is_iterating(*output));
8718 if ((SvTEMP(*output))) {
8722 SvREFCNT_dec_NN(*output);
8732 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8733 const bool complement_b, SV** i)
8735 /* Take the intersection of two inversion lists and point <i> to it. *i
8736 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8737 * the reference count to that list will be decremented if not already a
8738 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8739 * The first list, <a>, may be NULL, in which case an empty list is
8740 * returned. If <complement_b> is TRUE, the result will be the
8741 * intersection of <a> and the complement (or inversion) of <b> instead of
8744 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8745 * Richard Gillam, published by Addison-Wesley, and explained at some
8746 * length there. The preface says to incorporate its examples into your
8747 * code at your own risk. In fact, it had bugs
8749 * The algorithm is like a merge sort, and is essentially the same as the
8753 const UV* array_a; /* a's array */
8755 UV len_a; /* length of a's array */
8758 SV* r; /* the resulting intersection */
8762 UV i_a = 0; /* current index into a's array */
8766 /* running count, as explained in the algorithm source book; items are
8767 * stopped accumulating and are output when the count changes to/from 2.
8768 * The count is incremented when we start a range that's in the set, and
8769 * decremented when we start a range that's not in the set. So its range
8770 * is 0 to 2. Only when the count is 2 is something in the intersection.
8774 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8777 /* Special case if either one is empty */
8778 len_a = (a == NULL) ? 0 : _invlist_len(a);
8779 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8780 bool make_temp = FALSE;
8782 if (len_a != 0 && complement_b) {
8784 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8785 * be empty. Here, also we are using 'b's complement, which hence
8786 * must be every possible code point. Thus the intersection is
8790 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8795 *i = invlist_clone(a);
8797 /* else *i is already 'a' */
8805 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8806 * intersection must be empty */
8808 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8813 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8817 *i = _new_invlist(0);
8825 /* Here both lists exist and are non-empty */
8826 array_a = invlist_array(a);
8827 array_b = invlist_array(b);
8829 /* If are to take the intersection of 'a' with the complement of b, set it
8830 * up so are looking at b's complement. */
8833 /* To complement, we invert: if the first element is 0, remove it. To
8834 * do this, we just pretend the array starts one later */
8835 if (array_b[0] == 0) {
8841 /* But if the first element is not zero, we pretend the list starts
8842 * at the 0 that is always stored immediately before the array. */
8848 /* Size the intersection for the worst case: that the intersection ends up
8849 * fragmenting everything to be completely disjoint */
8850 r= _new_invlist(len_a + len_b);
8852 /* Will contain U+0000 iff both components do */
8853 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8854 && len_b > 0 && array_b[0] == 0);
8856 /* Go through each list item by item, stopping when exhausted one of
8858 while (i_a < len_a && i_b < len_b) {
8859 UV cp; /* The element to potentially add to the intersection's
8861 bool cp_in_set; /* Is it in the input list's set or not */
8863 /* We need to take one or the other of the two inputs for the
8864 * intersection. Since we are merging two sorted lists, we take the
8865 * smaller of the next items. In case of a tie, we take the one that
8866 * is not in its set first (a difference from the union algorithm). If
8867 * we took one in the set first, it would increment the count, possibly
8868 * to 2 which would cause it to be output as starting a range in the
8869 * intersection, and the next time through we would take that same
8870 * number, and output it again as ending the set. By doing it the
8871 * opposite of this, there is no possibility that the count will be
8872 * momentarily incremented to 2. (In a tie and both are in the set or
8873 * both not in the set, it doesn't matter which we take first.) */
8874 if (array_a[i_a] < array_b[i_b]
8875 || (array_a[i_a] == array_b[i_b]
8876 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8878 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8882 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8886 /* Here, have chosen which of the two inputs to look at. Only output
8887 * if the running count changes to/from 2, which marks the
8888 * beginning/end of a range that's in the intersection */
8892 array_r[i_r++] = cp;
8897 array_r[i_r++] = cp;
8903 /* Here, we are finished going through at least one of the lists, which
8904 * means there is something remaining in at most one. We check if the list
8905 * that has been exhausted is positioned such that we are in the middle
8906 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8907 * the ones we care about.) There are four cases:
8908 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8909 * nothing left in the intersection.
8910 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8911 * above 2. What should be output is exactly that which is in the
8912 * non-exhausted set, as everything it has is also in the intersection
8913 * set, and everything it doesn't have can't be in the intersection
8914 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8915 * gets incremented to 2. Like the previous case, the intersection is
8916 * everything that remains in the non-exhausted set.
8917 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8918 * remains 1. And the intersection has nothing more. */
8919 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8920 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8925 /* The final length is what we've output so far plus what else is in the
8926 * intersection. At most one of the subexpressions below will be non-zero
8930 len_r += (len_a - i_a) + (len_b - i_b);
8933 /* Set result to final length, which can change the pointer to array_r, so
8935 if (len_r != _invlist_len(r)) {
8936 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8938 array_r = invlist_array(r);
8941 /* Finish outputting any remaining */
8942 if (count >= 2) { /* At most one will have a non-zero copy count */
8944 if ((copy_count = len_a - i_a) > 0) {
8945 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8947 else if ((copy_count = len_b - i_b) > 0) {
8948 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8952 /* We may be removing a reference to one of the inputs. If so, the output
8953 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8954 * count decremented) */
8955 if (a == *i || b == *i) {
8956 assert(! invlist_is_iterating(*i));
8961 SvREFCNT_dec_NN(*i);
8971 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8973 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8974 * set. A pointer to the inversion list is returned. This may actually be
8975 * a new list, in which case the passed in one has been destroyed. The
8976 * passed-in inversion list can be NULL, in which case a new one is created
8977 * with just the one range in it */
8982 if (invlist == NULL) {
8983 invlist = _new_invlist(2);
8987 len = _invlist_len(invlist);
8990 /* If comes after the final entry actually in the list, can just append it
8993 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8994 && start >= invlist_array(invlist)[len - 1]))
8996 _append_range_to_invlist(invlist, start, end);
9000 /* Here, can't just append things, create and return a new inversion list
9001 * which is the union of this range and the existing inversion list */
9002 range_invlist = _new_invlist(2);
9003 _append_range_to_invlist(range_invlist, start, end);
9005 _invlist_union(invlist, range_invlist, &invlist);
9007 /* The temporary can be freed */
9008 SvREFCNT_dec_NN(range_invlist);
9014 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9015 UV** other_elements_ptr)
9017 /* Create and return an inversion list whose contents are to be populated
9018 * by the caller. The caller gives the number of elements (in 'size') and
9019 * the very first element ('element0'). This function will set
9020 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9023 * Obviously there is some trust involved that the caller will properly
9024 * fill in the other elements of the array.
9026 * (The first element needs to be passed in, as the underlying code does
9027 * things differently depending on whether it is zero or non-zero) */
9029 SV* invlist = _new_invlist(size);
9032 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9034 _append_range_to_invlist(invlist, element0, element0);
9035 offset = *get_invlist_offset_addr(invlist);
9037 invlist_set_len(invlist, size, offset);
9038 *other_elements_ptr = invlist_array(invlist) + 1;
9044 PERL_STATIC_INLINE SV*
9045 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9046 return _add_range_to_invlist(invlist, cp, cp);
9049 #ifndef PERL_IN_XSUB_RE
9051 Perl__invlist_invert(pTHX_ SV* const invlist)
9053 /* Complement the input inversion list. This adds a 0 if the list didn't
9054 * have a zero; removes it otherwise. As described above, the data
9055 * structure is set up so that this is very efficient */
9057 PERL_ARGS_ASSERT__INVLIST_INVERT;
9059 assert(! invlist_is_iterating(invlist));
9061 /* The inverse of matching nothing is matching everything */
9062 if (_invlist_len(invlist) == 0) {
9063 _append_range_to_invlist(invlist, 0, UV_MAX);
9067 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9072 PERL_STATIC_INLINE SV*
9073 S_invlist_clone(pTHX_ SV* const invlist)
9076 /* Return a new inversion list that is a copy of the input one, which is
9077 * unchanged. The new list will not be mortal even if the old one was. */
9079 /* Need to allocate extra space to accommodate Perl's addition of a
9080 * trailing NUL to SvPV's, since it thinks they are always strings */
9081 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9082 STRLEN physical_length = SvCUR(invlist);
9083 bool offset = *(get_invlist_offset_addr(invlist));
9085 PERL_ARGS_ASSERT_INVLIST_CLONE;
9087 *(get_invlist_offset_addr(new_invlist)) = offset;
9088 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9089 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9094 PERL_STATIC_INLINE STRLEN*
9095 S_get_invlist_iter_addr(SV* invlist)
9097 /* Return the address of the UV that contains the current iteration
9100 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9102 assert(SvTYPE(invlist) == SVt_INVLIST);
9104 return &(((XINVLIST*) SvANY(invlist))->iterator);
9107 PERL_STATIC_INLINE void
9108 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9110 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9112 *get_invlist_iter_addr(invlist) = 0;
9115 PERL_STATIC_INLINE void
9116 S_invlist_iterfinish(SV* invlist)
9118 /* Terminate iterator for invlist. This is to catch development errors.
9119 * Any iteration that is interrupted before completed should call this
9120 * function. Functions that add code points anywhere else but to the end
9121 * of an inversion list assert that they are not in the middle of an
9122 * iteration. If they were, the addition would make the iteration
9123 * problematical: if the iteration hadn't reached the place where things
9124 * were being added, it would be ok */
9126 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9128 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9132 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9134 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9135 * This call sets in <*start> and <*end>, the next range in <invlist>.
9136 * Returns <TRUE> if successful and the next call will return the next
9137 * range; <FALSE> if was already at the end of the list. If the latter,
9138 * <*start> and <*end> are unchanged, and the next call to this function
9139 * will start over at the beginning of the list */
9141 STRLEN* pos = get_invlist_iter_addr(invlist);
9142 UV len = _invlist_len(invlist);
9145 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9148 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9152 array = invlist_array(invlist);
9154 *start = array[(*pos)++];
9160 *end = array[(*pos)++] - 1;
9166 PERL_STATIC_INLINE UV
9167 S_invlist_highest(SV* const invlist)
9169 /* Returns the highest code point that matches an inversion list. This API
9170 * has an ambiguity, as it returns 0 under either the highest is actually
9171 * 0, or if the list is empty. If this distinction matters to you, check
9172 * for emptiness before calling this function */
9174 UV len = _invlist_len(invlist);
9177 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9183 array = invlist_array(invlist);
9185 /* The last element in the array in the inversion list always starts a
9186 * range that goes to infinity. That range may be for code points that are
9187 * matched in the inversion list, or it may be for ones that aren't
9188 * matched. In the latter case, the highest code point in the set is one
9189 * less than the beginning of this range; otherwise it is the final element
9190 * of this range: infinity */
9191 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9193 : array[len - 1] - 1;
9196 #ifndef PERL_IN_XSUB_RE
9198 Perl__invlist_contents(pTHX_ SV* const invlist)
9200 /* Get the contents of an inversion list into a string SV so that they can
9201 * be printed out. It uses the format traditionally done for debug tracing
9205 SV* output = newSVpvs("\n");
9207 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9209 assert(! invlist_is_iterating(invlist));
9211 invlist_iterinit(invlist);
9212 while (invlist_iternext(invlist, &start, &end)) {
9213 if (end == UV_MAX) {
9214 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9216 else if (end != start) {
9217 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9221 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9229 #ifndef PERL_IN_XSUB_RE
9231 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9232 const char * const indent, SV* const invlist)
9234 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9235 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9236 * the string 'indent'. The output looks like this:
9237 [0] 0x000A .. 0x000D
9239 [4] 0x2028 .. 0x2029
9240 [6] 0x3104 .. INFINITY
9241 * This means that the first range of code points matched by the list are
9242 * 0xA through 0xD; the second range contains only the single code point
9243 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9244 * are used to define each range (except if the final range extends to
9245 * infinity, only a single element is needed). The array index of the
9246 * first element for the corresponding range is given in brackets. */
9251 PERL_ARGS_ASSERT__INVLIST_DUMP;
9253 if (invlist_is_iterating(invlist)) {
9254 Perl_dump_indent(aTHX_ level, file,
9255 "%sCan't dump inversion list because is in middle of iterating\n",
9260 invlist_iterinit(invlist);
9261 while (invlist_iternext(invlist, &start, &end)) {
9262 if (end == UV_MAX) {
9263 Perl_dump_indent(aTHX_ level, file,
9264 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9265 indent, (UV)count, start);
9267 else if (end != start) {
9268 Perl_dump_indent(aTHX_ level, file,
9269 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9270 indent, (UV)count, start, end);
9273 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9274 indent, (UV)count, start);
9281 Perl__load_PL_utf8_foldclosures (pTHX)
9283 assert(! PL_utf8_foldclosures);
9285 /* If the folds haven't been read in, call a fold function
9287 if (! PL_utf8_tofold) {
9288 U8 dummy[UTF8_MAXBYTES_CASE+1];
9290 /* This string is just a short named one above \xff */
9291 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9292 assert(PL_utf8_tofold); /* Verify that worked */
9294 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9298 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9300 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9302 /* Return a boolean as to if the two passed in inversion lists are
9303 * identical. The final argument, if TRUE, says to take the complement of
9304 * the second inversion list before doing the comparison */
9306 const UV* array_a = invlist_array(a);
9307 const UV* array_b = invlist_array(b);
9308 UV len_a = _invlist_len(a);
9309 UV len_b = _invlist_len(b);
9311 UV i = 0; /* current index into the arrays */
9312 bool retval = TRUE; /* Assume are identical until proven otherwise */
9314 PERL_ARGS_ASSERT__INVLISTEQ;
9316 /* If are to compare 'a' with the complement of b, set it
9317 * up so are looking at b's complement. */
9320 /* The complement of nothing is everything, so <a> would have to have
9321 * just one element, starting at zero (ending at infinity) */
9323 return (len_a == 1 && array_a[0] == 0);
9325 else if (array_b[0] == 0) {
9327 /* Otherwise, to complement, we invert. Here, the first element is
9328 * 0, just remove it. To do this, we just pretend the array starts
9336 /* But if the first element is not zero, we pretend the list starts
9337 * at the 0 that is always stored immediately before the array. */
9343 /* Make sure that the lengths are the same, as well as the final element
9344 * before looping through the remainder. (Thus we test the length, final,
9345 * and first elements right off the bat) */
9346 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9349 else for (i = 0; i < len_a - 1; i++) {
9350 if (array_a[i] != array_b[i]) {
9361 * As best we can, determine the characters that can match the start of
9362 * the given EXACTF-ish node.
9364 * Returns the invlist as a new SV*; it is the caller's responsibility to
9365 * call SvREFCNT_dec() when done with it.
9368 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9370 const U8 * s = (U8*)STRING(node);
9371 SSize_t bytelen = STR_LEN(node);
9373 /* Start out big enough for 2 separate code points */
9374 SV* invlist = _new_invlist(4);
9376 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9381 /* We punt and assume can match anything if the node begins
9382 * with a multi-character fold. Things are complicated. For
9383 * example, /ffi/i could match any of:
9384 * "\N{LATIN SMALL LIGATURE FFI}"
9385 * "\N{LATIN SMALL LIGATURE FF}I"
9386 * "F\N{LATIN SMALL LIGATURE FI}"
9387 * plus several other things; and making sure we have all the
9388 * possibilities is hard. */
9389 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9390 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9393 /* Any Latin1 range character can potentially match any
9394 * other depending on the locale */
9395 if (OP(node) == EXACTFL) {
9396 _invlist_union(invlist, PL_Latin1, &invlist);
9399 /* But otherwise, it matches at least itself. We can
9400 * quickly tell if it has a distinct fold, and if so,
9401 * it matches that as well */
9402 invlist = add_cp_to_invlist(invlist, uc);
9403 if (IS_IN_SOME_FOLD_L1(uc))
9404 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9407 /* Some characters match above-Latin1 ones under /i. This
9408 * is true of EXACTFL ones when the locale is UTF-8 */
9409 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9410 && (! isASCII(uc) || (OP(node) != EXACTFA
9411 && OP(node) != EXACTFA_NO_TRIE)))
9413 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9417 else { /* Pattern is UTF-8 */
9418 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9419 STRLEN foldlen = UTF8SKIP(s);
9420 const U8* e = s + bytelen;
9423 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9425 /* The only code points that aren't folded in a UTF EXACTFish
9426 * node are are the problematic ones in EXACTFL nodes */
9427 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9428 /* We need to check for the possibility that this EXACTFL
9429 * node begins with a multi-char fold. Therefore we fold
9430 * the first few characters of it so that we can make that
9435 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9437 *(d++) = (U8) toFOLD(*s);
9442 to_utf8_fold(s, d, &len);
9448 /* And set up so the code below that looks in this folded
9449 * buffer instead of the node's string */
9451 foldlen = UTF8SKIP(folded);
9455 /* When we reach here 's' points to the fold of the first
9456 * character(s) of the node; and 'e' points to far enough along
9457 * the folded string to be just past any possible multi-char
9458 * fold. 'foldlen' is the length in bytes of the first
9461 * Unlike the non-UTF-8 case, the macro for determining if a
9462 * string is a multi-char fold requires all the characters to
9463 * already be folded. This is because of all the complications
9464 * if not. Note that they are folded anyway, except in EXACTFL
9465 * nodes. Like the non-UTF case above, we punt if the node
9466 * begins with a multi-char fold */
9468 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9469 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9471 else { /* Single char fold */
9473 /* It matches all the things that fold to it, which are
9474 * found in PL_utf8_foldclosures (including itself) */
9475 invlist = add_cp_to_invlist(invlist, uc);
9476 if (! PL_utf8_foldclosures)
9477 _load_PL_utf8_foldclosures();
9478 if ((listp = hv_fetch(PL_utf8_foldclosures,
9479 (char *) s, foldlen, FALSE)))
9481 AV* list = (AV*) *listp;
9483 for (k = 0; k <= av_tindex(list); k++) {
9484 SV** c_p = av_fetch(list, k, FALSE);
9490 /* /aa doesn't allow folds between ASCII and non- */
9491 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9492 && isASCII(c) != isASCII(uc))
9497 invlist = add_cp_to_invlist(invlist, c);
9506 #undef HEADER_LENGTH
9507 #undef TO_INTERNAL_SIZE
9508 #undef FROM_INTERNAL_SIZE
9509 #undef INVLIST_VERSION_ID
9511 /* End of inversion list object */
9514 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9516 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9517 * constructs, and updates RExC_flags with them. On input, RExC_parse
9518 * should point to the first flag; it is updated on output to point to the
9519 * final ')' or ':'. There needs to be at least one flag, or this will
9522 /* for (?g), (?gc), and (?o) warnings; warning
9523 about (?c) will warn about (?g) -- japhy */
9525 #define WASTED_O 0x01
9526 #define WASTED_G 0x02
9527 #define WASTED_C 0x04
9528 #define WASTED_GC (WASTED_G|WASTED_C)
9529 I32 wastedflags = 0x00;
9530 U32 posflags = 0, negflags = 0;
9531 U32 *flagsp = &posflags;
9532 char has_charset_modifier = '\0';
9534 bool has_use_defaults = FALSE;
9535 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9536 int x_mod_count = 0;
9538 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9540 /* '^' as an initial flag sets certain defaults */
9541 if (UCHARAT(RExC_parse) == '^') {
9543 has_use_defaults = TRUE;
9544 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9545 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9546 ? REGEX_UNICODE_CHARSET
9547 : REGEX_DEPENDS_CHARSET);
9550 cs = get_regex_charset(RExC_flags);
9551 if (cs == REGEX_DEPENDS_CHARSET
9552 && (RExC_utf8 || RExC_uni_semantics))
9554 cs = REGEX_UNICODE_CHARSET;
9557 while (*RExC_parse) {
9558 /* && strchr("iogcmsx", *RExC_parse) */
9559 /* (?g), (?gc) and (?o) are useless here
9560 and must be globally applied -- japhy */
9561 switch (*RExC_parse) {
9563 /* Code for the imsxn flags */
9564 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9566 case LOCALE_PAT_MOD:
9567 if (has_charset_modifier) {
9568 goto excess_modifier;
9570 else if (flagsp == &negflags) {
9573 cs = REGEX_LOCALE_CHARSET;
9574 has_charset_modifier = LOCALE_PAT_MOD;
9576 case UNICODE_PAT_MOD:
9577 if (has_charset_modifier) {
9578 goto excess_modifier;
9580 else if (flagsp == &negflags) {
9583 cs = REGEX_UNICODE_CHARSET;
9584 has_charset_modifier = UNICODE_PAT_MOD;
9586 case ASCII_RESTRICT_PAT_MOD:
9587 if (flagsp == &negflags) {
9590 if (has_charset_modifier) {
9591 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9592 goto excess_modifier;
9594 /* Doubled modifier implies more restricted */
9595 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9598 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9600 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9602 case DEPENDS_PAT_MOD:
9603 if (has_use_defaults) {
9604 goto fail_modifiers;
9606 else if (flagsp == &negflags) {
9609 else if (has_charset_modifier) {
9610 goto excess_modifier;
9613 /* The dual charset means unicode semantics if the
9614 * pattern (or target, not known until runtime) are
9615 * utf8, or something in the pattern indicates unicode
9617 cs = (RExC_utf8 || RExC_uni_semantics)
9618 ? REGEX_UNICODE_CHARSET
9619 : REGEX_DEPENDS_CHARSET;
9620 has_charset_modifier = DEPENDS_PAT_MOD;
9624 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9625 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9627 else if (has_charset_modifier == *(RExC_parse - 1)) {
9628 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9632 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9634 NOT_REACHED; /*NOTREACHED*/
9637 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9639 NOT_REACHED; /*NOTREACHED*/
9640 case ONCE_PAT_MOD: /* 'o' */
9641 case GLOBAL_PAT_MOD: /* 'g' */
9642 if (PASS2 && ckWARN(WARN_REGEXP)) {
9643 const I32 wflagbit = *RExC_parse == 'o'
9646 if (! (wastedflags & wflagbit) ) {
9647 wastedflags |= wflagbit;
9648 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9651 "Useless (%s%c) - %suse /%c modifier",
9652 flagsp == &negflags ? "?-" : "?",
9654 flagsp == &negflags ? "don't " : "",
9661 case CONTINUE_PAT_MOD: /* 'c' */
9662 if (PASS2 && ckWARN(WARN_REGEXP)) {
9663 if (! (wastedflags & WASTED_C) ) {
9664 wastedflags |= WASTED_GC;
9665 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9668 "Useless (%sc) - %suse /gc modifier",
9669 flagsp == &negflags ? "?-" : "?",
9670 flagsp == &negflags ? "don't " : ""
9675 case KEEPCOPY_PAT_MOD: /* 'p' */
9676 if (flagsp == &negflags) {
9678 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9680 *flagsp |= RXf_PMf_KEEPCOPY;
9684 /* A flag is a default iff it is following a minus, so
9685 * if there is a minus, it means will be trying to
9686 * re-specify a default which is an error */
9687 if (has_use_defaults || flagsp == &negflags) {
9688 goto fail_modifiers;
9691 wastedflags = 0; /* reset so (?g-c) warns twice */
9695 RExC_flags |= posflags;
9696 RExC_flags &= ~negflags;
9697 set_regex_charset(&RExC_flags, cs);
9698 if (RExC_flags & RXf_PMf_FOLD) {
9699 RExC_contains_i = 1;
9702 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9708 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9709 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9710 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9711 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9712 NOT_REACHED; /*NOTREACHED*/
9719 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9724 - reg - regular expression, i.e. main body or parenthesized thing
9726 * Caller must absorb opening parenthesis.
9728 * Combining parenthesis handling with the base level of regular expression
9729 * is a trifle forced, but the need to tie the tails of the branches to what
9730 * follows makes it hard to avoid.
9732 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9734 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9736 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9739 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9740 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9741 needs to be restarted.
9742 Otherwise would only return NULL if regbranch() returns NULL, which
9745 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9746 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9747 * 2 is like 1, but indicates that nextchar() has been called to advance
9748 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9749 * this flag alerts us to the need to check for that */
9751 regnode *ret; /* Will be the head of the group. */
9754 regnode *ender = NULL;
9757 U32 oregflags = RExC_flags;
9758 bool have_branch = 0;
9760 I32 freeze_paren = 0;
9761 I32 after_freeze = 0;
9762 I32 num; /* numeric backreferences */
9764 char * parse_start = RExC_parse; /* MJD */
9765 char * const oregcomp_parse = RExC_parse;
9767 GET_RE_DEBUG_FLAGS_DECL;
9769 PERL_ARGS_ASSERT_REG;
9770 DEBUG_PARSE("reg ");
9772 *flagp = 0; /* Tentatively. */
9775 /* Make an OPEN node, if parenthesized. */
9778 /* Under /x, space and comments can be gobbled up between the '(' and
9779 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9780 * intervening space, as the sequence is a token, and a token should be
9782 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9784 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9785 char *start_verb = RExC_parse;
9786 STRLEN verb_len = 0;
9787 char *start_arg = NULL;
9788 unsigned char op = 0;
9790 int internal_argval = 0; /* internal_argval is only useful if
9793 if (has_intervening_patws) {
9795 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9797 while ( *RExC_parse && *RExC_parse != ')' ) {
9798 if ( *RExC_parse == ':' ) {
9799 start_arg = RExC_parse + 1;
9805 verb_len = RExC_parse - start_verb;
9808 while ( *RExC_parse && *RExC_parse != ')' )
9810 if ( *RExC_parse != ')' )
9811 vFAIL("Unterminated verb pattern argument");
9812 if ( RExC_parse == start_arg )
9815 if ( *RExC_parse != ')' )
9816 vFAIL("Unterminated verb pattern");
9819 switch ( *start_verb ) {
9820 case 'A': /* (*ACCEPT) */
9821 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9823 internal_argval = RExC_nestroot;
9826 case 'C': /* (*COMMIT) */
9827 if ( memEQs(start_verb,verb_len,"COMMIT") )
9830 case 'F': /* (*FAIL) */
9831 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9836 case ':': /* (*:NAME) */
9837 case 'M': /* (*MARK:NAME) */
9838 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9843 case 'P': /* (*PRUNE) */
9844 if ( memEQs(start_verb,verb_len,"PRUNE") )
9847 case 'S': /* (*SKIP) */
9848 if ( memEQs(start_verb,verb_len,"SKIP") )
9851 case 'T': /* (*THEN) */
9852 /* [19:06] <TimToady> :: is then */
9853 if ( memEQs(start_verb,verb_len,"THEN") ) {
9855 RExC_seen |= REG_CUTGROUP_SEEN;
9860 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9862 "Unknown verb pattern '%"UTF8f"'",
9863 UTF8fARG(UTF, verb_len, start_verb));
9866 if ( start_arg && internal_argval ) {
9867 vFAIL3("Verb pattern '%.*s' may not have an argument",
9868 verb_len, start_verb);
9869 } else if ( argok < 0 && !start_arg ) {
9870 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9871 verb_len, start_verb);
9873 ret = reganode(pRExC_state, op, internal_argval);
9874 if ( ! internal_argval && ! SIZE_ONLY ) {
9876 SV *sv = newSVpvn( start_arg,
9877 RExC_parse - start_arg);
9878 ARG(ret) = add_data( pRExC_state,
9880 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9887 if (!internal_argval)
9888 RExC_seen |= REG_VERBARG_SEEN;
9889 } else if ( start_arg ) {
9890 vFAIL3("Verb pattern '%.*s' may not have an argument",
9891 verb_len, start_verb);
9893 ret = reg_node(pRExC_state, op);
9895 nextchar(pRExC_state);
9898 else if (*RExC_parse == '?') { /* (?...) */
9899 bool is_logical = 0;
9900 const char * const seqstart = RExC_parse;
9901 const char * endptr;
9902 if (has_intervening_patws) {
9904 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9908 paren = *RExC_parse++;
9909 ret = NULL; /* For look-ahead/behind. */
9912 case 'P': /* (?P...) variants for those used to PCRE/Python */
9913 paren = *RExC_parse++;
9914 if ( paren == '<') /* (?P<...>) named capture */
9916 else if (paren == '>') { /* (?P>name) named recursion */
9917 goto named_recursion;
9919 else if (paren == '=') { /* (?P=...) named backref */
9920 /* this pretty much dupes the code for \k<NAME> in
9921 * regatom(), if you change this make sure you change that
9923 char* name_start = RExC_parse;
9925 SV *sv_dat = reg_scan_name(pRExC_state,
9926 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9927 if (RExC_parse == name_start || *RExC_parse != ')')
9928 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9929 vFAIL2("Sequence %.3s... not terminated",parse_start);
9932 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9933 RExC_rxi->data->data[num]=(void*)sv_dat;
9934 SvREFCNT_inc_simple_void(sv_dat);
9937 ret = reganode(pRExC_state,
9940 : (ASCII_FOLD_RESTRICTED)
9942 : (AT_LEAST_UNI_SEMANTICS)
9950 Set_Node_Offset(ret, parse_start+1);
9951 Set_Node_Cur_Length(ret, parse_start);
9953 nextchar(pRExC_state);
9957 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9958 vFAIL3("Sequence (%.*s...) not recognized",
9959 RExC_parse-seqstart, seqstart);
9960 NOT_REACHED; /*NOTREACHED*/
9961 case '<': /* (?<...) */
9962 if (*RExC_parse == '!')
9964 else if (*RExC_parse != '=')
9970 case '\'': /* (?'...') */
9971 name_start= RExC_parse;
9972 svname = reg_scan_name(pRExC_state,
9973 SIZE_ONLY /* reverse test from the others */
9974 ? REG_RSN_RETURN_NAME
9975 : REG_RSN_RETURN_NULL);
9976 if (RExC_parse == name_start || *RExC_parse != paren)
9977 vFAIL2("Sequence (?%c... not terminated",
9978 paren=='>' ? '<' : paren);
9982 if (!svname) /* shouldn't happen */
9984 "panic: reg_scan_name returned NULL");
9985 if (!RExC_paren_names) {
9986 RExC_paren_names= newHV();
9987 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9989 RExC_paren_name_list= newAV();
9990 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9993 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9995 sv_dat = HeVAL(he_str);
9997 /* croak baby croak */
9999 "panic: paren_name hash element allocation failed");
10000 } else if ( SvPOK(sv_dat) ) {
10001 /* (?|...) can mean we have dupes so scan to check
10002 its already been stored. Maybe a flag indicating
10003 we are inside such a construct would be useful,
10004 but the arrays are likely to be quite small, so
10005 for now we punt -- dmq */
10006 IV count = SvIV(sv_dat);
10007 I32 *pv = (I32*)SvPVX(sv_dat);
10009 for ( i = 0 ; i < count ; i++ ) {
10010 if ( pv[i] == RExC_npar ) {
10016 pv = (I32*)SvGROW(sv_dat,
10017 SvCUR(sv_dat) + sizeof(I32)+1);
10018 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10019 pv[count] = RExC_npar;
10020 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10023 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10024 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10027 SvIV_set(sv_dat, 1);
10030 /* Yes this does cause a memory leak in debugging Perls
10032 if (!av_store(RExC_paren_name_list,
10033 RExC_npar, SvREFCNT_inc(svname)))
10034 SvREFCNT_dec_NN(svname);
10037 /*sv_dump(sv_dat);*/
10039 nextchar(pRExC_state);
10041 goto capturing_parens;
10043 RExC_seen |= REG_LOOKBEHIND_SEEN;
10044 RExC_in_lookbehind++;
10047 case '=': /* (?=...) */
10048 RExC_seen_zerolen++;
10050 case '!': /* (?!...) */
10051 RExC_seen_zerolen++;
10052 /* check if we're really just a "FAIL" assertion */
10054 nextchar(pRExC_state);
10055 if (*RExC_parse == ')') {
10056 ret=reg_node(pRExC_state, OPFAIL);
10057 nextchar(pRExC_state);
10061 case '|': /* (?|...) */
10062 /* branch reset, behave like a (?:...) except that
10063 buffers in alternations share the same numbers */
10065 after_freeze = freeze_paren = RExC_npar;
10067 case ':': /* (?:...) */
10068 case '>': /* (?>...) */
10070 case '$': /* (?$...) */
10071 case '@': /* (?@...) */
10072 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10074 case '0' : /* (?0) */
10075 case 'R' : /* (?R) */
10076 if (*RExC_parse != ')')
10077 FAIL("Sequence (?R) not terminated");
10078 ret = reg_node(pRExC_state, GOSTART);
10079 RExC_seen |= REG_GOSTART_SEEN;
10080 *flagp |= POSTPONED;
10081 nextchar(pRExC_state);
10084 /* named and numeric backreferences */
10085 case '&': /* (?&NAME) */
10086 parse_start = RExC_parse - 1;
10089 SV *sv_dat = reg_scan_name(pRExC_state,
10090 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10091 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10093 if (RExC_parse == RExC_end || *RExC_parse != ')')
10094 vFAIL("Sequence (?&... not terminated");
10095 goto gen_recurse_regop;
10098 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10100 vFAIL("Illegal pattern");
10102 goto parse_recursion;
10104 case '-': /* (?-1) */
10105 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10106 RExC_parse--; /* rewind to let it be handled later */
10110 case '1': case '2': case '3': case '4': /* (?1) */
10111 case '5': case '6': case '7': case '8': case '9':
10115 bool is_neg = FALSE;
10116 parse_start = RExC_parse - 1; /* MJD */
10117 if (*RExC_parse == '-') {
10121 num = grok_atou(RExC_parse, &endptr);
10123 RExC_parse = (char*)endptr;
10125 /* Some limit for num? */
10129 if (*RExC_parse!=')')
10130 vFAIL("Expecting close bracket");
10133 if ( paren == '-' ) {
10135 Diagram of capture buffer numbering.
10136 Top line is the normal capture buffer numbers
10137 Bottom line is the negative indexing as from
10141 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10145 num = RExC_npar + num;
10148 vFAIL("Reference to nonexistent group");
10150 } else if ( paren == '+' ) {
10151 num = RExC_npar + num - 1;
10154 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10156 if (num > (I32)RExC_rx->nparens) {
10158 vFAIL("Reference to nonexistent group");
10160 RExC_recurse_count++;
10161 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10162 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10163 22, "| |", (int)(depth * 2 + 1), "",
10164 (UV)ARG(ret), (IV)ARG2L(ret)));
10166 RExC_seen |= REG_RECURSE_SEEN;
10167 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10168 Set_Node_Offset(ret, parse_start); /* MJD */
10170 *flagp |= POSTPONED;
10171 nextchar(pRExC_state);
10176 case '?': /* (??...) */
10178 if (*RExC_parse != '{') {
10180 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10182 "Sequence (%"UTF8f"...) not recognized",
10183 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10184 NOT_REACHED; /*NOTREACHED*/
10186 *flagp |= POSTPONED;
10187 paren = *RExC_parse++;
10189 case '{': /* (?{...}) */
10192 struct reg_code_block *cb;
10194 RExC_seen_zerolen++;
10196 if ( !pRExC_state->num_code_blocks
10197 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10198 || pRExC_state->code_blocks[pRExC_state->code_index].start
10199 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10202 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10203 FAIL("panic: Sequence (?{...}): no code block found\n");
10204 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10206 /* this is a pre-compiled code block (?{...}) */
10207 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10208 RExC_parse = RExC_start + cb->end;
10211 if (cb->src_regex) {
10212 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10213 RExC_rxi->data->data[n] =
10214 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10215 RExC_rxi->data->data[n+1] = (void*)o;
10218 n = add_data(pRExC_state,
10219 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10220 RExC_rxi->data->data[n] = (void*)o;
10223 pRExC_state->code_index++;
10224 nextchar(pRExC_state);
10228 ret = reg_node(pRExC_state, LOGICAL);
10230 eval = reg2Lanode(pRExC_state, EVAL,
10233 /* for later propagation into (??{})
10235 RExC_flags & RXf_PMf_COMPILETIME
10240 REGTAIL(pRExC_state, ret, eval);
10241 /* deal with the length of this later - MJD */
10244 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10245 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10246 Set_Node_Offset(ret, parse_start);
10249 case '(': /* (?(?{...})...) and (?(?=...)...) */
10252 const int DEFINE_len = sizeof("DEFINE") - 1;
10253 if (RExC_parse[0] == '?') { /* (?(?...)) */
10254 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10255 || RExC_parse[1] == '<'
10256 || RExC_parse[1] == '{') { /* Lookahead or eval. */
10260 ret = reg_node(pRExC_state, LOGICAL);
10264 tail = reg(pRExC_state, 1, &flag, depth+1);
10265 if (flag & RESTART_UTF8) {
10266 *flagp = RESTART_UTF8;
10269 REGTAIL(pRExC_state, ret, tail);
10272 /* Fall through to ‘Unknown switch condition’ at the
10273 end of the if/else chain. */
10275 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10276 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10278 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10279 char *name_start= RExC_parse++;
10281 SV *sv_dat=reg_scan_name(pRExC_state,
10282 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10283 if (RExC_parse == name_start || *RExC_parse != ch)
10284 vFAIL2("Sequence (?(%c... not terminated",
10285 (ch == '>' ? '<' : ch));
10288 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10289 RExC_rxi->data->data[num]=(void*)sv_dat;
10290 SvREFCNT_inc_simple_void(sv_dat);
10292 ret = reganode(pRExC_state,NGROUPP,num);
10293 goto insert_if_check_paren;
10295 else if (RExC_end - RExC_parse >= DEFINE_len
10296 && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10298 ret = reganode(pRExC_state,DEFINEP,0);
10299 RExC_parse += DEFINE_len;
10301 goto insert_if_check_paren;
10303 else if (RExC_parse[0] == 'R') {
10306 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10307 parno = grok_atou(RExC_parse, &endptr);
10309 RExC_parse = (char*)endptr;
10310 } else if (RExC_parse[0] == '&') {
10313 sv_dat = reg_scan_name(pRExC_state,
10315 ? REG_RSN_RETURN_NULL
10316 : REG_RSN_RETURN_DATA);
10317 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10319 ret = reganode(pRExC_state,INSUBP,parno);
10320 goto insert_if_check_paren;
10322 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10326 parno = grok_atou(RExC_parse, &endptr);
10328 RExC_parse = (char*)endptr;
10329 ret = reganode(pRExC_state, GROUPP, parno);
10331 insert_if_check_paren:
10332 if (*(tmp = nextchar(pRExC_state)) != ')') {
10333 /* nextchar also skips comments, so undo its work
10334 * and skip over the the next character.
10337 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10338 vFAIL("Switch condition not recognized");
10341 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10342 br = regbranch(pRExC_state, &flags, 1,depth+1);
10344 if (flags & RESTART_UTF8) {
10345 *flagp = RESTART_UTF8;
10348 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10351 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10353 c = *nextchar(pRExC_state);
10354 if (flags&HASWIDTH)
10355 *flagp |= HASWIDTH;
10358 vFAIL("(?(DEFINE)....) does not allow branches");
10360 /* Fake one for optimizer. */
10361 lastbr = reganode(pRExC_state, IFTHEN, 0);
10363 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10364 if (flags & RESTART_UTF8) {
10365 *flagp = RESTART_UTF8;
10368 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10371 REGTAIL(pRExC_state, ret, lastbr);
10372 if (flags&HASWIDTH)
10373 *flagp |= HASWIDTH;
10374 c = *nextchar(pRExC_state);
10379 if (RExC_parse>RExC_end)
10380 vFAIL("Switch (?(condition)... not terminated");
10382 vFAIL("Switch (?(condition)... contains too many branches");
10384 ender = reg_node(pRExC_state, TAIL);
10385 REGTAIL(pRExC_state, br, ender);
10387 REGTAIL(pRExC_state, lastbr, ender);
10388 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10391 REGTAIL(pRExC_state, ret, ender);
10392 RExC_size++; /* XXX WHY do we need this?!!
10393 For large programs it seems to be required
10394 but I can't figure out why. -- dmq*/
10397 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10398 vFAIL("Unknown switch condition (?(...))");
10400 case '[': /* (?[ ... ]) */
10401 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10404 RExC_parse--; /* for vFAIL to print correctly */
10405 vFAIL("Sequence (? incomplete");
10407 default: /* e.g., (?i) */
10410 parse_lparen_question_flags(pRExC_state);
10411 if (UCHARAT(RExC_parse) != ':') {
10412 nextchar(pRExC_state);
10417 nextchar(pRExC_state);
10422 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
10427 ret = reganode(pRExC_state, OPEN, parno);
10429 if (!RExC_nestroot)
10430 RExC_nestroot = parno;
10431 if (RExC_seen & REG_RECURSE_SEEN
10432 && !RExC_open_parens[parno-1])
10434 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10435 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10436 22, "| |", (int)(depth * 2 + 1), "",
10437 (IV)parno, REG_NODE_NUM(ret)));
10438 RExC_open_parens[parno-1]= ret;
10441 Set_Node_Length(ret, 1); /* MJD */
10442 Set_Node_Offset(ret, RExC_parse); /* MJD */
10452 /* Pick up the branches, linking them together. */
10453 parse_start = RExC_parse; /* MJD */
10454 br = regbranch(pRExC_state, &flags, 1,depth+1);
10456 /* branch_len = (paren != 0); */
10459 if (flags & RESTART_UTF8) {
10460 *flagp = RESTART_UTF8;
10463 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10465 if (*RExC_parse == '|') {
10466 if (!SIZE_ONLY && RExC_extralen) {
10467 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10470 reginsert(pRExC_state, BRANCH, br, depth+1);
10471 Set_Node_Length(br, paren != 0);
10472 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10476 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10478 else if (paren == ':') {
10479 *flagp |= flags&SIMPLE;
10481 if (is_open) { /* Starts with OPEN. */
10482 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10484 else if (paren != '?') /* Not Conditional */
10486 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10488 while (*RExC_parse == '|') {
10489 if (!SIZE_ONLY && RExC_extralen) {
10490 ender = reganode(pRExC_state, LONGJMP,0);
10492 /* Append to the previous. */
10493 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10496 RExC_extralen += 2; /* Account for LONGJMP. */
10497 nextchar(pRExC_state);
10498 if (freeze_paren) {
10499 if (RExC_npar > after_freeze)
10500 after_freeze = RExC_npar;
10501 RExC_npar = freeze_paren;
10503 br = regbranch(pRExC_state, &flags, 0, depth+1);
10506 if (flags & RESTART_UTF8) {
10507 *flagp = RESTART_UTF8;
10510 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10512 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10514 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10517 if (have_branch || paren != ':') {
10518 /* Make a closing node, and hook it on the end. */
10521 ender = reg_node(pRExC_state, TAIL);
10524 ender = reganode(pRExC_state, CLOSE, parno);
10525 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10526 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10527 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10528 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10529 RExC_close_parens[parno-1]= ender;
10530 if (RExC_nestroot == parno)
10533 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10534 Set_Node_Length(ender,1); /* MJD */
10540 *flagp &= ~HASWIDTH;
10543 ender = reg_node(pRExC_state, SUCCEED);
10546 ender = reg_node(pRExC_state, END);
10548 assert(!RExC_opend); /* there can only be one! */
10549 RExC_opend = ender;
10553 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10554 DEBUG_PARSE_MSG("lsbr");
10555 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10556 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10557 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10558 SvPV_nolen_const(RExC_mysv1),
10559 (IV)REG_NODE_NUM(lastbr),
10560 SvPV_nolen_const(RExC_mysv2),
10561 (IV)REG_NODE_NUM(ender),
10562 (IV)(ender - lastbr)
10565 REGTAIL(pRExC_state, lastbr, ender);
10567 if (have_branch && !SIZE_ONLY) {
10568 char is_nothing= 1;
10570 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10572 /* Hook the tails of the branches to the closing node. */
10573 for (br = ret; br; br = regnext(br)) {
10574 const U8 op = PL_regkind[OP(br)];
10575 if (op == BRANCH) {
10576 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10577 if ( OP(NEXTOPER(br)) != NOTHING
10578 || regnext(NEXTOPER(br)) != ender)
10581 else if (op == BRANCHJ) {
10582 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10583 /* for now we always disable this optimisation * /
10584 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10585 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10591 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10592 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10593 DEBUG_PARSE_MSG("NADA");
10594 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10595 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10596 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10597 SvPV_nolen_const(RExC_mysv1),
10598 (IV)REG_NODE_NUM(ret),
10599 SvPV_nolen_const(RExC_mysv2),
10600 (IV)REG_NODE_NUM(ender),
10605 if (OP(ender) == TAIL) {
10610 for ( opt= br + 1; opt < ender ; opt++ )
10611 OP(opt)= OPTIMIZED;
10612 NEXT_OFF(br)= ender - br;
10620 static const char parens[] = "=!<,>";
10622 if (paren && (p = strchr(parens, paren))) {
10623 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10624 int flag = (p - parens) > 1;
10627 node = SUSPEND, flag = 0;
10628 reginsert(pRExC_state, node,ret, depth+1);
10629 Set_Node_Cur_Length(ret, parse_start);
10630 Set_Node_Offset(ret, parse_start + 1);
10632 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10636 /* Check for proper termination. */
10638 /* restore original flags, but keep (?p) */
10639 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10640 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10641 RExC_parse = oregcomp_parse;
10642 vFAIL("Unmatched (");
10645 else if (!paren && RExC_parse < RExC_end) {
10646 if (*RExC_parse == ')') {
10648 vFAIL("Unmatched )");
10651 FAIL("Junk on end of regexp"); /* "Can't happen". */
10652 NOT_REACHED; /* NOTREACHED */
10655 if (RExC_in_lookbehind) {
10656 RExC_in_lookbehind--;
10658 if (after_freeze > RExC_npar)
10659 RExC_npar = after_freeze;
10664 - regbranch - one alternative of an | operator
10666 * Implements the concatenation operator.
10668 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10672 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10675 regnode *chain = NULL;
10677 I32 flags = 0, c = 0;
10678 GET_RE_DEBUG_FLAGS_DECL;
10680 PERL_ARGS_ASSERT_REGBRANCH;
10682 DEBUG_PARSE("brnc");
10687 if (!SIZE_ONLY && RExC_extralen)
10688 ret = reganode(pRExC_state, BRANCHJ,0);
10690 ret = reg_node(pRExC_state, BRANCH);
10691 Set_Node_Length(ret, 1);
10695 if (!first && SIZE_ONLY)
10696 RExC_extralen += 1; /* BRANCHJ */
10698 *flagp = WORST; /* Tentatively. */
10701 nextchar(pRExC_state);
10702 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10703 flags &= ~TRYAGAIN;
10704 latest = regpiece(pRExC_state, &flags,depth+1);
10705 if (latest == NULL) {
10706 if (flags & TRYAGAIN)
10708 if (flags & RESTART_UTF8) {
10709 *flagp = RESTART_UTF8;
10712 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10714 else if (ret == NULL)
10716 *flagp |= flags&(HASWIDTH|POSTPONED);
10717 if (chain == NULL) /* First piece. */
10718 *flagp |= flags&SPSTART;
10720 /* FIXME adding one for every branch after the first is probably
10721 * excessive now we have TRIE support. (hv) */
10723 REGTAIL(pRExC_state, chain, latest);
10728 if (chain == NULL) { /* Loop ran zero times. */
10729 chain = reg_node(pRExC_state, NOTHING);
10734 *flagp |= flags&SIMPLE;
10741 - regpiece - something followed by possible [*+?]
10743 * Note that the branching code sequences used for ? and the general cases
10744 * of * and + are somewhat optimized: they use the same NOTHING node as
10745 * both the endmarker for their branch list and the body of the last branch.
10746 * It might seem that this node could be dispensed with entirely, but the
10747 * endmarker role is not redundant.
10749 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10751 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10755 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10761 const char * const origparse = RExC_parse;
10763 I32 max = REG_INFTY;
10764 #ifdef RE_TRACK_PATTERN_OFFSETS
10767 const char *maxpos = NULL;
10769 /* Save the original in case we change the emitted regop to a FAIL. */
10770 regnode * const orig_emit = RExC_emit;
10772 GET_RE_DEBUG_FLAGS_DECL;
10774 PERL_ARGS_ASSERT_REGPIECE;
10776 DEBUG_PARSE("piec");
10778 ret = regatom(pRExC_state, &flags,depth+1);
10780 if (flags & (TRYAGAIN|RESTART_UTF8))
10781 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10783 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10789 if (op == '{' && regcurly(RExC_parse)) {
10791 #ifdef RE_TRACK_PATTERN_OFFSETS
10792 parse_start = RExC_parse; /* MJD */
10794 next = RExC_parse + 1;
10795 while (isDIGIT(*next) || *next == ',') {
10796 if (*next == ',') {
10804 if (*next == '}') { /* got one */
10805 const char* endptr;
10809 min = grok_atou(RExC_parse, &endptr);
10810 if (*maxpos == ',')
10813 maxpos = RExC_parse;
10814 max = grok_atou(maxpos, &endptr);
10815 if (!max && *maxpos != '0')
10816 max = REG_INFTY; /* meaning "infinity" */
10817 else if (max >= REG_INFTY)
10818 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10820 nextchar(pRExC_state);
10821 if (max < min) { /* If can't match, warn and optimize to fail
10825 /* We can't back off the size because we have to reserve
10826 * enough space for all the things we are about to throw
10827 * away, but we can shrink it by the ammount we are about
10828 * to re-use here */
10829 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10832 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10833 RExC_emit = orig_emit;
10835 ret = reg_node(pRExC_state, OPFAIL);
10838 else if (min == max
10839 && RExC_parse < RExC_end
10840 && (*RExC_parse == '?' || *RExC_parse == '+'))
10843 ckWARN2reg(RExC_parse + 1,
10844 "Useless use of greediness modifier '%c'",
10847 /* Absorb the modifier, so later code doesn't see nor use
10849 nextchar(pRExC_state);
10853 if ((flags&SIMPLE)) {
10854 MARK_NAUGHTY_EXP(2, 2);
10855 reginsert(pRExC_state, CURLY, ret, depth+1);
10856 Set_Node_Offset(ret, parse_start+1); /* MJD */
10857 Set_Node_Cur_Length(ret, parse_start);
10860 regnode * const w = reg_node(pRExC_state, WHILEM);
10863 REGTAIL(pRExC_state, ret, w);
10864 if (!SIZE_ONLY && RExC_extralen) {
10865 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10866 reginsert(pRExC_state, NOTHING,ret, depth+1);
10867 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10869 reginsert(pRExC_state, CURLYX,ret, depth+1);
10871 Set_Node_Offset(ret, parse_start+1);
10872 Set_Node_Length(ret,
10873 op == '{' ? (RExC_parse - parse_start) : 1);
10875 if (!SIZE_ONLY && RExC_extralen)
10876 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10877 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10879 RExC_whilem_seen++, RExC_extralen += 3;
10880 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
10887 *flagp |= HASWIDTH;
10889 ARG1_SET(ret, (U16)min);
10890 ARG2_SET(ret, (U16)max);
10892 if (max == REG_INFTY)
10893 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10899 if (!ISMULT1(op)) {
10904 #if 0 /* Now runtime fix should be reliable. */
10906 /* if this is reinstated, don't forget to put this back into perldiag:
10908 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10910 (F) The part of the regexp subject to either the * or + quantifier
10911 could match an empty string. The {#} shows in the regular
10912 expression about where the problem was discovered.
10916 if (!(flags&HASWIDTH) && op != '?')
10917 vFAIL("Regexp *+ operand could be empty");
10920 #ifdef RE_TRACK_PATTERN_OFFSETS
10921 parse_start = RExC_parse;
10923 nextchar(pRExC_state);
10925 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10927 if (op == '*' && (flags&SIMPLE)) {
10928 reginsert(pRExC_state, STAR, ret, depth+1);
10931 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10933 else if (op == '*') {
10937 else if (op == '+' && (flags&SIMPLE)) {
10938 reginsert(pRExC_state, PLUS, ret, depth+1);
10941 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10943 else if (op == '+') {
10947 else if (op == '?') {
10952 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10953 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10954 ckWARN2reg(RExC_parse,
10955 "%"UTF8f" matches null string many times",
10956 UTF8fARG(UTF, (RExC_parse >= origparse
10957 ? RExC_parse - origparse
10960 (void)ReREFCNT_inc(RExC_rx_sv);
10963 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10964 nextchar(pRExC_state);
10965 reginsert(pRExC_state, MINMOD, ret, depth+1);
10966 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10969 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10971 nextchar(pRExC_state);
10972 ender = reg_node(pRExC_state, SUCCEED);
10973 REGTAIL(pRExC_state, ret, ender);
10974 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10976 ender = reg_node(pRExC_state, TAIL);
10977 REGTAIL(pRExC_state, ret, ender);
10980 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10982 vFAIL("Nested quantifiers");
10989 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10990 UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10994 /* This is expected to be called by a parser routine that has recognized '\N'
10995 and needs to handle the rest. RExC_parse is expected to point at the first
10996 char following the N at the time of the call. On successful return,
10997 RExC_parse has been updated to point to just after the sequence identified
10998 by this routine, <*flagp> has been updated, and the non-NULL input pointers
10999 have been set appropriately.
11001 The typical case for this is \N{some character name}. This is usually
11002 called while parsing the input, filling in or ready to fill in an EXACTish
11003 node, and the code point for the character should be returned, so that it
11004 can be added to the node, and parsing continued with the next input
11005 character. But it may be that instead of a single character the \N{}
11006 expands to more than one, a named sequence. In this case any following
11007 quantifier applies to the whole sequence, and it is easier, given the code
11008 structure that calls this, to handle it from a different area of the code.
11009 For this reason, the input parameters can be set so that it returns valid
11010 only on one or the other of these cases.
11012 Another possibility is for the input to be an empty \N{}, which for
11013 backwards compatibility we accept, but generate a NOTHING node which should
11014 later get optimized out. This is handled from the area of code which can
11015 handle a named sequence, so if called with the parameters for the other, it
11018 Still another possibility is for the \N to mean [^\n], and not a single
11019 character or explicit sequence at all. This is determined by context.
11020 Again, this is handled from the area of code which can handle a named
11021 sequence, so if called with the parameters for the other, it also fails.
11023 And the final possibility is for the \N to be called from within a bracketed
11024 character class. In this case the [^\n] meaning makes no sense, and so is
11025 an error. Other anomalous situations are left to the calling code to handle.
11027 For non-single-quoted regexes, the tokenizer has attempted to decide which
11028 of the above applies, and in the case of a named sequence, has converted it
11029 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11030 where c1... are the characters in the sequence. For single-quoted regexes,
11031 the tokenizer passes the \N sequence through unchanged; this code will not
11032 attempt to determine this nor expand those, instead raising a syntax error.
11033 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11034 or there is no '}', it signals that this \N occurrence means to match a
11035 non-newline. (This mostly was done because of [perl #56444].)
11037 The API is somewhat convoluted due to historical and the above reasons.
11039 The function raises an error (via vFAIL), and doesn't return for various
11040 syntax errors. For other failures, it returns (STRLEN) -1. For successes,
11041 it returns a count of how many characters were accounted for by it. (This
11042 can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11043 points in the sequence. It sets <node_p>, <valuep>, and/or
11044 <substitute_parse> on success.
11046 If <valuep> is non-null, it means the caller can accept an input sequence
11047 consisting of just a single code point; <*valuep> is set to the value of the
11048 only or first code point in the input.
11050 If <substitute_parse> is non-null, it means the caller can accept an input
11051 sequence consisting of one or more code points; <*substitute_parse> is a
11052 newly created mortal SV* in this case, containing \x{} escapes representing
11055 Both <valuep> and <substitute_parse> can be non-NULL.
11057 If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
11058 that the caller can accept any legal sequence other than a single code
11059 point. To wit, <*node_p> is set as follows:
11060 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11061 2) \N{}: points to a new NOTHING node; return is 0
11062 3) otherwise: points to a new EXACT node containing the resolved
11063 string; return is the number of code points in the
11064 string. This will never be 1.
11065 Note that failure is returned for single code point sequences if <valuep> is
11066 null and <node_p> is not.
11069 char * endbrace; /* '}' following the name */
11071 char *endchar; /* Points to '.' or '}' ending cur char in the input
11073 bool has_multiple_chars; /* true if the input stream contains a sequence of
11074 more than one character */
11075 bool in_char_class = substitute_parse != NULL;
11076 STRLEN count = 0; /* Number of characters in this sequence */
11078 GET_RE_DEBUG_FLAGS_DECL;
11080 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11082 GET_RE_DEBUG_FLAGS;
11084 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
11085 assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11087 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11088 * modifier. The other meaning does not, so use a temporary until we find
11089 * out which we are being called with */
11090 p = (RExC_flags & RXf_PMf_EXTENDED)
11091 ? regpatws(pRExC_state, RExC_parse,
11092 TRUE) /* means recognize comments */
11095 /* Disambiguate between \N meaning a named character versus \N meaning
11096 * [^\n]. The former is assumed when it can't be the latter. */
11097 if (*p != '{' || regcurly(p)) {
11100 /* no bare \N allowed in a charclass */
11101 if (in_char_class) {
11102 vFAIL("\\N in a character class must be a named character: \\N{...}");
11104 return (STRLEN) -1;
11106 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
11108 nextchar(pRExC_state);
11109 *node_p = reg_node(pRExC_state, REG_ANY);
11110 *flagp |= HASWIDTH|SIMPLE;
11112 Set_Node_Length(*node_p, 1); /* MJD */
11116 /* Here, we have decided it should be a named character or sequence */
11118 /* The test above made sure that the next real character is a '{', but
11119 * under the /x modifier, it could be separated by space (or a comment and
11120 * \n) and this is not allowed (for consistency with \x{...} and the
11121 * tokenizer handling of \N{NAME}). */
11122 if (*RExC_parse != '{') {
11123 vFAIL("Missing braces on \\N{}");
11126 RExC_parse++; /* Skip past the '{' */
11128 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11129 || ! (endbrace == RExC_parse /* nothing between the {} */
11130 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11131 && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11134 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11135 vFAIL("\\N{NAME} must be resolved by the lexer");
11138 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11140 if (endbrace == RExC_parse) { /* empty: \N{} */
11142 *node_p = reg_node(pRExC_state,NOTHING);
11144 else if (! in_char_class) {
11145 return (STRLEN) -1;
11147 nextchar(pRExC_state);
11151 RExC_parse += 2; /* Skip past the 'U+' */
11153 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11155 /* Code points are separated by dots. If none, there is only one code
11156 * point, and is terminated by the brace */
11157 has_multiple_chars = (endchar < endbrace);
11159 /* We get the first code point if we want it, and either there is only one,
11160 * or we can accept both cases of one and there is more than one */
11161 if (valuep && (substitute_parse || ! has_multiple_chars)) {
11162 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11163 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11164 | PERL_SCAN_DISALLOW_PREFIX
11166 /* No errors in the first pass (See [perl
11167 * #122671].) We let the code below find the
11168 * errors when there are multiple chars. */
11169 | ((SIZE_ONLY || has_multiple_chars)
11170 ? PERL_SCAN_SILENT_ILLDIGIT
11173 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11175 /* The tokenizer should have guaranteed validity, but it's possible to
11176 * bypass it by using single quoting, so check. Don't do the check
11177 * here when there are multiple chars; we do it below anyway. */
11178 if (! has_multiple_chars) {
11179 if (length_of_hex == 0
11180 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11182 RExC_parse += length_of_hex; /* Includes all the valid */
11183 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11184 ? UTF8SKIP(RExC_parse)
11186 /* Guard against malformed utf8 */
11187 if (RExC_parse >= endchar) {
11188 RExC_parse = endchar;
11190 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11193 RExC_parse = endbrace + 1;
11198 /* Here, we should have already handled the case where a single character
11199 * is expected and found. So it is a failure if we aren't expecting
11200 * multiple chars and got them; or didn't get them but wanted them. We
11201 * fail without advancing the parse, so that the caller can try again with
11202 * different acceptance criteria */
11203 if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11205 return (STRLEN) -1;
11209 /* What is done here is to convert this to a sub-pattern of the form
11210 * \x{char1}\x{char2}...
11211 * and then either return it in <*substitute_parse> if non-null; or
11212 * call reg recursively to parse it (enclosing in "(?: ... )" ). That
11213 * way, it retains its atomicness, while not having to worry about
11214 * special handling that some code points may have. toke.c has
11215 * converted the original Unicode values to native, so that we can just
11216 * pass on the hex values unchanged. We do have to set a flag to keep
11217 * recoding from happening in the recursion */
11221 char *orig_end = RExC_end;
11224 if (substitute_parse) {
11225 *substitute_parse = newSVpvs("");
11228 substitute_parse = &dummy;
11229 *substitute_parse = newSVpvs("?:");
11231 *substitute_parse = sv_2mortal(*substitute_parse);
11233 while (RExC_parse < endbrace) {
11235 /* Convert to notation the rest of the code understands */
11236 sv_catpv(*substitute_parse, "\\x{");
11237 sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11238 sv_catpv(*substitute_parse, "}");
11240 /* Point to the beginning of the next character in the sequence. */
11241 RExC_parse = endchar + 1;
11242 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11246 if (! in_char_class) {
11247 sv_catpv(*substitute_parse, ")");
11250 RExC_parse = SvPV(*substitute_parse, len);
11252 /* Don't allow empty number */
11253 if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11254 RExC_parse = endbrace;
11255 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11257 RExC_end = RExC_parse + len;
11259 /* The values are Unicode, and therefore not subject to recoding */
11260 RExC_override_recoding = 1;
11263 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11264 if (flags & RESTART_UTF8) {
11265 *flagp = RESTART_UTF8;
11266 return (STRLEN) -1;
11268 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11271 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11274 RExC_parse = endbrace;
11275 RExC_end = orig_end;
11276 RExC_override_recoding = 0;
11278 nextchar(pRExC_state);
11288 * It returns the code point in utf8 for the value in *encp.
11289 * value: a code value in the source encoding
11290 * encp: a pointer to an Encode object
11292 * If the result from Encode is not a single character,
11293 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11296 S_reg_recode(pTHX_ const char value, SV **encp)
11299 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11300 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11301 const STRLEN newlen = SvCUR(sv);
11302 UV uv = UNICODE_REPLACEMENT;
11304 PERL_ARGS_ASSERT_REG_RECODE;
11308 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11311 if (!newlen || numlen != newlen) {
11312 uv = UNICODE_REPLACEMENT;
11318 PERL_STATIC_INLINE U8
11319 S_compute_EXACTish(RExC_state_t *pRExC_state)
11323 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11331 op = get_regex_charset(RExC_flags);
11332 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11333 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11334 been, so there is no hole */
11337 return op + EXACTF;
11340 PERL_STATIC_INLINE void
11341 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11342 regnode *node, I32* flagp, STRLEN len, UV code_point,
11345 /* This knows the details about sizing an EXACTish node, setting flags for
11346 * it (by setting <*flagp>, and potentially populating it with a single
11349 * If <len> (the length in bytes) is non-zero, this function assumes that
11350 * the node has already been populated, and just does the sizing. In this
11351 * case <code_point> should be the final code point that has already been
11352 * placed into the node. This value will be ignored except that under some
11353 * circumstances <*flagp> is set based on it.
11355 * If <len> is zero, the function assumes that the node is to contain only
11356 * the single character given by <code_point> and calculates what <len>
11357 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11358 * additionally will populate the node's STRING with <code_point> or its
11361 * In both cases <*flagp> is appropriately set
11363 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11364 * 255, must be folded (the former only when the rules indicate it can
11367 * When it does the populating, it looks at the flag 'downgradable'. If
11368 * true with a node that folds, it checks if the single code point
11369 * participates in a fold, and if not downgrades the node to an EXACT.
11370 * This helps the optimizer */
11372 bool len_passed_in = cBOOL(len != 0);
11373 U8 character[UTF8_MAXBYTES_CASE+1];
11375 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11377 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11378 * sizing difference, and is extra work that is thrown away */
11379 if (downgradable && ! PASS2) {
11380 downgradable = FALSE;
11383 if (! len_passed_in) {
11385 if (UVCHR_IS_INVARIANT(code_point)) {
11386 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11387 *character = (U8) code_point;
11389 else { /* Here is /i and not /l. (toFOLD() is defined on just
11390 ASCII, which isn't the same thing as INVARIANT on
11391 EBCDIC, but it works there, as the extra invariants
11392 fold to themselves) */
11393 *character = toFOLD((U8) code_point);
11395 /* We can downgrade to an EXACT node if this character
11396 * isn't a folding one. Note that this assumes that
11397 * nothing above Latin1 folds to some other invariant than
11398 * one of these alphabetics; otherwise we would also have
11400 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11401 * || ASCII_FOLD_RESTRICTED))
11403 if (downgradable && PL_fold[code_point] == code_point) {
11409 else if (FOLD && (! LOC
11410 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11411 { /* Folding, and ok to do so now */
11412 UV folded = _to_uni_fold_flags(
11416 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11417 ? FOLD_FLAGS_NOMIX_ASCII
11420 && folded == code_point /* This quickly rules out many
11421 cases, avoiding the
11422 _invlist_contains_cp() overhead
11424 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11431 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11433 /* Not folding this cp, and can output it directly */
11434 *character = UTF8_TWO_BYTE_HI(code_point);
11435 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11439 uvchr_to_utf8( character, code_point);
11440 len = UTF8SKIP(character);
11442 } /* Else pattern isn't UTF8. */
11444 *character = (U8) code_point;
11446 } /* Else is folded non-UTF8 */
11447 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11449 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11450 * comments at join_exact()); */
11451 *character = (U8) code_point;
11454 /* Can turn into an EXACT node if we know the fold at compile time,
11455 * and it folds to itself and doesn't particpate in other folds */
11458 && PL_fold_latin1[code_point] == code_point
11459 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11460 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11464 } /* else is Sharp s. May need to fold it */
11465 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11467 *(character + 1) = 's';
11471 *character = LATIN_SMALL_LETTER_SHARP_S;
11477 RExC_size += STR_SZ(len);
11480 RExC_emit += STR_SZ(len);
11481 STR_LEN(node) = len;
11482 if (! len_passed_in) {
11483 Copy((char *) character, STRING(node), len, char);
11487 *flagp |= HASWIDTH;
11489 /* A single character node is SIMPLE, except for the special-cased SHARP S
11491 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11492 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11493 || ! FOLD || ! DEPENDS_SEMANTICS))
11498 /* The OP may not be well defined in PASS1 */
11499 if (PASS2 && OP(node) == EXACTFL) {
11500 RExC_contains_locale = 1;
11505 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11506 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11509 S_backref_value(char *p)
11511 const char* endptr;
11512 UV val = grok_atou(p, &endptr);
11513 if (endptr == p || endptr == NULL || val > I32_MAX)
11520 - regatom - the lowest level
11522 Try to identify anything special at the start of the pattern. If there
11523 is, then handle it as required. This may involve generating a single regop,
11524 such as for an assertion; or it may involve recursing, such as to
11525 handle a () structure.
11527 If the string doesn't start with something special then we gobble up
11528 as much literal text as we can.
11530 Once we have been able to handle whatever type of thing started the
11531 sequence, we return.
11533 Note: we have to be careful with escapes, as they can be both literal
11534 and special, and in the case of \10 and friends, context determines which.
11536 A summary of the code structure is:
11538 switch (first_byte) {
11539 cases for each special:
11540 handle this special;
11543 switch (2nd byte) {
11544 cases for each unambiguous special:
11545 handle this special;
11547 cases for each ambigous special/literal:
11549 if (special) handle here
11551 default: // unambiguously literal:
11554 default: // is a literal char
11557 create EXACTish node for literal;
11558 while (more input and node isn't full) {
11559 switch (input_byte) {
11560 cases for each special;
11561 make sure parse pointer is set so that the next call to
11562 regatom will see this special first
11563 goto loopdone; // EXACTish node terminated by prev. char
11565 append char to EXACTISH node;
11567 get next input byte;
11571 return the generated node;
11573 Specifically there are two separate switches for handling
11574 escape sequences, with the one for handling literal escapes requiring
11575 a dummy entry for all of the special escapes that are actually handled
11578 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11580 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11582 Otherwise does not return NULL.
11586 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11588 regnode *ret = NULL;
11590 char *parse_start = RExC_parse;
11595 GET_RE_DEBUG_FLAGS_DECL;
11597 *flagp = WORST; /* Tentatively. */
11599 DEBUG_PARSE("atom");
11601 PERL_ARGS_ASSERT_REGATOM;
11604 switch ((U8)*RExC_parse) {
11606 RExC_seen_zerolen++;
11607 nextchar(pRExC_state);
11608 if (RExC_flags & RXf_PMf_MULTILINE)
11609 ret = reg_node(pRExC_state, MBOL);
11611 ret = reg_node(pRExC_state, SBOL);
11612 Set_Node_Length(ret, 1); /* MJD */
11615 nextchar(pRExC_state);
11617 RExC_seen_zerolen++;
11618 if (RExC_flags & RXf_PMf_MULTILINE)
11619 ret = reg_node(pRExC_state, MEOL);
11621 ret = reg_node(pRExC_state, SEOL);
11622 Set_Node_Length(ret, 1); /* MJD */
11625 nextchar(pRExC_state);
11626 if (RExC_flags & RXf_PMf_SINGLELINE)
11627 ret = reg_node(pRExC_state, SANY);
11629 ret = reg_node(pRExC_state, REG_ANY);
11630 *flagp |= HASWIDTH|SIMPLE;
11632 Set_Node_Length(ret, 1); /* MJD */
11636 char * const oregcomp_parse = ++RExC_parse;
11637 ret = regclass(pRExC_state, flagp,depth+1,
11638 FALSE, /* means parse the whole char class */
11639 TRUE, /* allow multi-char folds */
11640 FALSE, /* don't silence non-portable warnings. */
11641 (bool) RExC_strict,
11643 if (*RExC_parse != ']') {
11644 RExC_parse = oregcomp_parse;
11645 vFAIL("Unmatched [");
11648 if (*flagp & RESTART_UTF8)
11650 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11653 nextchar(pRExC_state);
11654 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11658 nextchar(pRExC_state);
11659 ret = reg(pRExC_state, 2, &flags,depth+1);
11661 if (flags & TRYAGAIN) {
11662 if (RExC_parse == RExC_end) {
11663 /* Make parent create an empty node if needed. */
11664 *flagp |= TRYAGAIN;
11669 if (flags & RESTART_UTF8) {
11670 *flagp = RESTART_UTF8;
11673 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11676 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11680 if (flags & TRYAGAIN) {
11681 *flagp |= TRYAGAIN;
11684 vFAIL("Internal urp");
11685 /* Supposed to be caught earlier. */
11691 vFAIL("Quantifier follows nothing");
11696 This switch handles escape sequences that resolve to some kind
11697 of special regop and not to literal text. Escape sequnces that
11698 resolve to literal text are handled below in the switch marked
11701 Every entry in this switch *must* have a corresponding entry
11702 in the literal escape switch. However, the opposite is not
11703 required, as the default for this switch is to jump to the
11704 literal text handling code.
11706 switch ((U8)*++RExC_parse) {
11707 /* Special Escapes */
11709 RExC_seen_zerolen++;
11710 ret = reg_node(pRExC_state, SBOL);
11711 /* SBOL is shared with /^/ so we set the flags so we can tell
11712 * /\A/ from /^/ in split. We check ret because first pass we
11713 * have no regop struct to set the flags on. */
11717 goto finish_meta_pat;
11719 ret = reg_node(pRExC_state, GPOS);
11720 RExC_seen |= REG_GPOS_SEEN;
11722 goto finish_meta_pat;
11724 RExC_seen_zerolen++;
11725 ret = reg_node(pRExC_state, KEEPS);
11727 /* XXX:dmq : disabling in-place substitution seems to
11728 * be necessary here to avoid cases of memory corruption, as
11729 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11731 RExC_seen |= REG_LOOKBEHIND_SEEN;
11732 goto finish_meta_pat;
11734 ret = reg_node(pRExC_state, SEOL);
11736 RExC_seen_zerolen++; /* Do not optimize RE away */
11737 goto finish_meta_pat;
11739 ret = reg_node(pRExC_state, EOS);
11741 RExC_seen_zerolen++; /* Do not optimize RE away */
11742 goto finish_meta_pat;
11744 ret = reg_node(pRExC_state, CANY);
11745 RExC_seen |= REG_CANY_SEEN;
11746 *flagp |= HASWIDTH|SIMPLE;
11748 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11750 goto finish_meta_pat;
11752 ret = reg_node(pRExC_state, CLUMP);
11753 *flagp |= HASWIDTH;
11754 goto finish_meta_pat;
11760 arg = ANYOF_WORDCHAR;
11767 RExC_seen_zerolen++;
11768 RExC_seen |= REG_LOOKBEHIND_SEEN;
11769 op = BOUND + get_regex_charset(RExC_flags);
11770 if (op > BOUNDA) { /* /aa is same as /a */
11773 else if (op == BOUNDL) {
11774 RExC_contains_locale = 1;
11778 op += NBOUND - BOUND;
11781 ret = reg_node(pRExC_state, op);
11783 if ((U8) *(RExC_parse + 1) == '{') {
11784 /* diag_listed_as: Use "%s" instead of "%s" */
11785 vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse);
11787 goto finish_meta_pat;
11794 if (! DEPENDS_SEMANTICS) {
11798 /* \d doesn't have any matches in the upper Latin1 range, hence /d
11799 * is equivalent to /u. Changing to /u saves some branches at
11802 goto join_posix_op_known;
11805 ret = reg_node(pRExC_state, LNBREAK);
11806 *flagp |= HASWIDTH|SIMPLE;
11807 goto finish_meta_pat;
11815 goto join_posix_op_known;
11821 arg = ANYOF_VERTWS;
11823 goto join_posix_op_known;
11833 op = POSIXD + get_regex_charset(RExC_flags);
11834 if (op > POSIXA) { /* /aa is same as /a */
11837 else if (op == POSIXL) {
11838 RExC_contains_locale = 1;
11841 join_posix_op_known:
11844 op += NPOSIXD - POSIXD;
11847 ret = reg_node(pRExC_state, op);
11849 FLAGS(ret) = namedclass_to_classnum(arg);
11852 *flagp |= HASWIDTH|SIMPLE;
11856 nextchar(pRExC_state);
11857 Set_Node_Length(ret, 2); /* MJD */
11863 char* parse_start = RExC_parse - 2;
11868 ret = regclass(pRExC_state, flagp,depth+1,
11869 TRUE, /* means just parse this element */
11870 FALSE, /* don't allow multi-char folds */
11871 FALSE, /* don't silence non-portable warnings.
11872 It would be a bug if these returned
11874 (bool) RExC_strict,
11876 /* regclass() can only return RESTART_UTF8 if multi-char folds
11879 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11884 Set_Node_Offset(ret, parse_start + 2);
11885 Set_Node_Cur_Length(ret, parse_start);
11886 nextchar(pRExC_state);
11890 /* Handle \N and \N{NAME} with multiple code points here and not
11891 * below because it can be multicharacter. join_exact() will join
11892 * them up later on. Also this makes sure that things like
11893 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11894 * The options to the grok function call causes it to fail if the
11895 * sequence is just a single code point. We then go treat it as
11896 * just another character in the current EXACT node, and hence it
11897 * gets uniform treatment with all the other characters. The
11898 * special treatment for quantifiers is not needed for such single
11899 * character sequences */
11901 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11904 if (*flagp & RESTART_UTF8)
11910 case 'k': /* Handle \k<NAME> and \k'NAME' */
11913 char ch= RExC_parse[1];
11914 if (ch != '<' && ch != '\'' && ch != '{') {
11916 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11917 vFAIL2("Sequence %.2s... not terminated",parse_start);
11919 /* this pretty much dupes the code for (?P=...) in reg(), if
11920 you change this make sure you change that */
11921 char* name_start = (RExC_parse += 2);
11923 SV *sv_dat = reg_scan_name(pRExC_state,
11924 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11925 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11926 if (RExC_parse == name_start || *RExC_parse != ch)
11927 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11928 vFAIL2("Sequence %.3s... not terminated",parse_start);
11931 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11932 RExC_rxi->data->data[num]=(void*)sv_dat;
11933 SvREFCNT_inc_simple_void(sv_dat);
11937 ret = reganode(pRExC_state,
11940 : (ASCII_FOLD_RESTRICTED)
11942 : (AT_LEAST_UNI_SEMANTICS)
11948 *flagp |= HASWIDTH;
11950 /* override incorrect value set in reganode MJD */
11951 Set_Node_Offset(ret, parse_start+1);
11952 Set_Node_Cur_Length(ret, parse_start);
11953 nextchar(pRExC_state);
11959 case '1': case '2': case '3': case '4':
11960 case '5': case '6': case '7': case '8': case '9':
11965 if (*RExC_parse == 'g') {
11969 if (*RExC_parse == '{') {
11973 if (*RExC_parse == '-') {
11977 if (hasbrace && !isDIGIT(*RExC_parse)) {
11978 if (isrel) RExC_parse--;
11980 goto parse_named_seq;
11983 num = S_backref_value(RExC_parse);
11985 vFAIL("Reference to invalid group 0");
11986 else if (num == I32_MAX) {
11987 if (isDIGIT(*RExC_parse))
11988 vFAIL("Reference to nonexistent group");
11990 vFAIL("Unterminated \\g... pattern");
11994 num = RExC_npar - num;
11996 vFAIL("Reference to nonexistent or unclosed group");
12000 num = S_backref_value(RExC_parse);
12001 /* bare \NNN might be backref or octal - if it is larger than or equal
12002 * RExC_npar then it is assumed to be and octal escape.
12003 * Note RExC_npar is +1 from the actual number of parens*/
12004 if (num == I32_MAX || (num > 9 && num >= RExC_npar
12005 && *RExC_parse != '8' && *RExC_parse != '9'))
12007 /* Probably a character specified in octal, e.g. \35 */
12012 /* at this point RExC_parse definitely points to a backref
12015 #ifdef RE_TRACK_PATTERN_OFFSETS
12016 char * const parse_start = RExC_parse - 1; /* MJD */
12018 while (isDIGIT(*RExC_parse))
12021 if (*RExC_parse != '}')
12022 vFAIL("Unterminated \\g{...} pattern");
12026 if (num > (I32)RExC_rx->nparens)
12027 vFAIL("Reference to nonexistent group");
12030 ret = reganode(pRExC_state,
12033 : (ASCII_FOLD_RESTRICTED)
12035 : (AT_LEAST_UNI_SEMANTICS)
12041 *flagp |= HASWIDTH;
12043 /* override incorrect value set in reganode MJD */
12044 Set_Node_Offset(ret, parse_start+1);
12045 Set_Node_Cur_Length(ret, parse_start);
12047 nextchar(pRExC_state);
12052 if (RExC_parse >= RExC_end)
12053 FAIL("Trailing \\");
12056 /* Do not generate "unrecognized" warnings here, we fall
12057 back into the quick-grab loop below */
12064 if (RExC_flags & RXf_PMf_EXTENDED) {
12065 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12066 if (RExC_parse < RExC_end)
12073 parse_start = RExC_parse - 1;
12082 #define MAX_NODE_STRING_SIZE 127
12083 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12085 U8 upper_parse = MAX_NODE_STRING_SIZE;
12086 U8 node_type = compute_EXACTish(pRExC_state);
12087 bool next_is_quantifier;
12088 char * oldp = NULL;
12090 /* We can convert EXACTF nodes to EXACTFU if they contain only
12091 * characters that match identically regardless of the target
12092 * string's UTF8ness. The reason to do this is that EXACTF is not
12093 * trie-able, EXACTFU is.
12095 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12096 * contain only above-Latin1 characters (hence must be in UTF8),
12097 * which don't participate in folds with Latin1-range characters,
12098 * as the latter's folds aren't known until runtime. (We don't
12099 * need to figure this out until pass 2) */
12100 bool maybe_exactfu = PASS2
12101 && (node_type == EXACTF || node_type == EXACTFL);
12103 /* If a folding node contains only code points that don't
12104 * participate in folds, it can be changed into an EXACT node,
12105 * which allows the optimizer more things to look for */
12108 ret = reg_node(pRExC_state, node_type);
12110 /* In pass1, folded, we use a temporary buffer instead of the
12111 * actual node, as the node doesn't exist yet */
12112 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12118 /* We do the EXACTFish to EXACT node only if folding. (And we
12119 * don't need to figure this out until pass 2) */
12120 maybe_exact = FOLD && PASS2;
12122 /* XXX The node can hold up to 255 bytes, yet this only goes to
12123 * 127. I (khw) do not know why. Keeping it somewhat less than
12124 * 255 allows us to not have to worry about overflow due to
12125 * converting to utf8 and fold expansion, but that value is
12126 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12127 * split up by this limit into a single one using the real max of
12128 * 255. Even at 127, this breaks under rare circumstances. If
12129 * folding, we do not want to split a node at a character that is a
12130 * non-final in a multi-char fold, as an input string could just
12131 * happen to want to match across the node boundary. The join
12132 * would solve that problem if the join actually happens. But a
12133 * series of more than two nodes in a row each of 127 would cause
12134 * the first join to succeed to get to 254, but then there wouldn't
12135 * be room for the next one, which could at be one of those split
12136 * multi-char folds. I don't know of any fool-proof solution. One
12137 * could back off to end with only a code point that isn't such a
12138 * non-final, but it is possible for there not to be any in the
12140 for (p = RExC_parse - 1;
12141 len < upper_parse && p < RExC_end;
12146 if (RExC_flags & RXf_PMf_EXTENDED)
12147 p = regpatws(pRExC_state, p,
12148 TRUE); /* means recognize comments */
12159 /* Literal Escapes Switch
12161 This switch is meant to handle escape sequences that
12162 resolve to a literal character.
12164 Every escape sequence that represents something
12165 else, like an assertion or a char class, is handled
12166 in the switch marked 'Special Escapes' above in this
12167 routine, but also has an entry here as anything that
12168 isn't explicitly mentioned here will be treated as
12169 an unescaped equivalent literal.
12172 switch ((U8)*++p) {
12173 /* These are all the special escapes. */
12174 case 'A': /* Start assertion */
12175 case 'b': case 'B': /* Word-boundary assertion*/
12176 case 'C': /* Single char !DANGEROUS! */
12177 case 'd': case 'D': /* digit class */
12178 case 'g': case 'G': /* generic-backref, pos assertion */
12179 case 'h': case 'H': /* HORIZWS */
12180 case 'k': case 'K': /* named backref, keep marker */
12181 case 'p': case 'P': /* Unicode property */
12182 case 'R': /* LNBREAK */
12183 case 's': case 'S': /* space class */
12184 case 'v': case 'V': /* VERTWS */
12185 case 'w': case 'W': /* word class */
12186 case 'X': /* eXtended Unicode "combining
12187 character sequence" */
12188 case 'z': case 'Z': /* End of line/string assertion */
12192 /* Anything after here is an escape that resolves to a
12193 literal. (Except digits, which may or may not)
12199 case 'N': /* Handle a single-code point named character. */
12200 /* The options cause it to fail if a multiple code
12201 * point sequence. Handle those in the switch() above
12203 RExC_parse = p + 1;
12204 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12210 if (*flagp & RESTART_UTF8)
12211 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12212 RExC_parse = p = oldp;
12216 if (ender > 0xff) {
12233 ender = ESC_NATIVE;
12243 const char* error_msg;
12245 bool valid = grok_bslash_o(&p,
12248 PASS2, /* out warnings */
12249 (bool) RExC_strict,
12250 TRUE, /* Output warnings
12255 RExC_parse = p; /* going to die anyway; point
12256 to exact spot of failure */
12260 if (IN_ENCODING && ender < 0x100) {
12261 goto recode_encoding;
12263 if (ender > 0xff) {
12270 UV result = UV_MAX; /* initialize to erroneous
12272 const char* error_msg;
12274 bool valid = grok_bslash_x(&p,
12277 PASS2, /* out warnings */
12278 (bool) RExC_strict,
12279 TRUE, /* Silence warnings
12284 RExC_parse = p; /* going to die anyway; point
12285 to exact spot of failure */
12290 if (IN_ENCODING && ender < 0x100) {
12291 goto recode_encoding;
12293 if (ender > 0xff) {
12300 ender = grok_bslash_c(*p++, PASS2);
12302 case '8': case '9': /* must be a backreference */
12305 case '1': case '2': case '3':case '4':
12306 case '5': case '6': case '7':
12307 /* When we parse backslash escapes there is ambiguity
12308 * between backreferences and octal escapes. Any escape
12309 * from \1 - \9 is a backreference, any multi-digit
12310 * escape which does not start with 0 and which when
12311 * evaluated as decimal could refer to an already
12312 * parsed capture buffer is a back reference. Anything
12315 * Note this implies that \118 could be interpreted as
12316 * 118 OR as "\11" . "8" depending on whether there
12317 * were 118 capture buffers defined already in the
12320 /* NOTE, RExC_npar is 1 more than the actual number of
12321 * parens we have seen so far, hence the < RExC_npar below. */
12323 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12324 { /* Not to be treated as an octal constant, go
12332 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12334 ender = grok_oct(p, &numlen, &flags, NULL);
12335 if (ender > 0xff) {
12339 if (PASS2 /* like \08, \178 */
12342 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12344 reg_warn_non_literal_string(
12346 form_short_octal_warning(p, numlen));
12349 if (IN_ENCODING && ender < 0x100)
12350 goto recode_encoding;
12353 if (! RExC_override_recoding) {
12354 SV* enc = _get_encoding();
12355 ender = reg_recode((const char)(U8)ender, &enc);
12357 ckWARNreg(p, "Invalid escape in the specified encoding");
12363 FAIL("Trailing \\");
12366 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12367 /* Include any { following the alpha to emphasize
12368 * that it could be part of an escape at some point
12370 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12371 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12373 goto normal_default;
12374 } /* End of switch on '\' */
12377 /* Currently we don't warn when the lbrace is at the start
12378 * of a construct. This catches it in the middle of a
12379 * literal string, or when its the first thing after
12380 * something like "\b" */
12382 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12384 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12387 default: /* A literal character */
12389 if (UTF8_IS_START(*p) && UTF) {
12391 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12392 &numlen, UTF8_ALLOW_DEFAULT);
12398 } /* End of switch on the literal */
12400 /* Here, have looked at the literal character and <ender>
12401 * contains its ordinal, <p> points to the character after it
12404 if ( RExC_flags & RXf_PMf_EXTENDED)
12405 p = regpatws(pRExC_state, p,
12406 TRUE); /* means recognize comments */
12408 /* If the next thing is a quantifier, it applies to this
12409 * character only, which means that this character has to be in
12410 * its own node and can't just be appended to the string in an
12411 * existing node, so if there are already other characters in
12412 * the node, close the node with just them, and set up to do
12413 * this character again next time through, when it will be the
12414 * only thing in its new node */
12415 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12421 if (! FOLD) { /* The simple case, just append the literal */
12423 /* In the sizing pass, we need only the size of the
12424 * character we are appending, hence we can delay getting
12425 * its representation until PASS2. */
12428 const STRLEN unilen = UNISKIP(ender);
12431 /* We have to subtract 1 just below (and again in
12432 * the corresponding PASS2 code) because the loop
12433 * increments <len> each time, as all but this path
12434 * (and one other) through it add a single byte to
12435 * the EXACTish node. But these paths would change
12436 * len to be the correct final value, so cancel out
12437 * the increment that follows */
12443 } else { /* PASS2 */
12446 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12447 len += (char *) new_s - s - 1;
12448 s = (char *) new_s;
12451 *(s++) = (char) ender;
12455 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12457 /* Here are folding under /l, and the code point is
12458 * problematic. First, we know we can't simplify things */
12459 maybe_exact = FALSE;
12460 maybe_exactfu = FALSE;
12462 /* A problematic code point in this context means that its
12463 * fold isn't known until runtime, so we can't fold it now.
12464 * (The non-problematic code points are the above-Latin1
12465 * ones that fold to also all above-Latin1. Their folds
12466 * don't vary no matter what the locale is.) But here we
12467 * have characters whose fold depends on the locale.
12468 * Unlike the non-folding case above, we have to keep track
12469 * of these in the sizing pass, so that we can make sure we
12470 * don't split too-long nodes in the middle of a potential
12471 * multi-char fold. And unlike the regular fold case
12472 * handled in the else clauses below, we don't actually
12473 * fold and don't have special cases to consider. What we
12474 * do for both passes is the PASS2 code for non-folding */
12475 goto not_fold_common;
12477 else /* A regular FOLD code point */
12479 /* See comments for join_exact() as to why we fold this
12480 * non-UTF at compile time */
12481 || (node_type == EXACTFU
12482 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12484 /* Here, are folding and are not UTF-8 encoded; therefore
12485 * the character must be in the range 0-255, and is not /l
12486 * (Not /l because we already handled these under /l in
12487 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12488 if (IS_IN_SOME_FOLD_L1(ender)) {
12489 maybe_exact = FALSE;
12491 /* See if the character's fold differs between /d and
12492 * /u. This includes the multi-char fold SHARP S to
12495 && (PL_fold[ender] != PL_fold_latin1[ender]
12496 || ender == LATIN_SMALL_LETTER_SHARP_S
12498 && isALPHA_FOLD_EQ(ender, 's')
12499 && isALPHA_FOLD_EQ(*(s-1), 's'))))
12501 maybe_exactfu = FALSE;
12505 /* Even when folding, we store just the input character, as
12506 * we have an array that finds its fold quickly */
12507 *(s++) = (char) ender;
12509 else { /* FOLD and UTF */
12510 /* Unlike the non-fold case, we do actually have to
12511 * calculate the results here in pass 1. This is for two
12512 * reasons, the folded length may be longer than the
12513 * unfolded, and we have to calculate how many EXACTish
12514 * nodes it will take; and we may run out of room in a node
12515 * in the middle of a potential multi-char fold, and have
12516 * to back off accordingly. */
12519 if (isASCII_uni(ender)) {
12520 folded = toFOLD(ender);
12521 *(s)++ = (U8) folded;
12526 folded = _to_uni_fold_flags(
12530 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12531 ? FOLD_FLAGS_NOMIX_ASCII
12535 /* The loop increments <len> each time, as all but this
12536 * path (and one other) through it add a single byte to
12537 * the EXACTish node. But this one has changed len to
12538 * be the correct final value, so subtract one to
12539 * cancel out the increment that follows */
12540 len += foldlen - 1;
12542 /* If this node only contains non-folding code points so
12543 * far, see if this new one is also non-folding */
12545 if (folded != ender) {
12546 maybe_exact = FALSE;
12549 /* Here the fold is the original; we have to check
12550 * further to see if anything folds to it */
12551 if (_invlist_contains_cp(PL_utf8_foldable,
12554 maybe_exact = FALSE;
12561 if (next_is_quantifier) {
12563 /* Here, the next input is a quantifier, and to get here,
12564 * the current character is the only one in the node.
12565 * Also, here <len> doesn't include the final byte for this
12571 } /* End of loop through literal characters */
12573 /* Here we have either exhausted the input or ran out of room in
12574 * the node. (If we encountered a character that can't be in the
12575 * node, transfer is made directly to <loopdone>, and so we
12576 * wouldn't have fallen off the end of the loop.) In the latter
12577 * case, we artificially have to split the node into two, because
12578 * we just don't have enough space to hold everything. This
12579 * creates a problem if the final character participates in a
12580 * multi-character fold in the non-final position, as a match that
12581 * should have occurred won't, due to the way nodes are matched,
12582 * and our artificial boundary. So back off until we find a non-
12583 * problematic character -- one that isn't at the beginning or
12584 * middle of such a fold. (Either it doesn't participate in any
12585 * folds, or appears only in the final position of all the folds it
12586 * does participate in.) A better solution with far fewer false
12587 * positives, and that would fill the nodes more completely, would
12588 * be to actually have available all the multi-character folds to
12589 * test against, and to back-off only far enough to be sure that
12590 * this node isn't ending with a partial one. <upper_parse> is set
12591 * further below (if we need to reparse the node) to include just
12592 * up through that final non-problematic character that this code
12593 * identifies, so when it is set to less than the full node, we can
12594 * skip the rest of this */
12595 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12597 const STRLEN full_len = len;
12599 assert(len >= MAX_NODE_STRING_SIZE);
12601 /* Here, <s> points to the final byte of the final character.
12602 * Look backwards through the string until find a non-
12603 * problematic character */
12607 /* This has no multi-char folds to non-UTF characters */
12608 if (ASCII_FOLD_RESTRICTED) {
12612 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12616 if (! PL_NonL1NonFinalFold) {
12617 PL_NonL1NonFinalFold = _new_invlist_C_array(
12618 NonL1_Perl_Non_Final_Folds_invlist);
12621 /* Point to the first byte of the final character */
12622 s = (char *) utf8_hop((U8 *) s, -1);
12624 while (s >= s0) { /* Search backwards until find
12625 non-problematic char */
12626 if (UTF8_IS_INVARIANT(*s)) {
12628 /* There are no ascii characters that participate
12629 * in multi-char folds under /aa. In EBCDIC, the
12630 * non-ascii invariants are all control characters,
12631 * so don't ever participate in any folds. */
12632 if (ASCII_FOLD_RESTRICTED
12633 || ! IS_NON_FINAL_FOLD(*s))
12638 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12639 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12645 else if (! _invlist_contains_cp(
12646 PL_NonL1NonFinalFold,
12647 valid_utf8_to_uvchr((U8 *) s, NULL)))
12652 /* Here, the current character is problematic in that
12653 * it does occur in the non-final position of some
12654 * fold, so try the character before it, but have to
12655 * special case the very first byte in the string, so
12656 * we don't read outside the string */
12657 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12658 } /* End of loop backwards through the string */
12660 /* If there were only problematic characters in the string,
12661 * <s> will point to before s0, in which case the length
12662 * should be 0, otherwise include the length of the
12663 * non-problematic character just found */
12664 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12667 /* Here, have found the final character, if any, that is
12668 * non-problematic as far as ending the node without splitting
12669 * it across a potential multi-char fold. <len> contains the
12670 * number of bytes in the node up-to and including that
12671 * character, or is 0 if there is no such character, meaning
12672 * the whole node contains only problematic characters. In
12673 * this case, give up and just take the node as-is. We can't
12678 /* If the node ends in an 's' we make sure it stays EXACTF,
12679 * as if it turns into an EXACTFU, it could later get
12680 * joined with another 's' that would then wrongly match
12682 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12684 maybe_exactfu = FALSE;
12688 /* Here, the node does contain some characters that aren't
12689 * problematic. If one such is the final character in the
12690 * node, we are done */
12691 if (len == full_len) {
12694 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12696 /* If the final character is problematic, but the
12697 * penultimate is not, back-off that last character to
12698 * later start a new node with it */
12703 /* Here, the final non-problematic character is earlier
12704 * in the input than the penultimate character. What we do
12705 * is reparse from the beginning, going up only as far as
12706 * this final ok one, thus guaranteeing that the node ends
12707 * in an acceptable character. The reason we reparse is
12708 * that we know how far in the character is, but we don't
12709 * know how to correlate its position with the input parse.
12710 * An alternate implementation would be to build that
12711 * correlation as we go along during the original parse,
12712 * but that would entail extra work for every node, whereas
12713 * this code gets executed only when the string is too
12714 * large for the node, and the final two characters are
12715 * problematic, an infrequent occurrence. Yet another
12716 * possible strategy would be to save the tail of the
12717 * string, and the next time regatom is called, initialize
12718 * with that. The problem with this is that unless you
12719 * back off one more character, you won't be guaranteed
12720 * regatom will get called again, unless regbranch,
12721 * regpiece ... are also changed. If you do back off that
12722 * extra character, so that there is input guaranteed to
12723 * force calling regatom, you can't handle the case where
12724 * just the first character in the node is acceptable. I
12725 * (khw) decided to try this method which doesn't have that
12726 * pitfall; if performance issues are found, we can do a
12727 * combination of the current approach plus that one */
12733 } /* End of verifying node ends with an appropriate char */
12735 loopdone: /* Jumped to when encounters something that shouldn't be
12738 /* I (khw) don't know if you can get here with zero length, but the
12739 * old code handled this situation by creating a zero-length EXACT
12740 * node. Might as well be NOTHING instead */
12746 /* If 'maybe_exact' is still set here, means there are no
12747 * code points in the node that participate in folds;
12748 * similarly for 'maybe_exactfu' and code points that match
12749 * differently depending on UTF8ness of the target string
12750 * (for /u), or depending on locale for /l */
12756 else if (maybe_exactfu) {
12762 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12763 FALSE /* Don't look to see if could
12764 be turned into an EXACT
12765 node, as we have already
12770 RExC_parse = p - 1;
12771 Set_Node_Cur_Length(ret, parse_start);
12772 nextchar(pRExC_state);
12774 /* len is STRLEN which is unsigned, need to copy to signed */
12777 vFAIL("Internal disaster");
12780 } /* End of label 'defchar:' */
12782 } /* End of giant switch on input character */
12788 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12790 /* Returns the next non-pattern-white space, non-comment character (the
12791 * latter only if 'recognize_comment is true) in the string p, which is
12792 * ended by RExC_end. See also reg_skipcomment */
12793 const char *e = RExC_end;
12795 PERL_ARGS_ASSERT_REGPATWS;
12799 if ((len = is_PATWS_safe(p, e, UTF))) {
12802 else if (recognize_comment && *p == '#') {
12803 p = reg_skipcomment(pRExC_state, p);
12812 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12814 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12815 * sets up the bitmap and any flags, removing those code points from the
12816 * inversion list, setting it to NULL should it become completely empty */
12818 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12819 assert(PL_regkind[OP(node)] == ANYOF);
12821 ANYOF_BITMAP_ZERO(node);
12822 if (*invlist_ptr) {
12824 /* This gets set if we actually need to modify things */
12825 bool change_invlist = FALSE;
12829 /* Start looking through *invlist_ptr */
12830 invlist_iterinit(*invlist_ptr);
12831 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12835 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12836 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12838 else if (end >= NUM_ANYOF_CODE_POINTS) {
12839 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12842 /* Quit if are above what we should change */
12843 if (start >= NUM_ANYOF_CODE_POINTS) {
12847 change_invlist = TRUE;
12849 /* Set all the bits in the range, up to the max that we are doing */
12850 high = (end < NUM_ANYOF_CODE_POINTS - 1)
12852 : NUM_ANYOF_CODE_POINTS - 1;
12853 for (i = start; i <= (int) high; i++) {
12854 if (! ANYOF_BITMAP_TEST(node, i)) {
12855 ANYOF_BITMAP_SET(node, i);
12859 invlist_iterfinish(*invlist_ptr);
12861 /* Done with loop; remove any code points that are in the bitmap from
12862 * *invlist_ptr; similarly for code points above the bitmap if we have
12863 * a flag to match all of them anyways */
12864 if (change_invlist) {
12865 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12867 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12868 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12871 /* If have completely emptied it, remove it completely */
12872 if (_invlist_len(*invlist_ptr) == 0) {
12873 SvREFCNT_dec_NN(*invlist_ptr);
12874 *invlist_ptr = NULL;
12879 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12880 Character classes ([:foo:]) can also be negated ([:^foo:]).
12881 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12882 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12883 but trigger failures because they are currently unimplemented. */
12885 #define POSIXCC_DONE(c) ((c) == ':')
12886 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12887 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12889 PERL_STATIC_INLINE I32
12890 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12892 I32 namedclass = OOB_NAMEDCLASS;
12894 PERL_ARGS_ASSERT_REGPPOSIXCC;
12896 if (value == '[' && RExC_parse + 1 < RExC_end &&
12897 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12898 POSIXCC(UCHARAT(RExC_parse)))
12900 const char c = UCHARAT(RExC_parse);
12901 char* const s = RExC_parse++;
12903 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12905 if (RExC_parse == RExC_end) {
12908 /* Try to give a better location for the error (than the end of
12909 * the string) by looking for the matching ']' */
12911 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12914 vFAIL2("Unmatched '%c' in POSIX class", c);
12916 /* Grandfather lone [:, [=, [. */
12920 const char* const t = RExC_parse++; /* skip over the c */
12923 if (UCHARAT(RExC_parse) == ']') {
12924 const char *posixcc = s + 1;
12925 RExC_parse++; /* skip over the ending ] */
12928 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12929 const I32 skip = t - posixcc;
12931 /* Initially switch on the length of the name. */
12934 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12935 this is the Perl \w
12937 namedclass = ANYOF_WORDCHAR;
12940 /* Names all of length 5. */
12941 /* alnum alpha ascii blank cntrl digit graph lower
12942 print punct space upper */
12943 /* Offset 4 gives the best switch position. */
12944 switch (posixcc[4]) {
12946 if (memEQ(posixcc, "alph", 4)) /* alpha */
12947 namedclass = ANYOF_ALPHA;
12950 if (memEQ(posixcc, "spac", 4)) /* space */
12951 namedclass = ANYOF_PSXSPC;
12954 if (memEQ(posixcc, "grap", 4)) /* graph */
12955 namedclass = ANYOF_GRAPH;
12958 if (memEQ(posixcc, "asci", 4)) /* ascii */
12959 namedclass = ANYOF_ASCII;
12962 if (memEQ(posixcc, "blan", 4)) /* blank */
12963 namedclass = ANYOF_BLANK;
12966 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12967 namedclass = ANYOF_CNTRL;
12970 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12971 namedclass = ANYOF_ALPHANUMERIC;
12974 if (memEQ(posixcc, "lowe", 4)) /* lower */
12975 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12976 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12977 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12980 if (memEQ(posixcc, "digi", 4)) /* digit */
12981 namedclass = ANYOF_DIGIT;
12982 else if (memEQ(posixcc, "prin", 4)) /* print */
12983 namedclass = ANYOF_PRINT;
12984 else if (memEQ(posixcc, "punc", 4)) /* punct */
12985 namedclass = ANYOF_PUNCT;
12990 if (memEQ(posixcc, "xdigit", 6))
12991 namedclass = ANYOF_XDIGIT;
12995 if (namedclass == OOB_NAMEDCLASS)
12997 "POSIX class [:%"UTF8f":] unknown",
12998 UTF8fARG(UTF, t - s - 1, s + 1));
13000 /* The #defines are structured so each complement is +1 to
13001 * the normal one */
13005 assert (posixcc[skip] == ':');
13006 assert (posixcc[skip+1] == ']');
13007 } else if (!SIZE_ONLY) {
13008 /* [[=foo=]] and [[.foo.]] are still future. */
13010 /* adjust RExC_parse so the warning shows after
13011 the class closes */
13012 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13014 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13017 /* Maternal grandfather:
13018 * "[:" ending in ":" but not in ":]" */
13020 vFAIL("Unmatched '[' in POSIX class");
13023 /* Grandfather lone [:, [=, [. */
13033 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13035 /* This applies some heuristics at the current parse position (which should
13036 * be at a '[') to see if what follows might be intended to be a [:posix:]
13037 * class. It returns true if it really is a posix class, of course, but it
13038 * also can return true if it thinks that what was intended was a posix
13039 * class that didn't quite make it.
13041 * It will return true for
13043 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13044 * ')' indicating the end of the (?[
13045 * [:any garbage including %^&$ punctuation:]
13047 * This is designed to be called only from S_handle_regex_sets; it could be
13048 * easily adapted to be called from the spot at the beginning of regclass()
13049 * that checks to see in a normal bracketed class if the surrounding []
13050 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13051 * change long-standing behavior, so I (khw) didn't do that */
13052 char* p = RExC_parse + 1;
13053 char first_char = *p;
13055 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13057 assert(*(p - 1) == '[');
13059 if (! POSIXCC(first_char)) {
13064 while (p < RExC_end && isWORDCHAR(*p)) p++;
13066 if (p >= RExC_end) {
13070 if (p - RExC_parse > 2 /* Got at least 1 word character */
13071 && (*p == first_char
13072 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13077 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13080 && p - RExC_parse > 2 /* [:] evaluates to colon;
13081 [::] is a bad posix class. */
13082 && first_char == *(p - 1));
13086 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13087 I32 *flagp, U32 depth,
13088 char * const oregcomp_parse)
13090 /* Handle the (?[...]) construct to do set operations */
13093 UV start, end; /* End points of code point ranges */
13095 char *save_end, *save_parse;
13100 const bool save_fold = FOLD;
13102 GET_RE_DEBUG_FLAGS_DECL;
13104 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13107 vFAIL("(?[...]) not valid in locale");
13109 RExC_uni_semantics = 1;
13111 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13112 * (such as EXACT). Thus we can skip most everything if just sizing. We
13113 * call regclass to handle '[]' so as to not have to reinvent its parsing
13114 * rules here (throwing away the size it computes each time). And, we exit
13115 * upon an unescaped ']' that isn't one ending a regclass. To do both
13116 * these things, we need to realize that something preceded by a backslash
13117 * is escaped, so we have to keep track of backslashes */
13119 Perl_ck_warner_d(aTHX_
13120 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13121 "The regex_sets feature is experimental" REPORT_LOCATION,
13122 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13124 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13125 RExC_precomp + (RExC_parse - RExC_precomp)));
13128 UV depth = 0; /* how many nested (?[...]) constructs */
13130 while (RExC_parse < RExC_end) {
13131 SV* current = NULL;
13132 RExC_parse = regpatws(pRExC_state, RExC_parse,
13133 TRUE); /* means recognize comments */
13134 switch (*RExC_parse) {
13136 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13141 /* Skip the next byte (which could cause us to end up in
13142 * the middle of a UTF-8 character, but since none of those
13143 * are confusable with anything we currently handle in this
13144 * switch (invariants all), it's safe. We'll just hit the
13145 * default: case next time and keep on incrementing until
13146 * we find one of the invariants we do handle. */
13151 /* If this looks like it is a [:posix:] class, leave the
13152 * parse pointer at the '[' to fool regclass() into
13153 * thinking it is part of a '[[:posix:]]'. That function
13154 * will use strict checking to force a syntax error if it
13155 * doesn't work out to a legitimate class */
13156 bool is_posix_class
13157 = could_it_be_a_POSIX_class(pRExC_state);
13158 if (! is_posix_class) {
13162 /* regclass() can only return RESTART_UTF8 if multi-char
13163 folds are allowed. */
13164 if (!regclass(pRExC_state, flagp,depth+1,
13165 is_posix_class, /* parse the whole char
13166 class only if not a
13168 FALSE, /* don't allow multi-char folds */
13169 TRUE, /* silence non-portable warnings. */
13173 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13176 /* function call leaves parse pointing to the ']', except
13177 * if we faked it */
13178 if (is_posix_class) {
13182 SvREFCNT_dec(current); /* In case it returned something */
13187 if (depth--) break;
13189 if (RExC_parse < RExC_end
13190 && *RExC_parse == ')')
13192 node = reganode(pRExC_state, ANYOF, 0);
13193 RExC_size += ANYOF_SKIP;
13194 nextchar(pRExC_state);
13195 Set_Node_Length(node,
13196 RExC_parse - oregcomp_parse + 1); /* MJD */
13205 FAIL("Syntax error in (?[...])");
13208 /* Pass 2 only after this. Everything in this construct is a
13209 * metacharacter. Operands begin with either a '\' (for an escape
13210 * sequence), or a '[' for a bracketed character class. Any other
13211 * character should be an operator, or parenthesis for grouping. Both
13212 * types of operands are handled by calling regclass() to parse them. It
13213 * is called with a parameter to indicate to return the computed inversion
13214 * list. The parsing here is implemented via a stack. Each entry on the
13215 * stack is a single character representing one of the operators, or the
13216 * '('; or else a pointer to an operand inversion list. */
13218 #define IS_OPERAND(a) (! SvIOK(a))
13220 /* The stack starts empty. It is a syntax error if the first thing parsed
13221 * is a binary operator; everything else is pushed on the stack. When an
13222 * operand is parsed, the top of the stack is examined. If it is a binary
13223 * operator, the item before it should be an operand, and both are replaced
13224 * by the result of doing that operation on the new operand and the one on
13225 * the stack. Thus a sequence of binary operands is reduced to a single
13226 * one before the next one is parsed.
13228 * A unary operator may immediately follow a binary in the input, for
13231 * When an operand is parsed and the top of the stack is a unary operator,
13232 * the operation is performed, and then the stack is rechecked to see if
13233 * this new operand is part of a binary operation; if so, it is handled as
13236 * A '(' is simply pushed on the stack; it is valid only if the stack is
13237 * empty, or the top element of the stack is an operator or another '('
13238 * (for which the parenthesized expression will become an operand). By the
13239 * time the corresponding ')' is parsed everything in between should have
13240 * been parsed and evaluated to a single operand (or else is a syntax
13241 * error), and is handled as a regular operand */
13243 sv_2mortal((SV *)(stack = newAV()));
13245 while (RExC_parse < RExC_end) {
13246 I32 top_index = av_tindex(stack);
13248 SV* current = NULL;
13250 /* Skip white space */
13251 RExC_parse = regpatws(pRExC_state, RExC_parse,
13252 TRUE /* means recognize comments */ );
13253 if (RExC_parse >= RExC_end) {
13254 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13256 if ((curchar = UCHARAT(RExC_parse)) == ']') {
13263 if (av_tindex(stack) >= 0 /* This makes sure that we can
13264 safely subtract 1 from
13265 RExC_parse in the next clause.
13266 If we have something on the
13267 stack, we have parsed something
13269 && UCHARAT(RExC_parse - 1) == '('
13270 && RExC_parse < RExC_end)
13272 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13273 * This happens when we have some thing like
13275 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13277 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13279 * Here we would be handling the interpolated
13280 * '$thai_or_lao'. We handle this by a recursive call to
13281 * ourselves which returns the inversion list the
13282 * interpolated expression evaluates to. We use the flags
13283 * from the interpolated pattern. */
13284 U32 save_flags = RExC_flags;
13285 const char * const save_parse = ++RExC_parse;
13287 parse_lparen_question_flags(pRExC_state);
13289 if (RExC_parse == save_parse /* Makes sure there was at
13290 least one flag (or this
13291 embedding wasn't compiled)
13293 || RExC_parse >= RExC_end - 4
13294 || UCHARAT(RExC_parse) != ':'
13295 || UCHARAT(++RExC_parse) != '('
13296 || UCHARAT(++RExC_parse) != '?'
13297 || UCHARAT(++RExC_parse) != '[')
13300 /* In combination with the above, this moves the
13301 * pointer to the point just after the first erroneous
13302 * character (or if there are no flags, to where they
13303 * should have been) */
13304 if (RExC_parse >= RExC_end - 4) {
13305 RExC_parse = RExC_end;
13307 else if (RExC_parse != save_parse) {
13308 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13310 vFAIL("Expecting '(?flags:(?[...'");
13313 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13314 depth+1, oregcomp_parse);
13316 /* Here, 'current' contains the embedded expression's
13317 * inversion list, and RExC_parse points to the trailing
13318 * ']'; the next character should be the ')' which will be
13319 * paired with the '(' that has been put on the stack, so
13320 * the whole embedded expression reduces to '(operand)' */
13323 RExC_flags = save_flags;
13324 goto handle_operand;
13329 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13330 vFAIL("Unexpected character");
13333 /* regclass() can only return RESTART_UTF8 if multi-char
13334 folds are allowed. */
13335 if (!regclass(pRExC_state, flagp,depth+1,
13336 TRUE, /* means parse just the next thing */
13337 FALSE, /* don't allow multi-char folds */
13338 FALSE, /* don't silence non-portable warnings. */
13342 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13344 /* regclass() will return with parsing just the \ sequence,
13345 * leaving the parse pointer at the next thing to parse */
13347 goto handle_operand;
13349 case '[': /* Is a bracketed character class */
13351 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13353 if (! is_posix_class) {
13357 /* regclass() can only return RESTART_UTF8 if multi-char
13358 folds are allowed. */
13359 if(!regclass(pRExC_state, flagp,depth+1,
13360 is_posix_class, /* parse the whole char class
13361 only if not a posix class */
13362 FALSE, /* don't allow multi-char folds */
13363 FALSE, /* don't silence non-portable warnings. */
13367 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13369 /* function call leaves parse pointing to the ']', except if we
13371 if (is_posix_class) {
13375 goto handle_operand;
13384 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13385 || ! IS_OPERAND(*top_ptr))
13388 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13390 av_push(stack, newSVuv(curchar));
13394 av_push(stack, newSVuv(curchar));
13398 if (top_index >= 0) {
13399 top_ptr = av_fetch(stack, top_index, FALSE);
13401 if (IS_OPERAND(*top_ptr)) {
13403 vFAIL("Unexpected '(' with no preceding operator");
13406 av_push(stack, newSVuv(curchar));
13413 || ! (current = av_pop(stack))
13414 || ! IS_OPERAND(current)
13415 || ! (lparen = av_pop(stack))
13416 || IS_OPERAND(lparen)
13417 || SvUV(lparen) != '(')
13419 SvREFCNT_dec(current);
13421 vFAIL("Unexpected ')'");
13424 SvREFCNT_dec_NN(lparen);
13431 /* Here, we have an operand to process, in 'current' */
13433 if (top_index < 0) { /* Just push if stack is empty */
13434 av_push(stack, current);
13437 SV* top = av_pop(stack);
13439 char current_operator;
13441 if (IS_OPERAND(top)) {
13442 SvREFCNT_dec_NN(top);
13443 SvREFCNT_dec_NN(current);
13444 vFAIL("Operand with no preceding operator");
13446 current_operator = (char) SvUV(top);
13447 switch (current_operator) {
13448 case '(': /* Push the '(' back on followed by the new
13450 av_push(stack, top);
13451 av_push(stack, current);
13452 SvREFCNT_inc(top); /* Counters the '_dec' done
13453 just after the 'break', so
13454 it doesn't get wrongly freed
13459 _invlist_invert(current);
13461 /* Unlike binary operators, the top of the stack,
13462 * now that this unary one has been popped off, may
13463 * legally be an operator, and we now have operand
13466 SvREFCNT_dec_NN(top);
13467 goto handle_operand;
13470 prev = av_pop(stack);
13471 _invlist_intersection(prev,
13474 av_push(stack, current);
13479 prev = av_pop(stack);
13480 _invlist_union(prev, current, ¤t);
13481 av_push(stack, current);
13485 prev = av_pop(stack);;
13486 _invlist_subtract(prev, current, ¤t);
13487 av_push(stack, current);
13490 case '^': /* The union minus the intersection */
13496 prev = av_pop(stack);
13497 _invlist_union(prev, current, &u);
13498 _invlist_intersection(prev, current, &i);
13499 /* _invlist_subtract will overwrite current
13500 without freeing what it already contains */
13502 _invlist_subtract(u, i, ¤t);
13503 av_push(stack, current);
13504 SvREFCNT_dec_NN(i);
13505 SvREFCNT_dec_NN(u);
13506 SvREFCNT_dec_NN(element);
13511 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13513 SvREFCNT_dec_NN(top);
13514 SvREFCNT_dec(prev);
13518 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13521 if (av_tindex(stack) < 0 /* Was empty */
13522 || ((final = av_pop(stack)) == NULL)
13523 || ! IS_OPERAND(final)
13524 || av_tindex(stack) >= 0) /* More left on stack */
13526 vFAIL("Incomplete expression within '(?[ ])'");
13529 /* Here, 'final' is the resultant inversion list from evaluating the
13530 * expression. Return it if so requested */
13531 if (return_invlist) {
13532 *return_invlist = final;
13536 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13537 * expecting a string of ranges and individual code points */
13538 invlist_iterinit(final);
13539 result_string = newSVpvs("");
13540 while (invlist_iternext(final, &start, &end)) {
13541 if (start == end) {
13542 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13545 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13550 save_parse = RExC_parse;
13551 RExC_parse = SvPV(result_string, len);
13552 save_end = RExC_end;
13553 RExC_end = RExC_parse + len;
13555 /* We turn off folding around the call, as the class we have constructed
13556 * already has all folding taken into consideration, and we don't want
13557 * regclass() to add to that */
13558 RExC_flags &= ~RXf_PMf_FOLD;
13559 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13561 node = regclass(pRExC_state, flagp,depth+1,
13562 FALSE, /* means parse the whole char class */
13563 FALSE, /* don't allow multi-char folds */
13564 TRUE, /* silence non-portable warnings. The above may very
13565 well have generated non-portable code points, but
13566 they're valid on this machine */
13567 FALSE, /* similarly, no need for strict */
13571 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13574 RExC_flags |= RXf_PMf_FOLD;
13576 RExC_parse = save_parse + 1;
13577 RExC_end = save_end;
13578 SvREFCNT_dec_NN(final);
13579 SvREFCNT_dec_NN(result_string);
13581 nextchar(pRExC_state);
13582 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13588 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13590 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13591 * innocent-looking character class, like /[ks]/i won't have to go out to
13592 * disk to find the possible matches.
13594 * This should be called only for a Latin1-range code points, cp, which is
13595 * known to be involved in a simple fold with other code points above
13596 * Latin1. It would give false results if /aa has been specified.
13597 * Multi-char folds are outside the scope of this, and must be handled
13600 * XXX It would be better to generate these via regen, in case a new
13601 * version of the Unicode standard adds new mappings, though that is not
13602 * really likely, and may be caught by the default: case of the switch
13605 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13607 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13613 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13617 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13620 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13621 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13623 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13624 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13625 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13627 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13628 *invlist = add_cp_to_invlist(*invlist,
13629 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13631 case LATIN_SMALL_LETTER_SHARP_S:
13632 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13635 /* Use deprecated warning to increase the chances of this being
13638 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13645 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13647 /* This adds the string scalar <multi_string> to the array
13648 * <multi_char_matches>. <multi_string> is known to have exactly
13649 * <cp_count> code points in it. This is used when constructing a
13650 * bracketed character class and we find something that needs to match more
13651 * than a single character.
13653 * <multi_char_matches> is actually an array of arrays. Each top-level
13654 * element is an array that contains all the strings known so far that are
13655 * the same length. And that length (in number of code points) is the same
13656 * as the index of the top-level array. Hence, the [2] element is an
13657 * array, each element thereof is a string containing TWO code points;
13658 * while element [3] is for strings of THREE characters, and so on. Since
13659 * this is for multi-char strings there can never be a [0] nor [1] element.
13661 * When we rewrite the character class below, we will do so such that the
13662 * longest strings are written first, so that it prefers the longest
13663 * matching strings first. This is done even if it turns out that any
13664 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
13665 * Christiansen has agreed that this is ok. This makes the test for the
13666 * ligature 'ffi' come before the test for 'ff', for example */
13669 AV** this_array_ptr;
13671 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13673 if (! multi_char_matches) {
13674 multi_char_matches = newAV();
13677 if (av_exists(multi_char_matches, cp_count)) {
13678 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13679 this_array = *this_array_ptr;
13682 this_array = newAV();
13683 av_store(multi_char_matches, cp_count,
13686 av_push(this_array, multi_string);
13688 return multi_char_matches;
13691 /* The names of properties whose definitions are not known at compile time are
13692 * stored in this SV, after a constant heading. So if the length has been
13693 * changed since initialization, then there is a run-time definition. */
13694 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13695 (SvCUR(listsv) != initial_listsv_len)
13698 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13699 const bool stop_at_1, /* Just parse the next thing, don't
13700 look for a full character class */
13701 bool allow_multi_folds,
13702 const bool silence_non_portable, /* Don't output warnings
13706 SV** ret_invlist /* Return an inversion list, not a node */
13709 /* parse a bracketed class specification. Most of these will produce an
13710 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13711 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13712 * under /i with multi-character folds: it will be rewritten following the
13713 * paradigm of this example, where the <multi-fold>s are characters which
13714 * fold to multiple character sequences:
13715 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13716 * gets effectively rewritten as:
13717 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13718 * reg() gets called (recursively) on the rewritten version, and this
13719 * function will return what it constructs. (Actually the <multi-fold>s
13720 * aren't physically removed from the [abcdefghi], it's just that they are
13721 * ignored in the recursion by means of a flag:
13722 * <RExC_in_multi_char_class>.)
13724 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13725 * characters, with the corresponding bit set if that character is in the
13726 * list. For characters above this, a range list or swash is used. There
13727 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13728 * determinable at compile time
13730 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13731 * to be restarted. This can only happen if ret_invlist is non-NULL.
13734 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13736 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13739 IV namedclass = OOB_NAMEDCLASS;
13740 char *rangebegin = NULL;
13741 bool need_class = 0;
13743 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13744 than just initialized. */
13745 SV* properties = NULL; /* Code points that match \p{} \P{} */
13746 SV* posixes = NULL; /* Code points that match classes like [:word:],
13747 extended beyond the Latin1 range. These have to
13748 be kept separate from other code points for much
13749 of this function because their handling is
13750 different under /i, and for most classes under
13752 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13753 separate for a while from the non-complemented
13754 versions because of complications with /d
13756 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
13757 treated more simply than the general case,
13758 leading to less compilation and execution
13760 UV element_count = 0; /* Number of distinct elements in the class.
13761 Optimizations may be possible if this is tiny */
13762 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13763 character; used under /i */
13765 char * stop_ptr = RExC_end; /* where to stop parsing */
13766 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13769 /* Unicode properties are stored in a swash; this holds the current one
13770 * being parsed. If this swash is the only above-latin1 component of the
13771 * character class, an optimization is to pass it directly on to the
13772 * execution engine. Otherwise, it is set to NULL to indicate that there
13773 * are other things in the class that have to be dealt with at execution
13775 SV* swash = NULL; /* Code points that match \p{} \P{} */
13777 /* Set if a component of this character class is user-defined; just passed
13778 * on to the engine */
13779 bool has_user_defined_property = FALSE;
13781 /* inversion list of code points this node matches only when the target
13782 * string is in UTF-8. (Because is under /d) */
13783 SV* depends_list = NULL;
13785 /* Inversion list of code points this node matches regardless of things
13786 * like locale, folding, utf8ness of the target string */
13787 SV* cp_list = NULL;
13789 /* Like cp_list, but code points on this list need to be checked for things
13790 * that fold to/from them under /i */
13791 SV* cp_foldable_list = NULL;
13793 /* Like cp_list, but code points on this list are valid only when the
13794 * runtime locale is UTF-8 */
13795 SV* only_utf8_locale_list = NULL;
13797 /* In a range, if one of the endpoints is non-character-set portable,
13798 * meaning that it hard-codes a code point that may mean a different
13799 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
13800 * mnemonic '\t' which each mean the same character no matter which
13801 * character set the platform is on. */
13802 unsigned int non_portable_endpoint = 0;
13804 /* Is the range unicode? which means on a platform that isn't 1-1 native
13805 * to Unicode (i.e. non-ASCII), each code point in it should be considered
13806 * to be a Unicode value. */
13807 bool unicode_range = FALSE;
13808 bool invert = FALSE; /* Is this class to be complemented */
13810 bool warn_super = ALWAYS_WARN_SUPER;
13812 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13813 case we need to change the emitted regop to an EXACT. */
13814 const char * orig_parse = RExC_parse;
13815 const SSize_t orig_size = RExC_size;
13816 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13817 GET_RE_DEBUG_FLAGS_DECL;
13819 PERL_ARGS_ASSERT_REGCLASS;
13821 PERL_UNUSED_ARG(depth);
13824 DEBUG_PARSE("clas");
13826 /* Assume we are going to generate an ANYOF node. */
13827 ret = reganode(pRExC_state,
13834 RExC_size += ANYOF_SKIP;
13835 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13838 ANYOF_FLAGS(ret) = 0;
13840 RExC_emit += ANYOF_SKIP;
13841 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13842 initial_listsv_len = SvCUR(listsv);
13843 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13847 RExC_parse = regpatws(pRExC_state, RExC_parse,
13848 FALSE /* means don't recognize comments */ );
13851 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13854 allow_multi_folds = FALSE;
13857 RExC_parse = regpatws(pRExC_state, RExC_parse,
13858 FALSE /* means don't recognize comments */ );
13862 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13863 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13864 const char *s = RExC_parse;
13865 const char c = *s++;
13867 while (isWORDCHAR(*s))
13869 if (*s && c == *s && s[1] == ']') {
13870 SAVEFREESV(RExC_rx_sv);
13872 "POSIX syntax [%c %c] belongs inside character classes",
13874 (void)ReREFCNT_inc(RExC_rx_sv);
13878 /* If the caller wants us to just parse a single element, accomplish this
13879 * by faking the loop ending condition */
13880 if (stop_at_1 && RExC_end > RExC_parse) {
13881 stop_ptr = RExC_parse + 1;
13884 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13885 if (UCHARAT(RExC_parse) == ']')
13886 goto charclassloop;
13889 if (RExC_parse >= stop_ptr) {
13894 RExC_parse = regpatws(pRExC_state, RExC_parse,
13895 FALSE /* means don't recognize comments */ );
13898 if (UCHARAT(RExC_parse) == ']') {
13904 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13905 save_value = value;
13906 save_prevvalue = prevvalue;
13909 rangebegin = RExC_parse;
13911 non_portable_endpoint = 0;
13914 value = utf8n_to_uvchr((U8*)RExC_parse,
13915 RExC_end - RExC_parse,
13916 &numlen, UTF8_ALLOW_DEFAULT);
13917 RExC_parse += numlen;
13920 value = UCHARAT(RExC_parse++);
13923 && RExC_parse < RExC_end
13924 && POSIXCC(UCHARAT(RExC_parse)))
13926 namedclass = regpposixcc(pRExC_state, value, strict);
13928 else if (value == '\\') {
13929 /* Is a backslash; get the code point of the char after it */
13930 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13931 value = utf8n_to_uvchr((U8*)RExC_parse,
13932 RExC_end - RExC_parse,
13933 &numlen, UTF8_ALLOW_DEFAULT);
13934 RExC_parse += numlen;
13937 value = UCHARAT(RExC_parse++);
13939 /* Some compilers cannot handle switching on 64-bit integer
13940 * values, therefore value cannot be an UV. Yes, this will
13941 * be a problem later if we want switch on Unicode.
13942 * A similar issue a little bit later when switching on
13943 * namedclass. --jhi */
13945 /* If the \ is escaping white space when white space is being
13946 * skipped, it means that that white space is wanted literally, and
13947 * is already in 'value'. Otherwise, need to translate the escape
13948 * into what it signifies. */
13949 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13951 case 'w': namedclass = ANYOF_WORDCHAR; break;
13952 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13953 case 's': namedclass = ANYOF_SPACE; break;
13954 case 'S': namedclass = ANYOF_NSPACE; break;
13955 case 'd': namedclass = ANYOF_DIGIT; break;
13956 case 'D': namedclass = ANYOF_NDIGIT; break;
13957 case 'v': namedclass = ANYOF_VERTWS; break;
13958 case 'V': namedclass = ANYOF_NVERTWS; break;
13959 case 'h': namedclass = ANYOF_HORIZWS; break;
13960 case 'H': namedclass = ANYOF_NHORIZWS; break;
13961 case 'N': /* Handle \N{NAME} in class */
13964 STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13965 flagp, depth, &as_text);
13966 if (*flagp & RESTART_UTF8)
13967 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13968 if (cp_count != 1) { /* The typical case drops through */
13969 assert(cp_count != (STRLEN) -1);
13970 if (cp_count == 0) {
13972 RExC_parse++; /* Position after the "}" */
13973 vFAIL("Zero length \\N{}");
13976 ckWARNreg(RExC_parse,
13977 "Ignoring zero length \\N{} in character class");
13980 else { /* cp_count > 1 */
13981 if (! RExC_in_multi_char_class) {
13982 if (invert || range || *RExC_parse == '-') {
13985 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13988 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13993 = add_multi_match(multi_char_matches,
13997 break; /* <value> contains the first code
13998 point. Drop out of the switch to
14001 } /* End of cp_count != 1 */
14003 /* This element should not be processed further in this
14006 value = save_value;
14007 prevvalue = save_prevvalue;
14008 continue; /* Back to top of loop to get next char */
14011 /* Here, is a single code point, and <value> contains it */
14012 unicode_range = TRUE; /* \N{} are Unicode */
14020 /* We will handle any undefined properties ourselves */
14021 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14022 /* And we actually would prefer to get
14023 * the straight inversion list of the
14024 * swash, since we will be accessing it
14025 * anyway, to save a little time */
14026 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14028 if (RExC_parse >= RExC_end)
14029 vFAIL2("Empty \\%c{}", (U8)value);
14030 if (*RExC_parse == '{') {
14031 const U8 c = (U8)value;
14032 e = strchr(RExC_parse++, '}');
14034 vFAIL2("Missing right brace on \\%c{}", c);
14035 while (isSPACE(*RExC_parse))
14037 if (e == RExC_parse)
14038 vFAIL2("Empty \\%c{}", c);
14039 n = e - RExC_parse;
14040 while (isSPACE(*(RExC_parse + n - 1)))
14051 if (UCHARAT(RExC_parse) == '^') {
14054 /* toggle. (The rhs xor gets the single bit that
14055 * differs between P and p; the other xor inverts just
14057 value ^= 'P' ^ 'p';
14059 while (isSPACE(*RExC_parse)) {
14064 /* Try to get the definition of the property into
14065 * <invlist>. If /i is in effect, the effective property
14066 * will have its name be <__NAME_i>. The design is
14067 * discussed in commit
14068 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14069 name = savepv(Perl_form(aTHX_
14071 (FOLD) ? "__" : "",
14077 /* Look up the property name, and get its swash and
14078 * inversion list, if the property is found */
14080 SvREFCNT_dec_NN(swash);
14082 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14085 NULL, /* No inversion list */
14088 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14089 HV* curpkg = (IN_PERL_COMPILETIME)
14091 : CopSTASH(PL_curcop);
14093 SvREFCNT_dec_NN(swash);
14097 /* Here didn't find it. It could be a user-defined
14098 * property that will be available at run-time. If we
14099 * accept only compile-time properties, is an error;
14100 * otherwise add it to the list for run-time look up */
14102 RExC_parse = e + 1;
14104 "Property '%"UTF8f"' is unknown",
14105 UTF8fARG(UTF, n, name));
14108 /* If the property name doesn't already have a package
14109 * name, add the current one to it so that it can be
14110 * referred to outside it. [perl #121777] */
14111 if (curpkg && ! instr(name, "::")) {
14112 char* pkgname = HvNAME(curpkg);
14113 if (strNE(pkgname, "main")) {
14114 char* full_name = Perl_form(aTHX_
14118 n = strlen(full_name);
14120 name = savepvn(full_name, n);
14123 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14124 (value == 'p' ? '+' : '!'),
14125 UTF8fARG(UTF, n, name));
14126 has_user_defined_property = TRUE;
14128 /* We don't know yet, so have to assume that the
14129 * property could match something in the Latin1 range,
14130 * hence something that isn't utf8. Note that this
14131 * would cause things in <depends_list> to match
14132 * inappropriately, except that any \p{}, including
14133 * this one forces Unicode semantics, which means there
14134 * is no <depends_list> */
14136 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14140 /* Here, did get the swash and its inversion list. If
14141 * the swash is from a user-defined property, then this
14142 * whole character class should be regarded as such */
14143 if (swash_init_flags
14144 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14146 has_user_defined_property = TRUE;
14149 /* We warn on matching an above-Unicode code point
14150 * if the match would return true, except don't
14151 * warn for \p{All}, which has exactly one element
14153 (_invlist_contains_cp(invlist, 0x110000)
14154 && (! (_invlist_len(invlist) == 1
14155 && *invlist_array(invlist) == 0)))
14161 /* Invert if asking for the complement */
14162 if (value == 'P') {
14163 _invlist_union_complement_2nd(properties,
14167 /* The swash can't be used as-is, because we've
14168 * inverted things; delay removing it to here after
14169 * have copied its invlist above */
14170 SvREFCNT_dec_NN(swash);
14174 _invlist_union(properties, invlist, &properties);
14179 RExC_parse = e + 1;
14180 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14183 /* \p means they want Unicode semantics */
14184 RExC_uni_semantics = 1;
14187 case 'n': value = '\n'; break;
14188 case 'r': value = '\r'; break;
14189 case 't': value = '\t'; break;
14190 case 'f': value = '\f'; break;
14191 case 'b': value = '\b'; break;
14192 case 'e': value = ESC_NATIVE; break;
14193 case 'a': value = '\a'; break;
14195 RExC_parse--; /* function expects to be pointed at the 'o' */
14197 const char* error_msg;
14198 bool valid = grok_bslash_o(&RExC_parse,
14201 PASS2, /* warnings only in
14204 silence_non_portable,
14210 non_portable_endpoint++;
14211 if (IN_ENCODING && value < 0x100) {
14212 goto recode_encoding;
14216 RExC_parse--; /* function expects to be pointed at the 'x' */
14218 const char* error_msg;
14219 bool valid = grok_bslash_x(&RExC_parse,
14222 PASS2, /* Output warnings */
14224 silence_non_portable,
14230 non_portable_endpoint++;
14231 if (IN_ENCODING && value < 0x100)
14232 goto recode_encoding;
14235 value = grok_bslash_c(*RExC_parse++, PASS2);
14236 non_portable_endpoint++;
14238 case '0': case '1': case '2': case '3': case '4':
14239 case '5': case '6': case '7':
14241 /* Take 1-3 octal digits */
14242 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14243 numlen = (strict) ? 4 : 3;
14244 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14245 RExC_parse += numlen;
14248 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14249 vFAIL("Need exactly 3 octal digits");
14251 else if (! SIZE_ONLY /* like \08, \178 */
14253 && RExC_parse < RExC_end
14254 && isDIGIT(*RExC_parse)
14255 && ckWARN(WARN_REGEXP))
14257 SAVEFREESV(RExC_rx_sv);
14258 reg_warn_non_literal_string(
14260 form_short_octal_warning(RExC_parse, numlen));
14261 (void)ReREFCNT_inc(RExC_rx_sv);
14264 non_portable_endpoint++;
14265 if (IN_ENCODING && value < 0x100)
14266 goto recode_encoding;
14270 if (! RExC_override_recoding) {
14271 SV* enc = _get_encoding();
14272 value = reg_recode((const char)(U8)value, &enc);
14275 vFAIL("Invalid escape in the specified encoding");
14278 ckWARNreg(RExC_parse,
14279 "Invalid escape in the specified encoding");
14285 /* Allow \_ to not give an error */
14286 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14288 vFAIL2("Unrecognized escape \\%c in character class",
14292 SAVEFREESV(RExC_rx_sv);
14293 ckWARN2reg(RExC_parse,
14294 "Unrecognized escape \\%c in character class passed through",
14296 (void)ReREFCNT_inc(RExC_rx_sv);
14300 } /* End of switch on char following backslash */
14301 } /* end of handling backslash escape sequences */
14303 /* Here, we have the current token in 'value' */
14305 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14308 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14309 * literal, as is the character that began the false range, i.e.
14310 * the 'a' in the examples */
14313 const int w = (RExC_parse >= rangebegin)
14314 ? RExC_parse - rangebegin
14318 "False [] range \"%"UTF8f"\"",
14319 UTF8fARG(UTF, w, rangebegin));
14322 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14323 ckWARN2reg(RExC_parse,
14324 "False [] range \"%"UTF8f"\"",
14325 UTF8fARG(UTF, w, rangebegin));
14326 (void)ReREFCNT_inc(RExC_rx_sv);
14327 cp_list = add_cp_to_invlist(cp_list, '-');
14328 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14333 range = 0; /* this was not a true range */
14334 element_count += 2; /* So counts for three values */
14337 classnum = namedclass_to_classnum(namedclass);
14339 if (LOC && namedclass < ANYOF_POSIXL_MAX
14340 #ifndef HAS_ISASCII
14341 && classnum != _CC_ASCII
14344 /* What the Posix classes (like \w, [:space:]) match in locale
14345 * isn't knowable under locale until actual match time. Room
14346 * must be reserved (one time per outer bracketed class) to
14347 * store such classes. The space will contain a bit for each
14348 * named class that is to be matched against. This isn't
14349 * needed for \p{} and pseudo-classes, as they are not affected
14350 * by locale, and hence are dealt with separately */
14351 if (! need_class) {
14354 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14357 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14359 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14360 ANYOF_POSIXL_ZERO(ret);
14363 /* Coverity thinks it is possible for this to be negative; both
14364 * jhi and khw think it's not, but be safer */
14365 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14366 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14368 /* See if it already matches the complement of this POSIX
14370 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14371 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14375 posixl_matches_all = TRUE;
14376 break; /* No need to continue. Since it matches both
14377 e.g., \w and \W, it matches everything, and the
14378 bracketed class can be optimized into qr/./s */
14381 /* Add this class to those that should be checked at runtime */
14382 ANYOF_POSIXL_SET(ret, namedclass);
14384 /* The above-Latin1 characters are not subject to locale rules.
14385 * Just add them, in the second pass, to the
14386 * unconditionally-matched list */
14388 SV* scratch_list = NULL;
14390 /* Get the list of the above-Latin1 code points this
14392 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14393 PL_XPosix_ptrs[classnum],
14395 /* Odd numbers are complements, like
14396 * NDIGIT, NASCII, ... */
14397 namedclass % 2 != 0,
14399 /* Checking if 'cp_list' is NULL first saves an extra
14400 * clone. Its reference count will be decremented at the
14401 * next union, etc, or if this is the only instance, at the
14402 * end of the routine */
14404 cp_list = scratch_list;
14407 _invlist_union(cp_list, scratch_list, &cp_list);
14408 SvREFCNT_dec_NN(scratch_list);
14410 continue; /* Go get next character */
14413 else if (! SIZE_ONLY) {
14415 /* Here, not in pass1 (in that pass we skip calculating the
14416 * contents of this class), and is /l, or is a POSIX class for
14417 * which /l doesn't matter (or is a Unicode property, which is
14418 * skipped here). */
14419 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14420 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14422 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14423 * nor /l make a difference in what these match,
14424 * therefore we just add what they match to cp_list. */
14425 if (classnum != _CC_VERTSPACE) {
14426 assert( namedclass == ANYOF_HORIZWS
14427 || namedclass == ANYOF_NHORIZWS);
14429 /* It turns out that \h is just a synonym for
14431 classnum = _CC_BLANK;
14434 _invlist_union_maybe_complement_2nd(
14436 PL_XPosix_ptrs[classnum],
14437 namedclass % 2 != 0, /* Complement if odd
14438 (NHORIZWS, NVERTWS)
14443 else if (UNI_SEMANTICS
14444 || classnum == _CC_ASCII
14445 || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
14446 || classnum == _CC_XDIGIT)))
14448 /* We usually have to worry about /d and /a affecting what
14449 * POSIX classes match, with special code needed for /d
14450 * because we won't know until runtime what all matches.
14451 * But there is no extra work needed under /u, and
14452 * [:ascii:] is unaffected by /a and /d; and :digit: and
14453 * :xdigit: don't have runtime differences under /d. So we
14454 * can special case these, and avoid some extra work below,
14455 * and at runtime. */
14456 _invlist_union_maybe_complement_2nd(
14458 PL_XPosix_ptrs[classnum],
14459 namedclass % 2 != 0,
14462 else { /* Garden variety class. If is NUPPER, NALPHA, ...
14463 complement and use nposixes */
14464 SV** posixes_ptr = namedclass % 2 == 0
14467 _invlist_union_maybe_complement_2nd(
14469 PL_XPosix_ptrs[classnum],
14470 namedclass % 2 != 0,
14474 } /* end of namedclass \blah */
14477 RExC_parse = regpatws(pRExC_state, RExC_parse,
14478 FALSE /* means don't recognize comments */ );
14481 /* If 'range' is set, 'value' is the ending of a range--check its
14482 * validity. (If value isn't a single code point in the case of a
14483 * range, we should have figured that out above in the code that
14484 * catches false ranges). Later, we will handle each individual code
14485 * point in the range. If 'range' isn't set, this could be the
14486 * beginning of a range, so check for that by looking ahead to see if
14487 * the next real character to be processed is the range indicator--the
14492 /* For unicode ranges, we have to test that the Unicode as opposed
14493 * to the native values are not decreasing. (Above 255, there is
14494 * no difference between native and Unicode) */
14495 if (unicode_range && prevvalue < 255 && value < 255) {
14496 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14497 goto backwards_range;
14502 if (prevvalue > value) /* b-a */ {
14507 w = RExC_parse - rangebegin;
14509 "Invalid [] range \"%"UTF8f"\"",
14510 UTF8fARG(UTF, w, rangebegin));
14511 NOT_REACHED; /* NOT REACHED */
14515 prevvalue = value; /* save the beginning of the potential range */
14516 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14517 && *RExC_parse == '-')
14519 char* next_char_ptr = RExC_parse + 1;
14520 if (skip_white) { /* Get the next real char after the '-' */
14521 next_char_ptr = regpatws(pRExC_state,
14523 FALSE); /* means don't recognize
14527 /* If the '-' is at the end of the class (just before the ']',
14528 * it is a literal minus; otherwise it is a range */
14529 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14530 RExC_parse = next_char_ptr;
14532 /* a bad range like \w-, [:word:]- ? */
14533 if (namedclass > OOB_NAMEDCLASS) {
14534 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14535 const int w = RExC_parse >= rangebegin
14536 ? RExC_parse - rangebegin
14539 vFAIL4("False [] range \"%*.*s\"",
14544 "False [] range \"%*.*s\"",
14549 cp_list = add_cp_to_invlist(cp_list, '-');
14553 range = 1; /* yeah, it's a range! */
14554 continue; /* but do it the next time */
14559 if (namedclass > OOB_NAMEDCLASS) {
14563 /* Here, we have a single value this time through the loop, and
14564 * <prevvalue> is the beginning of the range, if any; or <value> if
14567 /* non-Latin1 code point implies unicode semantics. Must be set in
14568 * pass1 so is there for the whole of pass 2 */
14570 RExC_uni_semantics = 1;
14573 /* Ready to process either the single value, or the completed range.
14574 * For single-valued non-inverted ranges, we consider the possibility
14575 * of multi-char folds. (We made a conscious decision to not do this
14576 * for the other cases because it can often lead to non-intuitive
14577 * results. For example, you have the peculiar case that:
14578 * "s s" =~ /^[^\xDF]+$/i => Y
14579 * "ss" =~ /^[^\xDF]+$/i => N
14581 * See [perl #89750] */
14582 if (FOLD && allow_multi_folds && value == prevvalue) {
14583 if (value == LATIN_SMALL_LETTER_SHARP_S
14584 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14587 /* Here <value> is indeed a multi-char fold. Get what it is */
14589 U8 foldbuf[UTF8_MAXBYTES_CASE];
14592 UV folded = _to_uni_fold_flags(
14596 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14597 ? FOLD_FLAGS_NOMIX_ASCII
14601 /* Here, <folded> should be the first character of the
14602 * multi-char fold of <value>, with <foldbuf> containing the
14603 * whole thing. But, if this fold is not allowed (because of
14604 * the flags), <fold> will be the same as <value>, and should
14605 * be processed like any other character, so skip the special
14607 if (folded != value) {
14609 /* Skip if we are recursed, currently parsing the class
14610 * again. Otherwise add this character to the list of
14611 * multi-char folds. */
14612 if (! RExC_in_multi_char_class) {
14613 STRLEN cp_count = utf8_length(foldbuf,
14614 foldbuf + foldlen);
14615 SV* multi_fold = sv_2mortal(newSVpvs(""));
14617 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14620 = add_multi_match(multi_char_matches,
14626 /* This element should not be processed further in this
14629 value = save_value;
14630 prevvalue = save_prevvalue;
14636 if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
14639 /* If the range starts above 255, everything is portable and
14640 * likely to be so for any forseeable character set, so don't
14642 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
14643 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
14645 else if (prevvalue != value) {
14647 /* Under strict, ranges that stop and/or end in an ASCII
14648 * printable should have each end point be a portable value
14649 * for it (preferably like 'A', but we don't warn if it is
14650 * a (portable) Unicode name or code point), and the range
14651 * must be be all digits or all letters of the same case.
14652 * Otherwise, the range is non-portable and unclear as to
14653 * what it contains */
14654 if ((isPRINT_A(prevvalue) || isPRINT_A(value))
14655 && (non_portable_endpoint
14656 || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
14657 || (isLOWER_A(prevvalue) && isLOWER_A(value))
14658 || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
14660 vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
14662 else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
14664 /* But the nature of Unicode and languages mean we
14665 * can't do the same checks for above-ASCII ranges,
14666 * except in the case of digit ones. These should
14667 * contain only digits from the same group of 10. The
14668 * ASCII case is handled just above. 0x660 is the
14669 * first digit character beyond ASCII. Hence here, the
14670 * range could be a range of digits. Find out. */
14671 IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
14673 IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
14676 /* If the range start and final points are in the same
14677 * inversion list element, it means that either both
14678 * are not digits, or both are digits in a consecutive
14679 * sequence of digits. (So far, Unicode has kept all
14680 * such sequences as distinct groups of 10, but assert
14681 * to make sure). If the end points are not in the
14682 * same element, neither should be a digit. */
14683 if (index_start == index_final) {
14684 assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
14685 || invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
14686 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
14689 else if ((index_start >= 0
14690 && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
14691 || (index_final >= 0
14692 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
14694 vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
14699 if ((! range || prevvalue == value) && non_portable_endpoint) {
14700 if (isPRINT_A(value)) {
14703 if (isBACKSLASHED_PUNCT(value)) {
14704 literal[d++] = '\\';
14706 literal[d++] = (char) value;
14707 literal[d++] = '\0';
14710 "\"%.*s\" is more clearly written simply as \"%s\"",
14711 (int) (RExC_parse - rangebegin),
14716 else if isMNEMONIC_CNTRL(value) {
14718 "\"%.*s\" is more clearly written simply as \"%s\"",
14719 (int) (RExC_parse - rangebegin),
14721 cntrl_to_mnemonic((char) value)
14727 /* Deal with this element of the class */
14731 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14734 /* On non-ASCII platforms, for ranges that span all of 0..255, and
14735 * ones that don't require special handling, we can just add the
14736 * range like we do for ASCII platforms */
14737 if ((UNLIKELY(prevvalue == 0) && value >= 255)
14738 || ! (prevvalue < 256
14740 || (! non_portable_endpoint
14741 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14742 || (isUPPER_A(prevvalue)
14743 && isUPPER_A(value)))))))
14745 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14749 /* Here, requires special handling. This can be because it is
14750 * a range whose code points are considered to be Unicode, and
14751 * so must be individually translated into native, or because
14752 * its a subrange of 'A-Z' or 'a-z' which each aren't
14753 * contiguous in EBCDIC, but we have defined them to include
14754 * only the "expected" upper or lower case ASCII alphabetics.
14755 * Subranges above 255 are the same in native and Unicode, so
14756 * can be added as a range */
14757 U8 start = NATIVE_TO_LATIN1(prevvalue);
14759 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
14760 for (j = start; j <= end; j++) {
14761 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
14764 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14771 range = 0; /* this range (if it was one) is done now */
14772 } /* End of loop through all the text within the brackets */
14774 /* If anything in the class expands to more than one character, we have to
14775 * deal with them by building up a substitute parse string, and recursively
14776 * calling reg() on it, instead of proceeding */
14777 if (multi_char_matches) {
14778 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14781 char *save_end = RExC_end;
14782 char *save_parse = RExC_parse;
14783 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14788 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14789 because too confusing */
14791 sv_catpv(substitute_parse, "(?:");
14795 /* Look at the longest folds first */
14796 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14798 if (av_exists(multi_char_matches, cp_count)) {
14799 AV** this_array_ptr;
14802 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14804 while ((this_sequence = av_pop(*this_array_ptr)) !=
14807 if (! first_time) {
14808 sv_catpv(substitute_parse, "|");
14810 first_time = FALSE;
14812 sv_catpv(substitute_parse, SvPVX(this_sequence));
14817 /* If the character class contains anything else besides these
14818 * multi-character folds, have to include it in recursive parsing */
14819 if (element_count) {
14820 sv_catpv(substitute_parse, "|[");
14821 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14822 sv_catpv(substitute_parse, "]");
14825 sv_catpv(substitute_parse, ")");
14828 /* This is a way to get the parse to skip forward a whole named
14829 * sequence instead of matching the 2nd character when it fails the
14831 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14835 RExC_parse = SvPV(substitute_parse, len);
14836 RExC_end = RExC_parse + len;
14837 RExC_in_multi_char_class = 1;
14838 RExC_override_recoding = 1;
14839 RExC_emit = (regnode *)orig_emit;
14841 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14843 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14845 RExC_parse = save_parse;
14846 RExC_end = save_end;
14847 RExC_in_multi_char_class = 0;
14848 RExC_override_recoding = 0;
14849 SvREFCNT_dec_NN(multi_char_matches);
14853 /* Here, we've gone through the entire class and dealt with multi-char
14854 * folds. We are now in a position that we can do some checks to see if we
14855 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14856 * Currently we only do two checks:
14857 * 1) is in the unlikely event that the user has specified both, eg. \w and
14858 * \W under /l, then the class matches everything. (This optimization
14859 * is done only to make the optimizer code run later work.)
14860 * 2) if the character class contains only a single element (including a
14861 * single range), we see if there is an equivalent node for it.
14862 * Other checks are possible */
14863 if (! ret_invlist /* Can't optimize if returning the constructed
14865 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14870 if (UNLIKELY(posixl_matches_all)) {
14873 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14874 \w or [:digit:] or \p{foo}
14877 /* All named classes are mapped into POSIXish nodes, with its FLAG
14878 * argument giving which class it is */
14879 switch ((I32)namedclass) {
14880 case ANYOF_UNIPROP:
14883 /* These don't depend on the charset modifiers. They always
14884 * match under /u rules */
14885 case ANYOF_NHORIZWS:
14886 case ANYOF_HORIZWS:
14887 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14890 case ANYOF_NVERTWS:
14895 /* The actual POSIXish node for all the rest depends on the
14896 * charset modifier. The ones in the first set depend only on
14897 * ASCII or, if available on this platform, also locale */
14901 op = (LOC) ? POSIXL : POSIXA;
14907 /* The following don't have any matches in the upper Latin1
14908 * range, hence /d is equivalent to /u for them. Making it /u
14909 * saves some branches at runtime */
14913 case ANYOF_NXDIGIT:
14914 if (! DEPENDS_SEMANTICS) {
14915 goto treat_as_default;
14921 /* The following change to CASED under /i */
14927 namedclass = ANYOF_CASED + (namedclass % 2);
14931 /* The rest have more possibilities depending on the charset.
14932 * We take advantage of the enum ordering of the charset
14933 * modifiers to get the exact node type, */
14936 op = POSIXD + get_regex_charset(RExC_flags);
14937 if (op > POSIXA) { /* /aa is same as /a */
14942 /* The odd numbered ones are the complements of the
14943 * next-lower even number one */
14944 if (namedclass % 2 == 1) {
14948 arg = namedclass_to_classnum(namedclass);
14952 else if (value == prevvalue) {
14954 /* Here, the class consists of just a single code point */
14957 if (! LOC && value == '\n') {
14958 op = REG_ANY; /* Optimize [^\n] */
14959 *flagp |= HASWIDTH|SIMPLE;
14963 else if (value < 256 || UTF) {
14965 /* Optimize a single value into an EXACTish node, but not if it
14966 * would require converting the pattern to UTF-8. */
14967 op = compute_EXACTish(pRExC_state);
14969 } /* Otherwise is a range */
14970 else if (! LOC) { /* locale could vary these */
14971 if (prevvalue == '0') {
14972 if (value == '9') {
14977 else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
14978 /* We can optimize A-Z or a-z, but not if they could match
14979 * something like the KELVIN SIGN under /i (/a means they
14981 if (prevvalue == 'A') {
14984 && ! non_portable_end_point
14987 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14991 else if (prevvalue == 'a') {
14994 && ! non_portable_end_point
14997 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
15004 /* Here, we have changed <op> away from its initial value iff we found
15005 * an optimization */
15008 /* Throw away this ANYOF regnode, and emit the calculated one,
15009 * which should correspond to the beginning, not current, state of
15011 const char * cur_parse = RExC_parse;
15012 RExC_parse = (char *)orig_parse;
15016 /* To get locale nodes to not use the full ANYOF size would
15017 * require moving the code above that writes the portions
15018 * of it that aren't in other nodes to after this point.
15019 * e.g. ANYOF_POSIXL_SET */
15020 RExC_size = orig_size;
15024 RExC_emit = (regnode *)orig_emit;
15025 if (PL_regkind[op] == POSIXD) {
15026 if (op == POSIXL) {
15027 RExC_contains_locale = 1;
15030 op += NPOSIXD - POSIXD;
15035 ret = reg_node(pRExC_state, op);
15037 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
15041 *flagp |= HASWIDTH|SIMPLE;
15043 else if (PL_regkind[op] == EXACT) {
15044 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15045 TRUE /* downgradable to EXACT */
15049 RExC_parse = (char *) cur_parse;
15051 SvREFCNT_dec(posixes);
15052 SvREFCNT_dec(nposixes);
15053 SvREFCNT_dec(simple_posixes);
15054 SvREFCNT_dec(cp_list);
15055 SvREFCNT_dec(cp_foldable_list);
15062 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
15064 /* If folding, we calculate all characters that could fold to or from the
15065 * ones already on the list */
15066 if (cp_foldable_list) {
15068 UV start, end; /* End points of code point ranges */
15070 SV* fold_intersection = NULL;
15073 /* Our calculated list will be for Unicode rules. For locale
15074 * matching, we have to keep a separate list that is consulted at
15075 * runtime only when the locale indicates Unicode rules. For
15076 * non-locale, we just use to the general list */
15078 use_list = &only_utf8_locale_list;
15081 use_list = &cp_list;
15084 /* Only the characters in this class that participate in folds need
15085 * be checked. Get the intersection of this class and all the
15086 * possible characters that are foldable. This can quickly narrow
15087 * down a large class */
15088 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
15089 &fold_intersection);
15091 /* The folds for all the Latin1 characters are hard-coded into this
15092 * program, but we have to go out to disk to get the others. */
15093 if (invlist_highest(cp_foldable_list) >= 256) {
15095 /* This is a hash that for a particular fold gives all
15096 * characters that are involved in it */
15097 if (! PL_utf8_foldclosures) {
15098 _load_PL_utf8_foldclosures();
15102 /* Now look at the foldable characters in this class individually */
15103 invlist_iterinit(fold_intersection);
15104 while (invlist_iternext(fold_intersection, &start, &end)) {
15107 /* Look at every character in the range */
15108 for (j = start; j <= end; j++) {
15109 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
15115 if (IS_IN_SOME_FOLD_L1(j)) {
15117 /* ASCII is always matched; non-ASCII is matched
15118 * only under Unicode rules (which could happen
15119 * under /l if the locale is a UTF-8 one */
15120 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15121 *use_list = add_cp_to_invlist(*use_list,
15122 PL_fold_latin1[j]);
15126 add_cp_to_invlist(depends_list,
15127 PL_fold_latin1[j]);
15131 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15132 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15134 add_above_Latin1_folds(pRExC_state,
15141 /* Here is an above Latin1 character. We don't have the
15142 * rules hard-coded for it. First, get its fold. This is
15143 * the simple fold, as the multi-character folds have been
15144 * handled earlier and separated out */
15145 _to_uni_fold_flags(j, foldbuf, &foldlen,
15146 (ASCII_FOLD_RESTRICTED)
15147 ? FOLD_FLAGS_NOMIX_ASCII
15150 /* Single character fold of above Latin1. Add everything in
15151 * its fold closure to the list that this node should match.
15152 * The fold closures data structure is a hash with the keys
15153 * being the UTF-8 of every character that is folded to, like
15154 * 'k', and the values each an array of all code points that
15155 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
15156 * Multi-character folds are not included */
15157 if ((listp = hv_fetch(PL_utf8_foldclosures,
15158 (char *) foldbuf, foldlen, FALSE)))
15160 AV* list = (AV*) *listp;
15162 for (k = 0; k <= av_tindex(list); k++) {
15163 SV** c_p = av_fetch(list, k, FALSE);
15169 /* /aa doesn't allow folds between ASCII and non- */
15170 if ((ASCII_FOLD_RESTRICTED
15171 && (isASCII(c) != isASCII(j))))
15176 /* Folds under /l which cross the 255/256 boundary
15177 * are added to a separate list. (These are valid
15178 * only when the locale is UTF-8.) */
15179 if (c < 256 && LOC) {
15180 *use_list = add_cp_to_invlist(*use_list, c);
15184 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15186 cp_list = add_cp_to_invlist(cp_list, c);
15189 /* Similarly folds involving non-ascii Latin1
15190 * characters under /d are added to their list */
15191 depends_list = add_cp_to_invlist(depends_list,
15198 SvREFCNT_dec_NN(fold_intersection);
15201 /* Now that we have finished adding all the folds, there is no reason
15202 * to keep the foldable list separate */
15203 _invlist_union(cp_list, cp_foldable_list, &cp_list);
15204 SvREFCNT_dec_NN(cp_foldable_list);
15207 /* And combine the result (if any) with any inversion list from posix
15208 * classes. The lists are kept separate up to now because we don't want to
15209 * fold the classes (folding of those is automatically handled by the swash
15210 * fetching code) */
15211 if (simple_posixes) {
15212 _invlist_union(cp_list, simple_posixes, &cp_list);
15213 SvREFCNT_dec_NN(simple_posixes);
15215 if (posixes || nposixes) {
15216 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15217 /* Under /a and /aa, nothing above ASCII matches these */
15218 _invlist_intersection(posixes,
15219 PL_XPosix_ptrs[_CC_ASCII],
15223 if (DEPENDS_SEMANTICS) {
15224 /* Under /d, everything in the upper half of the Latin1 range
15225 * matches these complements */
15226 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15228 else if (AT_LEAST_ASCII_RESTRICTED) {
15229 /* Under /a and /aa, everything above ASCII matches these
15231 _invlist_union_complement_2nd(nposixes,
15232 PL_XPosix_ptrs[_CC_ASCII],
15236 _invlist_union(posixes, nposixes, &posixes);
15237 SvREFCNT_dec_NN(nposixes);
15240 posixes = nposixes;
15243 if (! DEPENDS_SEMANTICS) {
15245 _invlist_union(cp_list, posixes, &cp_list);
15246 SvREFCNT_dec_NN(posixes);
15253 /* Under /d, we put into a separate list the Latin1 things that
15254 * match only when the target string is utf8 */
15255 SV* nonascii_but_latin1_properties = NULL;
15256 _invlist_intersection(posixes, PL_UpperLatin1,
15257 &nonascii_but_latin1_properties);
15258 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15261 _invlist_union(cp_list, posixes, &cp_list);
15262 SvREFCNT_dec_NN(posixes);
15268 if (depends_list) {
15269 _invlist_union(depends_list, nonascii_but_latin1_properties,
15271 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15274 depends_list = nonascii_but_latin1_properties;
15279 /* And combine the result (if any) with any inversion list from properties.
15280 * The lists are kept separate up to now so that we can distinguish the two
15281 * in regards to matching above-Unicode. A run-time warning is generated
15282 * if a Unicode property is matched against a non-Unicode code point. But,
15283 * we allow user-defined properties to match anything, without any warning,
15284 * and we also suppress the warning if there is a portion of the character
15285 * class that isn't a Unicode property, and which matches above Unicode, \W
15286 * or [\x{110000}] for example.
15287 * (Note that in this case, unlike the Posix one above, there is no
15288 * <depends_list>, because having a Unicode property forces Unicode
15293 /* If it matters to the final outcome, see if a non-property
15294 * component of the class matches above Unicode. If so, the
15295 * warning gets suppressed. This is true even if just a single
15296 * such code point is specified, as though not strictly correct if
15297 * another such code point is matched against, the fact that they
15298 * are using above-Unicode code points indicates they should know
15299 * the issues involved */
15301 warn_super = ! (invert
15302 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15305 _invlist_union(properties, cp_list, &cp_list);
15306 SvREFCNT_dec_NN(properties);
15309 cp_list = properties;
15313 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15317 /* Here, we have calculated what code points should be in the character
15320 * Now we can see about various optimizations. Fold calculation (which we
15321 * did above) needs to take place before inversion. Otherwise /[^k]/i
15322 * would invert to include K, which under /i would match k, which it
15323 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15324 * folded until runtime */
15326 /* If we didn't do folding, it's because some information isn't available
15327 * until runtime; set the run-time fold flag for these. (We don't have to
15328 * worry about properties folding, as that is taken care of by the swash
15329 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15330 * locales, or the class matches at least one 0-255 range code point */
15332 if (only_utf8_locale_list) {
15333 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15335 else if (cp_list) { /* Look to see if there a 0-255 code point is in
15338 invlist_iterinit(cp_list);
15339 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15340 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15342 invlist_iterfinish(cp_list);
15346 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15347 * at compile time. Besides not inverting folded locale now, we can't
15348 * invert if there are things such as \w, which aren't known until runtime
15352 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15354 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15356 _invlist_invert(cp_list);
15358 /* Any swash can't be used as-is, because we've inverted things */
15360 SvREFCNT_dec_NN(swash);
15364 /* Clear the invert flag since have just done it here */
15369 *ret_invlist = cp_list;
15370 SvREFCNT_dec(swash);
15372 /* Discard the generated node */
15374 RExC_size = orig_size;
15377 RExC_emit = orig_emit;
15382 /* Some character classes are equivalent to other nodes. Such nodes take
15383 * up less room and generally fewer operations to execute than ANYOF nodes.
15384 * Above, we checked for and optimized into some such equivalents for
15385 * certain common classes that are easy to test. Getting to this point in
15386 * the code means that the class didn't get optimized there. Since this
15387 * code is only executed in Pass 2, it is too late to save space--it has
15388 * been allocated in Pass 1, and currently isn't given back. But turning
15389 * things into an EXACTish node can allow the optimizer to join it to any
15390 * adjacent such nodes. And if the class is equivalent to things like /./,
15391 * expensive run-time swashes can be avoided. Now that we have more
15392 * complete information, we can find things necessarily missed by the
15393 * earlier code. I (khw) am not sure how much to look for here. It would
15394 * be easy, but perhaps too slow, to check any candidates against all the
15395 * node types they could possibly match using _invlistEQ(). */
15400 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15401 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15403 /* We don't optimize if we are supposed to make sure all non-Unicode
15404 * code points raise a warning, as only ANYOF nodes have this check.
15406 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15409 U8 op = END; /* The optimzation node-type */
15410 const char * cur_parse= RExC_parse;
15412 invlist_iterinit(cp_list);
15413 if (! invlist_iternext(cp_list, &start, &end)) {
15415 /* Here, the list is empty. This happens, for example, when a
15416 * Unicode property is the only thing in the character class, and
15417 * it doesn't match anything. (perluniprops.pod notes such
15420 *flagp |= HASWIDTH|SIMPLE;
15422 else if (start == end) { /* The range is a single code point */
15423 if (! invlist_iternext(cp_list, &start, &end)
15425 /* Don't do this optimization if it would require changing
15426 * the pattern to UTF-8 */
15427 && (start < 256 || UTF))
15429 /* Here, the list contains a single code point. Can optimize
15430 * into an EXACTish node */
15441 /* A locale node under folding with one code point can be
15442 * an EXACTFL, as its fold won't be calculated until
15448 /* Here, we are generally folding, but there is only one
15449 * code point to match. If we have to, we use an EXACT
15450 * node, but it would be better for joining with adjacent
15451 * nodes in the optimization pass if we used the same
15452 * EXACTFish node that any such are likely to be. We can
15453 * do this iff the code point doesn't participate in any
15454 * folds. For example, an EXACTF of a colon is the same as
15455 * an EXACT one, since nothing folds to or from a colon. */
15457 if (IS_IN_SOME_FOLD_L1(value)) {
15462 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15467 /* If we haven't found the node type, above, it means we
15468 * can use the prevailing one */
15470 op = compute_EXACTish(pRExC_state);
15475 else if (start == 0) {
15476 if (end == UV_MAX) {
15478 *flagp |= HASWIDTH|SIMPLE;
15481 else if (end == '\n' - 1
15482 && invlist_iternext(cp_list, &start, &end)
15483 && start == '\n' + 1 && end == UV_MAX)
15486 *flagp |= HASWIDTH|SIMPLE;
15490 invlist_iterfinish(cp_list);
15493 RExC_parse = (char *)orig_parse;
15494 RExC_emit = (regnode *)orig_emit;
15496 ret = reg_node(pRExC_state, op);
15498 RExC_parse = (char *)cur_parse;
15500 if (PL_regkind[op] == EXACT) {
15501 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15502 TRUE /* downgradable to EXACT */
15506 SvREFCNT_dec_NN(cp_list);
15511 /* Here, <cp_list> contains all the code points we can determine at
15512 * compile time that match under all conditions. Go through it, and
15513 * for things that belong in the bitmap, put them there, and delete from
15514 * <cp_list>. While we are at it, see if everything above 255 is in the
15515 * list, and if so, set a flag to speed up execution */
15517 populate_ANYOF_from_invlist(ret, &cp_list);
15520 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15523 /* Here, the bitmap has been populated with all the Latin1 code points that
15524 * always match. Can now add to the overall list those that match only
15525 * when the target string is UTF-8 (<depends_list>). */
15526 if (depends_list) {
15528 _invlist_union(cp_list, depends_list, &cp_list);
15529 SvREFCNT_dec_NN(depends_list);
15532 cp_list = depends_list;
15534 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15537 /* If there is a swash and more than one element, we can't use the swash in
15538 * the optimization below. */
15539 if (swash && element_count > 1) {
15540 SvREFCNT_dec_NN(swash);
15544 /* Note that the optimization of using 'swash' if it is the only thing in
15545 * the class doesn't have us change swash at all, so it can include things
15546 * that are also in the bitmap; otherwise we have purposely deleted that
15547 * duplicate information */
15548 set_ANYOF_arg(pRExC_state, ret, cp_list,
15549 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15551 only_utf8_locale_list,
15552 swash, has_user_defined_property);
15554 *flagp |= HASWIDTH|SIMPLE;
15556 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15557 RExC_contains_locale = 1;
15563 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15566 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15567 regnode* const node,
15569 SV* const runtime_defns,
15570 SV* const only_utf8_locale_list,
15572 const bool has_user_defined_property)
15574 /* Sets the arg field of an ANYOF-type node 'node', using information about
15575 * the node passed-in. If there is nothing outside the node's bitmap, the
15576 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
15577 * the count returned by add_data(), having allocated and stored an array,
15578 * av, that that count references, as follows:
15579 * av[0] stores the character class description in its textual form.
15580 * This is used later (regexec.c:Perl_regclass_swash()) to
15581 * initialize the appropriate swash, and is also useful for dumping
15582 * the regnode. This is set to &PL_sv_undef if the textual
15583 * description is not needed at run-time (as happens if the other
15584 * elements completely define the class)
15585 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15586 * computed from av[0]. But if no further computation need be done,
15587 * the swash is stored here now (and av[0] is &PL_sv_undef).
15588 * av[2] stores the inversion list of code points that match only if the
15589 * current locale is UTF-8
15590 * av[3] stores the cp_list inversion list for use in addition or instead
15591 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15592 * (Otherwise everything needed is already in av[0] and av[1])
15593 * av[4] is set if any component of the class is from a user-defined
15594 * property; used only if av[3] exists */
15598 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15600 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15601 assert(! (ANYOF_FLAGS(node)
15602 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15603 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15604 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15607 AV * const av = newAV();
15610 assert(ANYOF_FLAGS(node)
15611 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15612 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15614 av_store(av, 0, (runtime_defns)
15615 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15618 av_store(av, 1, swash);
15619 SvREFCNT_dec_NN(cp_list);
15622 av_store(av, 1, &PL_sv_undef);
15624 av_store(av, 3, cp_list);
15625 av_store(av, 4, newSVuv(has_user_defined_property));
15629 if (only_utf8_locale_list) {
15630 av_store(av, 2, only_utf8_locale_list);
15633 av_store(av, 2, &PL_sv_undef);
15636 rv = newRV_noinc(MUTABLE_SV(av));
15637 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15638 RExC_rxi->data->data[n] = (void*)rv;
15643 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15645 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15646 const regnode* node,
15649 SV** only_utf8_locale_ptr,
15653 /* For internal core use only.
15654 * Returns the swash for the input 'node' in the regex 'prog'.
15655 * If <doinit> is 'true', will attempt to create the swash if not already
15657 * If <listsvp> is non-null, will return the printable contents of the
15658 * swash. This can be used to get debugging information even before the
15659 * swash exists, by calling this function with 'doinit' set to false, in
15660 * which case the components that will be used to eventually create the
15661 * swash are returned (in a printable form).
15662 * If <exclude_list> is not NULL, it is an inversion list of things to
15663 * exclude from what's returned in <listsvp>.
15664 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
15665 * that, in spite of this function's name, the swash it returns may include
15666 * the bitmap data as well */
15669 SV *si = NULL; /* Input swash initialization string */
15670 SV* invlist = NULL;
15672 RXi_GET_DECL(prog,progi);
15673 const struct reg_data * const data = prog ? progi->data : NULL;
15675 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15677 assert(ANYOF_FLAGS(node)
15678 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15679 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15681 if (data && data->count) {
15682 const U32 n = ARG(node);
15684 if (data->what[n] == 's') {
15685 SV * const rv = MUTABLE_SV(data->data[n]);
15686 AV * const av = MUTABLE_AV(SvRV(rv));
15687 SV **const ary = AvARRAY(av);
15688 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15690 si = *ary; /* ary[0] = the string to initialize the swash with */
15692 /* Elements 3 and 4 are either both present or both absent. [3] is
15693 * any inversion list generated at compile time; [4] indicates if
15694 * that inversion list has any user-defined properties in it. */
15695 if (av_tindex(av) >= 2) {
15696 if (only_utf8_locale_ptr
15698 && ary[2] != &PL_sv_undef)
15700 *only_utf8_locale_ptr = ary[2];
15703 assert(only_utf8_locale_ptr);
15704 *only_utf8_locale_ptr = NULL;
15707 if (av_tindex(av) >= 3) {
15709 if (SvUV(ary[4])) {
15710 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15718 /* Element [1] is reserved for the set-up swash. If already there,
15719 * return it; if not, create it and store it there */
15720 if (ary[1] && SvROK(ary[1])) {
15723 else if (doinit && ((si && si != &PL_sv_undef)
15724 || (invlist && invlist != &PL_sv_undef))) {
15726 sw = _core_swash_init("utf8", /* the utf8 package */
15730 0, /* not from tr/// */
15732 &swash_init_flags);
15733 (void)av_store(av, 1, sw);
15738 /* If requested, return a printable version of what this swash matches */
15740 SV* matches_string = newSVpvs("");
15742 /* The swash should be used, if possible, to get the data, as it
15743 * contains the resolved data. But this function can be called at
15744 * compile-time, before everything gets resolved, in which case we
15745 * return the currently best available information, which is the string
15746 * that will eventually be used to do that resolving, 'si' */
15747 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15748 && (si && si != &PL_sv_undef))
15750 sv_catsv(matches_string, si);
15753 /* Add the inversion list to whatever we have. This may have come from
15754 * the swash, or from an input parameter */
15756 if (exclude_list) {
15757 SV* clone = invlist_clone(invlist);
15758 _invlist_subtract(clone, exclude_list, &clone);
15759 sv_catsv(matches_string, _invlist_contents(clone));
15760 SvREFCNT_dec_NN(clone);
15763 sv_catsv(matches_string, _invlist_contents(invlist));
15766 *listsvp = matches_string;
15771 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15773 /* reg_skipcomment()
15775 Absorbs an /x style # comment from the input stream,
15776 returning a pointer to the first character beyond the comment, or if the
15777 comment terminates the pattern without anything following it, this returns
15778 one past the final character of the pattern (in other words, RExC_end) and
15779 sets the REG_RUN_ON_COMMENT_SEEN flag.
15781 Note it's the callers responsibility to ensure that we are
15782 actually in /x mode
15786 PERL_STATIC_INLINE char*
15787 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15789 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15793 while (p < RExC_end) {
15794 if (*(++p) == '\n') {
15799 /* we ran off the end of the pattern without ending the comment, so we have
15800 * to add an \n when wrapping */
15801 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15807 Advances the parse position, and optionally absorbs
15808 "whitespace" from the inputstream.
15810 Without /x "whitespace" means (?#...) style comments only,
15811 with /x this means (?#...) and # comments and whitespace proper.
15813 Returns the RExC_parse point from BEFORE the scan occurs.
15815 This is the /x friendly way of saying RExC_parse++.
15819 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15821 char* const retval = RExC_parse++;
15823 PERL_ARGS_ASSERT_NEXTCHAR;
15826 if (RExC_end - RExC_parse >= 3
15827 && *RExC_parse == '('
15828 && RExC_parse[1] == '?'
15829 && RExC_parse[2] == '#')
15831 while (*RExC_parse != ')') {
15832 if (RExC_parse == RExC_end)
15833 FAIL("Sequence (?#... not terminated");
15839 if (RExC_flags & RXf_PMf_EXTENDED) {
15840 char * p = regpatws(pRExC_state, RExC_parse,
15841 TRUE); /* means recognize comments */
15842 if (p != RExC_parse) {
15852 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15854 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15855 * space. In pass1, it aligns and increments RExC_size; in pass2,
15858 regnode * const ret = RExC_emit;
15859 GET_RE_DEBUG_FLAGS_DECL;
15861 PERL_ARGS_ASSERT_REGNODE_GUTS;
15863 assert(extra_size >= regarglen[op]);
15866 SIZE_ALIGN(RExC_size);
15867 RExC_size += 1 + extra_size;
15870 if (RExC_emit >= RExC_emit_bound)
15871 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15872 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15874 NODE_ALIGN_FILL(ret);
15875 #ifndef RE_TRACK_PATTERN_OFFSETS
15876 PERL_UNUSED_ARG(name);
15878 if (RExC_offsets) { /* MJD */
15880 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15883 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15884 ? "Overwriting end of array!\n" : "OK",
15885 (UV)(RExC_emit - RExC_emit_start),
15886 (UV)(RExC_parse - RExC_start),
15887 (UV)RExC_offsets[0]));
15888 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15895 - reg_node - emit a node
15897 STATIC regnode * /* Location. */
15898 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15900 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15902 PERL_ARGS_ASSERT_REG_NODE;
15904 assert(regarglen[op] == 0);
15907 regnode *ptr = ret;
15908 FILL_ADVANCE_NODE(ptr, op);
15915 - reganode - emit a node with an argument
15917 STATIC regnode * /* Location. */
15918 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15920 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15922 PERL_ARGS_ASSERT_REGANODE;
15924 assert(regarglen[op] == 1);
15927 regnode *ptr = ret;
15928 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15935 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15937 /* emit a node with U32 and I32 arguments */
15939 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15941 PERL_ARGS_ASSERT_REG2LANODE;
15943 assert(regarglen[op] == 2);
15946 regnode *ptr = ret;
15947 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15954 - reginsert - insert an operator in front of already-emitted operand
15956 * Means relocating the operand.
15959 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15964 const int offset = regarglen[(U8)op];
15965 const int size = NODE_STEP_REGNODE + offset;
15966 GET_RE_DEBUG_FLAGS_DECL;
15968 PERL_ARGS_ASSERT_REGINSERT;
15969 PERL_UNUSED_CONTEXT;
15970 PERL_UNUSED_ARG(depth);
15971 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15972 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15981 if (RExC_open_parens) {
15983 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15984 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15985 if ( RExC_open_parens[paren] >= opnd ) {
15986 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15987 RExC_open_parens[paren] += size;
15989 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15991 if ( RExC_close_parens[paren] >= opnd ) {
15992 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15993 RExC_close_parens[paren] += size;
15995 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
16000 while (src > opnd) {
16001 StructCopy(--src, --dst, regnode);
16002 #ifdef RE_TRACK_PATTERN_OFFSETS
16003 if (RExC_offsets) { /* MJD 20010112 */
16005 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
16009 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
16010 ? "Overwriting end of array!\n" : "OK",
16011 (UV)(src - RExC_emit_start),
16012 (UV)(dst - RExC_emit_start),
16013 (UV)RExC_offsets[0]));
16014 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
16015 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
16021 place = opnd; /* Op node, where operand used to be. */
16022 #ifdef RE_TRACK_PATTERN_OFFSETS
16023 if (RExC_offsets) { /* MJD */
16025 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
16029 (UV)(place - RExC_emit_start) > RExC_offsets[0]
16030 ? "Overwriting end of array!\n" : "OK",
16031 (UV)(place - RExC_emit_start),
16032 (UV)(RExC_parse - RExC_start),
16033 (UV)RExC_offsets[0]));
16034 Set_Node_Offset(place, RExC_parse);
16035 Set_Node_Length(place, 1);
16038 src = NEXTOPER(place);
16039 FILL_ADVANCE_NODE(place, op);
16040 Zero(src, offset, regnode);
16044 - regtail - set the next-pointer at the end of a node chain of p to val.
16045 - SEE ALSO: regtail_study
16047 /* TODO: All three parms should be const */
16049 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16050 const regnode *val,U32 depth)
16053 GET_RE_DEBUG_FLAGS_DECL;
16055 PERL_ARGS_ASSERT_REGTAIL;
16057 PERL_UNUSED_ARG(depth);
16063 /* Find last node. */
16066 regnode * const temp = regnext(scan);
16068 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
16069 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16070 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
16071 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
16072 (temp == NULL ? "->" : ""),
16073 (temp == NULL ? PL_reg_name[OP(val)] : "")
16081 if (reg_off_by_arg[OP(scan)]) {
16082 ARG_SET(scan, val - scan);
16085 NEXT_OFF(scan) = val - scan;
16091 - regtail_study - set the next-pointer at the end of a node chain of p to val.
16092 - Look for optimizable sequences at the same time.
16093 - currently only looks for EXACT chains.
16095 This is experimental code. The idea is to use this routine to perform
16096 in place optimizations on branches and groups as they are constructed,
16097 with the long term intention of removing optimization from study_chunk so
16098 that it is purely analytical.
16100 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
16101 to control which is which.
16104 /* TODO: All four parms should be const */
16107 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16108 const regnode *val,U32 depth)
16112 #ifdef EXPERIMENTAL_INPLACESCAN
16115 GET_RE_DEBUG_FLAGS_DECL;
16117 PERL_ARGS_ASSERT_REGTAIL_STUDY;
16123 /* Find last node. */
16127 regnode * const temp = regnext(scan);
16128 #ifdef EXPERIMENTAL_INPLACESCAN
16129 if (PL_regkind[OP(scan)] == EXACT) {
16130 bool unfolded_multi_char; /* Unexamined in this routine */
16131 if (join_exact(pRExC_state, scan, &min,
16132 &unfolded_multi_char, 1, val, depth+1))
16137 switch (OP(scan)) {
16141 case EXACTFA_NO_TRIE:
16147 if( exact == PSEUDO )
16149 else if ( exact != OP(scan) )
16158 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16159 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16160 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16161 SvPV_nolen_const(RExC_mysv),
16162 REG_NODE_NUM(scan),
16163 PL_reg_name[exact]);
16170 DEBUG_PARSE_MSG("");
16171 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16172 PerlIO_printf(Perl_debug_log,
16173 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16174 SvPV_nolen_const(RExC_mysv),
16175 (IV)REG_NODE_NUM(val),
16179 if (reg_off_by_arg[OP(scan)]) {
16180 ARG_SET(scan, val - scan);
16183 NEXT_OFF(scan) = val - scan;
16191 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16196 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16201 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16203 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16204 if (flags & (1<<bit)) {
16205 if (!set++ && lead)
16206 PerlIO_printf(Perl_debug_log, "%s",lead);
16207 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16212 PerlIO_printf(Perl_debug_log, "\n");
16214 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16219 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16225 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16227 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16228 if (flags & (1<<bit)) {
16229 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16232 if (!set++ && lead)
16233 PerlIO_printf(Perl_debug_log, "%s",lead);
16234 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16237 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16238 if (!set++ && lead) {
16239 PerlIO_printf(Perl_debug_log, "%s",lead);
16242 case REGEX_UNICODE_CHARSET:
16243 PerlIO_printf(Perl_debug_log, "UNICODE");
16245 case REGEX_LOCALE_CHARSET:
16246 PerlIO_printf(Perl_debug_log, "LOCALE");
16248 case REGEX_ASCII_RESTRICTED_CHARSET:
16249 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16251 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16252 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16255 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16261 PerlIO_printf(Perl_debug_log, "\n");
16263 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16269 Perl_regdump(pTHX_ const regexp *r)
16272 SV * const sv = sv_newmortal();
16273 SV *dsv= sv_newmortal();
16274 RXi_GET_DECL(r,ri);
16275 GET_RE_DEBUG_FLAGS_DECL;
16277 PERL_ARGS_ASSERT_REGDUMP;
16279 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16281 /* Header fields of interest. */
16282 if (r->anchored_substr) {
16283 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16284 RE_SV_DUMPLEN(r->anchored_substr), 30);
16285 PerlIO_printf(Perl_debug_log,
16286 "anchored %s%s at %"IVdf" ",
16287 s, RE_SV_TAIL(r->anchored_substr),
16288 (IV)r->anchored_offset);
16289 } else if (r->anchored_utf8) {
16290 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16291 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16292 PerlIO_printf(Perl_debug_log,
16293 "anchored utf8 %s%s at %"IVdf" ",
16294 s, RE_SV_TAIL(r->anchored_utf8),
16295 (IV)r->anchored_offset);
16297 if (r->float_substr) {
16298 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16299 RE_SV_DUMPLEN(r->float_substr), 30);
16300 PerlIO_printf(Perl_debug_log,
16301 "floating %s%s at %"IVdf"..%"UVuf" ",
16302 s, RE_SV_TAIL(r->float_substr),
16303 (IV)r->float_min_offset, (UV)r->float_max_offset);
16304 } else if (r->float_utf8) {
16305 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16306 RE_SV_DUMPLEN(r->float_utf8), 30);
16307 PerlIO_printf(Perl_debug_log,
16308 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16309 s, RE_SV_TAIL(r->float_utf8),
16310 (IV)r->float_min_offset, (UV)r->float_max_offset);
16312 if (r->check_substr || r->check_utf8)
16313 PerlIO_printf(Perl_debug_log,
16315 (r->check_substr == r->float_substr
16316 && r->check_utf8 == r->float_utf8
16317 ? "(checking floating" : "(checking anchored"));
16318 if (r->intflags & PREGf_NOSCAN)
16319 PerlIO_printf(Perl_debug_log, " noscan");
16320 if (r->extflags & RXf_CHECK_ALL)
16321 PerlIO_printf(Perl_debug_log, " isall");
16322 if (r->check_substr || r->check_utf8)
16323 PerlIO_printf(Perl_debug_log, ") ");
16325 if (ri->regstclass) {
16326 regprop(r, sv, ri->regstclass, NULL, NULL);
16327 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16329 if (r->intflags & PREGf_ANCH) {
16330 PerlIO_printf(Perl_debug_log, "anchored");
16331 if (r->intflags & PREGf_ANCH_MBOL)
16332 PerlIO_printf(Perl_debug_log, "(MBOL)");
16333 if (r->intflags & PREGf_ANCH_SBOL)
16334 PerlIO_printf(Perl_debug_log, "(SBOL)");
16335 if (r->intflags & PREGf_ANCH_GPOS)
16336 PerlIO_printf(Perl_debug_log, "(GPOS)");
16337 PerlIO_putc(Perl_debug_log, ' ');
16339 if (r->intflags & PREGf_GPOS_SEEN)
16340 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16341 if (r->intflags & PREGf_SKIP)
16342 PerlIO_printf(Perl_debug_log, "plus ");
16343 if (r->intflags & PREGf_IMPLICIT)
16344 PerlIO_printf(Perl_debug_log, "implicit ");
16345 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16346 if (r->extflags & RXf_EVAL_SEEN)
16347 PerlIO_printf(Perl_debug_log, "with eval ");
16348 PerlIO_printf(Perl_debug_log, "\n");
16350 regdump_extflags("r->extflags: ",r->extflags);
16351 regdump_intflags("r->intflags: ",r->intflags);
16354 PERL_ARGS_ASSERT_REGDUMP;
16355 PERL_UNUSED_CONTEXT;
16356 PERL_UNUSED_ARG(r);
16357 #endif /* DEBUGGING */
16361 - regprop - printable representation of opcode, with run time support
16365 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16370 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16371 static const char * const anyofs[] = {
16372 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16373 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
16374 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
16375 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
16376 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
16377 || _CC_VERTSPACE != 16
16378 #error Need to adjust order of anyofs[]
16415 RXi_GET_DECL(prog,progi);
16416 GET_RE_DEBUG_FLAGS_DECL;
16418 PERL_ARGS_ASSERT_REGPROP;
16420 sv_setpvn(sv, "", 0);
16422 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
16423 /* It would be nice to FAIL() here, but this may be called from
16424 regexec.c, and it would be hard to supply pRExC_state. */
16425 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16426 (int)OP(o), (int)REGNODE_MAX);
16427 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16429 k = PL_regkind[OP(o)];
16432 sv_catpvs(sv, " ");
16433 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16434 * is a crude hack but it may be the best for now since
16435 * we have no flag "this EXACTish node was UTF-8"
16437 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16438 PERL_PV_ESCAPE_UNI_DETECT |
16439 PERL_PV_ESCAPE_NONASCII |
16440 PERL_PV_PRETTY_ELLIPSES |
16441 PERL_PV_PRETTY_LTGT |
16442 PERL_PV_PRETTY_NOCLEAR
16444 } else if (k == TRIE) {
16445 /* print the details of the trie in dumpuntil instead, as
16446 * progi->data isn't available here */
16447 const char op = OP(o);
16448 const U32 n = ARG(o);
16449 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16450 (reg_ac_data *)progi->data->data[n] :
16452 const reg_trie_data * const trie
16453 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16455 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16456 DEBUG_TRIE_COMPILE_r(
16457 Perl_sv_catpvf(aTHX_ sv,
16458 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16459 (UV)trie->startstate,
16460 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16461 (UV)trie->wordcount,
16464 (UV)TRIE_CHARCOUNT(trie),
16465 (UV)trie->uniquecharcount
16468 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16469 sv_catpvs(sv, "[");
16470 (void) put_charclass_bitmap_innards(sv,
16471 (IS_ANYOF_TRIE(op))
16473 : TRIE_BITMAP(trie),
16475 sv_catpvs(sv, "]");
16478 } else if (k == CURLY) {
16479 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16480 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16481 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16483 else if (k == WHILEM && o->flags) /* Ordinal/of */
16484 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16485 else if (k == REF || k == OPEN || k == CLOSE
16486 || k == GROUPP || OP(o)==ACCEPT)
16488 AV *name_list= NULL;
16489 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16490 if ( RXp_PAREN_NAMES(prog) ) {
16491 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16492 } else if ( pRExC_state ) {
16493 name_list= RExC_paren_name_list;
16496 if ( k != REF || (OP(o) < NREF)) {
16497 SV **name= av_fetch(name_list, ARG(o), 0 );
16499 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16502 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16503 I32 *nums=(I32*)SvPVX(sv_dat);
16504 SV **name= av_fetch(name_list, nums[0], 0 );
16507 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16508 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16509 (n ? "," : ""), (IV)nums[n]);
16511 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16515 if ( k == REF && reginfo) {
16516 U32 n = ARG(o); /* which paren pair */
16517 I32 ln = prog->offs[n].start;
16518 if (prog->lastparen < n || ln == -1)
16519 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16520 else if (ln == prog->offs[n].end)
16521 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16523 const char *s = reginfo->strbeg + ln;
16524 Perl_sv_catpvf(aTHX_ sv, ": ");
16525 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16526 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16529 } else if (k == GOSUB) {
16530 AV *name_list= NULL;
16531 if ( RXp_PAREN_NAMES(prog) ) {
16532 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16533 } else if ( pRExC_state ) {
16534 name_list= RExC_paren_name_list;
16537 /* Paren and offset */
16538 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16540 SV **name= av_fetch(name_list, ARG(o), 0 );
16542 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16545 else if (k == VERB) {
16547 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16548 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16549 } else if (k == LOGICAL)
16550 /* 2: embedded, otherwise 1 */
16551 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16552 else if (k == ANYOF) {
16553 const U8 flags = ANYOF_FLAGS(o);
16555 SV* bitmap_invlist; /* Will hold what the bit map contains */
16558 if (OP(o) == ANYOFL)
16559 sv_catpvs(sv, "{loc}");
16560 if (flags & ANYOF_LOC_FOLD)
16561 sv_catpvs(sv, "{i}");
16562 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16563 if (flags & ANYOF_INVERT)
16564 sv_catpvs(sv, "^");
16566 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16568 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16571 /* output any special charclass tests (used entirely under use
16573 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16575 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16576 if (ANYOF_POSIXL_TEST(o,i)) {
16577 sv_catpv(sv, anyofs[i]);
16583 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16584 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16585 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16589 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16590 if (flags & ANYOF_INVERT)
16591 /*make sure the invert info is in each */
16592 sv_catpvs(sv, "^");
16595 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16596 sv_catpvs(sv, "{non-utf8-latin1-all}");
16599 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16600 sv_catpvs(sv, "{above_bitmap_all}");
16602 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16603 SV *lv; /* Set if there is something outside the bit map. */
16604 bool byte_output = FALSE; /* If something has been output */
16605 SV *only_utf8_locale;
16607 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
16608 * is used to guarantee that nothing in the bitmap gets
16610 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16611 &lv, &only_utf8_locale,
16613 if (lv && lv != &PL_sv_undef) {
16614 char *s = savesvpv(lv);
16615 char * const origs = s;
16617 while (*s && *s != '\n')
16621 const char * const t = ++s;
16623 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16624 sv_catpvs(sv, "{outside bitmap}");
16627 sv_catpvs(sv, "{utf8}");
16631 sv_catpvs(sv, " ");
16637 /* Truncate very long output */
16638 if (s - origs > 256) {
16639 Perl_sv_catpvf(aTHX_ sv,
16641 (int) (s - origs - 1),
16647 else if (*s == '\t') {
16661 SvREFCNT_dec_NN(lv);
16664 if ((flags & ANYOF_LOC_FOLD)
16665 && only_utf8_locale
16666 && only_utf8_locale != &PL_sv_undef)
16669 int max_entries = 256;
16671 sv_catpvs(sv, "{utf8 locale}");
16672 invlist_iterinit(only_utf8_locale);
16673 while (invlist_iternext(only_utf8_locale,
16675 put_range(sv, start, end, FALSE);
16677 if (max_entries < 0) {
16678 sv_catpvs(sv, "...");
16682 invlist_iterfinish(only_utf8_locale);
16686 SvREFCNT_dec(bitmap_invlist);
16689 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16691 else if (k == POSIXD || k == NPOSIXD) {
16692 U8 index = FLAGS(o) * 2;
16693 if (index < C_ARRAY_LENGTH(anyofs)) {
16694 if (*anyofs[index] != '[') {
16697 sv_catpv(sv, anyofs[index]);
16698 if (*anyofs[index] != '[') {
16703 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16706 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16707 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16708 else if (OP(o) == SBOL)
16709 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16711 PERL_UNUSED_CONTEXT;
16712 PERL_UNUSED_ARG(sv);
16713 PERL_UNUSED_ARG(o);
16714 PERL_UNUSED_ARG(prog);
16715 PERL_UNUSED_ARG(reginfo);
16716 PERL_UNUSED_ARG(pRExC_state);
16717 #endif /* DEBUGGING */
16723 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16724 { /* Assume that RE_INTUIT is set */
16725 struct regexp *const prog = ReANY(r);
16726 GET_RE_DEBUG_FLAGS_DECL;
16728 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16729 PERL_UNUSED_CONTEXT;
16733 const char * const s = SvPV_nolen_const(RX_UTF8(r)
16734 ? prog->check_utf8 : prog->check_substr);
16736 if (!PL_colorset) reginitcolors();
16737 PerlIO_printf(Perl_debug_log,
16738 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16740 RX_UTF8(r) ? "utf8 " : "",
16741 PL_colors[5],PL_colors[0],
16744 (strlen(s) > 60 ? "..." : ""));
16747 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
16748 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
16754 handles refcounting and freeing the perl core regexp structure. When
16755 it is necessary to actually free the structure the first thing it
16756 does is call the 'free' method of the regexp_engine associated to
16757 the regexp, allowing the handling of the void *pprivate; member
16758 first. (This routine is not overridable by extensions, which is why
16759 the extensions free is called first.)
16761 See regdupe and regdupe_internal if you change anything here.
16763 #ifndef PERL_IN_XSUB_RE
16765 Perl_pregfree(pTHX_ REGEXP *r)
16771 Perl_pregfree2(pTHX_ REGEXP *rx)
16773 struct regexp *const r = ReANY(rx);
16774 GET_RE_DEBUG_FLAGS_DECL;
16776 PERL_ARGS_ASSERT_PREGFREE2;
16778 if (r->mother_re) {
16779 ReREFCNT_dec(r->mother_re);
16781 CALLREGFREE_PVT(rx); /* free the private data */
16782 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16783 Safefree(r->xpv_len_u.xpvlenu_pv);
16786 SvREFCNT_dec(r->anchored_substr);
16787 SvREFCNT_dec(r->anchored_utf8);
16788 SvREFCNT_dec(r->float_substr);
16789 SvREFCNT_dec(r->float_utf8);
16790 Safefree(r->substrs);
16792 RX_MATCH_COPY_FREE(rx);
16793 #ifdef PERL_ANY_COW
16794 SvREFCNT_dec(r->saved_copy);
16797 SvREFCNT_dec(r->qr_anoncv);
16798 rx->sv_u.svu_rx = 0;
16803 This is a hacky workaround to the structural issue of match results
16804 being stored in the regexp structure which is in turn stored in
16805 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16806 could be PL_curpm in multiple contexts, and could require multiple
16807 result sets being associated with the pattern simultaneously, such
16808 as when doing a recursive match with (??{$qr})
16810 The solution is to make a lightweight copy of the regexp structure
16811 when a qr// is returned from the code executed by (??{$qr}) this
16812 lightweight copy doesn't actually own any of its data except for
16813 the starp/end and the actual regexp structure itself.
16819 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16821 struct regexp *ret;
16822 struct regexp *const r = ReANY(rx);
16823 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16825 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16828 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16830 SvOK_off((SV *)ret_x);
16832 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16833 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16834 made both spots point to the same regexp body.) */
16835 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16836 assert(!SvPVX(ret_x));
16837 ret_x->sv_u.svu_rx = temp->sv_any;
16838 temp->sv_any = NULL;
16839 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16840 SvREFCNT_dec_NN(temp);
16841 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16842 ing below will not set it. */
16843 SvCUR_set(ret_x, SvCUR(rx));
16846 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16847 sv_force_normal(sv) is called. */
16849 ret = ReANY(ret_x);
16851 SvFLAGS(ret_x) |= SvUTF8(rx);
16852 /* We share the same string buffer as the original regexp, on which we
16853 hold a reference count, incremented when mother_re is set below.
16854 The string pointer is copied here, being part of the regexp struct.
16856 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16857 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16859 const I32 npar = r->nparens+1;
16860 Newx(ret->offs, npar, regexp_paren_pair);
16861 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16864 Newx(ret->substrs, 1, struct reg_substr_data);
16865 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16867 SvREFCNT_inc_void(ret->anchored_substr);
16868 SvREFCNT_inc_void(ret->anchored_utf8);
16869 SvREFCNT_inc_void(ret->float_substr);
16870 SvREFCNT_inc_void(ret->float_utf8);
16872 /* check_substr and check_utf8, if non-NULL, point to either their
16873 anchored or float namesakes, and don't hold a second reference. */
16875 RX_MATCH_COPIED_off(ret_x);
16876 #ifdef PERL_ANY_COW
16877 ret->saved_copy = NULL;
16879 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16880 SvREFCNT_inc_void(ret->qr_anoncv);
16886 /* regfree_internal()
16888 Free the private data in a regexp. This is overloadable by
16889 extensions. Perl takes care of the regexp structure in pregfree(),
16890 this covers the *pprivate pointer which technically perl doesn't
16891 know about, however of course we have to handle the
16892 regexp_internal structure when no extension is in use.
16894 Note this is called before freeing anything in the regexp
16899 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16901 struct regexp *const r = ReANY(rx);
16902 RXi_GET_DECL(r,ri);
16903 GET_RE_DEBUG_FLAGS_DECL;
16905 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16911 SV *dsv= sv_newmortal();
16912 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16913 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16914 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16915 PL_colors[4],PL_colors[5],s);
16918 #ifdef RE_TRACK_PATTERN_OFFSETS
16920 Safefree(ri->u.offsets); /* 20010421 MJD */
16922 if (ri->code_blocks) {
16924 for (n = 0; n < ri->num_code_blocks; n++)
16925 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16926 Safefree(ri->code_blocks);
16930 int n = ri->data->count;
16933 /* If you add a ->what type here, update the comment in regcomp.h */
16934 switch (ri->data->what[n]) {
16940 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16943 Safefree(ri->data->data[n]);
16949 { /* Aho Corasick add-on structure for a trie node.
16950 Used in stclass optimization only */
16952 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16953 #ifdef USE_ITHREADS
16957 refcount = --aho->refcount;
16960 PerlMemShared_free(aho->states);
16961 PerlMemShared_free(aho->fail);
16962 /* do this last!!!! */
16963 PerlMemShared_free(ri->data->data[n]);
16964 /* we should only ever get called once, so
16965 * assert as much, and also guard the free
16966 * which /might/ happen twice. At the least
16967 * it will make code anlyzers happy and it
16968 * doesn't cost much. - Yves */
16969 assert(ri->regstclass);
16970 if (ri->regstclass) {
16971 PerlMemShared_free(ri->regstclass);
16972 ri->regstclass = 0;
16979 /* trie structure. */
16981 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16982 #ifdef USE_ITHREADS
16986 refcount = --trie->refcount;
16989 PerlMemShared_free(trie->charmap);
16990 PerlMemShared_free(trie->states);
16991 PerlMemShared_free(trie->trans);
16993 PerlMemShared_free(trie->bitmap);
16995 PerlMemShared_free(trie->jump);
16996 PerlMemShared_free(trie->wordinfo);
16997 /* do this last!!!! */
16998 PerlMemShared_free(ri->data->data[n]);
17003 Perl_croak(aTHX_ "panic: regfree data code '%c'",
17004 ri->data->what[n]);
17007 Safefree(ri->data->what);
17008 Safefree(ri->data);
17014 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
17015 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
17016 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
17019 re_dup - duplicate a regexp.
17021 This routine is expected to clone a given regexp structure. It is only
17022 compiled under USE_ITHREADS.
17024 After all of the core data stored in struct regexp is duplicated
17025 the regexp_engine.dupe method is used to copy any private data
17026 stored in the *pprivate pointer. This allows extensions to handle
17027 any duplication it needs to do.
17029 See pregfree() and regfree_internal() if you change anything here.
17031 #if defined(USE_ITHREADS)
17032 #ifndef PERL_IN_XSUB_RE
17034 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
17038 const struct regexp *r = ReANY(sstr);
17039 struct regexp *ret = ReANY(dstr);
17041 PERL_ARGS_ASSERT_RE_DUP_GUTS;
17043 npar = r->nparens+1;
17044 Newx(ret->offs, npar, regexp_paren_pair);
17045 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17047 if (ret->substrs) {
17048 /* Do it this way to avoid reading from *r after the StructCopy().
17049 That way, if any of the sv_dup_inc()s dislodge *r from the L1
17050 cache, it doesn't matter. */
17051 const bool anchored = r->check_substr
17052 ? r->check_substr == r->anchored_substr
17053 : r->check_utf8 == r->anchored_utf8;
17054 Newx(ret->substrs, 1, struct reg_substr_data);
17055 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17057 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
17058 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
17059 ret->float_substr = sv_dup_inc(ret->float_substr, param);
17060 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
17062 /* check_substr and check_utf8, if non-NULL, point to either their
17063 anchored or float namesakes, and don't hold a second reference. */
17065 if (ret->check_substr) {
17067 assert(r->check_utf8 == r->anchored_utf8);
17068 ret->check_substr = ret->anchored_substr;
17069 ret->check_utf8 = ret->anchored_utf8;
17071 assert(r->check_substr == r->float_substr);
17072 assert(r->check_utf8 == r->float_utf8);
17073 ret->check_substr = ret->float_substr;
17074 ret->check_utf8 = ret->float_utf8;
17076 } else if (ret->check_utf8) {
17078 ret->check_utf8 = ret->anchored_utf8;
17080 ret->check_utf8 = ret->float_utf8;
17085 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
17086 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
17089 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
17091 if (RX_MATCH_COPIED(dstr))
17092 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
17094 ret->subbeg = NULL;
17095 #ifdef PERL_ANY_COW
17096 ret->saved_copy = NULL;
17099 /* Whether mother_re be set or no, we need to copy the string. We
17100 cannot refrain from copying it when the storage points directly to
17101 our mother regexp, because that's
17102 1: a buffer in a different thread
17103 2: something we no longer hold a reference on
17104 so we need to copy it locally. */
17105 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
17106 ret->mother_re = NULL;
17108 #endif /* PERL_IN_XSUB_RE */
17113 This is the internal complement to regdupe() which is used to copy
17114 the structure pointed to by the *pprivate pointer in the regexp.
17115 This is the core version of the extension overridable cloning hook.
17116 The regexp structure being duplicated will be copied by perl prior
17117 to this and will be provided as the regexp *r argument, however
17118 with the /old/ structures pprivate pointer value. Thus this routine
17119 may override any copying normally done by perl.
17121 It returns a pointer to the new regexp_internal structure.
17125 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
17128 struct regexp *const r = ReANY(rx);
17129 regexp_internal *reti;
17131 RXi_GET_DECL(r,ri);
17133 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17137 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17138 char, regexp_internal);
17139 Copy(ri->program, reti->program, len+1, regnode);
17141 reti->num_code_blocks = ri->num_code_blocks;
17142 if (ri->code_blocks) {
17144 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17145 struct reg_code_block);
17146 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17147 struct reg_code_block);
17148 for (n = 0; n < ri->num_code_blocks; n++)
17149 reti->code_blocks[n].src_regex = (REGEXP*)
17150 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17153 reti->code_blocks = NULL;
17155 reti->regstclass = NULL;
17158 struct reg_data *d;
17159 const int count = ri->data->count;
17162 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17163 char, struct reg_data);
17164 Newx(d->what, count, U8);
17167 for (i = 0; i < count; i++) {
17168 d->what[i] = ri->data->what[i];
17169 switch (d->what[i]) {
17170 /* see also regcomp.h and regfree_internal() */
17171 case 'a': /* actually an AV, but the dup function is identical. */
17175 case 'u': /* actually an HV, but the dup function is identical. */
17176 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17179 /* This is cheating. */
17180 Newx(d->data[i], 1, regnode_ssc);
17181 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17182 reti->regstclass = (regnode*)d->data[i];
17185 /* Trie stclasses are readonly and can thus be shared
17186 * without duplication. We free the stclass in pregfree
17187 * when the corresponding reg_ac_data struct is freed.
17189 reti->regstclass= ri->regstclass;
17193 ((reg_trie_data*)ri->data->data[i])->refcount++;
17198 d->data[i] = ri->data->data[i];
17201 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17202 ri->data->what[i]);
17211 reti->name_list_idx = ri->name_list_idx;
17213 #ifdef RE_TRACK_PATTERN_OFFSETS
17214 if (ri->u.offsets) {
17215 Newx(reti->u.offsets, 2*len+1, U32);
17216 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17219 SetProgLen(reti,len);
17222 return (void*)reti;
17225 #endif /* USE_ITHREADS */
17227 #ifndef PERL_IN_XSUB_RE
17230 - regnext - dig the "next" pointer out of a node
17233 Perl_regnext(pTHX_ regnode *p)
17240 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17241 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17242 (int)OP(p), (int)REGNODE_MAX);
17245 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17254 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17257 STRLEN l1 = strlen(pat1);
17258 STRLEN l2 = strlen(pat2);
17261 const char *message;
17263 PERL_ARGS_ASSERT_RE_CROAK2;
17269 Copy(pat1, buf, l1 , char);
17270 Copy(pat2, buf + l1, l2 , char);
17271 buf[l1 + l2] = '\n';
17272 buf[l1 + l2 + 1] = '\0';
17273 va_start(args, pat2);
17274 msv = vmess(buf, &args);
17276 message = SvPV_const(msv,l1);
17279 Copy(message, buf, l1 , char);
17280 /* l1-1 to avoid \n */
17281 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17287 S_put_code_point(pTHX_ SV *sv, UV c)
17289 PERL_ARGS_ASSERT_PUT_CODE_POINT;
17292 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17294 else if (isPRINT(c)) {
17295 const char string = (char) c;
17296 if (isBACKSLASHED_PUNCT(c))
17297 sv_catpvs(sv, "\\");
17298 sv_catpvn(sv, &string, 1);
17301 const char * const mnemonic = cntrl_to_mnemonic((char) c);
17303 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17306 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17311 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17314 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17316 /* Appends to 'sv' a displayable version of the range of code points from
17317 * 'start' to 'end'. It assumes that only ASCII printables are displayable
17318 * as-is (though some of these will be escaped by put_code_point()). */
17320 const unsigned int min_range_count = 3;
17322 assert(start <= end);
17324 PERL_ARGS_ASSERT_PUT_RANGE;
17326 while (start <= end) {
17328 const char * format;
17330 if (end - start < min_range_count) {
17332 /* Individual chars in short ranges */
17333 for (; start <= end; start++) {
17334 put_code_point(sv, start);
17339 /* If permitted by the input options, and there is a possibility that
17340 * this range contains a printable literal, look to see if there is
17342 if (allow_literals && start <= MAX_PRINT_A) {
17344 /* If the range begin isn't an ASCII printable, effectively split
17345 * the range into two parts:
17346 * 1) the portion before the first such printable,
17348 * and output them separately. */
17349 if (! isPRINT_A(start)) {
17350 UV temp_end = start + 1;
17352 /* There is no point looking beyond the final possible
17353 * printable, in MAX_PRINT_A */
17354 UV max = MIN(end, MAX_PRINT_A);
17356 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17360 /* Here, temp_end points to one beyond the first printable if
17361 * found, or to one beyond 'max' if not. If none found, make
17362 * sure that we use the entire range */
17363 if (temp_end > MAX_PRINT_A) {
17364 temp_end = end + 1;
17367 /* Output the first part of the split range, the part that
17368 * doesn't have printables, with no looking for literals
17369 * (otherwise we would infinitely recurse) */
17370 put_range(sv, start, temp_end - 1, FALSE);
17372 /* The 2nd part of the range (if any) starts here. */
17375 /* We continue instead of dropping down because even if the 2nd
17376 * part is non-empty, it could be so short that we want to
17377 * output it specially, as tested for at the top of this loop.
17382 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
17383 * output a sub-range of just the digits or letters, then process
17384 * the remaining portion as usual. */
17385 if (isALPHANUMERIC_A(start)) {
17386 UV mask = (isDIGIT_A(start))
17391 UV temp_end = start + 1;
17393 /* Find the end of the sub-range that includes just the
17394 * characters in the same class as the first character in it */
17395 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17400 /* For short ranges, don't duplicate the code above to output
17401 * them; just call recursively */
17402 if (temp_end - start < min_range_count) {
17403 put_range(sv, start, temp_end, FALSE);
17405 else { /* Output as a range */
17406 put_code_point(sv, start);
17407 sv_catpvs(sv, "-");
17408 put_code_point(sv, temp_end);
17410 start = temp_end + 1;
17414 /* We output any other printables as individual characters */
17415 if (isPUNCT_A(start) || isSPACE_A(start)) {
17416 while (start <= end && (isPUNCT_A(start)
17417 || isSPACE_A(start)))
17419 put_code_point(sv, start);
17424 } /* End of looking for literals */
17426 /* Here is not to output as a literal. Some control characters have
17427 * mnemonic names. Split off any of those at the beginning and end of
17428 * the range to print mnemonically. It isn't possible for many of
17429 * these to be in a row, so this won't overwhelm with output */
17430 while (isMNEMONIC_CNTRL(start) && start <= end) {
17431 put_code_point(sv, start);
17434 if (start < end && isMNEMONIC_CNTRL(end)) {
17436 /* Here, the final character in the range has a mnemonic name.
17437 * Work backwards from the end to find the final non-mnemonic */
17438 UV temp_end = end - 1;
17439 while (isMNEMONIC_CNTRL(temp_end)) {
17443 /* And separately output the range that doesn't have mnemonics */
17444 put_range(sv, start, temp_end, FALSE);
17446 /* Then output the mnemonic trailing controls */
17447 start = temp_end + 1;
17448 while (start <= end) {
17449 put_code_point(sv, start);
17455 /* As a final resort, output the range or subrange as hex. */
17457 this_end = (end < NUM_ANYOF_CODE_POINTS)
17459 : NUM_ANYOF_CODE_POINTS - 1;
17460 format = (this_end < 256)
17461 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17462 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17463 GCC_DIAG_IGNORE(-Wformat-nonliteral);
17464 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17471 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17473 /* Appends to 'sv' a displayable version of the innards of the bracketed
17474 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
17475 * output anything, and bitmap_invlist, if not NULL, will point to an
17476 * inversion list of what is in the bit map */
17480 unsigned int punct_count = 0;
17481 SV* invlist = NULL;
17482 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17483 bool allow_literals = TRUE;
17485 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17487 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17489 /* Worst case is exactly every-other code point is in the list */
17490 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17492 /* Convert the bit map to an inversion list, keeping track of how many
17493 * ASCII puncts are set, including an extra amount for the backslashed
17495 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17496 if (BITMAP_TEST(bitmap, i)) {
17497 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17498 if (isPUNCT_A(i)) {
17500 if isBACKSLASHED_PUNCT(i) {
17507 /* Nothing to output */
17508 if (_invlist_len(*invlist_ptr) == 0) {
17509 SvREFCNT_dec(invlist);
17513 /* Generally, it is more readable if printable characters are output as
17514 * literals, but if a range (nearly) spans all of them, it's best to output
17515 * it as a single range. This code will use a single range if all but 2
17516 * printables are in it */
17517 invlist_iterinit(*invlist_ptr);
17518 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17520 /* If range starts beyond final printable, it doesn't have any in it */
17521 if (start > MAX_PRINT_A) {
17525 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
17526 * all but two, the range must start and end no later than 2 from
17528 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17529 if (end > MAX_PRINT_A) {
17535 if (end - start >= MAX_PRINT_A - ' ' - 2) {
17536 allow_literals = FALSE;
17541 invlist_iterfinish(*invlist_ptr);
17543 /* The legibility of the output depends mostly on how many punctuation
17544 * characters are output. There are 32 possible ASCII ones, and some have
17545 * an additional backslash, bringing it to currently 36, so if any more
17546 * than 18 are to be output, we can instead output it as its complement,
17547 * yielding fewer puncts, and making it more legible. But give some weight
17548 * to the fact that outputting it as a complement is less legible than a
17549 * straight output, so don't complement unless we are somewhat over the 18
17551 if (allow_literals && punct_count > 22) {
17552 sv_catpvs(sv, "^");
17554 /* Add everything remaining to the list, so when we invert it just
17555 * below, it will be excluded */
17556 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17557 _invlist_invert(*invlist_ptr);
17560 /* Here we have figured things out. Output each range */
17561 invlist_iterinit(*invlist_ptr);
17562 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17563 if (start >= NUM_ANYOF_CODE_POINTS) {
17566 put_range(sv, start, end, allow_literals);
17568 invlist_iterfinish(*invlist_ptr);
17573 #define CLEAR_OPTSTART \
17574 if (optstart) STMT_START { \
17575 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
17576 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17580 #define DUMPUNTIL(b,e) \
17582 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17584 STATIC const regnode *
17585 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17586 const regnode *last, const regnode *plast,
17587 SV* sv, I32 indent, U32 depth)
17589 U8 op = PSEUDO; /* Arbitrary non-END op. */
17590 const regnode *next;
17591 const regnode *optstart= NULL;
17593 RXi_GET_DECL(r,ri);
17594 GET_RE_DEBUG_FLAGS_DECL;
17596 PERL_ARGS_ASSERT_DUMPUNTIL;
17598 #ifdef DEBUG_DUMPUNTIL
17599 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17600 last ? last-start : 0,plast ? plast-start : 0);
17603 if (plast && plast < last)
17606 while (PL_regkind[op] != END && (!last || node < last)) {
17608 /* While that wasn't END last time... */
17611 if (op == CLOSE || op == WHILEM)
17613 next = regnext((regnode *)node);
17616 if (OP(node) == OPTIMIZED) {
17617 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17624 regprop(r, sv, node, NULL, NULL);
17625 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17626 (int)(2*indent + 1), "", SvPVX_const(sv));
17628 if (OP(node) != OPTIMIZED) {
17629 if (next == NULL) /* Next ptr. */
17630 PerlIO_printf(Perl_debug_log, " (0)");
17631 else if (PL_regkind[(U8)op] == BRANCH
17632 && PL_regkind[OP(next)] != BRANCH )
17633 PerlIO_printf(Perl_debug_log, " (FAIL)");
17635 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17636 (void)PerlIO_putc(Perl_debug_log, '\n');
17640 if (PL_regkind[(U8)op] == BRANCHJ) {
17643 const regnode *nnode = (OP(next) == LONGJMP
17644 ? regnext((regnode *)next)
17646 if (last && nnode > last)
17648 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17651 else if (PL_regkind[(U8)op] == BRANCH) {
17653 DUMPUNTIL(NEXTOPER(node), next);
17655 else if ( PL_regkind[(U8)op] == TRIE ) {
17656 const regnode *this_trie = node;
17657 const char op = OP(node);
17658 const U32 n = ARG(node);
17659 const reg_ac_data * const ac = op>=AHOCORASICK ?
17660 (reg_ac_data *)ri->data->data[n] :
17662 const reg_trie_data * const trie =
17663 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17665 AV *const trie_words
17666 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17668 const regnode *nextbranch= NULL;
17671 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17672 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17674 PerlIO_printf(Perl_debug_log, "%*s%s ",
17675 (int)(2*(indent+3)), "",
17677 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17678 SvCUR(*elem_ptr), 60,
17679 PL_colors[0], PL_colors[1],
17681 ? PERL_PV_ESCAPE_UNI
17683 | PERL_PV_PRETTY_ELLIPSES
17684 | PERL_PV_PRETTY_LTGT
17689 U16 dist= trie->jump[word_idx+1];
17690 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17691 (UV)((dist ? this_trie + dist : next) - start));
17694 nextbranch= this_trie + trie->jump[0];
17695 DUMPUNTIL(this_trie + dist, nextbranch);
17697 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17698 nextbranch= regnext((regnode *)nextbranch);
17700 PerlIO_printf(Perl_debug_log, "\n");
17703 if (last && next > last)
17708 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
17709 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17710 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17712 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17714 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17716 else if ( op == PLUS || op == STAR) {
17717 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17719 else if (PL_regkind[(U8)op] == ANYOF) {
17720 /* arglen 1 + class block */
17721 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17722 ? ANYOF_POSIXL_SKIP
17724 node = NEXTOPER(node);
17726 else if (PL_regkind[(U8)op] == EXACT) {
17727 /* Literal string, where present. */
17728 node += NODE_SZ_STR(node) - 1;
17729 node = NEXTOPER(node);
17732 node = NEXTOPER(node);
17733 node += regarglen[(U8)op];
17735 if (op == CURLYX || op == OPEN)
17739 #ifdef DEBUG_DUMPUNTIL
17740 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17745 #endif /* DEBUGGING */
17749 * c-indentation-style: bsd
17750 * c-basic-offset: 4
17751 * indent-tabs-mode: nil
17754 * ex: set ts=8 sts=4 sw=4 et: