5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_inline.h"
90 #include "invlist_inline.h"
91 #include "unicode_constants.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
101 #define STATIC static
105 #define MIN(a,b) ((a) < (b) ? (a) : (b))
108 /* this is a chain of data about sub patterns we are processing that
109 need to be handled separately/specially in study_chunk. Its so
110 we can simulate recursion without losing state. */
112 typedef struct scan_frame {
113 regnode *last_regnode; /* last node to process in this frame */
114 regnode *next_regnode; /* next node to process when last is reached */
115 U32 prev_recursed_depth;
116 I32 stopparen; /* what stopparen do we use */
117 U32 is_top_frame; /* what flags do we use? */
119 struct scan_frame *this_prev_frame; /* this previous frame */
120 struct scan_frame *prev_frame; /* previous frame */
121 struct scan_frame *next_frame; /* next frame */
124 /* Certain characters are output as a sequence with the first being a
126 #define isBACKSLASHED_PUNCT(c) \
127 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
130 struct RExC_state_t {
131 U32 flags; /* RXf_* are we folding, multilining? */
132 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
133 char *precomp; /* uncompiled string. */
134 REGEXP *rx_sv; /* The SV that is the regexp. */
135 regexp *rx; /* perl core regexp structure */
136 regexp_internal *rxi; /* internal data for regexp object
138 char *start; /* Start of input for compile */
139 char *end; /* End of input for compile */
140 char *parse; /* Input-scan pointer. */
141 SSize_t whilem_seen; /* number of WHILEM in this expr */
142 regnode *emit_start; /* Start of emitted-code area */
143 regnode *emit_bound; /* First regnode outside of the
145 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
146 implies compiling, so don't emit */
147 regnode_ssc emit_dummy; /* placeholder for emit to point to;
148 large enough for the largest
149 non-EXACTish node, so can use it as
151 I32 naughty; /* How bad is this pattern? */
152 I32 sawback; /* Did we see \1, ...? */
154 SSize_t size; /* Code size. */
155 I32 npar; /* Capture buffer count, (OPEN) plus
156 one. ("par" 0 is the whole
158 I32 nestroot; /* root parens we are in - used by
162 regnode **open_parens; /* pointers to open parens */
163 regnode **close_parens; /* pointers to close parens */
164 regnode *opend; /* END node in program */
165 I32 utf8; /* whether the pattern is utf8 or not */
166 I32 orig_utf8; /* whether the pattern was originally in utf8 */
167 /* XXX use this for future optimisation of case
168 * where pattern must be upgraded to utf8. */
169 I32 uni_semantics; /* If a d charset modifier should use unicode
170 rules, even if the pattern is not in
172 HV *paren_names; /* Paren names */
174 regnode **recurse; /* Recurse regops */
175 I32 recurse_count; /* Number of recurse regops */
176 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
178 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
182 I32 override_recoding;
184 I32 recode_x_to_native;
186 I32 in_multi_char_class;
187 struct reg_code_block *code_blocks; /* positions of literal (?{})
189 int num_code_blocks; /* size of code_blocks[] */
190 int code_index; /* next code_blocks[] slot */
191 SSize_t maxlen; /* mininum possible number of chars in string to match */
192 scan_frame *frame_head;
193 scan_frame *frame_last;
196 #ifdef ADD_TO_REGEXEC
197 char *starttry; /* -Dr: where regtry was called. */
198 #define RExC_starttry (pRExC_state->starttry)
200 SV *runtime_code_qr; /* qr with the runtime code blocks */
202 const char *lastparse;
204 AV *paren_name_list; /* idx -> name */
205 U32 study_chunk_recursed_count;
208 #define RExC_lastparse (pRExC_state->lastparse)
209 #define RExC_lastnum (pRExC_state->lastnum)
210 #define RExC_paren_name_list (pRExC_state->paren_name_list)
211 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
212 #define RExC_mysv (pRExC_state->mysv1)
213 #define RExC_mysv1 (pRExC_state->mysv1)
214 #define RExC_mysv2 (pRExC_state->mysv2)
219 #define RExC_flags (pRExC_state->flags)
220 #define RExC_pm_flags (pRExC_state->pm_flags)
221 #define RExC_precomp (pRExC_state->precomp)
222 #define RExC_rx_sv (pRExC_state->rx_sv)
223 #define RExC_rx (pRExC_state->rx)
224 #define RExC_rxi (pRExC_state->rxi)
225 #define RExC_start (pRExC_state->start)
226 #define RExC_end (pRExC_state->end)
227 #define RExC_parse (pRExC_state->parse)
228 #define RExC_whilem_seen (pRExC_state->whilem_seen)
229 #ifdef RE_TRACK_PATTERN_OFFSETS
230 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
233 #define RExC_emit (pRExC_state->emit)
234 #define RExC_emit_dummy (pRExC_state->emit_dummy)
235 #define RExC_emit_start (pRExC_state->emit_start)
236 #define RExC_emit_bound (pRExC_state->emit_bound)
237 #define RExC_sawback (pRExC_state->sawback)
238 #define RExC_seen (pRExC_state->seen)
239 #define RExC_size (pRExC_state->size)
240 #define RExC_maxlen (pRExC_state->maxlen)
241 #define RExC_npar (pRExC_state->npar)
242 #define RExC_nestroot (pRExC_state->nestroot)
243 #define RExC_extralen (pRExC_state->extralen)
244 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
245 #define RExC_utf8 (pRExC_state->utf8)
246 #define RExC_uni_semantics (pRExC_state->uni_semantics)
247 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
248 #define RExC_open_parens (pRExC_state->open_parens)
249 #define RExC_close_parens (pRExC_state->close_parens)
250 #define RExC_opend (pRExC_state->opend)
251 #define RExC_paren_names (pRExC_state->paren_names)
252 #define RExC_recurse (pRExC_state->recurse)
253 #define RExC_recurse_count (pRExC_state->recurse_count)
254 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
255 #define RExC_study_chunk_recursed_bytes \
256 (pRExC_state->study_chunk_recursed_bytes)
257 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
258 #define RExC_contains_locale (pRExC_state->contains_locale)
259 #define RExC_contains_i (pRExC_state->contains_i)
260 #define RExC_override_recoding (pRExC_state->override_recoding)
262 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
264 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
265 #define RExC_frame_head (pRExC_state->frame_head)
266 #define RExC_frame_last (pRExC_state->frame_last)
267 #define RExC_frame_count (pRExC_state->frame_count)
268 #define RExC_strict (pRExC_state->strict)
270 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
271 * a flag to disable back-off on the fixed/floating substrings - if it's
272 * a high complexity pattern we assume the benefit of avoiding a full match
273 * is worth the cost of checking for the substrings even if they rarely help.
275 #define RExC_naughty (pRExC_state->naughty)
276 #define TOO_NAUGHTY (10)
277 #define MARK_NAUGHTY(add) \
278 if (RExC_naughty < TOO_NAUGHTY) \
279 RExC_naughty += (add)
280 #define MARK_NAUGHTY_EXP(exp, add) \
281 if (RExC_naughty < TOO_NAUGHTY) \
282 RExC_naughty += RExC_naughty / (exp) + (add)
284 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
285 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
286 ((*s) == '{' && regcurly(s)))
289 * Flags to be passed up and down.
291 #define WORST 0 /* Worst case. */
292 #define HASWIDTH 0x01 /* Known to match non-null strings. */
294 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
295 * character. (There needs to be a case: in the switch statement in regexec.c
296 * for any node marked SIMPLE.) Note that this is not the same thing as
299 #define SPSTART 0x04 /* Starts with * or + */
300 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
301 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
302 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
304 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
306 /* whether trie related optimizations are enabled */
307 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
308 #define TRIE_STUDY_OPT
309 #define FULL_TRIE_STUDY
315 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
316 #define PBITVAL(paren) (1 << ((paren) & 7))
317 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
318 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
319 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
321 #define REQUIRE_UTF8 STMT_START { \
323 *flagp = RESTART_UTF8; \
328 /* This converts the named class defined in regcomp.h to its equivalent class
329 * number defined in handy.h. */
330 #define namedclass_to_classnum(class) ((int) ((class) / 2))
331 #define classnum_to_namedclass(classnum) ((classnum) * 2)
333 #define _invlist_union_complement_2nd(a, b, output) \
334 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
335 #define _invlist_intersection_complement_2nd(a, b, output) \
336 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
338 /* About scan_data_t.
340 During optimisation we recurse through the regexp program performing
341 various inplace (keyhole style) optimisations. In addition study_chunk
342 and scan_commit populate this data structure with information about
343 what strings MUST appear in the pattern. We look for the longest
344 string that must appear at a fixed location, and we look for the
345 longest string that may appear at a floating location. So for instance
350 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
351 strings (because they follow a .* construct). study_chunk will identify
352 both FOO and BAR as being the longest fixed and floating strings respectively.
354 The strings can be composites, for instance
358 will result in a composite fixed substring 'foo'.
360 For each string some basic information is maintained:
362 - offset or min_offset
363 This is the position the string must appear at, or not before.
364 It also implicitly (when combined with minlenp) tells us how many
365 characters must match before the string we are searching for.
366 Likewise when combined with minlenp and the length of the string it
367 tells us how many characters must appear after the string we have
371 Only used for floating strings. This is the rightmost point that
372 the string can appear at. If set to SSize_t_MAX it indicates that the
373 string can occur infinitely far to the right.
376 A pointer to the minimum number of characters of the pattern that the
377 string was found inside. This is important as in the case of positive
378 lookahead or positive lookbehind we can have multiple patterns
383 The minimum length of the pattern overall is 3, the minimum length
384 of the lookahead part is 3, but the minimum length of the part that
385 will actually match is 1. So 'FOO's minimum length is 3, but the
386 minimum length for the F is 1. This is important as the minimum length
387 is used to determine offsets in front of and behind the string being
388 looked for. Since strings can be composites this is the length of the
389 pattern at the time it was committed with a scan_commit. Note that
390 the length is calculated by study_chunk, so that the minimum lengths
391 are not known until the full pattern has been compiled, thus the
392 pointer to the value.
396 In the case of lookbehind the string being searched for can be
397 offset past the start point of the final matching string.
398 If this value was just blithely removed from the min_offset it would
399 invalidate some of the calculations for how many chars must match
400 before or after (as they are derived from min_offset and minlen and
401 the length of the string being searched for).
402 When the final pattern is compiled and the data is moved from the
403 scan_data_t structure into the regexp structure the information
404 about lookbehind is factored in, with the information that would
405 have been lost precalculated in the end_shift field for the
408 The fields pos_min and pos_delta are used to store the minimum offset
409 and the delta to the maximum offset at the current point in the pattern.
413 typedef struct scan_data_t {
414 /*I32 len_min; unused */
415 /*I32 len_delta; unused */
419 SSize_t last_end; /* min value, <0 unless valid. */
420 SSize_t last_start_min;
421 SSize_t last_start_max;
422 SV **longest; /* Either &l_fixed, or &l_float. */
423 SV *longest_fixed; /* longest fixed string found in pattern */
424 SSize_t offset_fixed; /* offset where it starts */
425 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
426 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
427 SV *longest_float; /* longest floating string found in pattern */
428 SSize_t offset_float_min; /* earliest point in string it can appear */
429 SSize_t offset_float_max; /* latest point in string it can appear */
430 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
431 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
434 SSize_t *last_closep;
435 regnode_ssc *start_class;
439 * Forward declarations for pregcomp()'s friends.
442 static const scan_data_t zero_scan_data =
443 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
445 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
446 #define SF_BEFORE_SEOL 0x0001
447 #define SF_BEFORE_MEOL 0x0002
448 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
449 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
451 #define SF_FIX_SHIFT_EOL (+2)
452 #define SF_FL_SHIFT_EOL (+4)
454 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
455 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
457 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
458 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
459 #define SF_IS_INF 0x0040
460 #define SF_HAS_PAR 0x0080
461 #define SF_IN_PAR 0x0100
462 #define SF_HAS_EVAL 0x0200
463 #define SCF_DO_SUBSTR 0x0400
464 #define SCF_DO_STCLASS_AND 0x0800
465 #define SCF_DO_STCLASS_OR 0x1000
466 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
467 #define SCF_WHILEM_VISITED_POS 0x2000
469 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
470 #define SCF_SEEN_ACCEPT 0x8000
471 #define SCF_TRIE_DOING_RESTUDY 0x10000
472 #define SCF_IN_DEFINE 0x20000
477 #define UTF cBOOL(RExC_utf8)
479 /* The enums for all these are ordered so things work out correctly */
480 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
481 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
482 == REGEX_DEPENDS_CHARSET)
483 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
484 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
485 >= REGEX_UNICODE_CHARSET)
486 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
487 == REGEX_ASCII_RESTRICTED_CHARSET)
488 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
489 >= REGEX_ASCII_RESTRICTED_CHARSET)
490 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
491 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
493 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
495 /* For programs that want to be strictly Unicode compatible by dying if any
496 * attempt is made to match a non-Unicode code point against a Unicode
498 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
500 #define OOB_NAMEDCLASS -1
502 /* There is no code point that is out-of-bounds, so this is problematic. But
503 * its only current use is to initialize a variable that is always set before
505 #define OOB_UNICODE 0xDEADBEEF
507 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
508 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
511 /* length of regex to show in messages that don't mark a position within */
512 #define RegexLengthToShowInErrorMessages 127
515 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
516 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
517 * op/pragma/warn/regcomp.
519 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
520 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
522 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
523 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
525 #define REPORT_LOCATION_ARGS(offset) \
526 UTF8fARG(UTF, offset, RExC_precomp), \
527 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
529 /* Used to point after bad bytes for an error message, but avoid skipping
530 * past a nul byte. */
531 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
534 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
535 * arg. Show regex, up to a maximum length. If it's too long, chop and add
538 #define _FAIL(code) STMT_START { \
539 const char *ellipses = ""; \
540 IV len = RExC_end - RExC_precomp; \
543 SAVEFREESV(RExC_rx_sv); \
544 if (len > RegexLengthToShowInErrorMessages) { \
545 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
546 len = RegexLengthToShowInErrorMessages - 10; \
552 #define FAIL(msg) _FAIL( \
553 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
554 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
556 #define FAIL2(msg,arg) _FAIL( \
557 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
558 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
561 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
563 #define Simple_vFAIL(m) STMT_START { \
565 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
566 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
567 m, REPORT_LOCATION_ARGS(offset)); \
571 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
573 #define vFAIL(m) STMT_START { \
575 SAVEFREESV(RExC_rx_sv); \
580 * Like Simple_vFAIL(), but accepts two arguments.
582 #define Simple_vFAIL2(m,a1) STMT_START { \
583 const IV offset = RExC_parse - RExC_precomp; \
584 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
585 REPORT_LOCATION_ARGS(offset)); \
589 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
591 #define vFAIL2(m,a1) STMT_START { \
593 SAVEFREESV(RExC_rx_sv); \
594 Simple_vFAIL2(m, a1); \
599 * Like Simple_vFAIL(), but accepts three arguments.
601 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
602 const IV offset = RExC_parse - RExC_precomp; \
603 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
604 REPORT_LOCATION_ARGS(offset)); \
608 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
610 #define vFAIL3(m,a1,a2) STMT_START { \
612 SAVEFREESV(RExC_rx_sv); \
613 Simple_vFAIL3(m, a1, a2); \
617 * Like Simple_vFAIL(), but accepts four arguments.
619 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
620 const IV offset = RExC_parse - RExC_precomp; \
621 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
622 REPORT_LOCATION_ARGS(offset)); \
625 #define vFAIL4(m,a1,a2,a3) STMT_START { \
627 SAVEFREESV(RExC_rx_sv); \
628 Simple_vFAIL4(m, a1, a2, a3); \
631 /* A specialized version of vFAIL2 that works with UTF8f */
632 #define vFAIL2utf8f(m, a1) STMT_START { \
633 const IV offset = RExC_parse - RExC_precomp; \
635 SAVEFREESV(RExC_rx_sv); \
636 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
637 REPORT_LOCATION_ARGS(offset)); \
640 /* These have asserts in them because of [perl #122671] Many warnings in
641 * regcomp.c can occur twice. If they get output in pass1 and later in that
642 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
643 * would get output again. So they should be output in pass2, and these
644 * asserts make sure new warnings follow that paradigm. */
646 /* m is not necessarily a "literal string", in this macro */
647 #define reg_warn_non_literal_string(loc, m) STMT_START { \
648 const IV offset = loc - RExC_precomp; \
649 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
650 m, REPORT_LOCATION_ARGS(offset)); \
653 #define ckWARNreg(loc,m) STMT_START { \
654 const IV offset = loc - RExC_precomp; \
655 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
656 REPORT_LOCATION_ARGS(offset)); \
659 #define vWARN(loc, m) STMT_START { \
660 const IV offset = loc - RExC_precomp; \
661 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
662 REPORT_LOCATION_ARGS(offset)); \
665 #define vWARN_dep(loc, m) STMT_START { \
666 const IV offset = loc - RExC_precomp; \
667 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
668 REPORT_LOCATION_ARGS(offset)); \
671 #define ckWARNdep(loc,m) STMT_START { \
672 const IV offset = loc - RExC_precomp; \
673 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
675 REPORT_LOCATION_ARGS(offset)); \
678 #define ckWARNregdep(loc,m) STMT_START { \
679 const IV offset = loc - RExC_precomp; \
680 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
682 REPORT_LOCATION_ARGS(offset)); \
685 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
686 const IV offset = loc - RExC_precomp; \
687 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
689 a1, REPORT_LOCATION_ARGS(offset)); \
692 #define ckWARN2reg(loc, m, a1) STMT_START { \
693 const IV offset = loc - RExC_precomp; \
694 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
695 a1, REPORT_LOCATION_ARGS(offset)); \
698 #define vWARN3(loc, m, a1, a2) STMT_START { \
699 const IV offset = loc - RExC_precomp; \
700 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
701 a1, a2, REPORT_LOCATION_ARGS(offset)); \
704 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
705 const IV offset = loc - RExC_precomp; \
706 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
707 a1, a2, REPORT_LOCATION_ARGS(offset)); \
710 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
711 const IV offset = loc - RExC_precomp; \
712 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
713 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
716 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
717 const IV offset = loc - RExC_precomp; \
718 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
719 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
722 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
723 const IV offset = loc - RExC_precomp; \
724 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
725 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
728 /* Macros for recording node offsets. 20001227 mjd@plover.com
729 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
730 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
731 * Element 0 holds the number n.
732 * Position is 1 indexed.
734 #ifndef RE_TRACK_PATTERN_OFFSETS
735 #define Set_Node_Offset_To_R(node,byte)
736 #define Set_Node_Offset(node,byte)
737 #define Set_Cur_Node_Offset
738 #define Set_Node_Length_To_R(node,len)
739 #define Set_Node_Length(node,len)
740 #define Set_Node_Cur_Length(node,start)
741 #define Node_Offset(n)
742 #define Node_Length(n)
743 #define Set_Node_Offset_Length(node,offset,len)
744 #define ProgLen(ri) ri->u.proglen
745 #define SetProgLen(ri,x) ri->u.proglen = x
747 #define ProgLen(ri) ri->u.offsets[0]
748 #define SetProgLen(ri,x) ri->u.offsets[0] = x
749 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
751 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
752 __LINE__, (int)(node), (int)(byte))); \
754 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
757 RExC_offsets[2*(node)-1] = (byte); \
762 #define Set_Node_Offset(node,byte) \
763 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
764 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
766 #define Set_Node_Length_To_R(node,len) STMT_START { \
768 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
769 __LINE__, (int)(node), (int)(len))); \
771 Perl_croak(aTHX_ "value of node is %d in Length macro", \
774 RExC_offsets[2*(node)] = (len); \
779 #define Set_Node_Length(node,len) \
780 Set_Node_Length_To_R((node)-RExC_emit_start, len)
781 #define Set_Node_Cur_Length(node, start) \
782 Set_Node_Length(node, RExC_parse - start)
784 /* Get offsets and lengths */
785 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
786 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
788 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
789 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
790 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
794 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
795 #define EXPERIMENTAL_INPLACESCAN
796 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
798 #define DEBUG_RExC_seen() \
799 DEBUG_OPTIMISE_MORE_r({ \
800 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
802 if (RExC_seen & REG_ZERO_LEN_SEEN) \
803 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
805 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
806 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
808 if (RExC_seen & REG_GPOS_SEEN) \
809 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
811 if (RExC_seen & REG_RECURSE_SEEN) \
812 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
814 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
815 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
817 if (RExC_seen & REG_VERBARG_SEEN) \
818 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
820 if (RExC_seen & REG_CUTGROUP_SEEN) \
821 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
823 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
824 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
826 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
827 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
829 if (RExC_seen & REG_GOSTART_SEEN) \
830 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
832 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
833 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
835 PerlIO_printf(Perl_debug_log,"\n"); \
838 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
839 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
841 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
843 PerlIO_printf(Perl_debug_log, "%s", open_str); \
844 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
845 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
846 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
847 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
848 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
849 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
850 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
851 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
852 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
853 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
854 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
855 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
856 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
857 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
858 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
859 PerlIO_printf(Perl_debug_log, "%s", close_str); \
863 #define DEBUG_STUDYDATA(str,data,depth) \
864 DEBUG_OPTIMISE_MORE_r(if(data){ \
865 PerlIO_printf(Perl_debug_log, \
866 "%*s" str "Pos:%"IVdf"/%"IVdf \
868 (int)(depth)*2, "", \
869 (IV)((data)->pos_min), \
870 (IV)((data)->pos_delta), \
871 (UV)((data)->flags) \
873 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
874 PerlIO_printf(Perl_debug_log, \
875 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
876 (IV)((data)->whilem_c), \
877 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
878 is_inf ? "INF " : "" \
880 if ((data)->last_found) \
881 PerlIO_printf(Perl_debug_log, \
882 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
883 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
884 SvPVX_const((data)->last_found), \
885 (IV)((data)->last_end), \
886 (IV)((data)->last_start_min), \
887 (IV)((data)->last_start_max), \
888 ((data)->longest && \
889 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
890 SvPVX_const((data)->longest_fixed), \
891 (IV)((data)->offset_fixed), \
892 ((data)->longest && \
893 (data)->longest==&((data)->longest_float)) ? "*" : "", \
894 SvPVX_const((data)->longest_float), \
895 (IV)((data)->offset_float_min), \
896 (IV)((data)->offset_float_max) \
898 PerlIO_printf(Perl_debug_log,"\n"); \
901 /* is c a control character for which we have a mnemonic? */
902 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
905 S_cntrl_to_mnemonic(const U8 c)
907 /* Returns the mnemonic string that represents character 'c', if one
908 * exists; NULL otherwise. The only ones that exist for the purposes of
909 * this routine are a few control characters */
912 case '\a': return "\\a";
913 case '\b': return "\\b";
914 case ESC_NATIVE: return "\\e";
915 case '\f': return "\\f";
916 case '\n': return "\\n";
917 case '\r': return "\\r";
918 case '\t': return "\\t";
924 /* Mark that we cannot extend a found fixed substring at this point.
925 Update the longest found anchored substring and the longest found
926 floating substrings if needed. */
929 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
930 SSize_t *minlenp, int is_inf)
932 const STRLEN l = CHR_SVLEN(data->last_found);
933 const STRLEN old_l = CHR_SVLEN(*data->longest);
934 GET_RE_DEBUG_FLAGS_DECL;
936 PERL_ARGS_ASSERT_SCAN_COMMIT;
938 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
939 SvSetMagicSV(*data->longest, data->last_found);
940 if (*data->longest == data->longest_fixed) {
941 data->offset_fixed = l ? data->last_start_min : data->pos_min;
942 if (data->flags & SF_BEFORE_EOL)
944 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
946 data->flags &= ~SF_FIX_BEFORE_EOL;
947 data->minlen_fixed=minlenp;
948 data->lookbehind_fixed=0;
950 else { /* *data->longest == data->longest_float */
951 data->offset_float_min = l ? data->last_start_min : data->pos_min;
952 data->offset_float_max = (l
953 ? data->last_start_max
954 : (data->pos_delta > SSize_t_MAX - data->pos_min
956 : data->pos_min + data->pos_delta));
958 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
959 data->offset_float_max = SSize_t_MAX;
960 if (data->flags & SF_BEFORE_EOL)
962 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
964 data->flags &= ~SF_FL_BEFORE_EOL;
965 data->minlen_float=minlenp;
966 data->lookbehind_float=0;
969 SvCUR_set(data->last_found, 0);
971 SV * const sv = data->last_found;
972 if (SvUTF8(sv) && SvMAGICAL(sv)) {
973 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
979 data->flags &= ~SF_BEFORE_EOL;
980 DEBUG_STUDYDATA("commit: ",data,0);
983 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
984 * list that describes which code points it matches */
987 S_ssc_anything(pTHX_ regnode_ssc *ssc)
989 /* Set the SSC 'ssc' to match an empty string or any code point */
991 PERL_ARGS_ASSERT_SSC_ANYTHING;
993 assert(is_ANYOF_SYNTHETIC(ssc));
995 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
996 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
997 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1001 S_ssc_is_anything(const regnode_ssc *ssc)
1003 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1004 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1005 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1006 * in any way, so there's no point in using it */
1011 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1013 assert(is_ANYOF_SYNTHETIC(ssc));
1015 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1019 /* See if the list consists solely of the range 0 - Infinity */
1020 invlist_iterinit(ssc->invlist);
1021 ret = invlist_iternext(ssc->invlist, &start, &end)
1025 invlist_iterfinish(ssc->invlist);
1031 /* If e.g., both \w and \W are set, matches everything */
1032 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1034 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1035 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1045 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1047 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1048 * string, any code point, or any posix class under locale */
1050 PERL_ARGS_ASSERT_SSC_INIT;
1052 Zero(ssc, 1, regnode_ssc);
1053 set_ANYOF_SYNTHETIC(ssc);
1054 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1057 /* If any portion of the regex is to operate under locale rules that aren't
1058 * fully known at compile time, initialization includes it. The reason
1059 * this isn't done for all regexes is that the optimizer was written under
1060 * the assumption that locale was all-or-nothing. Given the complexity and
1061 * lack of documentation in the optimizer, and that there are inadequate
1062 * test cases for locale, many parts of it may not work properly, it is
1063 * safest to avoid locale unless necessary. */
1064 if (RExC_contains_locale) {
1065 ANYOF_POSIXL_SETALL(ssc);
1068 ANYOF_POSIXL_ZERO(ssc);
1073 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1074 const regnode_ssc *ssc)
1076 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1077 * to the list of code points matched, and locale posix classes; hence does
1078 * not check its flags) */
1083 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1085 assert(is_ANYOF_SYNTHETIC(ssc));
1087 invlist_iterinit(ssc->invlist);
1088 ret = invlist_iternext(ssc->invlist, &start, &end)
1092 invlist_iterfinish(ssc->invlist);
1098 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1106 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1107 const regnode_charclass* const node)
1109 /* Returns a mortal inversion list defining which code points are matched
1110 * by 'node', which is of type ANYOF. Handles complementing the result if
1111 * appropriate. If some code points aren't knowable at this time, the
1112 * returned list must, and will, contain every code point that is a
1115 SV* invlist = sv_2mortal(_new_invlist(0));
1116 SV* only_utf8_locale_invlist = NULL;
1118 const U32 n = ARG(node);
1119 bool new_node_has_latin1 = FALSE;
1121 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1123 /* Look at the data structure created by S_set_ANYOF_arg() */
1124 if (n != ANYOF_ONLY_HAS_BITMAP) {
1125 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1126 AV * const av = MUTABLE_AV(SvRV(rv));
1127 SV **const ary = AvARRAY(av);
1128 assert(RExC_rxi->data->what[n] == 's');
1130 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1131 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1133 else if (ary[0] && ary[0] != &PL_sv_undef) {
1135 /* Here, no compile-time swash, and there are things that won't be
1136 * known until runtime -- we have to assume it could be anything */
1137 return _add_range_to_invlist(invlist, 0, UV_MAX);
1139 else if (ary[3] && ary[3] != &PL_sv_undef) {
1141 /* Here no compile-time swash, and no run-time only data. Use the
1142 * node's inversion list */
1143 invlist = sv_2mortal(invlist_clone(ary[3]));
1146 /* Get the code points valid only under UTF-8 locales */
1147 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1148 && ary[2] && ary[2] != &PL_sv_undef)
1150 only_utf8_locale_invlist = ary[2];
1154 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1155 * code points, and an inversion list for the others, but if there are code
1156 * points that should match only conditionally on the target string being
1157 * UTF-8, those are placed in the inversion list, and not the bitmap.
1158 * Since there are circumstances under which they could match, they are
1159 * included in the SSC. But if the ANYOF node is to be inverted, we have
1160 * to exclude them here, so that when we invert below, the end result
1161 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1162 * have to do this here before we add the unconditionally matched code
1164 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1165 _invlist_intersection_complement_2nd(invlist,
1170 /* Add in the points from the bit map */
1171 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1172 if (ANYOF_BITMAP_TEST(node, i)) {
1173 invlist = add_cp_to_invlist(invlist, i);
1174 new_node_has_latin1 = TRUE;
1178 /* If this can match all upper Latin1 code points, have to add them
1180 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1181 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1184 /* Similarly for these */
1185 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1186 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1189 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1190 _invlist_invert(invlist);
1192 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1194 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1195 * locale. We can skip this if there are no 0-255 at all. */
1196 _invlist_union(invlist, PL_Latin1, &invlist);
1199 /* Similarly add the UTF-8 locale possible matches. These have to be
1200 * deferred until after the non-UTF-8 locale ones are taken care of just
1201 * above, or it leads to wrong results under ANYOF_INVERT */
1202 if (only_utf8_locale_invlist) {
1203 _invlist_union_maybe_complement_2nd(invlist,
1204 only_utf8_locale_invlist,
1205 ANYOF_FLAGS(node) & ANYOF_INVERT,
1212 /* These two functions currently do the exact same thing */
1213 #define ssc_init_zero ssc_init
1215 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1216 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1218 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1219 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1220 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1223 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1224 const regnode_charclass *and_with)
1226 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1227 * another SSC or a regular ANYOF class. Can create false positives. */
1232 PERL_ARGS_ASSERT_SSC_AND;
1234 assert(is_ANYOF_SYNTHETIC(ssc));
1236 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1237 * the code point inversion list and just the relevant flags */
1238 if (is_ANYOF_SYNTHETIC(and_with)) {
1239 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1240 anded_flags = ANYOF_FLAGS(and_with);
1242 /* XXX This is a kludge around what appears to be deficiencies in the
1243 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1244 * there are paths through the optimizer where it doesn't get weeded
1245 * out when it should. And if we don't make some extra provision for
1246 * it like the code just below, it doesn't get added when it should.
1247 * This solution is to add it only when AND'ing, which is here, and
1248 * only when what is being AND'ed is the pristine, original node
1249 * matching anything. Thus it is like adding it to ssc_anything() but
1250 * only when the result is to be AND'ed. Probably the same solution
1251 * could be adopted for the same problem we have with /l matching,
1252 * which is solved differently in S_ssc_init(), and that would lead to
1253 * fewer false positives than that solution has. But if this solution
1254 * creates bugs, the consequences are only that a warning isn't raised
1255 * that should be; while the consequences for having /l bugs is
1256 * incorrect matches */
1257 if (ssc_is_anything((regnode_ssc *)and_with)) {
1258 anded_flags |= ANYOF_WARN_SUPER;
1262 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1263 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1266 ANYOF_FLAGS(ssc) &= anded_flags;
1268 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1269 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1270 * 'and_with' may be inverted. When not inverted, we have the situation of
1272 * (C1 | P1) & (C2 | P2)
1273 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1274 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1275 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1276 * <= ((C1 & C2) | P1 | P2)
1277 * Alternatively, the last few steps could be:
1278 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1279 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1280 * <= (C1 | C2 | (P1 & P2))
1281 * We favor the second approach if either P1 or P2 is non-empty. This is
1282 * because these components are a barrier to doing optimizations, as what
1283 * they match cannot be known until the moment of matching as they are
1284 * dependent on the current locale, 'AND"ing them likely will reduce or
1286 * But we can do better if we know that C1,P1 are in their initial state (a
1287 * frequent occurrence), each matching everything:
1288 * (<everything>) & (C2 | P2) = C2 | P2
1289 * Similarly, if C2,P2 are in their initial state (again a frequent
1290 * occurrence), the result is a no-op
1291 * (C1 | P1) & (<everything>) = C1 | P1
1294 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1295 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1296 * <= (C1 & ~C2) | (P1 & ~P2)
1299 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1300 && ! is_ANYOF_SYNTHETIC(and_with))
1304 ssc_intersection(ssc,
1306 FALSE /* Has already been inverted */
1309 /* If either P1 or P2 is empty, the intersection will be also; can skip
1311 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1312 ANYOF_POSIXL_ZERO(ssc);
1314 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1316 /* Note that the Posix class component P from 'and_with' actually
1318 * P = Pa | Pb | ... | Pn
1319 * where each component is one posix class, such as in [\w\s].
1321 * ~P = ~(Pa | Pb | ... | Pn)
1322 * = ~Pa & ~Pb & ... & ~Pn
1323 * <= ~Pa | ~Pb | ... | ~Pn
1324 * The last is something we can easily calculate, but unfortunately
1325 * is likely to have many false positives. We could do better
1326 * in some (but certainly not all) instances if two classes in
1327 * P have known relationships. For example
1328 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1330 * :lower: & :print: = :lower:
1331 * And similarly for classes that must be disjoint. For example,
1332 * since \s and \w can have no elements in common based on rules in
1333 * the POSIX standard,
1334 * \w & ^\S = nothing
1335 * Unfortunately, some vendor locales do not meet the Posix
1336 * standard, in particular almost everything by Microsoft.
1337 * The loop below just changes e.g., \w into \W and vice versa */
1339 regnode_charclass_posixl temp;
1340 int add = 1; /* To calculate the index of the complement */
1342 ANYOF_POSIXL_ZERO(&temp);
1343 for (i = 0; i < ANYOF_MAX; i++) {
1345 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1346 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1348 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1349 ANYOF_POSIXL_SET(&temp, i + add);
1351 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1353 ANYOF_POSIXL_AND(&temp, ssc);
1355 } /* else ssc already has no posixes */
1356 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1357 in its initial state */
1358 else if (! is_ANYOF_SYNTHETIC(and_with)
1359 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1361 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1362 * copy it over 'ssc' */
1363 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1364 if (is_ANYOF_SYNTHETIC(and_with)) {
1365 StructCopy(and_with, ssc, regnode_ssc);
1368 ssc->invlist = anded_cp_list;
1369 ANYOF_POSIXL_ZERO(ssc);
1370 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1371 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1375 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1376 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1378 /* One or the other of P1, P2 is non-empty. */
1379 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1380 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1382 ssc_union(ssc, anded_cp_list, FALSE);
1384 else { /* P1 = P2 = empty */
1385 ssc_intersection(ssc, anded_cp_list, FALSE);
1391 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1392 const regnode_charclass *or_with)
1394 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1395 * another SSC or a regular ANYOF class. Can create false positives if
1396 * 'or_with' is to be inverted. */
1401 PERL_ARGS_ASSERT_SSC_OR;
1403 assert(is_ANYOF_SYNTHETIC(ssc));
1405 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1406 * the code point inversion list and just the relevant flags */
1407 if (is_ANYOF_SYNTHETIC(or_with)) {
1408 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1409 ored_flags = ANYOF_FLAGS(or_with);
1412 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1413 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1416 ANYOF_FLAGS(ssc) |= ored_flags;
1418 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1419 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1420 * 'or_with' may be inverted. When not inverted, we have the simple
1421 * situation of computing:
1422 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1423 * If P1|P2 yields a situation with both a class and its complement are
1424 * set, like having both \w and \W, this matches all code points, and we
1425 * can delete these from the P component of the ssc going forward. XXX We
1426 * might be able to delete all the P components, but I (khw) am not certain
1427 * about this, and it is better to be safe.
1430 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1431 * <= (C1 | P1) | ~C2
1432 * <= (C1 | ~C2) | P1
1433 * (which results in actually simpler code than the non-inverted case)
1436 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1437 && ! is_ANYOF_SYNTHETIC(or_with))
1439 /* We ignore P2, leaving P1 going forward */
1440 } /* else Not inverted */
1441 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1442 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1443 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1445 for (i = 0; i < ANYOF_MAX; i += 2) {
1446 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1448 ssc_match_all_cp(ssc);
1449 ANYOF_POSIXL_CLEAR(ssc, i);
1450 ANYOF_POSIXL_CLEAR(ssc, i+1);
1458 FALSE /* Already has been inverted */
1462 PERL_STATIC_INLINE void
1463 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1465 PERL_ARGS_ASSERT_SSC_UNION;
1467 assert(is_ANYOF_SYNTHETIC(ssc));
1469 _invlist_union_maybe_complement_2nd(ssc->invlist,
1475 PERL_STATIC_INLINE void
1476 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1478 const bool invert2nd)
1480 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1482 assert(is_ANYOF_SYNTHETIC(ssc));
1484 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1490 PERL_STATIC_INLINE void
1491 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1493 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1495 assert(is_ANYOF_SYNTHETIC(ssc));
1497 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1500 PERL_STATIC_INLINE void
1501 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1503 /* AND just the single code point 'cp' into the SSC 'ssc' */
1505 SV* cp_list = _new_invlist(2);
1507 PERL_ARGS_ASSERT_SSC_CP_AND;
1509 assert(is_ANYOF_SYNTHETIC(ssc));
1511 cp_list = add_cp_to_invlist(cp_list, cp);
1512 ssc_intersection(ssc, cp_list,
1513 FALSE /* Not inverted */
1515 SvREFCNT_dec_NN(cp_list);
1518 PERL_STATIC_INLINE void
1519 S_ssc_clear_locale(regnode_ssc *ssc)
1521 /* Set the SSC 'ssc' to not match any locale things */
1522 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1524 assert(is_ANYOF_SYNTHETIC(ssc));
1526 ANYOF_POSIXL_ZERO(ssc);
1527 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1530 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1533 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1535 /* The synthetic start class is used to hopefully quickly winnow down
1536 * places where a pattern could start a match in the target string. If it
1537 * doesn't really narrow things down that much, there isn't much point to
1538 * having the overhead of using it. This function uses some very crude
1539 * heuristics to decide if to use the ssc or not.
1541 * It returns TRUE if 'ssc' rules out more than half what it considers to
1542 * be the "likely" possible matches, but of course it doesn't know what the
1543 * actual things being matched are going to be; these are only guesses
1545 * For /l matches, it assumes that the only likely matches are going to be
1546 * in the 0-255 range, uniformly distributed, so half of that is 127
1547 * For /a and /d matches, it assumes that the likely matches will be just
1548 * the ASCII range, so half of that is 63
1549 * For /u and there isn't anything matching above the Latin1 range, it
1550 * assumes that that is the only range likely to be matched, and uses
1551 * half that as the cut-off: 127. If anything matches above Latin1,
1552 * it assumes that all of Unicode could match (uniformly), except for
1553 * non-Unicode code points and things in the General Category "Other"
1554 * (unassigned, private use, surrogates, controls and formats). This
1555 * is a much large number. */
1557 const U32 max_match = (LOC)
1561 : (invlist_highest(ssc->invlist) < 256)
1563 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1564 U32 count = 0; /* Running total of number of code points matched by
1566 UV start, end; /* Start and end points of current range in inversion
1569 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1571 invlist_iterinit(ssc->invlist);
1572 while (invlist_iternext(ssc->invlist, &start, &end)) {
1574 /* /u is the only thing that we expect to match above 255; so if not /u
1575 * and even if there are matches above 255, ignore them. This catches
1576 * things like \d under /d which does match the digits above 255, but
1577 * since the pattern is /d, it is not likely to be expecting them */
1578 if (! UNI_SEMANTICS) {
1582 end = MIN(end, 255);
1584 count += end - start + 1;
1585 if (count > max_match) {
1586 invlist_iterfinish(ssc->invlist);
1596 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1598 /* The inversion list in the SSC is marked mortal; now we need a more
1599 * permanent copy, which is stored the same way that is done in a regular
1600 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1603 SV* invlist = invlist_clone(ssc->invlist);
1605 PERL_ARGS_ASSERT_SSC_FINALIZE;
1607 assert(is_ANYOF_SYNTHETIC(ssc));
1609 /* The code in this file assumes that all but these flags aren't relevant
1610 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1611 * by the time we reach here */
1612 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1614 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1616 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1617 NULL, NULL, NULL, FALSE);
1619 /* Make sure is clone-safe */
1620 ssc->invlist = NULL;
1622 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1623 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1626 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1629 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1630 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1631 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1632 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1633 ? (TRIE_LIST_CUR( idx ) - 1) \
1639 dump_trie(trie,widecharmap,revcharmap)
1640 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1641 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1643 These routines dump out a trie in a somewhat readable format.
1644 The _interim_ variants are used for debugging the interim
1645 tables that are used to generate the final compressed
1646 representation which is what dump_trie expects.
1648 Part of the reason for their existence is to provide a form
1649 of documentation as to how the different representations function.
1654 Dumps the final compressed table form of the trie to Perl_debug_log.
1655 Used for debugging make_trie().
1659 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1660 AV *revcharmap, U32 depth)
1663 SV *sv=sv_newmortal();
1664 int colwidth= widecharmap ? 6 : 4;
1666 GET_RE_DEBUG_FLAGS_DECL;
1668 PERL_ARGS_ASSERT_DUMP_TRIE;
1670 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1671 (int)depth * 2 + 2,"",
1672 "Match","Base","Ofs" );
1674 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1675 SV ** const tmp = av_fetch( revcharmap, state, 0);
1677 PerlIO_printf( Perl_debug_log, "%*s",
1679 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1680 PL_colors[0], PL_colors[1],
1681 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1682 PERL_PV_ESCAPE_FIRSTCHAR
1687 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1688 (int)depth * 2 + 2,"");
1690 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1691 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1692 PerlIO_printf( Perl_debug_log, "\n");
1694 for( state = 1 ; state < trie->statecount ; state++ ) {
1695 const U32 base = trie->states[ state ].trans.base;
1697 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1698 (int)depth * 2 + 2,"", (UV)state);
1700 if ( trie->states[ state ].wordnum ) {
1701 PerlIO_printf( Perl_debug_log, " W%4X",
1702 trie->states[ state ].wordnum );
1704 PerlIO_printf( Perl_debug_log, "%6s", "" );
1707 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1712 while( ( base + ofs < trie->uniquecharcount ) ||
1713 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1714 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1718 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1720 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1721 if ( ( base + ofs >= trie->uniquecharcount )
1722 && ( base + ofs - trie->uniquecharcount
1724 && trie->trans[ base + ofs
1725 - trie->uniquecharcount ].check == state )
1727 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1729 (UV)trie->trans[ base + ofs
1730 - trie->uniquecharcount ].next );
1732 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1736 PerlIO_printf( Perl_debug_log, "]");
1739 PerlIO_printf( Perl_debug_log, "\n" );
1741 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1743 for (word=1; word <= trie->wordcount; word++) {
1744 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1745 (int)word, (int)(trie->wordinfo[word].prev),
1746 (int)(trie->wordinfo[word].len));
1748 PerlIO_printf(Perl_debug_log, "\n" );
1751 Dumps a fully constructed but uncompressed trie in list form.
1752 List tries normally only are used for construction when the number of
1753 possible chars (trie->uniquecharcount) is very high.
1754 Used for debugging make_trie().
1757 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1758 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1762 SV *sv=sv_newmortal();
1763 int colwidth= widecharmap ? 6 : 4;
1764 GET_RE_DEBUG_FLAGS_DECL;
1766 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1768 /* print out the table precompression. */
1769 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1770 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1771 "------:-----+-----------------\n" );
1773 for( state=1 ; state < next_alloc ; state ++ ) {
1776 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1777 (int)depth * 2 + 2,"", (UV)state );
1778 if ( ! trie->states[ state ].wordnum ) {
1779 PerlIO_printf( Perl_debug_log, "%5s| ","");
1781 PerlIO_printf( Perl_debug_log, "W%4x| ",
1782 trie->states[ state ].wordnum
1785 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1786 SV ** const tmp = av_fetch( revcharmap,
1787 TRIE_LIST_ITEM(state,charid).forid, 0);
1789 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1791 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1793 PL_colors[0], PL_colors[1],
1794 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1795 | PERL_PV_ESCAPE_FIRSTCHAR
1797 TRIE_LIST_ITEM(state,charid).forid,
1798 (UV)TRIE_LIST_ITEM(state,charid).newstate
1801 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1802 (int)((depth * 2) + 14), "");
1805 PerlIO_printf( Perl_debug_log, "\n");
1810 Dumps a fully constructed but uncompressed trie in table form.
1811 This is the normal DFA style state transition table, with a few
1812 twists to facilitate compression later.
1813 Used for debugging make_trie().
1816 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1817 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1822 SV *sv=sv_newmortal();
1823 int colwidth= widecharmap ? 6 : 4;
1824 GET_RE_DEBUG_FLAGS_DECL;
1826 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1829 print out the table precompression so that we can do a visual check
1830 that they are identical.
1833 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1835 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1836 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1838 PerlIO_printf( Perl_debug_log, "%*s",
1840 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1841 PL_colors[0], PL_colors[1],
1842 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1843 PERL_PV_ESCAPE_FIRSTCHAR
1849 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1851 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1852 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1855 PerlIO_printf( Perl_debug_log, "\n" );
1857 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1859 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1860 (int)depth * 2 + 2,"",
1861 (UV)TRIE_NODENUM( state ) );
1863 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1864 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1866 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1868 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1870 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1871 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1872 (UV)trie->trans[ state ].check );
1874 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1875 (UV)trie->trans[ state ].check,
1876 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1884 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1885 startbranch: the first branch in the whole branch sequence
1886 first : start branch of sequence of branch-exact nodes.
1887 May be the same as startbranch
1888 last : Thing following the last branch.
1889 May be the same as tail.
1890 tail : item following the branch sequence
1891 count : words in the sequence
1892 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1893 depth : indent depth
1895 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1897 A trie is an N'ary tree where the branches are determined by digital
1898 decomposition of the key. IE, at the root node you look up the 1st character and
1899 follow that branch repeat until you find the end of the branches. Nodes can be
1900 marked as "accepting" meaning they represent a complete word. Eg:
1904 would convert into the following structure. Numbers represent states, letters
1905 following numbers represent valid transitions on the letter from that state, if
1906 the number is in square brackets it represents an accepting state, otherwise it
1907 will be in parenthesis.
1909 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1913 (1) +-i->(6)-+-s->[7]
1915 +-s->(3)-+-h->(4)-+-e->[5]
1917 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1919 This shows that when matching against the string 'hers' we will begin at state 1
1920 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1921 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1922 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1923 single traverse. We store a mapping from accepting to state to which word was
1924 matched, and then when we have multiple possibilities we try to complete the
1925 rest of the regex in the order in which they occurred in the alternation.
1927 The only prior NFA like behaviour that would be changed by the TRIE support is
1928 the silent ignoring of duplicate alternations which are of the form:
1930 / (DUPE|DUPE) X? (?{ ... }) Y /x
1932 Thus EVAL blocks following a trie may be called a different number of times with
1933 and without the optimisation. With the optimisations dupes will be silently
1934 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1935 the following demonstrates:
1937 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1939 which prints out 'word' three times, but
1941 'words'=~/(word|word|word)(?{ print $1 })S/
1943 which doesnt print it out at all. This is due to other optimisations kicking in.
1945 Example of what happens on a structural level:
1947 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1949 1: CURLYM[1] {1,32767}(18)
1960 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1961 and should turn into:
1963 1: CURLYM[1] {1,32767}(18)
1965 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1973 Cases where tail != last would be like /(?foo|bar)baz/:
1983 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1984 and would end up looking like:
1987 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1994 d = uvchr_to_utf8_flags(d, uv, 0);
1996 is the recommended Unicode-aware way of saying
2001 #define TRIE_STORE_REVCHAR(val) \
2004 SV *zlopp = newSV(7); /* XXX: optimize me */ \
2005 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2006 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2007 SvCUR_set(zlopp, kapow - flrbbbbb); \
2010 av_push(revcharmap, zlopp); \
2012 char ooooff = (char)val; \
2013 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2017 /* This gets the next character from the input, folding it if not already
2019 #define TRIE_READ_CHAR STMT_START { \
2022 /* if it is UTF then it is either already folded, or does not need \
2024 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2026 else if (folder == PL_fold_latin1) { \
2027 /* This folder implies Unicode rules, which in the range expressible \
2028 * by not UTF is the lower case, with the two exceptions, one of \
2029 * which should have been taken care of before calling this */ \
2030 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2031 uvc = toLOWER_L1(*uc); \
2032 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2035 /* raw data, will be folded later if needed */ \
2043 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2044 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2045 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2046 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2048 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2049 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2050 TRIE_LIST_CUR( state )++; \
2053 #define TRIE_LIST_NEW(state) STMT_START { \
2054 Newxz( trie->states[ state ].trans.list, \
2055 4, reg_trie_trans_le ); \
2056 TRIE_LIST_CUR( state ) = 1; \
2057 TRIE_LIST_LEN( state ) = 4; \
2060 #define TRIE_HANDLE_WORD(state) STMT_START { \
2061 U16 dupe= trie->states[ state ].wordnum; \
2062 regnode * const noper_next = regnext( noper ); \
2065 /* store the word for dumping */ \
2067 if (OP(noper) != NOTHING) \
2068 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2070 tmp = newSVpvn_utf8( "", 0, UTF ); \
2071 av_push( trie_words, tmp ); \
2075 trie->wordinfo[curword].prev = 0; \
2076 trie->wordinfo[curword].len = wordlen; \
2077 trie->wordinfo[curword].accept = state; \
2079 if ( noper_next < tail ) { \
2081 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2083 trie->jump[curword] = (U16)(noper_next - convert); \
2085 jumper = noper_next; \
2087 nextbranch= regnext(cur); \
2091 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2092 /* chain, so that when the bits of chain are later */\
2093 /* linked together, the dups appear in the chain */\
2094 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2095 trie->wordinfo[dupe].prev = curword; \
2097 /* we haven't inserted this word yet. */ \
2098 trie->states[ state ].wordnum = curword; \
2103 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2104 ( ( base + charid >= ucharcount \
2105 && base + charid < ubound \
2106 && state == trie->trans[ base - ucharcount + charid ].check \
2107 && trie->trans[ base - ucharcount + charid ].next ) \
2108 ? trie->trans[ base - ucharcount + charid ].next \
2109 : ( state==1 ? special : 0 ) \
2113 #define MADE_JUMP_TRIE 2
2114 #define MADE_EXACT_TRIE 4
2117 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2118 regnode *first, regnode *last, regnode *tail,
2119 U32 word_count, U32 flags, U32 depth)
2121 /* first pass, loop through and scan words */
2122 reg_trie_data *trie;
2123 HV *widecharmap = NULL;
2124 AV *revcharmap = newAV();
2130 regnode *jumper = NULL;
2131 regnode *nextbranch = NULL;
2132 regnode *convert = NULL;
2133 U32 *prev_states; /* temp array mapping each state to previous one */
2134 /* we just use folder as a flag in utf8 */
2135 const U8 * folder = NULL;
2138 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2139 AV *trie_words = NULL;
2140 /* along with revcharmap, this only used during construction but both are
2141 * useful during debugging so we store them in the struct when debugging.
2144 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2145 STRLEN trie_charcount=0;
2147 SV *re_trie_maxbuff;
2148 GET_RE_DEBUG_FLAGS_DECL;
2150 PERL_ARGS_ASSERT_MAKE_TRIE;
2152 PERL_UNUSED_ARG(depth);
2156 case EXACT: case EXACTL: break;
2160 case EXACTFLU8: folder = PL_fold_latin1; break;
2161 case EXACTF: folder = PL_fold; break;
2162 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2165 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2167 trie->startstate = 1;
2168 trie->wordcount = word_count;
2169 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2170 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2171 if (flags == EXACT || flags == EXACTL)
2172 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2173 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2174 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2177 trie_words = newAV();
2180 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2181 assert(re_trie_maxbuff);
2182 if (!SvIOK(re_trie_maxbuff)) {
2183 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2185 DEBUG_TRIE_COMPILE_r({
2186 PerlIO_printf( Perl_debug_log,
2187 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2188 (int)depth * 2 + 2, "",
2189 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2190 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2193 /* Find the node we are going to overwrite */
2194 if ( first == startbranch && OP( last ) != BRANCH ) {
2195 /* whole branch chain */
2198 /* branch sub-chain */
2199 convert = NEXTOPER( first );
2202 /* -- First loop and Setup --
2204 We first traverse the branches and scan each word to determine if it
2205 contains widechars, and how many unique chars there are, this is
2206 important as we have to build a table with at least as many columns as we
2209 We use an array of integers to represent the character codes 0..255
2210 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2211 the native representation of the character value as the key and IV's for
2214 *TODO* If we keep track of how many times each character is used we can
2215 remap the columns so that the table compression later on is more
2216 efficient in terms of memory by ensuring the most common value is in the
2217 middle and the least common are on the outside. IMO this would be better
2218 than a most to least common mapping as theres a decent chance the most
2219 common letter will share a node with the least common, meaning the node
2220 will not be compressible. With a middle is most common approach the worst
2221 case is when we have the least common nodes twice.
2225 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2226 regnode *noper = NEXTOPER( cur );
2227 const U8 *uc = (U8*)STRING( noper );
2228 const U8 *e = uc + STR_LEN( noper );
2230 U32 wordlen = 0; /* required init */
2231 STRLEN minchars = 0;
2232 STRLEN maxchars = 0;
2233 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2236 if (OP(noper) == NOTHING) {
2237 regnode *noper_next= regnext(noper);
2238 if (noper_next != tail && OP(noper_next) == flags) {
2240 uc= (U8*)STRING(noper);
2241 e= uc + STR_LEN(noper);
2242 trie->minlen= STR_LEN(noper);
2249 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2250 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2251 regardless of encoding */
2252 if (OP( noper ) == EXACTFU_SS) {
2253 /* false positives are ok, so just set this */
2254 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2257 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2259 TRIE_CHARCOUNT(trie)++;
2262 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2263 * is in effect. Under /i, this character can match itself, or
2264 * anything that folds to it. If not under /i, it can match just
2265 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2266 * all fold to k, and all are single characters. But some folds
2267 * expand to more than one character, so for example LATIN SMALL
2268 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2269 * the string beginning at 'uc' is 'ffi', it could be matched by
2270 * three characters, or just by the one ligature character. (It
2271 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2272 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2273 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2274 * match.) The trie needs to know the minimum and maximum number
2275 * of characters that could match so that it can use size alone to
2276 * quickly reject many match attempts. The max is simple: it is
2277 * the number of folded characters in this branch (since a fold is
2278 * never shorter than what folds to it. */
2282 /* And the min is equal to the max if not under /i (indicated by
2283 * 'folder' being NULL), or there are no multi-character folds. If
2284 * there is a multi-character fold, the min is incremented just
2285 * once, for the character that folds to the sequence. Each
2286 * character in the sequence needs to be added to the list below of
2287 * characters in the trie, but we count only the first towards the
2288 * min number of characters needed. This is done through the
2289 * variable 'foldlen', which is returned by the macros that look
2290 * for these sequences as the number of bytes the sequence
2291 * occupies. Each time through the loop, we decrement 'foldlen' by
2292 * how many bytes the current char occupies. Only when it reaches
2293 * 0 do we increment 'minchars' or look for another multi-character
2295 if (folder == NULL) {
2298 else if (foldlen > 0) {
2299 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2304 /* See if *uc is the beginning of a multi-character fold. If
2305 * so, we decrement the length remaining to look at, to account
2306 * for the current character this iteration. (We can use 'uc'
2307 * instead of the fold returned by TRIE_READ_CHAR because for
2308 * non-UTF, the latin1_safe macro is smart enough to account
2309 * for all the unfolded characters, and because for UTF, the
2310 * string will already have been folded earlier in the
2311 * compilation process */
2313 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2314 foldlen -= UTF8SKIP(uc);
2317 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2322 /* The current character (and any potential folds) should be added
2323 * to the possible matching characters for this position in this
2327 U8 folded= folder[ (U8) uvc ];
2328 if ( !trie->charmap[ folded ] ) {
2329 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2330 TRIE_STORE_REVCHAR( folded );
2333 if ( !trie->charmap[ uvc ] ) {
2334 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2335 TRIE_STORE_REVCHAR( uvc );
2338 /* store the codepoint in the bitmap, and its folded
2340 TRIE_BITMAP_SET(trie, uvc);
2342 /* store the folded codepoint */
2343 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2346 /* store first byte of utf8 representation of
2347 variant codepoints */
2348 if (! UVCHR_IS_INVARIANT(uvc)) {
2349 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2352 set_bit = 0; /* We've done our bit :-) */
2356 /* XXX We could come up with the list of code points that fold
2357 * to this using PL_utf8_foldclosures, except not for
2358 * multi-char folds, as there may be multiple combinations
2359 * there that could work, which needs to wait until runtime to
2360 * resolve (The comment about LIGATURE FFI above is such an
2365 widecharmap = newHV();
2367 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2370 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2372 if ( !SvTRUE( *svpp ) ) {
2373 sv_setiv( *svpp, ++trie->uniquecharcount );
2374 TRIE_STORE_REVCHAR(uvc);
2377 } /* end loop through characters in this branch of the trie */
2379 /* We take the min and max for this branch and combine to find the min
2380 * and max for all branches processed so far */
2381 if( cur == first ) {
2382 trie->minlen = minchars;
2383 trie->maxlen = maxchars;
2384 } else if (minchars < trie->minlen) {
2385 trie->minlen = minchars;
2386 } else if (maxchars > trie->maxlen) {
2387 trie->maxlen = maxchars;
2389 } /* end first pass */
2390 DEBUG_TRIE_COMPILE_r(
2391 PerlIO_printf( Perl_debug_log,
2392 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2393 (int)depth * 2 + 2,"",
2394 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2395 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2396 (int)trie->minlen, (int)trie->maxlen )
2400 We now know what we are dealing with in terms of unique chars and
2401 string sizes so we can calculate how much memory a naive
2402 representation using a flat table will take. If it's over a reasonable
2403 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2404 conservative but potentially much slower representation using an array
2407 At the end we convert both representations into the same compressed
2408 form that will be used in regexec.c for matching with. The latter
2409 is a form that cannot be used to construct with but has memory
2410 properties similar to the list form and access properties similar
2411 to the table form making it both suitable for fast searches and
2412 small enough that its feasable to store for the duration of a program.
2414 See the comment in the code where the compressed table is produced
2415 inplace from the flat tabe representation for an explanation of how
2416 the compression works.
2421 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2424 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2425 > SvIV(re_trie_maxbuff) )
2428 Second Pass -- Array Of Lists Representation
2430 Each state will be represented by a list of charid:state records
2431 (reg_trie_trans_le) the first such element holds the CUR and LEN
2432 points of the allocated array. (See defines above).
2434 We build the initial structure using the lists, and then convert
2435 it into the compressed table form which allows faster lookups
2436 (but cant be modified once converted).
2439 STRLEN transcount = 1;
2441 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2442 "%*sCompiling trie using list compiler\n",
2443 (int)depth * 2 + 2, ""));
2445 trie->states = (reg_trie_state *)
2446 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2447 sizeof(reg_trie_state) );
2451 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2453 regnode *noper = NEXTOPER( cur );
2454 U8 *uc = (U8*)STRING( noper );
2455 const U8 *e = uc + STR_LEN( noper );
2456 U32 state = 1; /* required init */
2457 U16 charid = 0; /* sanity init */
2458 U32 wordlen = 0; /* required init */
2460 if (OP(noper) == NOTHING) {
2461 regnode *noper_next= regnext(noper);
2462 if (noper_next != tail && OP(noper_next) == flags) {
2464 uc= (U8*)STRING(noper);
2465 e= uc + STR_LEN(noper);
2469 if (OP(noper) != NOTHING) {
2470 for ( ; uc < e ; uc += len ) {
2475 charid = trie->charmap[ uvc ];
2477 SV** const svpp = hv_fetch( widecharmap,
2484 charid=(U16)SvIV( *svpp );
2487 /* charid is now 0 if we dont know the char read, or
2488 * nonzero if we do */
2495 if ( !trie->states[ state ].trans.list ) {
2496 TRIE_LIST_NEW( state );
2499 check <= TRIE_LIST_USED( state );
2502 if ( TRIE_LIST_ITEM( state, check ).forid
2505 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2510 newstate = next_alloc++;
2511 prev_states[newstate] = state;
2512 TRIE_LIST_PUSH( state, charid, newstate );
2517 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2521 TRIE_HANDLE_WORD(state);
2523 } /* end second pass */
2525 /* next alloc is the NEXT state to be allocated */
2526 trie->statecount = next_alloc;
2527 trie->states = (reg_trie_state *)
2528 PerlMemShared_realloc( trie->states,
2530 * sizeof(reg_trie_state) );
2532 /* and now dump it out before we compress it */
2533 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2534 revcharmap, next_alloc,
2538 trie->trans = (reg_trie_trans *)
2539 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2546 for( state=1 ; state < next_alloc ; state ++ ) {
2550 DEBUG_TRIE_COMPILE_MORE_r(
2551 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2555 if (trie->states[state].trans.list) {
2556 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2560 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2561 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2562 if ( forid < minid ) {
2564 } else if ( forid > maxid ) {
2568 if ( transcount < tp + maxid - minid + 1) {
2570 trie->trans = (reg_trie_trans *)
2571 PerlMemShared_realloc( trie->trans,
2573 * sizeof(reg_trie_trans) );
2574 Zero( trie->trans + (transcount / 2),
2578 base = trie->uniquecharcount + tp - minid;
2579 if ( maxid == minid ) {
2581 for ( ; zp < tp ; zp++ ) {
2582 if ( ! trie->trans[ zp ].next ) {
2583 base = trie->uniquecharcount + zp - minid;
2584 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2586 trie->trans[ zp ].check = state;
2592 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2594 trie->trans[ tp ].check = state;
2599 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2600 const U32 tid = base
2601 - trie->uniquecharcount
2602 + TRIE_LIST_ITEM( state, idx ).forid;
2603 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2605 trie->trans[ tid ].check = state;
2607 tp += ( maxid - minid + 1 );
2609 Safefree(trie->states[ state ].trans.list);
2612 DEBUG_TRIE_COMPILE_MORE_r(
2613 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2616 trie->states[ state ].trans.base=base;
2618 trie->lasttrans = tp + 1;
2622 Second Pass -- Flat Table Representation.
2624 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2625 each. We know that we will need Charcount+1 trans at most to store
2626 the data (one row per char at worst case) So we preallocate both
2627 structures assuming worst case.
2629 We then construct the trie using only the .next slots of the entry
2632 We use the .check field of the first entry of the node temporarily
2633 to make compression both faster and easier by keeping track of how
2634 many non zero fields are in the node.
2636 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2639 There are two terms at use here: state as a TRIE_NODEIDX() which is
2640 a number representing the first entry of the node, and state as a
2641 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2642 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2643 if there are 2 entrys per node. eg:
2651 The table is internally in the right hand, idx form. However as we
2652 also have to deal with the states array which is indexed by nodenum
2653 we have to use TRIE_NODENUM() to convert.
2656 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2657 "%*sCompiling trie using table compiler\n",
2658 (int)depth * 2 + 2, ""));
2660 trie->trans = (reg_trie_trans *)
2661 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2662 * trie->uniquecharcount + 1,
2663 sizeof(reg_trie_trans) );
2664 trie->states = (reg_trie_state *)
2665 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2666 sizeof(reg_trie_state) );
2667 next_alloc = trie->uniquecharcount + 1;
2670 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2672 regnode *noper = NEXTOPER( cur );
2673 const U8 *uc = (U8*)STRING( noper );
2674 const U8 *e = uc + STR_LEN( noper );
2676 U32 state = 1; /* required init */
2678 U16 charid = 0; /* sanity init */
2679 U32 accept_state = 0; /* sanity init */
2681 U32 wordlen = 0; /* required init */
2683 if (OP(noper) == NOTHING) {
2684 regnode *noper_next= regnext(noper);
2685 if (noper_next != tail && OP(noper_next) == flags) {
2687 uc= (U8*)STRING(noper);
2688 e= uc + STR_LEN(noper);
2692 if ( OP(noper) != NOTHING ) {
2693 for ( ; uc < e ; uc += len ) {
2698 charid = trie->charmap[ uvc ];
2700 SV* const * const svpp = hv_fetch( widecharmap,
2704 charid = svpp ? (U16)SvIV(*svpp) : 0;
2708 if ( !trie->trans[ state + charid ].next ) {
2709 trie->trans[ state + charid ].next = next_alloc;
2710 trie->trans[ state ].check++;
2711 prev_states[TRIE_NODENUM(next_alloc)]
2712 = TRIE_NODENUM(state);
2713 next_alloc += trie->uniquecharcount;
2715 state = trie->trans[ state + charid ].next;
2717 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2719 /* charid is now 0 if we dont know the char read, or
2720 * nonzero if we do */
2723 accept_state = TRIE_NODENUM( state );
2724 TRIE_HANDLE_WORD(accept_state);
2726 } /* end second pass */
2728 /* and now dump it out before we compress it */
2729 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2731 next_alloc, depth+1));
2735 * Inplace compress the table.*
2737 For sparse data sets the table constructed by the trie algorithm will
2738 be mostly 0/FAIL transitions or to put it another way mostly empty.
2739 (Note that leaf nodes will not contain any transitions.)
2741 This algorithm compresses the tables by eliminating most such
2742 transitions, at the cost of a modest bit of extra work during lookup:
2744 - Each states[] entry contains a .base field which indicates the
2745 index in the state[] array wheres its transition data is stored.
2747 - If .base is 0 there are no valid transitions from that node.
2749 - If .base is nonzero then charid is added to it to find an entry in
2752 -If trans[states[state].base+charid].check!=state then the
2753 transition is taken to be a 0/Fail transition. Thus if there are fail
2754 transitions at the front of the node then the .base offset will point
2755 somewhere inside the previous nodes data (or maybe even into a node
2756 even earlier), but the .check field determines if the transition is
2760 The following process inplace converts the table to the compressed
2761 table: We first do not compress the root node 1,and mark all its
2762 .check pointers as 1 and set its .base pointer as 1 as well. This
2763 allows us to do a DFA construction from the compressed table later,
2764 and ensures that any .base pointers we calculate later are greater
2767 - We set 'pos' to indicate the first entry of the second node.
2769 - We then iterate over the columns of the node, finding the first and
2770 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2771 and set the .check pointers accordingly, and advance pos
2772 appropriately and repreat for the next node. Note that when we copy
2773 the next pointers we have to convert them from the original
2774 NODEIDX form to NODENUM form as the former is not valid post
2777 - If a node has no transitions used we mark its base as 0 and do not
2778 advance the pos pointer.
2780 - If a node only has one transition we use a second pointer into the
2781 structure to fill in allocated fail transitions from other states.
2782 This pointer is independent of the main pointer and scans forward
2783 looking for null transitions that are allocated to a state. When it
2784 finds one it writes the single transition into the "hole". If the
2785 pointer doesnt find one the single transition is appended as normal.
2787 - Once compressed we can Renew/realloc the structures to release the
2790 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2791 specifically Fig 3.47 and the associated pseudocode.
2795 const U32 laststate = TRIE_NODENUM( next_alloc );
2798 trie->statecount = laststate;
2800 for ( state = 1 ; state < laststate ; state++ ) {
2802 const U32 stateidx = TRIE_NODEIDX( state );
2803 const U32 o_used = trie->trans[ stateidx ].check;
2804 U32 used = trie->trans[ stateidx ].check;
2805 trie->trans[ stateidx ].check = 0;
2808 used && charid < trie->uniquecharcount;
2811 if ( flag || trie->trans[ stateidx + charid ].next ) {
2812 if ( trie->trans[ stateidx + charid ].next ) {
2814 for ( ; zp < pos ; zp++ ) {
2815 if ( ! trie->trans[ zp ].next ) {
2819 trie->states[ state ].trans.base
2821 + trie->uniquecharcount
2823 trie->trans[ zp ].next
2824 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2826 trie->trans[ zp ].check = state;
2827 if ( ++zp > pos ) pos = zp;
2834 trie->states[ state ].trans.base
2835 = pos + trie->uniquecharcount - charid ;
2837 trie->trans[ pos ].next
2838 = SAFE_TRIE_NODENUM(
2839 trie->trans[ stateidx + charid ].next );
2840 trie->trans[ pos ].check = state;
2845 trie->lasttrans = pos + 1;
2846 trie->states = (reg_trie_state *)
2847 PerlMemShared_realloc( trie->states, laststate
2848 * sizeof(reg_trie_state) );
2849 DEBUG_TRIE_COMPILE_MORE_r(
2850 PerlIO_printf( Perl_debug_log,
2851 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2852 (int)depth * 2 + 2,"",
2853 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2857 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2860 } /* end table compress */
2862 DEBUG_TRIE_COMPILE_MORE_r(
2863 PerlIO_printf(Perl_debug_log,
2864 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2865 (int)depth * 2 + 2, "",
2866 (UV)trie->statecount,
2867 (UV)trie->lasttrans)
2869 /* resize the trans array to remove unused space */
2870 trie->trans = (reg_trie_trans *)
2871 PerlMemShared_realloc( trie->trans, trie->lasttrans
2872 * sizeof(reg_trie_trans) );
2874 { /* Modify the program and insert the new TRIE node */
2875 U8 nodetype =(U8)(flags & 0xFF);
2879 regnode *optimize = NULL;
2880 #ifdef RE_TRACK_PATTERN_OFFSETS
2883 U32 mjd_nodelen = 0;
2884 #endif /* RE_TRACK_PATTERN_OFFSETS */
2885 #endif /* DEBUGGING */
2887 This means we convert either the first branch or the first Exact,
2888 depending on whether the thing following (in 'last') is a branch
2889 or not and whther first is the startbranch (ie is it a sub part of
2890 the alternation or is it the whole thing.)
2891 Assuming its a sub part we convert the EXACT otherwise we convert
2892 the whole branch sequence, including the first.
2894 /* Find the node we are going to overwrite */
2895 if ( first != startbranch || OP( last ) == BRANCH ) {
2896 /* branch sub-chain */
2897 NEXT_OFF( first ) = (U16)(last - first);
2898 #ifdef RE_TRACK_PATTERN_OFFSETS
2900 mjd_offset= Node_Offset((convert));
2901 mjd_nodelen= Node_Length((convert));
2904 /* whole branch chain */
2906 #ifdef RE_TRACK_PATTERN_OFFSETS
2909 const regnode *nop = NEXTOPER( convert );
2910 mjd_offset= Node_Offset((nop));
2911 mjd_nodelen= Node_Length((nop));
2915 PerlIO_printf(Perl_debug_log,
2916 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2917 (int)depth * 2 + 2, "",
2918 (UV)mjd_offset, (UV)mjd_nodelen)
2921 /* But first we check to see if there is a common prefix we can
2922 split out as an EXACT and put in front of the TRIE node. */
2923 trie->startstate= 1;
2924 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2926 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2930 const U32 base = trie->states[ state ].trans.base;
2932 if ( trie->states[state].wordnum )
2935 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2936 if ( ( base + ofs >= trie->uniquecharcount ) &&
2937 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2938 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2940 if ( ++count > 1 ) {
2941 SV **tmp = av_fetch( revcharmap, ofs, 0);
2942 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2943 if ( state == 1 ) break;
2945 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2947 PerlIO_printf(Perl_debug_log,
2948 "%*sNew Start State=%"UVuf" Class: [",
2949 (int)depth * 2 + 2, "",
2952 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2953 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2955 TRIE_BITMAP_SET(trie,*ch);
2957 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2959 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2963 TRIE_BITMAP_SET(trie,*ch);
2965 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2966 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2972 SV **tmp = av_fetch( revcharmap, idx, 0);
2974 char *ch = SvPV( *tmp, len );
2976 SV *sv=sv_newmortal();
2977 PerlIO_printf( Perl_debug_log,
2978 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2979 (int)depth * 2 + 2, "",
2981 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2982 PL_colors[0], PL_colors[1],
2983 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2984 PERL_PV_ESCAPE_FIRSTCHAR
2989 OP( convert ) = nodetype;
2990 str=STRING(convert);
2993 STR_LEN(convert) += len;
2999 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
3004 trie->prefixlen = (state-1);
3006 regnode *n = convert+NODE_SZ_STR(convert);
3007 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3008 trie->startstate = state;
3009 trie->minlen -= (state - 1);
3010 trie->maxlen -= (state - 1);
3012 /* At least the UNICOS C compiler choked on this
3013 * being argument to DEBUG_r(), so let's just have
3016 #ifdef PERL_EXT_RE_BUILD
3022 regnode *fix = convert;
3023 U32 word = trie->wordcount;
3025 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3026 while( ++fix < n ) {
3027 Set_Node_Offset_Length(fix, 0, 0);
3030 SV ** const tmp = av_fetch( trie_words, word, 0 );
3032 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3033 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3035 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3043 NEXT_OFF(convert) = (U16)(tail - convert);
3044 DEBUG_r(optimize= n);
3050 if ( trie->maxlen ) {
3051 NEXT_OFF( convert ) = (U16)(tail - convert);
3052 ARG_SET( convert, data_slot );
3053 /* Store the offset to the first unabsorbed branch in
3054 jump[0], which is otherwise unused by the jump logic.
3055 We use this when dumping a trie and during optimisation. */
3057 trie->jump[0] = (U16)(nextbranch - convert);
3059 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3060 * and there is a bitmap
3061 * and the first "jump target" node we found leaves enough room
3062 * then convert the TRIE node into a TRIEC node, with the bitmap
3063 * embedded inline in the opcode - this is hypothetically faster.
3065 if ( !trie->states[trie->startstate].wordnum
3067 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3069 OP( convert ) = TRIEC;
3070 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3071 PerlMemShared_free(trie->bitmap);
3074 OP( convert ) = TRIE;
3076 /* store the type in the flags */
3077 convert->flags = nodetype;
3081 + regarglen[ OP( convert ) ];
3083 /* XXX We really should free up the resource in trie now,
3084 as we won't use them - (which resources?) dmq */
3086 /* needed for dumping*/
3087 DEBUG_r(if (optimize) {
3088 regnode *opt = convert;
3090 while ( ++opt < optimize) {
3091 Set_Node_Offset_Length(opt,0,0);
3094 Try to clean up some of the debris left after the
3097 while( optimize < jumper ) {
3098 mjd_nodelen += Node_Length((optimize));
3099 OP( optimize ) = OPTIMIZED;
3100 Set_Node_Offset_Length(optimize,0,0);
3103 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3105 } /* end node insert */
3107 /* Finish populating the prev field of the wordinfo array. Walk back
3108 * from each accept state until we find another accept state, and if
3109 * so, point the first word's .prev field at the second word. If the
3110 * second already has a .prev field set, stop now. This will be the
3111 * case either if we've already processed that word's accept state,
3112 * or that state had multiple words, and the overspill words were
3113 * already linked up earlier.
3120 for (word=1; word <= trie->wordcount; word++) {
3122 if (trie->wordinfo[word].prev)
3124 state = trie->wordinfo[word].accept;
3126 state = prev_states[state];
3129 prev = trie->states[state].wordnum;
3133 trie->wordinfo[word].prev = prev;
3135 Safefree(prev_states);
3139 /* and now dump out the compressed format */
3140 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3142 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3144 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3145 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3147 SvREFCNT_dec_NN(revcharmap);
3151 : trie->startstate>1
3157 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3159 /* The Trie is constructed and compressed now so we can build a fail array if
3162 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3164 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3168 We find the fail state for each state in the trie, this state is the longest
3169 proper suffix of the current state's 'word' that is also a proper prefix of
3170 another word in our trie. State 1 represents the word '' and is thus the
3171 default fail state. This allows the DFA not to have to restart after its
3172 tried and failed a word at a given point, it simply continues as though it
3173 had been matching the other word in the first place.
3175 'abcdgu'=~/abcdefg|cdgu/
3176 When we get to 'd' we are still matching the first word, we would encounter
3177 'g' which would fail, which would bring us to the state representing 'd' in
3178 the second word where we would try 'g' and succeed, proceeding to match
3181 /* add a fail transition */
3182 const U32 trie_offset = ARG(source);
3183 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3185 const U32 ucharcount = trie->uniquecharcount;
3186 const U32 numstates = trie->statecount;
3187 const U32 ubound = trie->lasttrans + ucharcount;
3191 U32 base = trie->states[ 1 ].trans.base;
3194 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3196 GET_RE_DEBUG_FLAGS_DECL;
3198 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3199 PERL_UNUSED_CONTEXT;
3201 PERL_UNUSED_ARG(depth);
3204 if ( OP(source) == TRIE ) {
3205 struct regnode_1 *op = (struct regnode_1 *)
3206 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3207 StructCopy(source,op,struct regnode_1);
3208 stclass = (regnode *)op;
3210 struct regnode_charclass *op = (struct regnode_charclass *)
3211 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3212 StructCopy(source,op,struct regnode_charclass);
3213 stclass = (regnode *)op;
3215 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3217 ARG_SET( stclass, data_slot );
3218 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3219 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3220 aho->trie=trie_offset;
3221 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3222 Copy( trie->states, aho->states, numstates, reg_trie_state );
3223 Newxz( q, numstates, U32);
3224 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3227 /* initialize fail[0..1] to be 1 so that we always have
3228 a valid final fail state */
3229 fail[ 0 ] = fail[ 1 ] = 1;
3231 for ( charid = 0; charid < ucharcount ; charid++ ) {
3232 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3234 q[ q_write ] = newstate;
3235 /* set to point at the root */
3236 fail[ q[ q_write++ ] ]=1;
3239 while ( q_read < q_write) {
3240 const U32 cur = q[ q_read++ % numstates ];
3241 base = trie->states[ cur ].trans.base;
3243 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3244 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3246 U32 fail_state = cur;
3249 fail_state = fail[ fail_state ];
3250 fail_base = aho->states[ fail_state ].trans.base;
3251 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3253 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3254 fail[ ch_state ] = fail_state;
3255 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3257 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3259 q[ q_write++ % numstates] = ch_state;
3263 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3264 when we fail in state 1, this allows us to use the
3265 charclass scan to find a valid start char. This is based on the principle
3266 that theres a good chance the string being searched contains lots of stuff
3267 that cant be a start char.
3269 fail[ 0 ] = fail[ 1 ] = 0;
3270 DEBUG_TRIE_COMPILE_r({
3271 PerlIO_printf(Perl_debug_log,
3272 "%*sStclass Failtable (%"UVuf" states): 0",
3273 (int)(depth * 2), "", (UV)numstates
3275 for( q_read=1; q_read<numstates; q_read++ ) {
3276 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3278 PerlIO_printf(Perl_debug_log, "\n");
3281 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3286 #define DEBUG_PEEP(str,scan,depth) \
3287 DEBUG_OPTIMISE_r({if (scan){ \
3288 regnode *Next = regnext(scan); \
3289 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3290 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3291 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3292 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3293 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3294 PerlIO_printf(Perl_debug_log, "\n"); \
3297 /* The below joins as many adjacent EXACTish nodes as possible into a single
3298 * one. The regop may be changed if the node(s) contain certain sequences that
3299 * require special handling. The joining is only done if:
3300 * 1) there is room in the current conglomerated node to entirely contain the
3302 * 2) they are the exact same node type
3304 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3305 * these get optimized out
3307 * If a node is to match under /i (folded), the number of characters it matches
3308 * can be different than its character length if it contains a multi-character
3309 * fold. *min_subtract is set to the total delta number of characters of the
3312 * And *unfolded_multi_char is set to indicate whether or not the node contains
3313 * an unfolded multi-char fold. This happens when whether the fold is valid or
3314 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3315 * SMALL LETTER SHARP S, as only if the target string being matched against
3316 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3317 * folding rules depend on the locale in force at runtime. (Multi-char folds
3318 * whose components are all above the Latin1 range are not run-time locale
3319 * dependent, and have already been folded by the time this function is
3322 * This is as good a place as any to discuss the design of handling these
3323 * multi-character fold sequences. It's been wrong in Perl for a very long
3324 * time. There are three code points in Unicode whose multi-character folds
3325 * were long ago discovered to mess things up. The previous designs for
3326 * dealing with these involved assigning a special node for them. This
3327 * approach doesn't always work, as evidenced by this example:
3328 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3329 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3330 * would match just the \xDF, it won't be able to handle the case where a
3331 * successful match would have to cross the node's boundary. The new approach
3332 * that hopefully generally solves the problem generates an EXACTFU_SS node
3333 * that is "sss" in this case.
3335 * It turns out that there are problems with all multi-character folds, and not
3336 * just these three. Now the code is general, for all such cases. The
3337 * approach taken is:
3338 * 1) This routine examines each EXACTFish node that could contain multi-
3339 * character folded sequences. Since a single character can fold into
3340 * such a sequence, the minimum match length for this node is less than
3341 * the number of characters in the node. This routine returns in
3342 * *min_subtract how many characters to subtract from the the actual
3343 * length of the string to get a real minimum match length; it is 0 if
3344 * there are no multi-char foldeds. This delta is used by the caller to
3345 * adjust the min length of the match, and the delta between min and max,
3346 * so that the optimizer doesn't reject these possibilities based on size
3348 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3349 * is used for an EXACTFU node that contains at least one "ss" sequence in
3350 * it. For non-UTF-8 patterns and strings, this is the only case where
3351 * there is a possible fold length change. That means that a regular
3352 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3353 * with length changes, and so can be processed faster. regexec.c takes
3354 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3355 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3356 * known until runtime). This saves effort in regex matching. However,
3357 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3358 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3359 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3360 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3361 * possibilities for the non-UTF8 patterns are quite simple, except for
3362 * the sharp s. All the ones that don't involve a UTF-8 target string are
3363 * members of a fold-pair, and arrays are set up for all of them so that
3364 * the other member of the pair can be found quickly. Code elsewhere in
3365 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3366 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3367 * described in the next item.
3368 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3369 * validity of the fold won't be known until runtime, and so must remain
3370 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3371 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3372 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3373 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3374 * The reason this is a problem is that the optimizer part of regexec.c
3375 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3376 * that a character in the pattern corresponds to at most a single
3377 * character in the target string. (And I do mean character, and not byte
3378 * here, unlike other parts of the documentation that have never been
3379 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3380 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3381 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3382 * nodes, violate the assumption, and they are the only instances where it
3383 * is violated. I'm reluctant to try to change the assumption, as the
3384 * code involved is impenetrable to me (khw), so instead the code here
3385 * punts. This routine examines EXACTFL nodes, and (when the pattern
3386 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3387 * boolean indicating whether or not the node contains such a fold. When
3388 * it is true, the caller sets a flag that later causes the optimizer in
3389 * this file to not set values for the floating and fixed string lengths,
3390 * and thus avoids the optimizer code in regexec.c that makes the invalid
3391 * assumption. Thus, there is no optimization based on string lengths for
3392 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3393 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3394 * assumption is wrong only in these cases is that all other non-UTF-8
3395 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3396 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3397 * EXACTF nodes because we don't know at compile time if it actually
3398 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3399 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3400 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3401 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3402 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3403 * string would require the pattern to be forced into UTF-8, the overhead
3404 * of which we want to avoid. Similarly the unfolded multi-char folds in
3405 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3408 * Similarly, the code that generates tries doesn't currently handle
3409 * not-already-folded multi-char folds, and it looks like a pain to change
3410 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3411 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3412 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3413 * using /iaa matching will be doing so almost entirely with ASCII
3414 * strings, so this should rarely be encountered in practice */
3416 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3417 if (PL_regkind[OP(scan)] == EXACT) \
3418 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3421 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3422 UV *min_subtract, bool *unfolded_multi_char,
3423 U32 flags,regnode *val, U32 depth)
3425 /* Merge several consecutive EXACTish nodes into one. */
3426 regnode *n = regnext(scan);
3428 regnode *next = scan + NODE_SZ_STR(scan);
3432 regnode *stop = scan;
3433 GET_RE_DEBUG_FLAGS_DECL;
3435 PERL_UNUSED_ARG(depth);
3438 PERL_ARGS_ASSERT_JOIN_EXACT;
3439 #ifndef EXPERIMENTAL_INPLACESCAN
3440 PERL_UNUSED_ARG(flags);
3441 PERL_UNUSED_ARG(val);
3443 DEBUG_PEEP("join",scan,depth);
3445 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3446 * EXACT ones that are mergeable to the current one. */
3448 && (PL_regkind[OP(n)] == NOTHING
3449 || (stringok && OP(n) == OP(scan)))
3451 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3454 if (OP(n) == TAIL || n > next)
3456 if (PL_regkind[OP(n)] == NOTHING) {
3457 DEBUG_PEEP("skip:",n,depth);
3458 NEXT_OFF(scan) += NEXT_OFF(n);
3459 next = n + NODE_STEP_REGNODE;
3466 else if (stringok) {
3467 const unsigned int oldl = STR_LEN(scan);
3468 regnode * const nnext = regnext(n);
3470 /* XXX I (khw) kind of doubt that this works on platforms (should
3471 * Perl ever run on one) where U8_MAX is above 255 because of lots
3472 * of other assumptions */
3473 /* Don't join if the sum can't fit into a single node */
3474 if (oldl + STR_LEN(n) > U8_MAX)
3477 DEBUG_PEEP("merg",n,depth);
3480 NEXT_OFF(scan) += NEXT_OFF(n);
3481 STR_LEN(scan) += STR_LEN(n);
3482 next = n + NODE_SZ_STR(n);
3483 /* Now we can overwrite *n : */
3484 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3492 #ifdef EXPERIMENTAL_INPLACESCAN
3493 if (flags && !NEXT_OFF(n)) {
3494 DEBUG_PEEP("atch", val, depth);
3495 if (reg_off_by_arg[OP(n)]) {
3496 ARG_SET(n, val - n);
3499 NEXT_OFF(n) = val - n;
3507 *unfolded_multi_char = FALSE;
3509 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3510 * can now analyze for sequences of problematic code points. (Prior to
3511 * this final joining, sequences could have been split over boundaries, and
3512 * hence missed). The sequences only happen in folding, hence for any
3513 * non-EXACT EXACTish node */
3514 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3515 U8* s0 = (U8*) STRING(scan);
3517 U8* s_end = s0 + STR_LEN(scan);
3519 int total_count_delta = 0; /* Total delta number of characters that
3520 multi-char folds expand to */
3522 /* One pass is made over the node's string looking for all the
3523 * possibilities. To avoid some tests in the loop, there are two main
3524 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3529 if (OP(scan) == EXACTFL) {
3532 /* An EXACTFL node would already have been changed to another
3533 * node type unless there is at least one character in it that
3534 * is problematic; likely a character whose fold definition
3535 * won't be known until runtime, and so has yet to be folded.
3536 * For all but the UTF-8 locale, folds are 1-1 in length, but
3537 * to handle the UTF-8 case, we need to create a temporary
3538 * folded copy using UTF-8 locale rules in order to analyze it.
3539 * This is because our macros that look to see if a sequence is
3540 * a multi-char fold assume everything is folded (otherwise the
3541 * tests in those macros would be too complicated and slow).
3542 * Note that here, the non-problematic folds will have already
3543 * been done, so we can just copy such characters. We actually
3544 * don't completely fold the EXACTFL string. We skip the
3545 * unfolded multi-char folds, as that would just create work
3546 * below to figure out the size they already are */
3548 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3551 STRLEN s_len = UTF8SKIP(s);
3552 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3553 Copy(s, d, s_len, U8);
3556 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3557 *unfolded_multi_char = TRUE;
3558 Copy(s, d, s_len, U8);
3561 else if (isASCII(*s)) {
3562 *(d++) = toFOLD(*s);
3566 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3572 /* Point the remainder of the routine to look at our temporary
3576 } /* End of creating folded copy of EXACTFL string */
3578 /* Examine the string for a multi-character fold sequence. UTF-8
3579 * patterns have all characters pre-folded by the time this code is
3581 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3582 length sequence we are looking for is 2 */
3584 int count = 0; /* How many characters in a multi-char fold */
3585 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3586 if (! len) { /* Not a multi-char fold: get next char */
3591 /* Nodes with 'ss' require special handling, except for
3592 * EXACTFA-ish for which there is no multi-char fold to this */
3593 if (len == 2 && *s == 's' && *(s+1) == 's'
3594 && OP(scan) != EXACTFA
3595 && OP(scan) != EXACTFA_NO_TRIE)
3598 if (OP(scan) != EXACTFL) {
3599 OP(scan) = EXACTFU_SS;
3603 else { /* Here is a generic multi-char fold. */
3604 U8* multi_end = s + len;
3606 /* Count how many characters are in it. In the case of
3607 * /aa, no folds which contain ASCII code points are
3608 * allowed, so check for those, and skip if found. */
3609 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3610 count = utf8_length(s, multi_end);
3614 while (s < multi_end) {
3617 goto next_iteration;
3627 /* The delta is how long the sequence is minus 1 (1 is how long
3628 * the character that folds to the sequence is) */
3629 total_count_delta += count - 1;
3633 /* We created a temporary folded copy of the string in EXACTFL
3634 * nodes. Therefore we need to be sure it doesn't go below zero,
3635 * as the real string could be shorter */
3636 if (OP(scan) == EXACTFL) {
3637 int total_chars = utf8_length((U8*) STRING(scan),
3638 (U8*) STRING(scan) + STR_LEN(scan));
3639 if (total_count_delta > total_chars) {
3640 total_count_delta = total_chars;
3644 *min_subtract += total_count_delta;
3647 else if (OP(scan) == EXACTFA) {
3649 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3650 * fold to the ASCII range (and there are no existing ones in the
3651 * upper latin1 range). But, as outlined in the comments preceding
3652 * this function, we need to flag any occurrences of the sharp s.
3653 * This character forbids trie formation (because of added
3655 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
3656 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
3657 || UNICODE_DOT_DOT_VERSION > 0)
3659 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3660 OP(scan) = EXACTFA_NO_TRIE;
3661 *unfolded_multi_char = TRUE;
3669 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3670 * folds that are all Latin1. As explained in the comments
3671 * preceding this function, we look also for the sharp s in EXACTF
3672 * and EXACTFL nodes; it can be in the final position. Otherwise
3673 * we can stop looking 1 byte earlier because have to find at least
3674 * two characters for a multi-fold */
3675 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3680 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3681 if (! len) { /* Not a multi-char fold. */
3682 if (*s == LATIN_SMALL_LETTER_SHARP_S
3683 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3685 *unfolded_multi_char = TRUE;
3692 && isALPHA_FOLD_EQ(*s, 's')
3693 && isALPHA_FOLD_EQ(*(s+1), 's'))
3696 /* EXACTF nodes need to know that the minimum length
3697 * changed so that a sharp s in the string can match this
3698 * ss in the pattern, but they remain EXACTF nodes, as they
3699 * won't match this unless the target string is is UTF-8,
3700 * which we don't know until runtime. EXACTFL nodes can't
3701 * transform into EXACTFU nodes */
3702 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3703 OP(scan) = EXACTFU_SS;
3707 *min_subtract += len - 1;
3715 /* Allow dumping but overwriting the collection of skipped
3716 * ops and/or strings with fake optimized ops */
3717 n = scan + NODE_SZ_STR(scan);
3725 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3729 /* REx optimizer. Converts nodes into quicker variants "in place".
3730 Finds fixed substrings. */
3732 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3733 to the position after last scanned or to NULL. */
3735 #define INIT_AND_WITHP \
3736 assert(!and_withp); \
3737 Newx(and_withp,1, regnode_ssc); \
3738 SAVEFREEPV(and_withp)
3742 S_unwind_scan_frames(pTHX_ const void *p)
3744 scan_frame *f= (scan_frame *)p;
3746 scan_frame *n= f->next_frame;
3754 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3755 SSize_t *minlenp, SSize_t *deltap,
3760 regnode_ssc *and_withp,
3761 U32 flags, U32 depth)
3762 /* scanp: Start here (read-write). */
3763 /* deltap: Write maxlen-minlen here. */
3764 /* last: Stop before this one. */
3765 /* data: string data about the pattern */
3766 /* stopparen: treat close N as END */
3767 /* recursed: which subroutines have we recursed into */
3768 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3770 /* There must be at least this number of characters to match */
3773 regnode *scan = *scanp, *next;
3775 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3776 int is_inf_internal = 0; /* The studied chunk is infinite */
3777 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3778 scan_data_t data_fake;
3779 SV *re_trie_maxbuff = NULL;
3780 regnode *first_non_open = scan;
3781 SSize_t stopmin = SSize_t_MAX;
3782 scan_frame *frame = NULL;
3783 GET_RE_DEBUG_FLAGS_DECL;
3785 PERL_ARGS_ASSERT_STUDY_CHUNK;
3789 while (first_non_open && OP(first_non_open) == OPEN)
3790 first_non_open=regnext(first_non_open);
3796 RExC_study_chunk_recursed_count++;
3798 DEBUG_OPTIMISE_MORE_r(
3800 PerlIO_printf(Perl_debug_log,
3801 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3802 (int)(depth*2), "", (long)stopparen,
3803 (unsigned long)RExC_study_chunk_recursed_count,
3804 (unsigned long)depth, (unsigned long)recursed_depth,
3807 if (recursed_depth) {
3810 for ( j = 0 ; j < recursed_depth ; j++ ) {
3811 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3813 PAREN_TEST(RExC_study_chunk_recursed +
3814 ( j * RExC_study_chunk_recursed_bytes), i )
3817 !PAREN_TEST(RExC_study_chunk_recursed +
3818 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3821 PerlIO_printf(Perl_debug_log," %d",(int)i);
3825 if ( j + 1 < recursed_depth ) {
3826 PerlIO_printf(Perl_debug_log, ",");
3830 PerlIO_printf(Perl_debug_log,"\n");
3833 while ( scan && OP(scan) != END && scan < last ){
3834 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3835 node length to get a real minimum (because
3836 the folded version may be shorter) */
3837 bool unfolded_multi_char = FALSE;
3838 /* Peephole optimizer: */
3839 DEBUG_STUDYDATA("Peep:", data, depth);
3840 DEBUG_PEEP("Peep", scan, depth);
3843 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3844 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3845 * by a different invocation of reg() -- Yves
3847 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3849 /* Follow the next-chain of the current node and optimize
3850 away all the NOTHINGs from it. */
3851 if (OP(scan) != CURLYX) {
3852 const int max = (reg_off_by_arg[OP(scan)]
3854 /* I32 may be smaller than U16 on CRAYs! */
3855 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3856 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3860 /* Skip NOTHING and LONGJMP. */
3861 while ((n = regnext(n))
3862 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3863 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3864 && off + noff < max)
3866 if (reg_off_by_arg[OP(scan)])
3869 NEXT_OFF(scan) = off;
3872 /* The principal pseudo-switch. Cannot be a switch, since we
3873 look into several different things. */
3874 if ( OP(scan) == DEFINEP ) {
3876 SSize_t deltanext = 0;
3877 SSize_t fake_last_close = 0;
3878 I32 f = SCF_IN_DEFINE;
3880 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3881 scan = regnext(scan);
3882 assert( OP(scan) == IFTHEN );
3883 DEBUG_PEEP("expect IFTHEN", scan, depth);
3885 data_fake.last_closep= &fake_last_close;
3887 next = regnext(scan);
3888 scan = NEXTOPER(NEXTOPER(scan));
3889 DEBUG_PEEP("scan", scan, depth);
3890 DEBUG_PEEP("next", next, depth);
3892 /* we suppose the run is continuous, last=next...
3893 * NOTE we dont use the return here! */
3894 (void)study_chunk(pRExC_state, &scan, &minlen,
3895 &deltanext, next, &data_fake, stopparen,
3896 recursed_depth, NULL, f, depth+1);
3901 OP(scan) == BRANCH ||
3902 OP(scan) == BRANCHJ ||
3905 next = regnext(scan);
3908 /* The op(next)==code check below is to see if we
3909 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3910 * IFTHEN is special as it might not appear in pairs.
3911 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3912 * we dont handle it cleanly. */
3913 if (OP(next) == code || code == IFTHEN) {
3914 /* NOTE - There is similar code to this block below for
3915 * handling TRIE nodes on a re-study. If you change stuff here
3916 * check there too. */
3917 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3919 regnode * const startbranch=scan;
3921 if (flags & SCF_DO_SUBSTR) {
3922 /* Cannot merge strings after this. */
3923 scan_commit(pRExC_state, data, minlenp, is_inf);
3926 if (flags & SCF_DO_STCLASS)
3927 ssc_init_zero(pRExC_state, &accum);
3929 while (OP(scan) == code) {
3930 SSize_t deltanext, minnext, fake;
3932 regnode_ssc this_class;
3934 DEBUG_PEEP("Branch", scan, depth);
3937 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3939 data_fake.whilem_c = data->whilem_c;
3940 data_fake.last_closep = data->last_closep;
3943 data_fake.last_closep = &fake;
3945 data_fake.pos_delta = delta;
3946 next = regnext(scan);
3948 scan = NEXTOPER(scan); /* everything */
3949 if (code != BRANCH) /* everything but BRANCH */
3950 scan = NEXTOPER(scan);
3952 if (flags & SCF_DO_STCLASS) {
3953 ssc_init(pRExC_state, &this_class);
3954 data_fake.start_class = &this_class;
3955 f = SCF_DO_STCLASS_AND;
3957 if (flags & SCF_WHILEM_VISITED_POS)
3958 f |= SCF_WHILEM_VISITED_POS;
3960 /* we suppose the run is continuous, last=next...*/
3961 minnext = study_chunk(pRExC_state, &scan, minlenp,
3962 &deltanext, next, &data_fake, stopparen,
3963 recursed_depth, NULL, f,depth+1);
3967 if (deltanext == SSize_t_MAX) {
3968 is_inf = is_inf_internal = 1;
3970 } else if (max1 < minnext + deltanext)
3971 max1 = minnext + deltanext;
3973 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3975 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3976 if ( stopmin > minnext)
3977 stopmin = min + min1;
3978 flags &= ~SCF_DO_SUBSTR;
3980 data->flags |= SCF_SEEN_ACCEPT;
3983 if (data_fake.flags & SF_HAS_EVAL)
3984 data->flags |= SF_HAS_EVAL;
3985 data->whilem_c = data_fake.whilem_c;
3987 if (flags & SCF_DO_STCLASS)
3988 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3990 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3992 if (flags & SCF_DO_SUBSTR) {
3993 data->pos_min += min1;
3994 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3995 data->pos_delta = SSize_t_MAX;
3997 data->pos_delta += max1 - min1;
3998 if (max1 != min1 || is_inf)
3999 data->longest = &(data->longest_float);
4002 if (delta == SSize_t_MAX
4003 || SSize_t_MAX - delta - (max1 - min1) < 0)
4004 delta = SSize_t_MAX;
4006 delta += max1 - min1;
4007 if (flags & SCF_DO_STCLASS_OR) {
4008 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4010 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4011 flags &= ~SCF_DO_STCLASS;
4014 else if (flags & SCF_DO_STCLASS_AND) {
4016 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4017 flags &= ~SCF_DO_STCLASS;
4020 /* Switch to OR mode: cache the old value of
4021 * data->start_class */
4023 StructCopy(data->start_class, and_withp, regnode_ssc);
4024 flags &= ~SCF_DO_STCLASS_AND;
4025 StructCopy(&accum, data->start_class, regnode_ssc);
4026 flags |= SCF_DO_STCLASS_OR;
4030 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4031 OP( startbranch ) == BRANCH )
4035 Assuming this was/is a branch we are dealing with: 'scan'
4036 now points at the item that follows the branch sequence,
4037 whatever it is. We now start at the beginning of the
4038 sequence and look for subsequences of
4044 which would be constructed from a pattern like
4047 If we can find such a subsequence we need to turn the first
4048 element into a trie and then add the subsequent branch exact
4049 strings to the trie.
4053 1. patterns where the whole set of branches can be
4056 2. patterns where only a subset can be converted.
4058 In case 1 we can replace the whole set with a single regop
4059 for the trie. In case 2 we need to keep the start and end
4062 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4063 becomes BRANCH TRIE; BRANCH X;
4065 There is an additional case, that being where there is a
4066 common prefix, which gets split out into an EXACT like node
4067 preceding the TRIE node.
4069 If x(1..n)==tail then we can do a simple trie, if not we make
4070 a "jump" trie, such that when we match the appropriate word
4071 we "jump" to the appropriate tail node. Essentially we turn
4072 a nested if into a case structure of sorts.
4077 if (!re_trie_maxbuff) {
4078 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4079 if (!SvIOK(re_trie_maxbuff))
4080 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4082 if ( SvIV(re_trie_maxbuff)>=0 ) {
4084 regnode *first = (regnode *)NULL;
4085 regnode *last = (regnode *)NULL;
4086 regnode *tail = scan;
4090 /* var tail is used because there may be a TAIL
4091 regop in the way. Ie, the exacts will point to the
4092 thing following the TAIL, but the last branch will
4093 point at the TAIL. So we advance tail. If we
4094 have nested (?:) we may have to move through several
4098 while ( OP( tail ) == TAIL ) {
4099 /* this is the TAIL generated by (?:) */
4100 tail = regnext( tail );
4104 DEBUG_TRIE_COMPILE_r({
4105 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4106 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4107 (int)depth * 2 + 2, "",
4108 "Looking for TRIE'able sequences. Tail node is: ",
4109 SvPV_nolen_const( RExC_mysv )
4115 Step through the branches
4116 cur represents each branch,
4117 noper is the first thing to be matched as part
4119 noper_next is the regnext() of that node.
4121 We normally handle a case like this
4122 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4123 support building with NOJUMPTRIE, which restricts
4124 the trie logic to structures like /FOO|BAR/.
4126 If noper is a trieable nodetype then the branch is
4127 a possible optimization target. If we are building
4128 under NOJUMPTRIE then we require that noper_next is
4129 the same as scan (our current position in the regex
4132 Once we have two or more consecutive such branches
4133 we can create a trie of the EXACT's contents and
4134 stitch it in place into the program.
4136 If the sequence represents all of the branches in
4137 the alternation we replace the entire thing with a
4140 Otherwise when it is a subsequence we need to
4141 stitch it in place and replace only the relevant
4142 branches. This means the first branch has to remain
4143 as it is used by the alternation logic, and its
4144 next pointer, and needs to be repointed at the item
4145 on the branch chain following the last branch we
4146 have optimized away.
4148 This could be either a BRANCH, in which case the
4149 subsequence is internal, or it could be the item
4150 following the branch sequence in which case the
4151 subsequence is at the end (which does not
4152 necessarily mean the first node is the start of the
4155 TRIE_TYPE(X) is a define which maps the optype to a
4159 ----------------+-----------
4163 EXACTFU_SS | EXACTFU
4166 EXACTFLU8 | EXACTFLU8
4170 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4172 : ( EXACT == (X) ) \
4174 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4176 : ( EXACTFA == (X) ) \
4178 : ( EXACTL == (X) ) \
4180 : ( EXACTFLU8 == (X) ) \
4184 /* dont use tail as the end marker for this traverse */
4185 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4186 regnode * const noper = NEXTOPER( cur );
4187 U8 noper_type = OP( noper );
4188 U8 noper_trietype = TRIE_TYPE( noper_type );
4189 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4190 regnode * const noper_next = regnext( noper );
4191 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4192 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4195 DEBUG_TRIE_COMPILE_r({
4196 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4197 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4198 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4200 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4201 PerlIO_printf( Perl_debug_log, " -> %s",
4202 SvPV_nolen_const(RExC_mysv));
4205 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4206 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4207 SvPV_nolen_const(RExC_mysv));
4209 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4210 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4211 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4215 /* Is noper a trieable nodetype that can be merged
4216 * with the current trie (if there is one)? */
4220 ( noper_trietype == NOTHING)
4221 || ( trietype == NOTHING )
4222 || ( trietype == noper_trietype )
4225 && noper_next == tail
4229 /* Handle mergable triable node Either we are
4230 * the first node in a new trieable sequence,
4231 * in which case we do some bookkeeping,
4232 * otherwise we update the end pointer. */
4235 if ( noper_trietype == NOTHING ) {
4236 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4237 regnode * const noper_next = regnext( noper );
4238 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4239 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4242 if ( noper_next_trietype ) {
4243 trietype = noper_next_trietype;
4244 } else if (noper_next_type) {
4245 /* a NOTHING regop is 1 regop wide.
4246 * We need at least two for a trie
4247 * so we can't merge this in */
4251 trietype = noper_trietype;
4254 if ( trietype == NOTHING )
4255 trietype = noper_trietype;
4260 } /* end handle mergable triable node */
4262 /* handle unmergable node -
4263 * noper may either be a triable node which can
4264 * not be tried together with the current trie,
4265 * or a non triable node */
4267 /* If last is set and trietype is not
4268 * NOTHING then we have found at least two
4269 * triable branch sequences in a row of a
4270 * similar trietype so we can turn them
4271 * into a trie. If/when we allow NOTHING to
4272 * start a trie sequence this condition
4273 * will be required, and it isn't expensive
4274 * so we leave it in for now. */
4275 if ( trietype && trietype != NOTHING )
4276 make_trie( pRExC_state,
4277 startbranch, first, cur, tail,
4278 count, trietype, depth+1 );
4279 last = NULL; /* note: we clear/update
4280 first, trietype etc below,
4281 so we dont do it here */
4285 && noper_next == tail
4288 /* noper is triable, so we can start a new
4292 trietype = noper_trietype;
4294 /* if we already saw a first but the
4295 * current node is not triable then we have
4296 * to reset the first information. */
4301 } /* end handle unmergable node */
4302 } /* loop over branches */
4303 DEBUG_TRIE_COMPILE_r({
4304 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4305 PerlIO_printf( Perl_debug_log,
4306 "%*s- %s (%d) <SCAN FINISHED>\n",
4308 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4311 if ( last && trietype ) {
4312 if ( trietype != NOTHING ) {
4313 /* the last branch of the sequence was part of
4314 * a trie, so we have to construct it here
4315 * outside of the loop */
4316 made= make_trie( pRExC_state, startbranch,
4317 first, scan, tail, count,
4318 trietype, depth+1 );
4319 #ifdef TRIE_STUDY_OPT
4320 if ( ((made == MADE_EXACT_TRIE &&
4321 startbranch == first)
4322 || ( first_non_open == first )) &&
4324 flags |= SCF_TRIE_RESTUDY;
4325 if ( startbranch == first
4328 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4333 /* at this point we know whatever we have is a
4334 * NOTHING sequence/branch AND if 'startbranch'
4335 * is 'first' then we can turn the whole thing
4338 if ( startbranch == first ) {
4340 /* the entire thing is a NOTHING sequence,
4341 * something like this: (?:|) So we can
4342 * turn it into a plain NOTHING op. */
4343 DEBUG_TRIE_COMPILE_r({
4344 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4345 PerlIO_printf( Perl_debug_log,
4346 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4347 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4350 OP(startbranch)= NOTHING;
4351 NEXT_OFF(startbranch)= tail - startbranch;
4352 for ( opt= startbranch + 1; opt < tail ; opt++ )
4356 } /* end if ( last) */
4357 } /* TRIE_MAXBUF is non zero */
4362 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4363 scan = NEXTOPER(NEXTOPER(scan));
4364 } else /* single branch is optimized. */
4365 scan = NEXTOPER(scan);
4367 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4369 regnode *start = NULL;
4370 regnode *end = NULL;
4371 U32 my_recursed_depth= recursed_depth;
4374 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4375 /* Do setup, note this code has side effects beyond
4376 * the rest of this block. Specifically setting
4377 * RExC_recurse[] must happen at least once during
4379 if (OP(scan) == GOSUB) {
4381 RExC_recurse[ARG2L(scan)] = scan;
4382 start = RExC_open_parens[paren-1];
4383 end = RExC_close_parens[paren-1];
4385 start = RExC_rxi->program + 1;
4388 /* NOTE we MUST always execute the above code, even
4389 * if we do nothing with a GOSUB/GOSTART */
4391 ( flags & SCF_IN_DEFINE )
4394 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4396 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4399 /* no need to do anything here if we are in a define. */
4400 /* or we are after some kind of infinite construct
4401 * so we can skip recursing into this item.
4402 * Since it is infinite we will not change the maxlen
4403 * or delta, and if we miss something that might raise
4404 * the minlen it will merely pessimise a little.
4406 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4407 * might result in a minlen of 1 and not of 4,
4408 * but this doesn't make us mismatch, just try a bit
4409 * harder than we should.
4411 scan= regnext(scan);
4418 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4420 /* it is quite possible that there are more efficient ways
4421 * to do this. We maintain a bitmap per level of recursion
4422 * of which patterns we have entered so we can detect if a
4423 * pattern creates a possible infinite loop. When we
4424 * recurse down a level we copy the previous levels bitmap
4425 * down. When we are at recursion level 0 we zero the top
4426 * level bitmap. It would be nice to implement a different
4427 * more efficient way of doing this. In particular the top
4428 * level bitmap may be unnecessary.
4430 if (!recursed_depth) {
4431 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4433 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4434 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4435 RExC_study_chunk_recursed_bytes, U8);
4437 /* we havent recursed into this paren yet, so recurse into it */
4438 DEBUG_STUDYDATA("set:", data,depth);
4439 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4440 my_recursed_depth= recursed_depth + 1;
4442 DEBUG_STUDYDATA("inf:", data,depth);
4443 /* some form of infinite recursion, assume infinite length
4445 if (flags & SCF_DO_SUBSTR) {
4446 scan_commit(pRExC_state, data, minlenp, is_inf);
4447 data->longest = &(data->longest_float);
4449 is_inf = is_inf_internal = 1;
4450 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4451 ssc_anything(data->start_class);
4452 flags &= ~SCF_DO_STCLASS;
4454 start= NULL; /* reset start so we dont recurse later on. */
4459 end = regnext(scan);
4462 scan_frame *newframe;
4464 if (!RExC_frame_last) {
4465 Newxz(newframe, 1, scan_frame);
4466 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4467 RExC_frame_head= newframe;
4469 } else if (!RExC_frame_last->next_frame) {
4470 Newxz(newframe,1,scan_frame);
4471 RExC_frame_last->next_frame= newframe;
4472 newframe->prev_frame= RExC_frame_last;
4475 newframe= RExC_frame_last->next_frame;
4477 RExC_frame_last= newframe;
4479 newframe->next_regnode = regnext(scan);
4480 newframe->last_regnode = last;
4481 newframe->stopparen = stopparen;
4482 newframe->prev_recursed_depth = recursed_depth;
4483 newframe->this_prev_frame= frame;
4485 DEBUG_STUDYDATA("frame-new:",data,depth);
4486 DEBUG_PEEP("fnew", scan, depth);
4493 recursed_depth= my_recursed_depth;
4498 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4499 SSize_t l = STR_LEN(scan);
4502 const U8 * const s = (U8*)STRING(scan);
4503 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4504 l = utf8_length(s, s + l);
4506 uc = *((U8*)STRING(scan));
4509 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4510 /* The code below prefers earlier match for fixed
4511 offset, later match for variable offset. */
4512 if (data->last_end == -1) { /* Update the start info. */
4513 data->last_start_min = data->pos_min;
4514 data->last_start_max = is_inf
4515 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4517 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4519 SvUTF8_on(data->last_found);
4521 SV * const sv = data->last_found;
4522 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4523 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4524 if (mg && mg->mg_len >= 0)
4525 mg->mg_len += utf8_length((U8*)STRING(scan),
4526 (U8*)STRING(scan)+STR_LEN(scan));
4528 data->last_end = data->pos_min + l;
4529 data->pos_min += l; /* As in the first entry. */
4530 data->flags &= ~SF_BEFORE_EOL;
4533 /* ANDing the code point leaves at most it, and not in locale, and
4534 * can't match null string */
4535 if (flags & SCF_DO_STCLASS_AND) {
4536 ssc_cp_and(data->start_class, uc);
4537 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4538 ssc_clear_locale(data->start_class);
4540 else if (flags & SCF_DO_STCLASS_OR) {
4541 ssc_add_cp(data->start_class, uc);
4542 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4544 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4545 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4547 flags &= ~SCF_DO_STCLASS;
4549 else if (PL_regkind[OP(scan)] == EXACT) {
4550 /* But OP != EXACT!, so is EXACTFish */
4551 SSize_t l = STR_LEN(scan);
4552 const U8 * s = (U8*)STRING(scan);
4554 /* Search for fixed substrings supports EXACT only. */
4555 if (flags & SCF_DO_SUBSTR) {
4557 scan_commit(pRExC_state, data, minlenp, is_inf);
4560 l = utf8_length(s, s + l);
4562 if (unfolded_multi_char) {
4563 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4565 min += l - min_subtract;
4567 delta += min_subtract;
4568 if (flags & SCF_DO_SUBSTR) {
4569 data->pos_min += l - min_subtract;
4570 if (data->pos_min < 0) {
4573 data->pos_delta += min_subtract;
4575 data->longest = &(data->longest_float);
4579 if (flags & SCF_DO_STCLASS) {
4580 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4582 assert(EXACTF_invlist);
4583 if (flags & SCF_DO_STCLASS_AND) {
4584 if (OP(scan) != EXACTFL)
4585 ssc_clear_locale(data->start_class);
4586 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4587 ANYOF_POSIXL_ZERO(data->start_class);
4588 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4590 else { /* SCF_DO_STCLASS_OR */
4591 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4592 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4594 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4595 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4597 flags &= ~SCF_DO_STCLASS;
4598 SvREFCNT_dec(EXACTF_invlist);
4601 else if (REGNODE_VARIES(OP(scan))) {
4602 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4603 I32 fl = 0, f = flags;
4604 regnode * const oscan = scan;
4605 regnode_ssc this_class;
4606 regnode_ssc *oclass = NULL;
4607 I32 next_is_eval = 0;
4609 switch (PL_regkind[OP(scan)]) {
4610 case WHILEM: /* End of (?:...)* . */
4611 scan = NEXTOPER(scan);
4614 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4615 next = NEXTOPER(scan);
4616 if (OP(next) == EXACT
4617 || OP(next) == EXACTL
4618 || (flags & SCF_DO_STCLASS))
4621 maxcount = REG_INFTY;
4622 next = regnext(scan);
4623 scan = NEXTOPER(scan);
4627 if (flags & SCF_DO_SUBSTR)
4632 if (flags & SCF_DO_STCLASS) {
4634 maxcount = REG_INFTY;
4635 next = regnext(scan);
4636 scan = NEXTOPER(scan);
4639 if (flags & SCF_DO_SUBSTR) {
4640 scan_commit(pRExC_state, data, minlenp, is_inf);
4641 /* Cannot extend fixed substrings */
4642 data->longest = &(data->longest_float);
4644 is_inf = is_inf_internal = 1;
4645 scan = regnext(scan);
4646 goto optimize_curly_tail;
4648 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4649 && (scan->flags == stopparen))
4654 mincount = ARG1(scan);
4655 maxcount = ARG2(scan);
4657 next = regnext(scan);
4658 if (OP(scan) == CURLYX) {
4659 I32 lp = (data ? *(data->last_closep) : 0);
4660 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4662 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4663 next_is_eval = (OP(scan) == EVAL);
4665 if (flags & SCF_DO_SUBSTR) {
4667 scan_commit(pRExC_state, data, minlenp, is_inf);
4668 /* Cannot extend fixed substrings */
4669 pos_before = data->pos_min;
4673 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4675 data->flags |= SF_IS_INF;
4677 if (flags & SCF_DO_STCLASS) {
4678 ssc_init(pRExC_state, &this_class);
4679 oclass = data->start_class;
4680 data->start_class = &this_class;
4681 f |= SCF_DO_STCLASS_AND;
4682 f &= ~SCF_DO_STCLASS_OR;
4684 /* Exclude from super-linear cache processing any {n,m}
4685 regops for which the combination of input pos and regex
4686 pos is not enough information to determine if a match
4689 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4690 regex pos at the \s*, the prospects for a match depend not
4691 only on the input position but also on how many (bar\s*)
4692 repeats into the {4,8} we are. */
4693 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4694 f &= ~SCF_WHILEM_VISITED_POS;
4696 /* This will finish on WHILEM, setting scan, or on NULL: */
4697 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4698 last, data, stopparen, recursed_depth, NULL,
4700 ? (f & ~SCF_DO_SUBSTR)
4704 if (flags & SCF_DO_STCLASS)
4705 data->start_class = oclass;
4706 if (mincount == 0 || minnext == 0) {
4707 if (flags & SCF_DO_STCLASS_OR) {
4708 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4710 else if (flags & SCF_DO_STCLASS_AND) {
4711 /* Switch to OR mode: cache the old value of
4712 * data->start_class */
4714 StructCopy(data->start_class, and_withp, regnode_ssc);
4715 flags &= ~SCF_DO_STCLASS_AND;
4716 StructCopy(&this_class, data->start_class, regnode_ssc);
4717 flags |= SCF_DO_STCLASS_OR;
4718 ANYOF_FLAGS(data->start_class)
4719 |= SSC_MATCHES_EMPTY_STRING;
4721 } else { /* Non-zero len */
4722 if (flags & SCF_DO_STCLASS_OR) {
4723 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4724 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4726 else if (flags & SCF_DO_STCLASS_AND)
4727 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4728 flags &= ~SCF_DO_STCLASS;
4730 if (!scan) /* It was not CURLYX, but CURLY. */
4732 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4733 /* ? quantifier ok, except for (?{ ... }) */
4734 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4735 && (minnext == 0) && (deltanext == 0)
4736 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4737 && maxcount <= REG_INFTY/3) /* Complement check for big
4740 /* Fatal warnings may leak the regexp without this: */
4741 SAVEFREESV(RExC_rx_sv);
4742 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4743 "Quantifier unexpected on zero-length expression "
4744 "in regex m/%"UTF8f"/",
4745 UTF8fARG(UTF, RExC_end - RExC_precomp,
4747 (void)ReREFCNT_inc(RExC_rx_sv);
4750 min += minnext * mincount;
4751 is_inf_internal |= deltanext == SSize_t_MAX
4752 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4753 is_inf |= is_inf_internal;
4755 delta = SSize_t_MAX;
4757 delta += (minnext + deltanext) * maxcount
4758 - minnext * mincount;
4760 /* Try powerful optimization CURLYX => CURLYN. */
4761 if ( OP(oscan) == CURLYX && data
4762 && data->flags & SF_IN_PAR
4763 && !(data->flags & SF_HAS_EVAL)
4764 && !deltanext && minnext == 1 ) {
4765 /* Try to optimize to CURLYN. */
4766 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4767 regnode * const nxt1 = nxt;
4774 if (!REGNODE_SIMPLE(OP(nxt))
4775 && !(PL_regkind[OP(nxt)] == EXACT
4776 && STR_LEN(nxt) == 1))
4782 if (OP(nxt) != CLOSE)
4784 if (RExC_open_parens) {
4785 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4786 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4788 /* Now we know that nxt2 is the only contents: */
4789 oscan->flags = (U8)ARG(nxt);
4791 OP(nxt1) = NOTHING; /* was OPEN. */
4794 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4795 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4796 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4797 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4798 OP(nxt + 1) = OPTIMIZED; /* was count. */
4799 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4804 /* Try optimization CURLYX => CURLYM. */
4805 if ( OP(oscan) == CURLYX && data
4806 && !(data->flags & SF_HAS_PAR)
4807 && !(data->flags & SF_HAS_EVAL)
4808 && !deltanext /* atom is fixed width */
4809 && minnext != 0 /* CURLYM can't handle zero width */
4811 /* Nor characters whose fold at run-time may be
4812 * multi-character */
4813 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4815 /* XXXX How to optimize if data == 0? */
4816 /* Optimize to a simpler form. */
4817 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4821 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4822 && (OP(nxt2) != WHILEM))
4824 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4825 /* Need to optimize away parenths. */
4826 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4827 /* Set the parenth number. */
4828 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4830 oscan->flags = (U8)ARG(nxt);
4831 if (RExC_open_parens) {
4832 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4833 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4835 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4836 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4839 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4840 OP(nxt + 1) = OPTIMIZED; /* was count. */
4841 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4842 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4845 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4846 regnode *nnxt = regnext(nxt1);
4848 if (reg_off_by_arg[OP(nxt1)])
4849 ARG_SET(nxt1, nxt2 - nxt1);
4850 else if (nxt2 - nxt1 < U16_MAX)
4851 NEXT_OFF(nxt1) = nxt2 - nxt1;
4853 OP(nxt) = NOTHING; /* Cannot beautify */
4858 /* Optimize again: */
4859 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4860 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4865 else if ((OP(oscan) == CURLYX)
4866 && (flags & SCF_WHILEM_VISITED_POS)
4867 /* See the comment on a similar expression above.
4868 However, this time it's not a subexpression
4869 we care about, but the expression itself. */
4870 && (maxcount == REG_INFTY)
4871 && data && ++data->whilem_c < 16) {
4872 /* This stays as CURLYX, we can put the count/of pair. */
4873 /* Find WHILEM (as in regexec.c) */
4874 regnode *nxt = oscan + NEXT_OFF(oscan);
4876 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4878 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4879 | (RExC_whilem_seen << 4)); /* On WHILEM */
4881 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4883 if (flags & SCF_DO_SUBSTR) {
4884 SV *last_str = NULL;
4885 STRLEN last_chrs = 0;
4886 int counted = mincount != 0;
4888 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4890 SSize_t b = pos_before >= data->last_start_min
4891 ? pos_before : data->last_start_min;
4893 const char * const s = SvPV_const(data->last_found, l);
4894 SSize_t old = b - data->last_start_min;
4897 old = utf8_hop((U8*)s, old) - (U8*)s;
4899 /* Get the added string: */
4900 last_str = newSVpvn_utf8(s + old, l, UTF);
4901 last_chrs = UTF ? utf8_length((U8*)(s + old),
4902 (U8*)(s + old + l)) : l;
4903 if (deltanext == 0 && pos_before == b) {
4904 /* What was added is a constant string */
4907 SvGROW(last_str, (mincount * l) + 1);
4908 repeatcpy(SvPVX(last_str) + l,
4909 SvPVX_const(last_str), l,
4911 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4912 /* Add additional parts. */
4913 SvCUR_set(data->last_found,
4914 SvCUR(data->last_found) - l);
4915 sv_catsv(data->last_found, last_str);
4917 SV * sv = data->last_found;
4919 SvUTF8(sv) && SvMAGICAL(sv) ?
4920 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4921 if (mg && mg->mg_len >= 0)
4922 mg->mg_len += last_chrs * (mincount-1);
4924 last_chrs *= mincount;
4925 data->last_end += l * (mincount - 1);
4928 /* start offset must point into the last copy */
4929 data->last_start_min += minnext * (mincount - 1);
4930 data->last_start_max =
4933 : data->last_start_max +
4934 (maxcount - 1) * (minnext + data->pos_delta);
4937 /* It is counted once already... */
4938 data->pos_min += minnext * (mincount - counted);
4940 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4941 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4942 " maxcount=%"UVuf" mincount=%"UVuf"\n",
4943 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4945 if (deltanext != SSize_t_MAX)
4946 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4947 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4948 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4950 if (deltanext == SSize_t_MAX
4951 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4952 data->pos_delta = SSize_t_MAX;
4954 data->pos_delta += - counted * deltanext +
4955 (minnext + deltanext) * maxcount - minnext * mincount;
4956 if (mincount != maxcount) {
4957 /* Cannot extend fixed substrings found inside
4959 scan_commit(pRExC_state, data, minlenp, is_inf);
4960 if (mincount && last_str) {
4961 SV * const sv = data->last_found;
4962 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4963 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4967 sv_setsv(sv, last_str);
4968 data->last_end = data->pos_min;
4969 data->last_start_min = data->pos_min - last_chrs;
4970 data->last_start_max = is_inf
4972 : data->pos_min + data->pos_delta - last_chrs;
4974 data->longest = &(data->longest_float);
4976 SvREFCNT_dec(last_str);
4978 if (data && (fl & SF_HAS_EVAL))
4979 data->flags |= SF_HAS_EVAL;
4980 optimize_curly_tail:
4981 if (OP(oscan) != CURLYX) {
4982 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4984 NEXT_OFF(oscan) += NEXT_OFF(next);
4990 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4995 if (flags & SCF_DO_SUBSTR) {
4996 /* Cannot expect anything... */
4997 scan_commit(pRExC_state, data, minlenp, is_inf);
4998 data->longest = &(data->longest_float);
5000 is_inf = is_inf_internal = 1;
5001 if (flags & SCF_DO_STCLASS_OR) {
5002 if (OP(scan) == CLUMP) {
5003 /* Actually is any start char, but very few code points
5004 * aren't start characters */
5005 ssc_match_all_cp(data->start_class);
5008 ssc_anything(data->start_class);
5011 flags &= ~SCF_DO_STCLASS;
5015 else if (OP(scan) == LNBREAK) {
5016 if (flags & SCF_DO_STCLASS) {
5017 if (flags & SCF_DO_STCLASS_AND) {
5018 ssc_intersection(data->start_class,
5019 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5020 ssc_clear_locale(data->start_class);
5021 ANYOF_FLAGS(data->start_class)
5022 &= ~SSC_MATCHES_EMPTY_STRING;
5024 else if (flags & SCF_DO_STCLASS_OR) {
5025 ssc_union(data->start_class,
5026 PL_XPosix_ptrs[_CC_VERTSPACE],
5028 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5030 /* See commit msg for
5031 * 749e076fceedeb708a624933726e7989f2302f6a */
5032 ANYOF_FLAGS(data->start_class)
5033 &= ~SSC_MATCHES_EMPTY_STRING;
5035 flags &= ~SCF_DO_STCLASS;
5038 if (delta != SSize_t_MAX)
5039 delta++; /* Because of the 2 char string cr-lf */
5040 if (flags & SCF_DO_SUBSTR) {
5041 /* Cannot expect anything... */
5042 scan_commit(pRExC_state, data, minlenp, is_inf);
5044 data->pos_delta += 1;
5045 data->longest = &(data->longest_float);
5048 else if (REGNODE_SIMPLE(OP(scan))) {
5050 if (flags & SCF_DO_SUBSTR) {
5051 scan_commit(pRExC_state, data, minlenp, is_inf);
5055 if (flags & SCF_DO_STCLASS) {
5057 SV* my_invlist = NULL;
5060 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5061 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5063 /* Some of the logic below assumes that switching
5064 locale on will only add false positives. */
5069 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5073 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5074 ssc_match_all_cp(data->start_class);
5079 SV* REG_ANY_invlist = _new_invlist(2);
5080 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5082 if (flags & SCF_DO_STCLASS_OR) {
5083 ssc_union(data->start_class,
5085 TRUE /* TRUE => invert, hence all but \n
5089 else if (flags & SCF_DO_STCLASS_AND) {
5090 ssc_intersection(data->start_class,
5092 TRUE /* TRUE => invert */
5094 ssc_clear_locale(data->start_class);
5096 SvREFCNT_dec_NN(REG_ANY_invlist);
5102 if (flags & SCF_DO_STCLASS_AND)
5103 ssc_and(pRExC_state, data->start_class,
5104 (regnode_charclass *) scan);
5106 ssc_or(pRExC_state, data->start_class,
5107 (regnode_charclass *) scan);
5115 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5116 if (flags & SCF_DO_STCLASS_AND) {
5117 bool was_there = cBOOL(
5118 ANYOF_POSIXL_TEST(data->start_class,
5120 ANYOF_POSIXL_ZERO(data->start_class);
5121 if (was_there) { /* Do an AND */
5122 ANYOF_POSIXL_SET(data->start_class, namedclass);
5124 /* No individual code points can now match */
5125 data->start_class->invlist
5126 = sv_2mortal(_new_invlist(0));
5129 int complement = namedclass + ((invert) ? -1 : 1);
5131 assert(flags & SCF_DO_STCLASS_OR);
5133 /* If the complement of this class was already there,
5134 * the result is that they match all code points,
5135 * (\d + \D == everything). Remove the classes from
5136 * future consideration. Locale is not relevant in
5138 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5139 ssc_match_all_cp(data->start_class);
5140 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5141 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5143 else { /* The usual case; just add this class to the
5145 ANYOF_POSIXL_SET(data->start_class, namedclass);
5150 case NPOSIXA: /* For these, we always know the exact set of
5155 if (FLAGS(scan) == _CC_ASCII) {
5156 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5159 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5160 PL_XPosix_ptrs[_CC_ASCII],
5171 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5173 /* NPOSIXD matches all upper Latin1 code points unless the
5174 * target string being matched is UTF-8, which is
5175 * unknowable until match time. Since we are going to
5176 * invert, we want to get rid of all of them so that the
5177 * inversion will match all */
5178 if (OP(scan) == NPOSIXD) {
5179 _invlist_subtract(my_invlist, PL_UpperLatin1,
5185 if (flags & SCF_DO_STCLASS_AND) {
5186 ssc_intersection(data->start_class, my_invlist, invert);
5187 ssc_clear_locale(data->start_class);
5190 assert(flags & SCF_DO_STCLASS_OR);
5191 ssc_union(data->start_class, my_invlist, invert);
5193 SvREFCNT_dec(my_invlist);
5195 if (flags & SCF_DO_STCLASS_OR)
5196 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5197 flags &= ~SCF_DO_STCLASS;
5200 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5201 data->flags |= (OP(scan) == MEOL
5204 scan_commit(pRExC_state, data, minlenp, is_inf);
5207 else if ( PL_regkind[OP(scan)] == BRANCHJ
5208 /* Lookbehind, or need to calculate parens/evals/stclass: */
5209 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5210 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5212 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5213 || OP(scan) == UNLESSM )
5215 /* Negative Lookahead/lookbehind
5216 In this case we can't do fixed string optimisation.
5219 SSize_t deltanext, minnext, fake = 0;
5224 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5226 data_fake.whilem_c = data->whilem_c;
5227 data_fake.last_closep = data->last_closep;
5230 data_fake.last_closep = &fake;
5231 data_fake.pos_delta = delta;
5232 if ( flags & SCF_DO_STCLASS && !scan->flags
5233 && OP(scan) == IFMATCH ) { /* Lookahead */
5234 ssc_init(pRExC_state, &intrnl);
5235 data_fake.start_class = &intrnl;
5236 f |= SCF_DO_STCLASS_AND;
5238 if (flags & SCF_WHILEM_VISITED_POS)
5239 f |= SCF_WHILEM_VISITED_POS;
5240 next = regnext(scan);
5241 nscan = NEXTOPER(NEXTOPER(scan));
5242 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5243 last, &data_fake, stopparen,
5244 recursed_depth, NULL, f, depth+1);
5247 FAIL("Variable length lookbehind not implemented");
5249 else if (minnext > (I32)U8_MAX) {
5250 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5253 scan->flags = (U8)minnext;
5256 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5258 if (data_fake.flags & SF_HAS_EVAL)
5259 data->flags |= SF_HAS_EVAL;
5260 data->whilem_c = data_fake.whilem_c;
5262 if (f & SCF_DO_STCLASS_AND) {
5263 if (flags & SCF_DO_STCLASS_OR) {
5264 /* OR before, AND after: ideally we would recurse with
5265 * data_fake to get the AND applied by study of the
5266 * remainder of the pattern, and then derecurse;
5267 * *** HACK *** for now just treat as "no information".
5268 * See [perl #56690].
5270 ssc_init(pRExC_state, data->start_class);
5272 /* AND before and after: combine and continue. These
5273 * assertions are zero-length, so can match an EMPTY
5275 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5276 ANYOF_FLAGS(data->start_class)
5277 |= SSC_MATCHES_EMPTY_STRING;
5281 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5283 /* Positive Lookahead/lookbehind
5284 In this case we can do fixed string optimisation,
5285 but we must be careful about it. Note in the case of
5286 lookbehind the positions will be offset by the minimum
5287 length of the pattern, something we won't know about
5288 until after the recurse.
5290 SSize_t deltanext, fake = 0;
5294 /* We use SAVEFREEPV so that when the full compile
5295 is finished perl will clean up the allocated
5296 minlens when it's all done. This way we don't
5297 have to worry about freeing them when we know
5298 they wont be used, which would be a pain.
5301 Newx( minnextp, 1, SSize_t );
5302 SAVEFREEPV(minnextp);
5305 StructCopy(data, &data_fake, scan_data_t);
5306 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5309 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5310 data_fake.last_found=newSVsv(data->last_found);
5314 data_fake.last_closep = &fake;
5315 data_fake.flags = 0;
5316 data_fake.pos_delta = delta;
5318 data_fake.flags |= SF_IS_INF;
5319 if ( flags & SCF_DO_STCLASS && !scan->flags
5320 && OP(scan) == IFMATCH ) { /* Lookahead */
5321 ssc_init(pRExC_state, &intrnl);
5322 data_fake.start_class = &intrnl;
5323 f |= SCF_DO_STCLASS_AND;
5325 if (flags & SCF_WHILEM_VISITED_POS)
5326 f |= SCF_WHILEM_VISITED_POS;
5327 next = regnext(scan);
5328 nscan = NEXTOPER(NEXTOPER(scan));
5330 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5331 &deltanext, last, &data_fake,
5332 stopparen, recursed_depth, NULL,
5336 FAIL("Variable length lookbehind not implemented");
5338 else if (*minnextp > (I32)U8_MAX) {
5339 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5342 scan->flags = (U8)*minnextp;
5347 if (f & SCF_DO_STCLASS_AND) {
5348 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5349 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5352 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5354 if (data_fake.flags & SF_HAS_EVAL)
5355 data->flags |= SF_HAS_EVAL;
5356 data->whilem_c = data_fake.whilem_c;
5357 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5358 if (RExC_rx->minlen<*minnextp)
5359 RExC_rx->minlen=*minnextp;
5360 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5361 SvREFCNT_dec_NN(data_fake.last_found);
5363 if ( data_fake.minlen_fixed != minlenp )
5365 data->offset_fixed= data_fake.offset_fixed;
5366 data->minlen_fixed= data_fake.minlen_fixed;
5367 data->lookbehind_fixed+= scan->flags;
5369 if ( data_fake.minlen_float != minlenp )
5371 data->minlen_float= data_fake.minlen_float;
5372 data->offset_float_min=data_fake.offset_float_min;
5373 data->offset_float_max=data_fake.offset_float_max;
5374 data->lookbehind_float+= scan->flags;
5381 else if (OP(scan) == OPEN) {
5382 if (stopparen != (I32)ARG(scan))
5385 else if (OP(scan) == CLOSE) {
5386 if (stopparen == (I32)ARG(scan)) {
5389 if ((I32)ARG(scan) == is_par) {
5390 next = regnext(scan);
5392 if ( next && (OP(next) != WHILEM) && next < last)
5393 is_par = 0; /* Disable optimization */
5396 *(data->last_closep) = ARG(scan);
5398 else if (OP(scan) == EVAL) {
5400 data->flags |= SF_HAS_EVAL;
5402 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5403 if (flags & SCF_DO_SUBSTR) {
5404 scan_commit(pRExC_state, data, minlenp, is_inf);
5405 flags &= ~SCF_DO_SUBSTR;
5407 if (data && OP(scan)==ACCEPT) {
5408 data->flags |= SCF_SEEN_ACCEPT;
5413 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5415 if (flags & SCF_DO_SUBSTR) {
5416 scan_commit(pRExC_state, data, minlenp, is_inf);
5417 data->longest = &(data->longest_float);
5419 is_inf = is_inf_internal = 1;
5420 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5421 ssc_anything(data->start_class);
5422 flags &= ~SCF_DO_STCLASS;
5424 else if (OP(scan) == GPOS) {
5425 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5426 !(delta || is_inf || (data && data->pos_delta)))
5428 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5429 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5430 if (RExC_rx->gofs < (STRLEN)min)
5431 RExC_rx->gofs = min;
5433 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5437 #ifdef TRIE_STUDY_OPT
5438 #ifdef FULL_TRIE_STUDY
5439 else if (PL_regkind[OP(scan)] == TRIE) {
5440 /* NOTE - There is similar code to this block above for handling
5441 BRANCH nodes on the initial study. If you change stuff here
5443 regnode *trie_node= scan;
5444 regnode *tail= regnext(scan);
5445 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5446 SSize_t max1 = 0, min1 = SSize_t_MAX;
5449 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5450 /* Cannot merge strings after this. */
5451 scan_commit(pRExC_state, data, minlenp, is_inf);
5453 if (flags & SCF_DO_STCLASS)
5454 ssc_init_zero(pRExC_state, &accum);
5460 const regnode *nextbranch= NULL;
5463 for ( word=1 ; word <= trie->wordcount ; word++)
5465 SSize_t deltanext=0, minnext=0, f = 0, fake;
5466 regnode_ssc this_class;
5468 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5470 data_fake.whilem_c = data->whilem_c;
5471 data_fake.last_closep = data->last_closep;
5474 data_fake.last_closep = &fake;
5475 data_fake.pos_delta = delta;
5476 if (flags & SCF_DO_STCLASS) {
5477 ssc_init(pRExC_state, &this_class);
5478 data_fake.start_class = &this_class;
5479 f = SCF_DO_STCLASS_AND;
5481 if (flags & SCF_WHILEM_VISITED_POS)
5482 f |= SCF_WHILEM_VISITED_POS;
5484 if (trie->jump[word]) {
5486 nextbranch = trie_node + trie->jump[0];
5487 scan= trie_node + trie->jump[word];
5488 /* We go from the jump point to the branch that follows
5489 it. Note this means we need the vestigal unused
5490 branches even though they arent otherwise used. */
5491 minnext = study_chunk(pRExC_state, &scan, minlenp,
5492 &deltanext, (regnode *)nextbranch, &data_fake,
5493 stopparen, recursed_depth, NULL, f,depth+1);
5495 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5496 nextbranch= regnext((regnode*)nextbranch);
5498 if (min1 > (SSize_t)(minnext + trie->minlen))
5499 min1 = minnext + trie->minlen;
5500 if (deltanext == SSize_t_MAX) {
5501 is_inf = is_inf_internal = 1;
5503 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5504 max1 = minnext + deltanext + trie->maxlen;
5506 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5508 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5509 if ( stopmin > min + min1)
5510 stopmin = min + min1;
5511 flags &= ~SCF_DO_SUBSTR;
5513 data->flags |= SCF_SEEN_ACCEPT;
5516 if (data_fake.flags & SF_HAS_EVAL)
5517 data->flags |= SF_HAS_EVAL;
5518 data->whilem_c = data_fake.whilem_c;
5520 if (flags & SCF_DO_STCLASS)
5521 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5524 if (flags & SCF_DO_SUBSTR) {
5525 data->pos_min += min1;
5526 data->pos_delta += max1 - min1;
5527 if (max1 != min1 || is_inf)
5528 data->longest = &(data->longest_float);
5531 if (delta != SSize_t_MAX)
5532 delta += max1 - min1;
5533 if (flags & SCF_DO_STCLASS_OR) {
5534 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5536 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5537 flags &= ~SCF_DO_STCLASS;
5540 else if (flags & SCF_DO_STCLASS_AND) {
5542 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5543 flags &= ~SCF_DO_STCLASS;
5546 /* Switch to OR mode: cache the old value of
5547 * data->start_class */
5549 StructCopy(data->start_class, and_withp, regnode_ssc);
5550 flags &= ~SCF_DO_STCLASS_AND;
5551 StructCopy(&accum, data->start_class, regnode_ssc);
5552 flags |= SCF_DO_STCLASS_OR;
5559 else if (PL_regkind[OP(scan)] == TRIE) {
5560 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5563 min += trie->minlen;
5564 delta += (trie->maxlen - trie->minlen);
5565 flags &= ~SCF_DO_STCLASS; /* xxx */
5566 if (flags & SCF_DO_SUBSTR) {
5567 /* Cannot expect anything... */
5568 scan_commit(pRExC_state, data, minlenp, is_inf);
5569 data->pos_min += trie->minlen;
5570 data->pos_delta += (trie->maxlen - trie->minlen);
5571 if (trie->maxlen != trie->minlen)
5572 data->longest = &(data->longest_float);
5574 if (trie->jump) /* no more substrings -- for now /grr*/
5575 flags &= ~SCF_DO_SUBSTR;
5577 #endif /* old or new */
5578 #endif /* TRIE_STUDY_OPT */
5580 /* Else: zero-length, ignore. */
5581 scan = regnext(scan);
5583 /* If we are exiting a recursion we can unset its recursed bit
5584 * and allow ourselves to enter it again - no danger of an
5585 * infinite loop there.
5586 if (stopparen > -1 && recursed) {
5587 DEBUG_STUDYDATA("unset:", data,depth);
5588 PAREN_UNSET( recursed, stopparen);
5594 DEBUG_STUDYDATA("frame-end:",data,depth);
5595 DEBUG_PEEP("fend", scan, depth);
5597 /* restore previous context */
5598 last = frame->last_regnode;
5599 scan = frame->next_regnode;
5600 stopparen = frame->stopparen;
5601 recursed_depth = frame->prev_recursed_depth;
5603 RExC_frame_last = frame->prev_frame;
5604 frame = frame->this_prev_frame;
5605 goto fake_study_recurse;
5610 DEBUG_STUDYDATA("pre-fin:",data,depth);
5613 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5615 if (flags & SCF_DO_SUBSTR && is_inf)
5616 data->pos_delta = SSize_t_MAX - data->pos_min;
5617 if (is_par > (I32)U8_MAX)
5619 if (is_par && pars==1 && data) {
5620 data->flags |= SF_IN_PAR;
5621 data->flags &= ~SF_HAS_PAR;
5623 else if (pars && data) {
5624 data->flags |= SF_HAS_PAR;
5625 data->flags &= ~SF_IN_PAR;
5627 if (flags & SCF_DO_STCLASS_OR)
5628 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5629 if (flags & SCF_TRIE_RESTUDY)
5630 data->flags |= SCF_TRIE_RESTUDY;
5632 DEBUG_STUDYDATA("post-fin:",data,depth);
5635 SSize_t final_minlen= min < stopmin ? min : stopmin;
5637 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5638 if (final_minlen > SSize_t_MAX - delta)
5639 RExC_maxlen = SSize_t_MAX;
5640 else if (RExC_maxlen < final_minlen + delta)
5641 RExC_maxlen = final_minlen + delta;
5643 return final_minlen;
5645 NOT_REACHED; /* NOTREACHED */
5649 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5651 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5653 PERL_ARGS_ASSERT_ADD_DATA;
5655 Renewc(RExC_rxi->data,
5656 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5657 char, struct reg_data);
5659 Renew(RExC_rxi->data->what, count + n, U8);
5661 Newx(RExC_rxi->data->what, n, U8);
5662 RExC_rxi->data->count = count + n;
5663 Copy(s, RExC_rxi->data->what + count, n, U8);
5667 /*XXX: todo make this not included in a non debugging perl, but appears to be
5668 * used anyway there, in 'use re' */
5669 #ifndef PERL_IN_XSUB_RE
5671 Perl_reginitcolors(pTHX)
5673 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5675 char *t = savepv(s);
5679 t = strchr(t, '\t');
5685 PL_colors[i] = t = (char *)"";
5690 PL_colors[i++] = (char *)"";
5697 #ifdef TRIE_STUDY_OPT
5698 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5701 (data.flags & SCF_TRIE_RESTUDY) \
5709 #define CHECK_RESTUDY_GOTO_butfirst
5713 * pregcomp - compile a regular expression into internal code
5715 * Decides which engine's compiler to call based on the hint currently in
5719 #ifndef PERL_IN_XSUB_RE
5721 /* return the currently in-scope regex engine (or the default if none) */
5723 regexp_engine const *
5724 Perl_current_re_engine(pTHX)
5726 if (IN_PERL_COMPILETIME) {
5727 HV * const table = GvHV(PL_hintgv);
5730 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5731 return &PL_core_reg_engine;
5732 ptr = hv_fetchs(table, "regcomp", FALSE);
5733 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5734 return &PL_core_reg_engine;
5735 return INT2PTR(regexp_engine*,SvIV(*ptr));
5739 if (!PL_curcop->cop_hints_hash)
5740 return &PL_core_reg_engine;
5741 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5742 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5743 return &PL_core_reg_engine;
5744 return INT2PTR(regexp_engine*,SvIV(ptr));
5750 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5752 regexp_engine const *eng = current_re_engine();
5753 GET_RE_DEBUG_FLAGS_DECL;
5755 PERL_ARGS_ASSERT_PREGCOMP;
5757 /* Dispatch a request to compile a regexp to correct regexp engine. */
5759 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5762 return CALLREGCOMP_ENG(eng, pattern, flags);
5766 /* public(ish) entry point for the perl core's own regex compiling code.
5767 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5768 * pattern rather than a list of OPs, and uses the internal engine rather
5769 * than the current one */
5772 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5774 SV *pat = pattern; /* defeat constness! */
5775 PERL_ARGS_ASSERT_RE_COMPILE;
5776 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5777 #ifdef PERL_IN_XSUB_RE
5780 &PL_core_reg_engine,
5782 NULL, NULL, rx_flags, 0);
5786 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5787 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5788 * point to the realloced string and length.
5790 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5794 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5795 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5797 U8 *const src = (U8*)*pat_p;
5802 GET_RE_DEBUG_FLAGS_DECL;
5804 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5805 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5807 Newx(dst, *plen_p * 2 + 1, U8);
5810 while (s < *plen_p) {
5811 append_utf8_from_native_byte(src[s], &d);
5812 if (n < num_code_blocks) {
5813 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5814 pRExC_state->code_blocks[n].start = d - dst - 1;
5815 assert(*(d - 1) == '(');
5818 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5819 pRExC_state->code_blocks[n].end = d - dst - 1;
5820 assert(*(d - 1) == ')');
5829 *pat_p = (char*) dst;
5831 RExC_orig_utf8 = RExC_utf8 = 1;
5836 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5837 * while recording any code block indices, and handling overloading,
5838 * nested qr// objects etc. If pat is null, it will allocate a new
5839 * string, or just return the first arg, if there's only one.
5841 * Returns the malloced/updated pat.
5842 * patternp and pat_count is the array of SVs to be concatted;
5843 * oplist is the optional list of ops that generated the SVs;
5844 * recompile_p is a pointer to a boolean that will be set if
5845 * the regex will need to be recompiled.
5846 * delim, if non-null is an SV that will be inserted between each element
5850 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5851 SV *pat, SV ** const patternp, int pat_count,
5852 OP *oplist, bool *recompile_p, SV *delim)
5856 bool use_delim = FALSE;
5857 bool alloced = FALSE;
5859 /* if we know we have at least two args, create an empty string,
5860 * then concatenate args to that. For no args, return an empty string */
5861 if (!pat && pat_count != 1) {
5867 for (svp = patternp; svp < patternp + pat_count; svp++) {
5870 STRLEN orig_patlen = 0;
5872 SV *msv = use_delim ? delim : *svp;
5873 if (!msv) msv = &PL_sv_undef;
5875 /* if we've got a delimiter, we go round the loop twice for each
5876 * svp slot (except the last), using the delimiter the second
5885 if (SvTYPE(msv) == SVt_PVAV) {
5886 /* we've encountered an interpolated array within
5887 * the pattern, e.g. /...@a..../. Expand the list of elements,
5888 * then recursively append elements.
5889 * The code in this block is based on S_pushav() */
5891 AV *const av = (AV*)msv;
5892 const SSize_t maxarg = AvFILL(av) + 1;
5896 assert(oplist->op_type == OP_PADAV
5897 || oplist->op_type == OP_RV2AV);
5898 oplist = OpSIBLING(oplist);
5901 if (SvRMAGICAL(av)) {
5904 Newx(array, maxarg, SV*);
5906 for (i=0; i < maxarg; i++) {
5907 SV ** const svp = av_fetch(av, i, FALSE);
5908 array[i] = svp ? *svp : &PL_sv_undef;
5912 array = AvARRAY(av);
5914 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5915 array, maxarg, NULL, recompile_p,
5917 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5923 /* we make the assumption here that each op in the list of
5924 * op_siblings maps to one SV pushed onto the stack,
5925 * except for code blocks, with have both an OP_NULL and
5927 * This allows us to match up the list of SVs against the
5928 * list of OPs to find the next code block.
5930 * Note that PUSHMARK PADSV PADSV ..
5932 * PADRANGE PADSV PADSV ..
5933 * so the alignment still works. */
5936 if (oplist->op_type == OP_NULL
5937 && (oplist->op_flags & OPf_SPECIAL))
5939 assert(n < pRExC_state->num_code_blocks);
5940 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5941 pRExC_state->code_blocks[n].block = oplist;
5942 pRExC_state->code_blocks[n].src_regex = NULL;
5945 oplist = OpSIBLING(oplist); /* skip CONST */
5948 oplist = OpSIBLING(oplist);;
5951 /* apply magic and QR overloading to arg */
5954 if (SvROK(msv) && SvAMAGIC(msv)) {
5955 SV *sv = AMG_CALLunary(msv, regexp_amg);
5959 if (SvTYPE(sv) != SVt_REGEXP)
5960 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5965 /* try concatenation overload ... */
5966 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5967 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5970 /* overloading involved: all bets are off over literal
5971 * code. Pretend we haven't seen it */
5972 pRExC_state->num_code_blocks -= n;
5976 /* ... or failing that, try "" overload */
5977 while (SvAMAGIC(msv)
5978 && (sv = AMG_CALLunary(msv, string_amg))
5982 && SvRV(msv) == SvRV(sv))
5987 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5991 /* this is a partially unrolled
5992 * sv_catsv_nomg(pat, msv);
5993 * that allows us to adjust code block indices if
5996 char *dst = SvPV_force_nomg(pat, dlen);
5998 if (SvUTF8(msv) && !SvUTF8(pat)) {
5999 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6000 sv_setpvn(pat, dst, dlen);
6003 sv_catsv_nomg(pat, msv);
6010 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6013 /* extract any code blocks within any embedded qr//'s */
6014 if (rx && SvTYPE(rx) == SVt_REGEXP
6015 && RX_ENGINE((REGEXP*)rx)->op_comp)
6018 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6019 if (ri->num_code_blocks) {
6021 /* the presence of an embedded qr// with code means
6022 * we should always recompile: the text of the
6023 * qr// may not have changed, but it may be a
6024 * different closure than last time */
6026 Renew(pRExC_state->code_blocks,
6027 pRExC_state->num_code_blocks + ri->num_code_blocks,
6028 struct reg_code_block);
6029 pRExC_state->num_code_blocks += ri->num_code_blocks;
6031 for (i=0; i < ri->num_code_blocks; i++) {
6032 struct reg_code_block *src, *dst;
6033 STRLEN offset = orig_patlen
6034 + ReANY((REGEXP *)rx)->pre_prefix;
6035 assert(n < pRExC_state->num_code_blocks);
6036 src = &ri->code_blocks[i];
6037 dst = &pRExC_state->code_blocks[n];
6038 dst->start = src->start + offset;
6039 dst->end = src->end + offset;
6040 dst->block = src->block;
6041 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6050 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6059 /* see if there are any run-time code blocks in the pattern.
6060 * False positives are allowed */
6063 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6064 char *pat, STRLEN plen)
6069 PERL_UNUSED_CONTEXT;
6071 for (s = 0; s < plen; s++) {
6072 if (n < pRExC_state->num_code_blocks
6073 && s == pRExC_state->code_blocks[n].start)
6075 s = pRExC_state->code_blocks[n].end;
6079 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6081 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6083 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6090 /* Handle run-time code blocks. We will already have compiled any direct
6091 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6092 * copy of it, but with any literal code blocks blanked out and
6093 * appropriate chars escaped; then feed it into
6095 * eval "qr'modified_pattern'"
6099 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6103 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6105 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6106 * and merge them with any code blocks of the original regexp.
6108 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6109 * instead, just save the qr and return FALSE; this tells our caller that
6110 * the original pattern needs upgrading to utf8.
6114 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6115 char *pat, STRLEN plen)
6119 GET_RE_DEBUG_FLAGS_DECL;
6121 if (pRExC_state->runtime_code_qr) {
6122 /* this is the second time we've been called; this should
6123 * only happen if the main pattern got upgraded to utf8
6124 * during compilation; re-use the qr we compiled first time
6125 * round (which should be utf8 too)
6127 qr = pRExC_state->runtime_code_qr;
6128 pRExC_state->runtime_code_qr = NULL;
6129 assert(RExC_utf8 && SvUTF8(qr));
6135 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6139 /* determine how many extra chars we need for ' and \ escaping */
6140 for (s = 0; s < plen; s++) {
6141 if (pat[s] == '\'' || pat[s] == '\\')
6145 Newx(newpat, newlen, char);
6147 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6149 for (s = 0; s < plen; s++) {
6150 if (n < pRExC_state->num_code_blocks
6151 && s == pRExC_state->code_blocks[n].start)
6153 /* blank out literal code block */
6154 assert(pat[s] == '(');
6155 while (s <= pRExC_state->code_blocks[n].end) {
6163 if (pat[s] == '\'' || pat[s] == '\\')
6168 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6172 PerlIO_printf(Perl_debug_log,
6173 "%sre-parsing pattern for runtime code:%s %s\n",
6174 PL_colors[4],PL_colors[5],newpat);
6177 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6183 PUSHSTACKi(PERLSI_REQUIRE);
6184 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6185 * parsing qr''; normally only q'' does this. It also alters
6187 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6188 SvREFCNT_dec_NN(sv);
6193 SV * const errsv = ERRSV;
6194 if (SvTRUE_NN(errsv))
6196 Safefree(pRExC_state->code_blocks);
6197 /* use croak_sv ? */
6198 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6201 assert(SvROK(qr_ref));
6203 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6204 /* the leaving below frees the tmp qr_ref.
6205 * Give qr a life of its own */
6213 if (!RExC_utf8 && SvUTF8(qr)) {
6214 /* first time through; the pattern got upgraded; save the
6215 * qr for the next time through */
6216 assert(!pRExC_state->runtime_code_qr);
6217 pRExC_state->runtime_code_qr = qr;
6222 /* extract any code blocks within the returned qr// */
6225 /* merge the main (r1) and run-time (r2) code blocks into one */
6227 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6228 struct reg_code_block *new_block, *dst;
6229 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6232 if (!r2->num_code_blocks) /* we guessed wrong */
6234 SvREFCNT_dec_NN(qr);
6239 r1->num_code_blocks + r2->num_code_blocks,
6240 struct reg_code_block);
6243 while ( i1 < r1->num_code_blocks
6244 || i2 < r2->num_code_blocks)
6246 struct reg_code_block *src;
6249 if (i1 == r1->num_code_blocks) {
6250 src = &r2->code_blocks[i2++];
6253 else if (i2 == r2->num_code_blocks)
6254 src = &r1->code_blocks[i1++];
6255 else if ( r1->code_blocks[i1].start
6256 < r2->code_blocks[i2].start)
6258 src = &r1->code_blocks[i1++];
6259 assert(src->end < r2->code_blocks[i2].start);
6262 assert( r1->code_blocks[i1].start
6263 > r2->code_blocks[i2].start);
6264 src = &r2->code_blocks[i2++];
6266 assert(src->end < r1->code_blocks[i1].start);
6269 assert(pat[src->start] == '(');
6270 assert(pat[src->end] == ')');
6271 dst->start = src->start;
6272 dst->end = src->end;
6273 dst->block = src->block;
6274 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6278 r1->num_code_blocks += r2->num_code_blocks;
6279 Safefree(r1->code_blocks);
6280 r1->code_blocks = new_block;
6283 SvREFCNT_dec_NN(qr);
6289 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6290 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6291 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6292 STRLEN longest_length, bool eol, bool meol)
6294 /* This is the common code for setting up the floating and fixed length
6295 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6296 * as to whether succeeded or not */
6301 if (! (longest_length
6302 || (eol /* Can't have SEOL and MULTI */
6303 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6305 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6306 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6311 /* copy the information about the longest from the reg_scan_data
6312 over to the program. */
6313 if (SvUTF8(sv_longest)) {
6314 *rx_utf8 = sv_longest;
6317 *rx_substr = sv_longest;
6320 /* end_shift is how many chars that must be matched that
6321 follow this item. We calculate it ahead of time as once the
6322 lookbehind offset is added in we lose the ability to correctly
6324 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6325 *rx_end_shift = ml - offset
6326 - longest_length + (SvTAIL(sv_longest) != 0)
6329 t = (eol/* Can't have SEOL and MULTI */
6330 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6331 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6337 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6338 * regular expression into internal code.
6339 * The pattern may be passed either as:
6340 * a list of SVs (patternp plus pat_count)
6341 * a list of OPs (expr)
6342 * If both are passed, the SV list is used, but the OP list indicates
6343 * which SVs are actually pre-compiled code blocks
6345 * The SVs in the list have magic and qr overloading applied to them (and
6346 * the list may be modified in-place with replacement SVs in the latter
6349 * If the pattern hasn't changed from old_re, then old_re will be
6352 * eng is the current engine. If that engine has an op_comp method, then
6353 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6354 * do the initial concatenation of arguments and pass on to the external
6357 * If is_bare_re is not null, set it to a boolean indicating whether the
6358 * arg list reduced (after overloading) to a single bare regex which has
6359 * been returned (i.e. /$qr/).
6361 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6363 * pm_flags contains the PMf_* flags, typically based on those from the
6364 * pm_flags field of the related PMOP. Currently we're only interested in
6365 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6367 * We can't allocate space until we know how big the compiled form will be,
6368 * but we can't compile it (and thus know how big it is) until we've got a
6369 * place to put the code. So we cheat: we compile it twice, once with code
6370 * generation turned off and size counting turned on, and once "for real".
6371 * This also means that we don't allocate space until we are sure that the
6372 * thing really will compile successfully, and we never have to move the
6373 * code and thus invalidate pointers into it. (Note that it has to be in
6374 * one piece because free() must be able to free it all.) [NB: not true in perl]
6376 * Beware that the optimization-preparation code in here knows about some
6377 * of the structure of the compiled regexp. [I'll say.]
6381 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6382 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6383 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6387 regexp_internal *ri;
6395 SV *code_blocksv = NULL;
6396 SV** new_patternp = patternp;
6398 /* these are all flags - maybe they should be turned
6399 * into a single int with different bit masks */
6400 I32 sawlookahead = 0;
6405 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6407 bool runtime_code = 0;
6409 RExC_state_t RExC_state;
6410 RExC_state_t * const pRExC_state = &RExC_state;
6411 #ifdef TRIE_STUDY_OPT
6413 RExC_state_t copyRExC_state;
6415 GET_RE_DEBUG_FLAGS_DECL;
6417 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6419 DEBUG_r(if (!PL_colorset) reginitcolors());
6421 /* Initialize these here instead of as-needed, as is quick and avoids
6422 * having to test them each time otherwise */
6423 if (! PL_AboveLatin1) {
6424 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6425 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6426 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6427 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6428 PL_HasMultiCharFold =
6429 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6431 /* This is calculated here, because the Perl program that generates the
6432 * static global ones doesn't currently have access to
6433 * NUM_ANYOF_CODE_POINTS */
6434 PL_InBitmap = _new_invlist(2);
6435 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6436 NUM_ANYOF_CODE_POINTS - 1);
6439 pRExC_state->code_blocks = NULL;
6440 pRExC_state->num_code_blocks = 0;
6443 *is_bare_re = FALSE;
6445 if (expr && (expr->op_type == OP_LIST ||
6446 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6447 /* allocate code_blocks if needed */
6451 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6452 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6453 ncode++; /* count of DO blocks */
6455 pRExC_state->num_code_blocks = ncode;
6456 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6461 /* compile-time pattern with just OP_CONSTs and DO blocks */
6466 /* find how many CONSTs there are */
6469 if (expr->op_type == OP_CONST)
6472 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6473 if (o->op_type == OP_CONST)
6477 /* fake up an SV array */
6479 assert(!new_patternp);
6480 Newx(new_patternp, n, SV*);
6481 SAVEFREEPV(new_patternp);
6485 if (expr->op_type == OP_CONST)
6486 new_patternp[n] = cSVOPx_sv(expr);
6488 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6489 if (o->op_type == OP_CONST)
6490 new_patternp[n++] = cSVOPo_sv;
6495 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6496 "Assembling pattern from %d elements%s\n", pat_count,
6497 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6499 /* set expr to the first arg op */
6501 if (pRExC_state->num_code_blocks
6502 && expr->op_type != OP_CONST)
6504 expr = cLISTOPx(expr)->op_first;
6505 assert( expr->op_type == OP_PUSHMARK
6506 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6507 || expr->op_type == OP_PADRANGE);
6508 expr = OpSIBLING(expr);
6511 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6512 expr, &recompile, NULL);
6514 /* handle bare (possibly after overloading) regex: foo =~ $re */
6519 if (SvTYPE(re) == SVt_REGEXP) {
6523 Safefree(pRExC_state->code_blocks);
6524 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6525 "Precompiled pattern%s\n",
6526 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6532 exp = SvPV_nomg(pat, plen);
6534 if (!eng->op_comp) {
6535 if ((SvUTF8(pat) && IN_BYTES)
6536 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6538 /* make a temporary copy; either to convert to bytes,
6539 * or to avoid repeating get-magic / overloaded stringify */
6540 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6541 (IN_BYTES ? 0 : SvUTF8(pat)));
6543 Safefree(pRExC_state->code_blocks);
6544 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6547 /* ignore the utf8ness if the pattern is 0 length */
6548 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6549 RExC_uni_semantics = 0;
6550 RExC_contains_locale = 0;
6551 RExC_contains_i = 0;
6552 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6553 pRExC_state->runtime_code_qr = NULL;
6554 RExC_frame_head= NULL;
6555 RExC_frame_last= NULL;
6556 RExC_frame_count= 0;
6559 RExC_mysv1= sv_newmortal();
6560 RExC_mysv2= sv_newmortal();
6563 SV *dsv= sv_newmortal();
6564 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6565 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6566 PL_colors[4],PL_colors[5],s);
6570 /* we jump here if we upgrade the pattern to utf8 and have to
6573 if ((pm_flags & PMf_USE_RE_EVAL)
6574 /* this second condition covers the non-regex literal case,
6575 * i.e. $foo =~ '(?{})'. */
6576 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6578 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6580 /* return old regex if pattern hasn't changed */
6581 /* XXX: note in the below we have to check the flags as well as the
6584 * Things get a touch tricky as we have to compare the utf8 flag
6585 * independently from the compile flags. */
6589 && !!RX_UTF8(old_re) == !!RExC_utf8
6590 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6591 && RX_PRECOMP(old_re)
6592 && RX_PRELEN(old_re) == plen
6593 && memEQ(RX_PRECOMP(old_re), exp, plen)
6594 && !runtime_code /* with runtime code, always recompile */ )
6596 Safefree(pRExC_state->code_blocks);
6600 rx_flags = orig_rx_flags;
6602 if (rx_flags & PMf_FOLD) {
6603 RExC_contains_i = 1;
6605 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6607 /* Set to use unicode semantics if the pattern is in utf8 and has the
6608 * 'depends' charset specified, as it means unicode when utf8 */
6609 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6613 RExC_flags = rx_flags;
6614 RExC_pm_flags = pm_flags;
6617 if (TAINTING_get && TAINT_get)
6618 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6620 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6621 /* whoops, we have a non-utf8 pattern, whilst run-time code
6622 * got compiled as utf8. Try again with a utf8 pattern */
6623 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6624 pRExC_state->num_code_blocks);
6625 goto redo_first_pass;
6628 assert(!pRExC_state->runtime_code_qr);
6634 RExC_in_lookbehind = 0;
6635 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6637 RExC_override_recoding = 0;
6639 RExC_recode_x_to_native = 0;
6641 RExC_in_multi_char_class = 0;
6643 /* First pass: determine size, legality. */
6646 RExC_end = exp + plen;
6651 RExC_emit = (regnode *) &RExC_emit_dummy;
6652 RExC_whilem_seen = 0;
6653 RExC_open_parens = NULL;
6654 RExC_close_parens = NULL;
6656 RExC_paren_names = NULL;
6658 RExC_paren_name_list = NULL;
6660 RExC_recurse = NULL;
6661 RExC_study_chunk_recursed = NULL;
6662 RExC_study_chunk_recursed_bytes= 0;
6663 RExC_recurse_count = 0;
6664 pRExC_state->code_index = 0;
6667 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6669 RExC_lastparse=NULL;
6671 /* reg may croak on us, not giving us a chance to free
6672 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6673 need it to survive as long as the regexp (qr/(?{})/).
6674 We must check that code_blocksv is not already set, because we may
6675 have jumped back to restart the sizing pass. */
6676 if (pRExC_state->code_blocks && !code_blocksv) {
6677 code_blocksv = newSV_type(SVt_PV);
6678 SAVEFREESV(code_blocksv);
6679 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6680 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6682 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6683 /* It's possible to write a regexp in ascii that represents Unicode
6684 codepoints outside of the byte range, such as via \x{100}. If we
6685 detect such a sequence we have to convert the entire pattern to utf8
6686 and then recompile, as our sizing calculation will have been based
6687 on 1 byte == 1 character, but we will need to use utf8 to encode
6688 at least some part of the pattern, and therefore must convert the whole
6691 if (flags & RESTART_UTF8) {
6692 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6693 pRExC_state->num_code_blocks);
6694 goto redo_first_pass;
6696 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6699 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6702 PerlIO_printf(Perl_debug_log,
6703 "Required size %"IVdf" nodes\n"
6704 "Starting second pass (creation)\n",
6707 RExC_lastparse=NULL;
6710 /* The first pass could have found things that force Unicode semantics */
6711 if ((RExC_utf8 || RExC_uni_semantics)
6712 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6714 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6717 /* Small enough for pointer-storage convention?
6718 If extralen==0, this means that we will not need long jumps. */
6719 if (RExC_size >= 0x10000L && RExC_extralen)
6720 RExC_size += RExC_extralen;
6723 if (RExC_whilem_seen > 15)
6724 RExC_whilem_seen = 15;
6726 /* Allocate space and zero-initialize. Note, the two step process
6727 of zeroing when in debug mode, thus anything assigned has to
6728 happen after that */
6729 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6731 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6732 char, regexp_internal);
6733 if ( r == NULL || ri == NULL )
6734 FAIL("Regexp out of space");
6736 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6737 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6740 /* bulk initialize base fields with 0. */
6741 Zero(ri, sizeof(regexp_internal), char);
6744 /* non-zero initialization begins here */
6747 r->extflags = rx_flags;
6748 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6750 if (pm_flags & PMf_IS_QR) {
6751 ri->code_blocks = pRExC_state->code_blocks;
6752 ri->num_code_blocks = pRExC_state->num_code_blocks;
6757 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6758 if (pRExC_state->code_blocks[n].src_regex)
6759 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6760 SAVEFREEPV(pRExC_state->code_blocks);
6764 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6765 bool has_charset = (get_regex_charset(r->extflags)
6766 != REGEX_DEPENDS_CHARSET);
6768 /* The caret is output if there are any defaults: if not all the STD
6769 * flags are set, or if no character set specifier is needed */
6771 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6773 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6774 == REG_RUN_ON_COMMENT_SEEN);
6775 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6776 >> RXf_PMf_STD_PMMOD_SHIFT);
6777 const char *fptr = STD_PAT_MODS; /*"msixn"*/
6779 /* Allocate for the worst case, which is all the std flags are turned
6780 * on. If more precision is desired, we could do a population count of
6781 * the flags set. This could be done with a small lookup table, or by
6782 * shifting, masking and adding, or even, when available, assembly
6783 * language for a machine-language population count.
6784 * We never output a minus, as all those are defaults, so are
6785 * covered by the caret */
6786 const STRLEN wraplen = plen + has_p + has_runon
6787 + has_default /* If needs a caret */
6789 /* If needs a character set specifier */
6790 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6791 + (sizeof(STD_PAT_MODS) - 1)
6792 + (sizeof("(?:)") - 1);
6794 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6795 r->xpv_len_u.xpvlenu_pv = p;
6797 SvFLAGS(rx) |= SVf_UTF8;
6800 /* If a default, cover it using the caret */
6802 *p++= DEFAULT_PAT_MOD;
6806 const char* const name = get_regex_charset_name(r->extflags, &len);
6807 Copy(name, p, len, char);
6811 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6814 while((ch = *fptr++)) {
6822 Copy(RExC_precomp, p, plen, char);
6823 assert ((RX_WRAPPED(rx) - p) < 16);
6824 r->pre_prefix = p - RX_WRAPPED(rx);
6830 SvCUR_set(rx, p - RX_WRAPPED(rx));
6834 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6836 /* setup various meta data about recursion, this all requires
6837 * RExC_npar to be correctly set, and a bit later on we clear it */
6838 if (RExC_seen & REG_RECURSE_SEEN) {
6839 Newxz(RExC_open_parens, RExC_npar,regnode *);
6840 SAVEFREEPV(RExC_open_parens);
6841 Newxz(RExC_close_parens,RExC_npar,regnode *);
6842 SAVEFREEPV(RExC_close_parens);
6844 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6845 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6846 * So its 1 if there are no parens. */
6847 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6848 ((RExC_npar & 0x07) != 0);
6849 Newx(RExC_study_chunk_recursed,
6850 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6851 SAVEFREEPV(RExC_study_chunk_recursed);
6854 /* Useful during FAIL. */
6855 #ifdef RE_TRACK_PATTERN_OFFSETS
6856 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6857 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6858 "%s %"UVuf" bytes for offset annotations.\n",
6859 ri->u.offsets ? "Got" : "Couldn't get",
6860 (UV)((2*RExC_size+1) * sizeof(U32))));
6862 SetProgLen(ri,RExC_size);
6867 /* Second pass: emit code. */
6868 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6869 RExC_pm_flags = pm_flags;
6871 RExC_end = exp + plen;
6874 RExC_emit_start = ri->program;
6875 RExC_emit = ri->program;
6876 RExC_emit_bound = ri->program + RExC_size + 1;
6877 pRExC_state->code_index = 0;
6879 *((char*) RExC_emit++) = (char) REG_MAGIC;
6880 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6882 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6884 /* XXXX To minimize changes to RE engine we always allocate
6885 3-units-long substrs field. */
6886 Newx(r->substrs, 1, struct reg_substr_data);
6887 if (RExC_recurse_count) {
6888 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6889 SAVEFREEPV(RExC_recurse);
6893 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6895 RExC_study_chunk_recursed_count= 0;
6897 Zero(r->substrs, 1, struct reg_substr_data);
6898 if (RExC_study_chunk_recursed) {
6899 Zero(RExC_study_chunk_recursed,
6900 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6904 #ifdef TRIE_STUDY_OPT
6906 StructCopy(&zero_scan_data, &data, scan_data_t);
6907 copyRExC_state = RExC_state;
6910 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6912 RExC_state = copyRExC_state;
6913 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6914 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6916 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6917 StructCopy(&zero_scan_data, &data, scan_data_t);
6920 StructCopy(&zero_scan_data, &data, scan_data_t);
6923 /* Dig out information for optimizations. */
6924 r->extflags = RExC_flags; /* was pm_op */
6925 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6928 SvUTF8_on(rx); /* Unicode in it? */
6929 ri->regstclass = NULL;
6930 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
6931 r->intflags |= PREGf_NAUGHTY;
6932 scan = ri->program + 1; /* First BRANCH. */
6934 /* testing for BRANCH here tells us whether there is "must appear"
6935 data in the pattern. If there is then we can use it for optimisations */
6936 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6939 STRLEN longest_float_length, longest_fixed_length;
6940 regnode_ssc ch_class; /* pointed to by data */
6942 SSize_t last_close = 0; /* pointed to by data */
6943 regnode *first= scan;
6944 regnode *first_next= regnext(first);
6946 * Skip introductions and multiplicators >= 1
6947 * so that we can extract the 'meat' of the pattern that must
6948 * match in the large if() sequence following.
6949 * NOTE that EXACT is NOT covered here, as it is normally
6950 * picked up by the optimiser separately.
6952 * This is unfortunate as the optimiser isnt handling lookahead
6953 * properly currently.
6956 while ((OP(first) == OPEN && (sawopen = 1)) ||
6957 /* An OR of *one* alternative - should not happen now. */
6958 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6959 /* for now we can't handle lookbehind IFMATCH*/
6960 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6961 (OP(first) == PLUS) ||
6962 (OP(first) == MINMOD) ||
6963 /* An {n,m} with n>0 */
6964 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6965 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6968 * the only op that could be a regnode is PLUS, all the rest
6969 * will be regnode_1 or regnode_2.
6971 * (yves doesn't think this is true)
6973 if (OP(first) == PLUS)
6976 if (OP(first) == MINMOD)
6978 first += regarglen[OP(first)];
6980 first = NEXTOPER(first);
6981 first_next= regnext(first);
6984 /* Starting-point info. */
6986 DEBUG_PEEP("first:",first,0);
6987 /* Ignore EXACT as we deal with it later. */
6988 if (PL_regkind[OP(first)] == EXACT) {
6989 if (OP(first) == EXACT || OP(first) == EXACTL)
6990 NOOP; /* Empty, get anchored substr later. */
6992 ri->regstclass = first;
6995 else if (PL_regkind[OP(first)] == TRIE &&
6996 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6998 /* this can happen only on restudy */
6999 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7002 else if (REGNODE_SIMPLE(OP(first)))
7003 ri->regstclass = first;
7004 else if (PL_regkind[OP(first)] == BOUND ||
7005 PL_regkind[OP(first)] == NBOUND)
7006 ri->regstclass = first;
7007 else if (PL_regkind[OP(first)] == BOL) {
7008 r->intflags |= (OP(first) == MBOL
7011 first = NEXTOPER(first);
7014 else if (OP(first) == GPOS) {
7015 r->intflags |= PREGf_ANCH_GPOS;
7016 first = NEXTOPER(first);
7019 else if ((!sawopen || !RExC_sawback) &&
7021 (OP(first) == STAR &&
7022 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7023 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7025 /* turn .* into ^.* with an implied $*=1 */
7027 (OP(NEXTOPER(first)) == REG_ANY)
7030 r->intflags |= (type | PREGf_IMPLICIT);
7031 first = NEXTOPER(first);
7034 if (sawplus && !sawminmod && !sawlookahead
7035 && (!sawopen || !RExC_sawback)
7036 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7037 /* x+ must match at the 1st pos of run of x's */
7038 r->intflags |= PREGf_SKIP;
7040 /* Scan is after the zeroth branch, first is atomic matcher. */
7041 #ifdef TRIE_STUDY_OPT
7044 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7045 (IV)(first - scan + 1))
7049 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7050 (IV)(first - scan + 1))
7056 * If there's something expensive in the r.e., find the
7057 * longest literal string that must appear and make it the
7058 * regmust. Resolve ties in favor of later strings, since
7059 * the regstart check works with the beginning of the r.e.
7060 * and avoiding duplication strengthens checking. Not a
7061 * strong reason, but sufficient in the absence of others.
7062 * [Now we resolve ties in favor of the earlier string if
7063 * it happens that c_offset_min has been invalidated, since the
7064 * earlier string may buy us something the later one won't.]
7067 data.longest_fixed = newSVpvs("");
7068 data.longest_float = newSVpvs("");
7069 data.last_found = newSVpvs("");
7070 data.longest = &(data.longest_fixed);
7071 ENTER_with_name("study_chunk");
7072 SAVEFREESV(data.longest_fixed);
7073 SAVEFREESV(data.longest_float);
7074 SAVEFREESV(data.last_found);
7076 if (!ri->regstclass) {
7077 ssc_init(pRExC_state, &ch_class);
7078 data.start_class = &ch_class;
7079 stclass_flag = SCF_DO_STCLASS_AND;
7080 } else /* XXXX Check for BOUND? */
7082 data.last_closep = &last_close;
7085 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7086 scan + RExC_size, /* Up to end */
7088 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7089 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7093 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7096 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7097 && data.last_start_min == 0 && data.last_end > 0
7098 && !RExC_seen_zerolen
7099 && !(RExC_seen & REG_VERBARG_SEEN)
7100 && !(RExC_seen & REG_GPOS_SEEN)
7102 r->extflags |= RXf_CHECK_ALL;
7104 scan_commit(pRExC_state, &data,&minlen,0);
7106 longest_float_length = CHR_SVLEN(data.longest_float);
7108 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7109 && data.offset_fixed == data.offset_float_min
7110 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7111 && S_setup_longest (aTHX_ pRExC_state,
7115 &(r->float_end_shift),
7116 data.lookbehind_float,
7117 data.offset_float_min,
7119 longest_float_length,
7120 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7121 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7123 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7124 r->float_max_offset = data.offset_float_max;
7125 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7126 r->float_max_offset -= data.lookbehind_float;
7127 SvREFCNT_inc_simple_void_NN(data.longest_float);
7130 r->float_substr = r->float_utf8 = NULL;
7131 longest_float_length = 0;
7134 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7136 if (S_setup_longest (aTHX_ pRExC_state,
7138 &(r->anchored_utf8),
7139 &(r->anchored_substr),
7140 &(r->anchored_end_shift),
7141 data.lookbehind_fixed,
7144 longest_fixed_length,
7145 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7146 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7148 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7149 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7152 r->anchored_substr = r->anchored_utf8 = NULL;
7153 longest_fixed_length = 0;
7155 LEAVE_with_name("study_chunk");
7158 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7159 ri->regstclass = NULL;
7161 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7163 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7164 && is_ssc_worth_it(pRExC_state, data.start_class))
7166 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7168 ssc_finalize(pRExC_state, data.start_class);
7170 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7171 StructCopy(data.start_class,
7172 (regnode_ssc*)RExC_rxi->data->data[n],
7174 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7175 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7176 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7177 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7178 PerlIO_printf(Perl_debug_log,
7179 "synthetic stclass \"%s\".\n",
7180 SvPVX_const(sv));});
7181 data.start_class = NULL;
7184 /* A temporary algorithm prefers floated substr to fixed one to dig
7186 if (longest_fixed_length > longest_float_length) {
7187 r->substrs->check_ix = 0;
7188 r->check_end_shift = r->anchored_end_shift;
7189 r->check_substr = r->anchored_substr;
7190 r->check_utf8 = r->anchored_utf8;
7191 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7192 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7193 r->intflags |= PREGf_NOSCAN;
7196 r->substrs->check_ix = 1;
7197 r->check_end_shift = r->float_end_shift;
7198 r->check_substr = r->float_substr;
7199 r->check_utf8 = r->float_utf8;
7200 r->check_offset_min = r->float_min_offset;
7201 r->check_offset_max = r->float_max_offset;
7203 if ((r->check_substr || r->check_utf8) ) {
7204 r->extflags |= RXf_USE_INTUIT;
7205 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7206 r->extflags |= RXf_INTUIT_TAIL;
7208 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7210 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7211 if ( (STRLEN)minlen < longest_float_length )
7212 minlen= longest_float_length;
7213 if ( (STRLEN)minlen < longest_fixed_length )
7214 minlen= longest_fixed_length;
7218 /* Several toplevels. Best we can is to set minlen. */
7220 regnode_ssc ch_class;
7221 SSize_t last_close = 0;
7223 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7225 scan = ri->program + 1;
7226 ssc_init(pRExC_state, &ch_class);
7227 data.start_class = &ch_class;
7228 data.last_closep = &last_close;
7231 minlen = study_chunk(pRExC_state,
7232 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7233 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7234 ? SCF_TRIE_DOING_RESTUDY
7238 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7240 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7241 = r->float_substr = r->float_utf8 = NULL;
7243 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7244 && is_ssc_worth_it(pRExC_state, data.start_class))
7246 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7248 ssc_finalize(pRExC_state, data.start_class);
7250 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7251 StructCopy(data.start_class,
7252 (regnode_ssc*)RExC_rxi->data->data[n],
7254 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7255 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7256 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7257 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7258 PerlIO_printf(Perl_debug_log,
7259 "synthetic stclass \"%s\".\n",
7260 SvPVX_const(sv));});
7261 data.start_class = NULL;
7265 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7266 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7267 r->maxlen = REG_INFTY;
7270 r->maxlen = RExC_maxlen;
7273 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7274 the "real" pattern. */
7276 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7277 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7279 r->minlenret = minlen;
7280 if (r->minlen < minlen)
7283 if (RExC_seen & REG_GPOS_SEEN)
7284 r->intflags |= PREGf_GPOS_SEEN;
7285 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7286 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7288 if (pRExC_state->num_code_blocks)
7289 r->extflags |= RXf_EVAL_SEEN;
7290 if (RExC_seen & REG_VERBARG_SEEN)
7292 r->intflags |= PREGf_VERBARG_SEEN;
7293 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7295 if (RExC_seen & REG_CUTGROUP_SEEN)
7296 r->intflags |= PREGf_CUTGROUP_SEEN;
7297 if (pm_flags & PMf_USE_RE_EVAL)
7298 r->intflags |= PREGf_USE_RE_EVAL;
7299 if (RExC_paren_names)
7300 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7302 RXp_PAREN_NAMES(r) = NULL;
7304 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7305 * so it can be used in pp.c */
7306 if (r->intflags & PREGf_ANCH)
7307 r->extflags |= RXf_IS_ANCHORED;
7311 /* this is used to identify "special" patterns that might result
7312 * in Perl NOT calling the regex engine and instead doing the match "itself",
7313 * particularly special cases in split//. By having the regex compiler
7314 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7315 * we avoid weird issues with equivalent patterns resulting in different behavior,
7316 * AND we allow non Perl engines to get the same optimizations by the setting the
7317 * flags appropriately - Yves */
7318 regnode *first = ri->program + 1;
7320 regnode *next = regnext(first);
7323 if (PL_regkind[fop] == NOTHING && nop == END)
7324 r->extflags |= RXf_NULL;
7325 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7326 /* when fop is SBOL first->flags will be true only when it was
7327 * produced by parsing /\A/, and not when parsing /^/. This is
7328 * very important for the split code as there we want to
7329 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7330 * See rt #122761 for more details. -- Yves */
7331 r->extflags |= RXf_START_ONLY;
7332 else if (fop == PLUS
7333 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7335 r->extflags |= RXf_WHITE;
7336 else if ( r->extflags & RXf_SPLIT
7337 && (fop == EXACT || fop == EXACTL)
7338 && STR_LEN(first) == 1
7339 && *(STRING(first)) == ' '
7341 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7345 if (RExC_contains_locale) {
7346 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7350 if (RExC_paren_names) {
7351 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7352 ri->data->data[ri->name_list_idx]
7353 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7356 ri->name_list_idx = 0;
7358 if (RExC_recurse_count) {
7359 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7360 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7361 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7364 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7365 /* assume we don't need to swap parens around before we match */
7367 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7368 (unsigned long)RExC_study_chunk_recursed_count);
7372 PerlIO_printf(Perl_debug_log,"Final program:\n");
7375 #ifdef RE_TRACK_PATTERN_OFFSETS
7376 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7377 const STRLEN len = ri->u.offsets[0];
7379 GET_RE_DEBUG_FLAGS_DECL;
7380 PerlIO_printf(Perl_debug_log,
7381 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7382 for (i = 1; i <= len; i++) {
7383 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7384 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7385 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7387 PerlIO_printf(Perl_debug_log, "\n");
7392 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7393 * by setting the regexp SV to readonly-only instead. If the
7394 * pattern's been recompiled, the USEDness should remain. */
7395 if (old_re && SvREADONLY(old_re))
7403 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7406 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7408 PERL_UNUSED_ARG(value);
7410 if (flags & RXapif_FETCH) {
7411 return reg_named_buff_fetch(rx, key, flags);
7412 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7413 Perl_croak_no_modify();
7415 } else if (flags & RXapif_EXISTS) {
7416 return reg_named_buff_exists(rx, key, flags)
7419 } else if (flags & RXapif_REGNAMES) {
7420 return reg_named_buff_all(rx, flags);
7421 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7422 return reg_named_buff_scalar(rx, flags);
7424 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7430 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7433 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7434 PERL_UNUSED_ARG(lastkey);
7436 if (flags & RXapif_FIRSTKEY)
7437 return reg_named_buff_firstkey(rx, flags);
7438 else if (flags & RXapif_NEXTKEY)
7439 return reg_named_buff_nextkey(rx, flags);
7441 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7448 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7451 AV *retarray = NULL;
7453 struct regexp *const rx = ReANY(r);
7455 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7457 if (flags & RXapif_ALL)
7460 if (rx && RXp_PAREN_NAMES(rx)) {
7461 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7464 SV* sv_dat=HeVAL(he_str);
7465 I32 *nums=(I32*)SvPVX(sv_dat);
7466 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7467 if ((I32)(rx->nparens) >= nums[i]
7468 && rx->offs[nums[i]].start != -1
7469 && rx->offs[nums[i]].end != -1)
7472 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7477 ret = newSVsv(&PL_sv_undef);
7480 av_push(retarray, ret);
7483 return newRV_noinc(MUTABLE_SV(retarray));
7490 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7493 struct regexp *const rx = ReANY(r);
7495 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7497 if (rx && RXp_PAREN_NAMES(rx)) {
7498 if (flags & RXapif_ALL) {
7499 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7501 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7503 SvREFCNT_dec_NN(sv);
7515 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7517 struct regexp *const rx = ReANY(r);
7519 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7521 if ( rx && RXp_PAREN_NAMES(rx) ) {
7522 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7524 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7531 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7533 struct regexp *const rx = ReANY(r);
7534 GET_RE_DEBUG_FLAGS_DECL;
7536 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7538 if (rx && RXp_PAREN_NAMES(rx)) {
7539 HV *hv = RXp_PAREN_NAMES(rx);
7541 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7544 SV* sv_dat = HeVAL(temphe);
7545 I32 *nums = (I32*)SvPVX(sv_dat);
7546 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7547 if ((I32)(rx->lastparen) >= nums[i] &&
7548 rx->offs[nums[i]].start != -1 &&
7549 rx->offs[nums[i]].end != -1)
7555 if (parno || flags & RXapif_ALL) {
7556 return newSVhek(HeKEY_hek(temphe));
7564 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7569 struct regexp *const rx = ReANY(r);
7571 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7573 if (rx && RXp_PAREN_NAMES(rx)) {
7574 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7575 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7576 } else if (flags & RXapif_ONE) {
7577 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7578 av = MUTABLE_AV(SvRV(ret));
7579 length = av_tindex(av);
7580 SvREFCNT_dec_NN(ret);
7581 return newSViv(length + 1);
7583 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7588 return &PL_sv_undef;
7592 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7594 struct regexp *const rx = ReANY(r);
7597 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7599 if (rx && RXp_PAREN_NAMES(rx)) {
7600 HV *hv= RXp_PAREN_NAMES(rx);
7602 (void)hv_iterinit(hv);
7603 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7606 SV* sv_dat = HeVAL(temphe);
7607 I32 *nums = (I32*)SvPVX(sv_dat);
7608 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7609 if ((I32)(rx->lastparen) >= nums[i] &&
7610 rx->offs[nums[i]].start != -1 &&
7611 rx->offs[nums[i]].end != -1)
7617 if (parno || flags & RXapif_ALL) {
7618 av_push(av, newSVhek(HeKEY_hek(temphe)));
7623 return newRV_noinc(MUTABLE_SV(av));
7627 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7630 struct regexp *const rx = ReANY(r);
7636 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7638 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7639 || n == RX_BUFF_IDX_CARET_FULLMATCH
7640 || n == RX_BUFF_IDX_CARET_POSTMATCH
7643 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7645 /* on something like
7648 * the KEEPCOPY is set on the PMOP rather than the regex */
7649 if (PL_curpm && r == PM_GETRE(PL_curpm))
7650 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7659 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7660 /* no need to distinguish between them any more */
7661 n = RX_BUFF_IDX_FULLMATCH;
7663 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7664 && rx->offs[0].start != -1)
7666 /* $`, ${^PREMATCH} */
7667 i = rx->offs[0].start;
7671 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7672 && rx->offs[0].end != -1)
7674 /* $', ${^POSTMATCH} */
7675 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7676 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7679 if ( 0 <= n && n <= (I32)rx->nparens &&
7680 (s1 = rx->offs[n].start) != -1 &&
7681 (t1 = rx->offs[n].end) != -1)
7683 /* $&, ${^MATCH}, $1 ... */
7685 s = rx->subbeg + s1 - rx->suboffset;
7690 assert(s >= rx->subbeg);
7691 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7693 #ifdef NO_TAINT_SUPPORT
7694 sv_setpvn(sv, s, i);
7696 const int oldtainted = TAINT_get;
7698 sv_setpvn(sv, s, i);
7699 TAINT_set(oldtainted);
7701 if (RXp_MATCH_UTF8(rx))
7706 if (RXp_MATCH_TAINTED(rx)) {
7707 if (SvTYPE(sv) >= SVt_PVMG) {
7708 MAGIC* const mg = SvMAGIC(sv);
7711 SvMAGIC_set(sv, mg->mg_moremagic);
7713 if ((mgt = SvMAGIC(sv))) {
7714 mg->mg_moremagic = mgt;
7715 SvMAGIC_set(sv, mg);
7726 sv_setsv(sv,&PL_sv_undef);
7732 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7733 SV const * const value)
7735 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7737 PERL_UNUSED_ARG(rx);
7738 PERL_UNUSED_ARG(paren);
7739 PERL_UNUSED_ARG(value);
7742 Perl_croak_no_modify();
7746 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7749 struct regexp *const rx = ReANY(r);
7753 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7755 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7756 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7757 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7760 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7762 /* on something like
7765 * the KEEPCOPY is set on the PMOP rather than the regex */
7766 if (PL_curpm && r == PM_GETRE(PL_curpm))
7767 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7773 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7775 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7776 case RX_BUFF_IDX_PREMATCH: /* $` */
7777 if (rx->offs[0].start != -1) {
7778 i = rx->offs[0].start;
7787 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7788 case RX_BUFF_IDX_POSTMATCH: /* $' */
7789 if (rx->offs[0].end != -1) {
7790 i = rx->sublen - rx->offs[0].end;
7792 s1 = rx->offs[0].end;
7799 default: /* $& / ${^MATCH}, $1, $2, ... */
7800 if (paren <= (I32)rx->nparens &&
7801 (s1 = rx->offs[paren].start) != -1 &&
7802 (t1 = rx->offs[paren].end) != -1)
7808 if (ckWARN(WARN_UNINITIALIZED))
7809 report_uninit((const SV *)sv);
7814 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7815 const char * const s = rx->subbeg - rx->suboffset + s1;
7820 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7827 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7829 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7830 PERL_UNUSED_ARG(rx);
7834 return newSVpvs("Regexp");
7837 /* Scans the name of a named buffer from the pattern.
7838 * If flags is REG_RSN_RETURN_NULL returns null.
7839 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7840 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7841 * to the parsed name as looked up in the RExC_paren_names hash.
7842 * If there is an error throws a vFAIL().. type exception.
7845 #define REG_RSN_RETURN_NULL 0
7846 #define REG_RSN_RETURN_NAME 1
7847 #define REG_RSN_RETURN_DATA 2
7850 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7852 char *name_start = RExC_parse;
7854 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7856 assert (RExC_parse <= RExC_end);
7857 if (RExC_parse == RExC_end) NOOP;
7858 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7859 /* skip IDFIRST by using do...while */
7862 RExC_parse += UTF8SKIP(RExC_parse);
7863 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7867 } while (isWORDCHAR(*RExC_parse));
7869 RExC_parse++; /* so the <- from the vFAIL is after the offending
7871 vFAIL("Group name must start with a non-digit word character");
7875 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7876 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7877 if ( flags == REG_RSN_RETURN_NAME)
7879 else if (flags==REG_RSN_RETURN_DATA) {
7882 if ( ! sv_name ) /* should not happen*/
7883 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7884 if (RExC_paren_names)
7885 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7887 sv_dat = HeVAL(he_str);
7889 vFAIL("Reference to nonexistent named group");
7893 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7894 (unsigned long) flags);
7896 NOT_REACHED; /* NOTREACHED */
7901 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7903 if (RExC_lastparse!=RExC_parse) { \
7904 PerlIO_printf(Perl_debug_log, "%s", \
7905 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
7906 RExC_end - RExC_parse, 16, \
7908 PERL_PV_ESCAPE_UNI_DETECT | \
7909 PERL_PV_PRETTY_ELLIPSES | \
7910 PERL_PV_PRETTY_LTGT | \
7911 PERL_PV_ESCAPE_RE | \
7912 PERL_PV_PRETTY_EXACTSIZE \
7916 PerlIO_printf(Perl_debug_log,"%16s",""); \
7919 num = RExC_size + 1; \
7921 num=REG_NODE_NUM(RExC_emit); \
7922 if (RExC_lastnum!=num) \
7923 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7925 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7926 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7927 (int)((depth*2)), "", \
7931 RExC_lastparse=RExC_parse; \
7936 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7937 DEBUG_PARSE_MSG((funcname)); \
7938 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7940 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7941 DEBUG_PARSE_MSG((funcname)); \
7942 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7945 /* This section of code defines the inversion list object and its methods. The
7946 * interfaces are highly subject to change, so as much as possible is static to
7947 * this file. An inversion list is here implemented as a malloc'd C UV array
7948 * as an SVt_INVLIST scalar.
7950 * An inversion list for Unicode is an array of code points, sorted by ordinal
7951 * number. The zeroth element is the first code point in the list. The 1th
7952 * element is the first element beyond that not in the list. In other words,
7953 * the first range is
7954 * invlist[0]..(invlist[1]-1)
7955 * The other ranges follow. Thus every element whose index is divisible by two
7956 * marks the beginning of a range that is in the list, and every element not
7957 * divisible by two marks the beginning of a range not in the list. A single
7958 * element inversion list that contains the single code point N generally
7959 * consists of two elements
7962 * (The exception is when N is the highest representable value on the
7963 * machine, in which case the list containing just it would be a single
7964 * element, itself. By extension, if the last range in the list extends to
7965 * infinity, then the first element of that range will be in the inversion list
7966 * at a position that is divisible by two, and is the final element in the
7968 * Taking the complement (inverting) an inversion list is quite simple, if the
7969 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7970 * This implementation reserves an element at the beginning of each inversion
7971 * list to always contain 0; there is an additional flag in the header which
7972 * indicates if the list begins at the 0, or is offset to begin at the next
7975 * More about inversion lists can be found in "Unicode Demystified"
7976 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7977 * More will be coming when functionality is added later.
7979 * The inversion list data structure is currently implemented as an SV pointing
7980 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7981 * array of UV whose memory management is automatically handled by the existing
7982 * facilities for SV's.
7984 * Some of the methods should always be private to the implementation, and some
7985 * should eventually be made public */
7987 /* The header definitions are in F<invlist_inline.h> */
7989 PERL_STATIC_INLINE UV*
7990 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7992 /* Returns a pointer to the first element in the inversion list's array.
7993 * This is called upon initialization of an inversion list. Where the
7994 * array begins depends on whether the list has the code point U+0000 in it
7995 * or not. The other parameter tells it whether the code that follows this
7996 * call is about to put a 0 in the inversion list or not. The first
7997 * element is either the element reserved for 0, if TRUE, or the element
7998 * after it, if FALSE */
8000 bool* offset = get_invlist_offset_addr(invlist);
8001 UV* zero_addr = (UV *) SvPVX(invlist);
8003 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8006 assert(! _invlist_len(invlist));
8010 /* 1^1 = 0; 1^0 = 1 */
8011 *offset = 1 ^ will_have_0;
8012 return zero_addr + *offset;
8015 PERL_STATIC_INLINE void
8016 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8018 /* Sets the current number of elements stored in the inversion list.
8019 * Updates SvCUR correspondingly */
8020 PERL_UNUSED_CONTEXT;
8021 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8023 assert(SvTYPE(invlist) == SVt_INVLIST);
8028 : TO_INTERNAL_SIZE(len + offset));
8029 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8032 #ifndef PERL_IN_XSUB_RE
8034 PERL_STATIC_INLINE IV*
8035 S_get_invlist_previous_index_addr(SV* invlist)
8037 /* Return the address of the IV that is reserved to hold the cached index
8039 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8041 assert(SvTYPE(invlist) == SVt_INVLIST);
8043 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8046 PERL_STATIC_INLINE IV
8047 S_invlist_previous_index(SV* const invlist)
8049 /* Returns cached index of previous search */
8051 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8053 return *get_invlist_previous_index_addr(invlist);
8056 PERL_STATIC_INLINE void
8057 S_invlist_set_previous_index(SV* const invlist, const IV index)
8059 /* Caches <index> for later retrieval */
8061 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8063 assert(index == 0 || index < (int) _invlist_len(invlist));
8065 *get_invlist_previous_index_addr(invlist) = index;
8068 PERL_STATIC_INLINE void
8069 S_invlist_trim(SV* const invlist)
8071 PERL_ARGS_ASSERT_INVLIST_TRIM;
8073 assert(SvTYPE(invlist) == SVt_INVLIST);
8075 /* Change the length of the inversion list to how many entries it currently
8077 SvPV_shrink_to_cur((SV *) invlist);
8080 PERL_STATIC_INLINE bool
8081 S_invlist_is_iterating(SV* const invlist)
8083 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8085 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8088 #endif /* ifndef PERL_IN_XSUB_RE */
8090 PERL_STATIC_INLINE UV
8091 S_invlist_max(SV* const invlist)
8093 /* Returns the maximum number of elements storable in the inversion list's
8094 * array, without having to realloc() */
8096 PERL_ARGS_ASSERT_INVLIST_MAX;
8098 assert(SvTYPE(invlist) == SVt_INVLIST);
8100 /* Assumes worst case, in which the 0 element is not counted in the
8101 * inversion list, so subtracts 1 for that */
8102 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8103 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8104 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8107 #ifndef PERL_IN_XSUB_RE
8109 Perl__new_invlist(pTHX_ IV initial_size)
8112 /* Return a pointer to a newly constructed inversion list, with enough
8113 * space to store 'initial_size' elements. If that number is negative, a
8114 * system default is used instead */
8118 if (initial_size < 0) {
8122 /* Allocate the initial space */
8123 new_list = newSV_type(SVt_INVLIST);
8125 /* First 1 is in case the zero element isn't in the list; second 1 is for
8127 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8128 invlist_set_len(new_list, 0, 0);
8130 /* Force iterinit() to be used to get iteration to work */
8131 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8133 *get_invlist_previous_index_addr(new_list) = 0;
8139 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8141 /* Return a pointer to a newly constructed inversion list, initialized to
8142 * point to <list>, which has to be in the exact correct inversion list
8143 * form, including internal fields. Thus this is a dangerous routine that
8144 * should not be used in the wrong hands. The passed in 'list' contains
8145 * several header fields at the beginning that are not part of the
8146 * inversion list body proper */
8148 const STRLEN length = (STRLEN) list[0];
8149 const UV version_id = list[1];
8150 const bool offset = cBOOL(list[2]);
8151 #define HEADER_LENGTH 3
8152 /* If any of the above changes in any way, you must change HEADER_LENGTH
8153 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8154 * perl -E 'say int(rand 2**31-1)'
8156 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8157 data structure type, so that one being
8158 passed in can be validated to be an
8159 inversion list of the correct vintage.
8162 SV* invlist = newSV_type(SVt_INVLIST);
8164 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8166 if (version_id != INVLIST_VERSION_ID) {
8167 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8170 /* The generated array passed in includes header elements that aren't part
8171 * of the list proper, so start it just after them */
8172 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8174 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8175 shouldn't touch it */
8177 *(get_invlist_offset_addr(invlist)) = offset;
8179 /* The 'length' passed to us is the physical number of elements in the
8180 * inversion list. But if there is an offset the logical number is one
8182 invlist_set_len(invlist, length - offset, offset);
8184 invlist_set_previous_index(invlist, 0);
8186 /* Initialize the iteration pointer. */
8187 invlist_iterfinish(invlist);
8189 SvREADONLY_on(invlist);
8193 #endif /* ifndef PERL_IN_XSUB_RE */
8196 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8198 /* Grow the maximum size of an inversion list */
8200 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8202 assert(SvTYPE(invlist) == SVt_INVLIST);
8204 /* Add one to account for the zero element at the beginning which may not
8205 * be counted by the calling parameters */
8206 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8210 S__append_range_to_invlist(pTHX_ SV* const invlist,
8211 const UV start, const UV end)
8213 /* Subject to change or removal. Append the range from 'start' to 'end' at
8214 * the end of the inversion list. The range must be above any existing
8218 UV max = invlist_max(invlist);
8219 UV len = _invlist_len(invlist);
8222 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8224 if (len == 0) { /* Empty lists must be initialized */
8225 offset = start != 0;
8226 array = _invlist_array_init(invlist, ! offset);
8229 /* Here, the existing list is non-empty. The current max entry in the
8230 * list is generally the first value not in the set, except when the
8231 * set extends to the end of permissible values, in which case it is
8232 * the first entry in that final set, and so this call is an attempt to
8233 * append out-of-order */
8235 UV final_element = len - 1;
8236 array = invlist_array(invlist);
8237 if (array[final_element] > start
8238 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8240 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8241 array[final_element], start,
8242 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8245 /* Here, it is a legal append. If the new range begins with the first
8246 * value not in the set, it is extending the set, so the new first
8247 * value not in the set is one greater than the newly extended range.
8249 offset = *get_invlist_offset_addr(invlist);
8250 if (array[final_element] == start) {
8251 if (end != UV_MAX) {
8252 array[final_element] = end + 1;
8255 /* But if the end is the maximum representable on the machine,
8256 * just let the range that this would extend to have no end */
8257 invlist_set_len(invlist, len - 1, offset);
8263 /* Here the new range doesn't extend any existing set. Add it */
8265 len += 2; /* Includes an element each for the start and end of range */
8267 /* If wll overflow the existing space, extend, which may cause the array to
8270 invlist_extend(invlist, len);
8272 /* Have to set len here to avoid assert failure in invlist_array() */
8273 invlist_set_len(invlist, len, offset);
8275 array = invlist_array(invlist);
8278 invlist_set_len(invlist, len, offset);
8281 /* The next item on the list starts the range, the one after that is
8282 * one past the new range. */
8283 array[len - 2] = start;
8284 if (end != UV_MAX) {
8285 array[len - 1] = end + 1;
8288 /* But if the end is the maximum representable on the machine, just let
8289 * the range have no end */
8290 invlist_set_len(invlist, len - 1, offset);
8294 #ifndef PERL_IN_XSUB_RE
8297 Perl__invlist_search(SV* const invlist, const UV cp)
8299 /* Searches the inversion list for the entry that contains the input code
8300 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8301 * return value is the index into the list's array of the range that
8306 IV high = _invlist_len(invlist);
8307 const IV highest_element = high - 1;
8310 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8312 /* If list is empty, return failure. */
8317 /* (We can't get the array unless we know the list is non-empty) */
8318 array = invlist_array(invlist);
8320 mid = invlist_previous_index(invlist);
8321 assert(mid >=0 && mid <= highest_element);
8323 /* <mid> contains the cache of the result of the previous call to this
8324 * function (0 the first time). See if this call is for the same result,
8325 * or if it is for mid-1. This is under the theory that calls to this
8326 * function will often be for related code points that are near each other.
8327 * And benchmarks show that caching gives better results. We also test
8328 * here if the code point is within the bounds of the list. These tests
8329 * replace others that would have had to be made anyway to make sure that
8330 * the array bounds were not exceeded, and these give us extra information
8331 * at the same time */
8332 if (cp >= array[mid]) {
8333 if (cp >= array[highest_element]) {
8334 return highest_element;
8337 /* Here, array[mid] <= cp < array[highest_element]. This means that
8338 * the final element is not the answer, so can exclude it; it also
8339 * means that <mid> is not the final element, so can refer to 'mid + 1'
8341 if (cp < array[mid + 1]) {
8347 else { /* cp < aray[mid] */
8348 if (cp < array[0]) { /* Fail if outside the array */
8352 if (cp >= array[mid - 1]) {
8357 /* Binary search. What we are looking for is <i> such that
8358 * array[i] <= cp < array[i+1]
8359 * The loop below converges on the i+1. Note that there may not be an
8360 * (i+1)th element in the array, and things work nonetheless */
8361 while (low < high) {
8362 mid = (low + high) / 2;
8363 assert(mid <= highest_element);
8364 if (array[mid] <= cp) { /* cp >= array[mid] */
8367 /* We could do this extra test to exit the loop early.
8368 if (cp < array[low]) {
8373 else { /* cp < array[mid] */
8380 invlist_set_previous_index(invlist, high);
8385 Perl__invlist_populate_swatch(SV* const invlist,
8386 const UV start, const UV end, U8* swatch)
8388 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8389 * but is used when the swash has an inversion list. This makes this much
8390 * faster, as it uses a binary search instead of a linear one. This is
8391 * intimately tied to that function, and perhaps should be in utf8.c,
8392 * except it is intimately tied to inversion lists as well. It assumes
8393 * that <swatch> is all 0's on input */
8396 const IV len = _invlist_len(invlist);
8400 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8402 if (len == 0) { /* Empty inversion list */
8406 array = invlist_array(invlist);
8408 /* Find which element it is */
8409 i = _invlist_search(invlist, start);
8411 /* We populate from <start> to <end> */
8412 while (current < end) {
8415 /* The inversion list gives the results for every possible code point
8416 * after the first one in the list. Only those ranges whose index is
8417 * even are ones that the inversion list matches. For the odd ones,
8418 * and if the initial code point is not in the list, we have to skip
8419 * forward to the next element */
8420 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8422 if (i >= len) { /* Finished if beyond the end of the array */
8426 if (current >= end) { /* Finished if beyond the end of what we
8428 if (LIKELY(end < UV_MAX)) {
8432 /* We get here when the upper bound is the maximum
8433 * representable on the machine, and we are looking for just
8434 * that code point. Have to special case it */
8436 goto join_end_of_list;
8439 assert(current >= start);
8441 /* The current range ends one below the next one, except don't go past
8444 upper = (i < len && array[i] < end) ? array[i] : end;
8446 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8447 * for each code point in it */
8448 for (; current < upper; current++) {
8449 const STRLEN offset = (STRLEN)(current - start);
8450 swatch[offset >> 3] |= 1 << (offset & 7);
8455 /* Quit if at the end of the list */
8458 /* But first, have to deal with the highest possible code point on
8459 * the platform. The previous code assumes that <end> is one
8460 * beyond where we want to populate, but that is impossible at the
8461 * platform's infinity, so have to handle it specially */
8462 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8464 const STRLEN offset = (STRLEN)(end - start);
8465 swatch[offset >> 3] |= 1 << (offset & 7);
8470 /* Advance to the next range, which will be for code points not in the
8479 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8480 const bool complement_b, SV** output)
8482 /* Take the union of two inversion lists and point <output> to it. *output
8483 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8484 * the reference count to that list will be decremented if not already a
8485 * temporary (mortal); otherwise *output will be made correspondingly
8486 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8487 * second list is returned. If <complement_b> is TRUE, the union is taken
8488 * of the complement (inversion) of <b> instead of b itself.
8490 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8491 * Richard Gillam, published by Addison-Wesley, and explained at some
8492 * length there. The preface says to incorporate its examples into your
8493 * code at your own risk.
8495 * The algorithm is like a merge sort.
8497 * XXX A potential performance improvement is to keep track as we go along
8498 * if only one of the inputs contributes to the result, meaning the other
8499 * is a subset of that one. In that case, we can skip the final copy and
8500 * return the larger of the input lists, but then outside code might need
8501 * to keep track of whether to free the input list or not */
8503 const UV* array_a; /* a's array */
8505 UV len_a; /* length of a's array */
8508 SV* u; /* the resulting union */
8512 UV i_a = 0; /* current index into a's array */
8516 /* running count, as explained in the algorithm source book; items are
8517 * stopped accumulating and are output when the count changes to/from 0.
8518 * The count is incremented when we start a range that's in the set, and
8519 * decremented when we start a range that's not in the set. So its range
8520 * is 0 to 2. Only when the count is zero is something not in the set.
8524 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8527 /* If either one is empty, the union is the other one */
8528 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8529 bool make_temp = FALSE; /* Should we mortalize the result? */
8533 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8539 *output = invlist_clone(b);
8541 _invlist_invert(*output);
8543 } /* else *output already = b; */
8546 sv_2mortal(*output);
8550 else if ((len_b = _invlist_len(b)) == 0) {
8551 bool make_temp = FALSE;
8553 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8558 /* The complement of an empty list is a list that has everything in it,
8559 * so the union with <a> includes everything too */
8562 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8566 *output = _new_invlist(1);
8567 _append_range_to_invlist(*output, 0, UV_MAX);
8569 else if (*output != a) {
8570 *output = invlist_clone(a);
8572 /* else *output already = a; */
8575 sv_2mortal(*output);
8580 /* Here both lists exist and are non-empty */
8581 array_a = invlist_array(a);
8582 array_b = invlist_array(b);
8584 /* If are to take the union of 'a' with the complement of b, set it
8585 * up so are looking at b's complement. */
8588 /* To complement, we invert: if the first element is 0, remove it. To
8589 * do this, we just pretend the array starts one later */
8590 if (array_b[0] == 0) {
8596 /* But if the first element is not zero, we pretend the list starts
8597 * at the 0 that is always stored immediately before the array. */
8603 /* Size the union for the worst case: that the sets are completely
8605 u = _new_invlist(len_a + len_b);
8607 /* Will contain U+0000 if either component does */
8608 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8609 || (len_b > 0 && array_b[0] == 0));
8611 /* Go through each list item by item, stopping when exhausted one of
8613 while (i_a < len_a && i_b < len_b) {
8614 UV cp; /* The element to potentially add to the union's array */
8615 bool cp_in_set; /* is it in the the input list's set or not */
8617 /* We need to take one or the other of the two inputs for the union.
8618 * Since we are merging two sorted lists, we take the smaller of the
8619 * next items. In case of a tie, we take the one that is in its set
8620 * first. If we took one not in the set first, it would decrement the
8621 * count, possibly to 0 which would cause it to be output as ending the
8622 * range, and the next time through we would take the same number, and
8623 * output it again as beginning the next range. By doing it the
8624 * opposite way, there is no possibility that the count will be
8625 * momentarily decremented to 0, and thus the two adjoining ranges will
8626 * be seamlessly merged. (In a tie and both are in the set or both not
8627 * in the set, it doesn't matter which we take first.) */
8628 if (array_a[i_a] < array_b[i_b]
8629 || (array_a[i_a] == array_b[i_b]
8630 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8632 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8636 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8637 cp = array_b[i_b++];
8640 /* Here, have chosen which of the two inputs to look at. Only output
8641 * if the running count changes to/from 0, which marks the
8642 * beginning/end of a range in that's in the set */
8645 array_u[i_u++] = cp;
8652 array_u[i_u++] = cp;
8657 /* Here, we are finished going through at least one of the lists, which
8658 * means there is something remaining in at most one. We check if the list
8659 * that hasn't been exhausted is positioned such that we are in the middle
8660 * of a range in its set or not. (i_a and i_b point to the element beyond
8661 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8662 * is potentially more to output.
8663 * There are four cases:
8664 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8665 * in the union is entirely from the non-exhausted set.
8666 * 2) Both were in their sets, count is 2. Nothing further should
8667 * be output, as everything that remains will be in the exhausted
8668 * list's set, hence in the union; decrementing to 1 but not 0 insures
8670 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8671 * Nothing further should be output because the union includes
8672 * everything from the exhausted set. Not decrementing ensures that.
8673 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8674 * decrementing to 0 insures that we look at the remainder of the
8675 * non-exhausted set */
8676 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8677 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8682 /* The final length is what we've output so far, plus what else is about to
8683 * be output. (If 'count' is non-zero, then the input list we exhausted
8684 * has everything remaining up to the machine's limit in its set, and hence
8685 * in the union, so there will be no further output. */
8688 /* At most one of the subexpressions will be non-zero */
8689 len_u += (len_a - i_a) + (len_b - i_b);
8692 /* Set result to final length, which can change the pointer to array_u, so
8694 if (len_u != _invlist_len(u)) {
8695 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8697 array_u = invlist_array(u);
8700 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8701 * the other) ended with everything above it not in its set. That means
8702 * that the remaining part of the union is precisely the same as the
8703 * non-exhausted list, so can just copy it unchanged. (If both list were
8704 * exhausted at the same time, then the operations below will be both 0.)
8707 IV copy_count; /* At most one will have a non-zero copy count */
8708 if ((copy_count = len_a - i_a) > 0) {
8709 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8711 else if ((copy_count = len_b - i_b) > 0) {
8712 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8716 /* We may be removing a reference to one of the inputs. If so, the output
8717 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8718 * count decremented) */
8719 if (a == *output || b == *output) {
8720 assert(! invlist_is_iterating(*output));
8721 if ((SvTEMP(*output))) {
8725 SvREFCNT_dec_NN(*output);
8735 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8736 const bool complement_b, SV** i)
8738 /* Take the intersection of two inversion lists and point <i> to it. *i
8739 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8740 * the reference count to that list will be decremented if not already a
8741 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8742 * The first list, <a>, may be NULL, in which case an empty list is
8743 * returned. If <complement_b> is TRUE, the result will be the
8744 * intersection of <a> and the complement (or inversion) of <b> instead of
8747 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8748 * Richard Gillam, published by Addison-Wesley, and explained at some
8749 * length there. The preface says to incorporate its examples into your
8750 * code at your own risk. In fact, it had bugs
8752 * The algorithm is like a merge sort, and is essentially the same as the
8756 const UV* array_a; /* a's array */
8758 UV len_a; /* length of a's array */
8761 SV* r; /* the resulting intersection */
8765 UV i_a = 0; /* current index into a's array */
8769 /* running count, as explained in the algorithm source book; items are
8770 * stopped accumulating and are output when the count changes to/from 2.
8771 * The count is incremented when we start a range that's in the set, and
8772 * decremented when we start a range that's not in the set. So its range
8773 * is 0 to 2. Only when the count is 2 is something in the intersection.
8777 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8780 /* Special case if either one is empty */
8781 len_a = (a == NULL) ? 0 : _invlist_len(a);
8782 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8783 bool make_temp = FALSE;
8785 if (len_a != 0 && complement_b) {
8787 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8788 * be empty. Here, also we are using 'b's complement, which hence
8789 * must be every possible code point. Thus the intersection is
8793 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8798 *i = invlist_clone(a);
8800 /* else *i is already 'a' */
8808 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8809 * intersection must be empty */
8811 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8816 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8820 *i = _new_invlist(0);
8828 /* Here both lists exist and are non-empty */
8829 array_a = invlist_array(a);
8830 array_b = invlist_array(b);
8832 /* If are to take the intersection of 'a' with the complement of b, set it
8833 * up so are looking at b's complement. */
8836 /* To complement, we invert: if the first element is 0, remove it. To
8837 * do this, we just pretend the array starts one later */
8838 if (array_b[0] == 0) {
8844 /* But if the first element is not zero, we pretend the list starts
8845 * at the 0 that is always stored immediately before the array. */
8851 /* Size the intersection for the worst case: that the intersection ends up
8852 * fragmenting everything to be completely disjoint */
8853 r= _new_invlist(len_a + len_b);
8855 /* Will contain U+0000 iff both components do */
8856 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8857 && len_b > 0 && array_b[0] == 0);
8859 /* Go through each list item by item, stopping when exhausted one of
8861 while (i_a < len_a && i_b < len_b) {
8862 UV cp; /* The element to potentially add to the intersection's
8864 bool cp_in_set; /* Is it in the input list's set or not */
8866 /* We need to take one or the other of the two inputs for the
8867 * intersection. Since we are merging two sorted lists, we take the
8868 * smaller of the next items. In case of a tie, we take the one that
8869 * is not in its set first (a difference from the union algorithm). If
8870 * we took one in the set first, it would increment the count, possibly
8871 * to 2 which would cause it to be output as starting a range in the
8872 * intersection, and the next time through we would take that same
8873 * number, and output it again as ending the set. By doing it the
8874 * opposite of this, there is no possibility that the count will be
8875 * momentarily incremented to 2. (In a tie and both are in the set or
8876 * both not in the set, it doesn't matter which we take first.) */
8877 if (array_a[i_a] < array_b[i_b]
8878 || (array_a[i_a] == array_b[i_b]
8879 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8881 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8885 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8889 /* Here, have chosen which of the two inputs to look at. Only output
8890 * if the running count changes to/from 2, which marks the
8891 * beginning/end of a range that's in the intersection */
8895 array_r[i_r++] = cp;
8900 array_r[i_r++] = cp;
8906 /* Here, we are finished going through at least one of the lists, which
8907 * means there is something remaining in at most one. We check if the list
8908 * that has been exhausted is positioned such that we are in the middle
8909 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8910 * the ones we care about.) There are four cases:
8911 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8912 * nothing left in the intersection.
8913 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8914 * above 2. What should be output is exactly that which is in the
8915 * non-exhausted set, as everything it has is also in the intersection
8916 * set, and everything it doesn't have can't be in the intersection
8917 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8918 * gets incremented to 2. Like the previous case, the intersection is
8919 * everything that remains in the non-exhausted set.
8920 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8921 * remains 1. And the intersection has nothing more. */
8922 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8923 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8928 /* The final length is what we've output so far plus what else is in the
8929 * intersection. At most one of the subexpressions below will be non-zero
8933 len_r += (len_a - i_a) + (len_b - i_b);
8936 /* Set result to final length, which can change the pointer to array_r, so
8938 if (len_r != _invlist_len(r)) {
8939 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8941 array_r = invlist_array(r);
8944 /* Finish outputting any remaining */
8945 if (count >= 2) { /* At most one will have a non-zero copy count */
8947 if ((copy_count = len_a - i_a) > 0) {
8948 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8950 else if ((copy_count = len_b - i_b) > 0) {
8951 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8955 /* We may be removing a reference to one of the inputs. If so, the output
8956 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8957 * count decremented) */
8958 if (a == *i || b == *i) {
8959 assert(! invlist_is_iterating(*i));
8964 SvREFCNT_dec_NN(*i);
8974 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8976 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8977 * set. A pointer to the inversion list is returned. This may actually be
8978 * a new list, in which case the passed in one has been destroyed. The
8979 * passed-in inversion list can be NULL, in which case a new one is created
8980 * with just the one range in it */
8985 if (invlist == NULL) {
8986 invlist = _new_invlist(2);
8990 len = _invlist_len(invlist);
8993 /* If comes after the final entry actually in the list, can just append it
8996 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8997 && start >= invlist_array(invlist)[len - 1]))
8999 _append_range_to_invlist(invlist, start, end);
9003 /* Here, can't just append things, create and return a new inversion list
9004 * which is the union of this range and the existing inversion list. (If
9005 * the new range is well-behaved wrt to the old one, we could just insert
9006 * it, doing a Move() down on the tail of the old one (potentially growing
9007 * it first). But to determine that means we would have the extra
9008 * (possibly throw-away) work of first finding where the new one goes and
9009 * whether it disrupts (splits) an existing range, so it doesn't appear to
9010 * me (khw) that it's worth it) */
9011 range_invlist = _new_invlist(2);
9012 _append_range_to_invlist(range_invlist, start, end);
9014 _invlist_union(invlist, range_invlist, &invlist);
9016 /* The temporary can be freed */
9017 SvREFCNT_dec_NN(range_invlist);
9023 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9024 UV** other_elements_ptr)
9026 /* Create and return an inversion list whose contents are to be populated
9027 * by the caller. The caller gives the number of elements (in 'size') and
9028 * the very first element ('element0'). This function will set
9029 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9032 * Obviously there is some trust involved that the caller will properly
9033 * fill in the other elements of the array.
9035 * (The first element needs to be passed in, as the underlying code does
9036 * things differently depending on whether it is zero or non-zero) */
9038 SV* invlist = _new_invlist(size);
9041 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9043 _append_range_to_invlist(invlist, element0, element0);
9044 offset = *get_invlist_offset_addr(invlist);
9046 invlist_set_len(invlist, size, offset);
9047 *other_elements_ptr = invlist_array(invlist) + 1;
9053 PERL_STATIC_INLINE SV*
9054 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9055 return _add_range_to_invlist(invlist, cp, cp);
9058 #ifndef PERL_IN_XSUB_RE
9060 Perl__invlist_invert(pTHX_ SV* const invlist)
9062 /* Complement the input inversion list. This adds a 0 if the list didn't
9063 * have a zero; removes it otherwise. As described above, the data
9064 * structure is set up so that this is very efficient */
9066 PERL_ARGS_ASSERT__INVLIST_INVERT;
9068 assert(! invlist_is_iterating(invlist));
9070 /* The inverse of matching nothing is matching everything */
9071 if (_invlist_len(invlist) == 0) {
9072 _append_range_to_invlist(invlist, 0, UV_MAX);
9076 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9081 PERL_STATIC_INLINE SV*
9082 S_invlist_clone(pTHX_ SV* const invlist)
9085 /* Return a new inversion list that is a copy of the input one, which is
9086 * unchanged. The new list will not be mortal even if the old one was. */
9088 /* Need to allocate extra space to accommodate Perl's addition of a
9089 * trailing NUL to SvPV's, since it thinks they are always strings */
9090 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9091 STRLEN physical_length = SvCUR(invlist);
9092 bool offset = *(get_invlist_offset_addr(invlist));
9094 PERL_ARGS_ASSERT_INVLIST_CLONE;
9096 *(get_invlist_offset_addr(new_invlist)) = offset;
9097 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9098 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9103 PERL_STATIC_INLINE STRLEN*
9104 S_get_invlist_iter_addr(SV* invlist)
9106 /* Return the address of the UV that contains the current iteration
9109 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9111 assert(SvTYPE(invlist) == SVt_INVLIST);
9113 return &(((XINVLIST*) SvANY(invlist))->iterator);
9116 PERL_STATIC_INLINE void
9117 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9119 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9121 *get_invlist_iter_addr(invlist) = 0;
9124 PERL_STATIC_INLINE void
9125 S_invlist_iterfinish(SV* invlist)
9127 /* Terminate iterator for invlist. This is to catch development errors.
9128 * Any iteration that is interrupted before completed should call this
9129 * function. Functions that add code points anywhere else but to the end
9130 * of an inversion list assert that they are not in the middle of an
9131 * iteration. If they were, the addition would make the iteration
9132 * problematical: if the iteration hadn't reached the place where things
9133 * were being added, it would be ok */
9135 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9137 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9141 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9143 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9144 * This call sets in <*start> and <*end>, the next range in <invlist>.
9145 * Returns <TRUE> if successful and the next call will return the next
9146 * range; <FALSE> if was already at the end of the list. If the latter,
9147 * <*start> and <*end> are unchanged, and the next call to this function
9148 * will start over at the beginning of the list */
9150 STRLEN* pos = get_invlist_iter_addr(invlist);
9151 UV len = _invlist_len(invlist);
9154 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9157 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9161 array = invlist_array(invlist);
9163 *start = array[(*pos)++];
9169 *end = array[(*pos)++] - 1;
9175 PERL_STATIC_INLINE UV
9176 S_invlist_highest(SV* const invlist)
9178 /* Returns the highest code point that matches an inversion list. This API
9179 * has an ambiguity, as it returns 0 under either the highest is actually
9180 * 0, or if the list is empty. If this distinction matters to you, check
9181 * for emptiness before calling this function */
9183 UV len = _invlist_len(invlist);
9186 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9192 array = invlist_array(invlist);
9194 /* The last element in the array in the inversion list always starts a
9195 * range that goes to infinity. That range may be for code points that are
9196 * matched in the inversion list, or it may be for ones that aren't
9197 * matched. In the latter case, the highest code point in the set is one
9198 * less than the beginning of this range; otherwise it is the final element
9199 * of this range: infinity */
9200 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9202 : array[len - 1] - 1;
9205 #ifndef PERL_IN_XSUB_RE
9207 Perl__invlist_contents(pTHX_ SV* const invlist)
9209 /* Get the contents of an inversion list into a string SV so that they can
9210 * be printed out. It uses the format traditionally done for debug tracing
9214 SV* output = newSVpvs("\n");
9216 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9218 assert(! invlist_is_iterating(invlist));
9220 invlist_iterinit(invlist);
9221 while (invlist_iternext(invlist, &start, &end)) {
9222 if (end == UV_MAX) {
9223 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9225 else if (end != start) {
9226 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9230 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9238 #ifndef PERL_IN_XSUB_RE
9240 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9241 const char * const indent, SV* const invlist)
9243 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9244 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9245 * the string 'indent'. The output looks like this:
9246 [0] 0x000A .. 0x000D
9248 [4] 0x2028 .. 0x2029
9249 [6] 0x3104 .. INFINITY
9250 * This means that the first range of code points matched by the list are
9251 * 0xA through 0xD; the second range contains only the single code point
9252 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9253 * are used to define each range (except if the final range extends to
9254 * infinity, only a single element is needed). The array index of the
9255 * first element for the corresponding range is given in brackets. */
9260 PERL_ARGS_ASSERT__INVLIST_DUMP;
9262 if (invlist_is_iterating(invlist)) {
9263 Perl_dump_indent(aTHX_ level, file,
9264 "%sCan't dump inversion list because is in middle of iterating\n",
9269 invlist_iterinit(invlist);
9270 while (invlist_iternext(invlist, &start, &end)) {
9271 if (end == UV_MAX) {
9272 Perl_dump_indent(aTHX_ level, file,
9273 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9274 indent, (UV)count, start);
9276 else if (end != start) {
9277 Perl_dump_indent(aTHX_ level, file,
9278 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9279 indent, (UV)count, start, end);
9282 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9283 indent, (UV)count, start);
9290 Perl__load_PL_utf8_foldclosures (pTHX)
9292 assert(! PL_utf8_foldclosures);
9294 /* If the folds haven't been read in, call a fold function
9296 if (! PL_utf8_tofold) {
9297 U8 dummy[UTF8_MAXBYTES_CASE+1];
9299 /* This string is just a short named one above \xff */
9300 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9301 assert(PL_utf8_tofold); /* Verify that worked */
9303 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9307 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9309 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9311 /* Return a boolean as to if the two passed in inversion lists are
9312 * identical. The final argument, if TRUE, says to take the complement of
9313 * the second inversion list before doing the comparison */
9315 const UV* array_a = invlist_array(a);
9316 const UV* array_b = invlist_array(b);
9317 UV len_a = _invlist_len(a);
9318 UV len_b = _invlist_len(b);
9320 UV i = 0; /* current index into the arrays */
9321 bool retval = TRUE; /* Assume are identical until proven otherwise */
9323 PERL_ARGS_ASSERT__INVLISTEQ;
9325 /* If are to compare 'a' with the complement of b, set it
9326 * up so are looking at b's complement. */
9329 /* The complement of nothing is everything, so <a> would have to have
9330 * just one element, starting at zero (ending at infinity) */
9332 return (len_a == 1 && array_a[0] == 0);
9334 else if (array_b[0] == 0) {
9336 /* Otherwise, to complement, we invert. Here, the first element is
9337 * 0, just remove it. To do this, we just pretend the array starts
9345 /* But if the first element is not zero, we pretend the list starts
9346 * at the 0 that is always stored immediately before the array. */
9352 /* Make sure that the lengths are the same, as well as the final element
9353 * before looping through the remainder. (Thus we test the length, final,
9354 * and first elements right off the bat) */
9355 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9358 else for (i = 0; i < len_a - 1; i++) {
9359 if (array_a[i] != array_b[i]) {
9370 * As best we can, determine the characters that can match the start of
9371 * the given EXACTF-ish node.
9373 * Returns the invlist as a new SV*; it is the caller's responsibility to
9374 * call SvREFCNT_dec() when done with it.
9377 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9379 const U8 * s = (U8*)STRING(node);
9380 SSize_t bytelen = STR_LEN(node);
9382 /* Start out big enough for 2 separate code points */
9383 SV* invlist = _new_invlist(4);
9385 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9390 /* We punt and assume can match anything if the node begins
9391 * with a multi-character fold. Things are complicated. For
9392 * example, /ffi/i could match any of:
9393 * "\N{LATIN SMALL LIGATURE FFI}"
9394 * "\N{LATIN SMALL LIGATURE FF}I"
9395 * "F\N{LATIN SMALL LIGATURE FI}"
9396 * plus several other things; and making sure we have all the
9397 * possibilities is hard. */
9398 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9399 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9402 /* Any Latin1 range character can potentially match any
9403 * other depending on the locale */
9404 if (OP(node) == EXACTFL) {
9405 _invlist_union(invlist, PL_Latin1, &invlist);
9408 /* But otherwise, it matches at least itself. We can
9409 * quickly tell if it has a distinct fold, and if so,
9410 * it matches that as well */
9411 invlist = add_cp_to_invlist(invlist, uc);
9412 if (IS_IN_SOME_FOLD_L1(uc))
9413 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9416 /* Some characters match above-Latin1 ones under /i. This
9417 * is true of EXACTFL ones when the locale is UTF-8 */
9418 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9419 && (! isASCII(uc) || (OP(node) != EXACTFA
9420 && OP(node) != EXACTFA_NO_TRIE)))
9422 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9426 else { /* Pattern is UTF-8 */
9427 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9428 STRLEN foldlen = UTF8SKIP(s);
9429 const U8* e = s + bytelen;
9432 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9434 /* The only code points that aren't folded in a UTF EXACTFish
9435 * node are are the problematic ones in EXACTFL nodes */
9436 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9437 /* We need to check for the possibility that this EXACTFL
9438 * node begins with a multi-char fold. Therefore we fold
9439 * the first few characters of it so that we can make that
9444 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9446 *(d++) = (U8) toFOLD(*s);
9451 to_utf8_fold(s, d, &len);
9457 /* And set up so the code below that looks in this folded
9458 * buffer instead of the node's string */
9460 foldlen = UTF8SKIP(folded);
9464 /* When we reach here 's' points to the fold of the first
9465 * character(s) of the node; and 'e' points to far enough along
9466 * the folded string to be just past any possible multi-char
9467 * fold. 'foldlen' is the length in bytes of the first
9470 * Unlike the non-UTF-8 case, the macro for determining if a
9471 * string is a multi-char fold requires all the characters to
9472 * already be folded. This is because of all the complications
9473 * if not. Note that they are folded anyway, except in EXACTFL
9474 * nodes. Like the non-UTF case above, we punt if the node
9475 * begins with a multi-char fold */
9477 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9478 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9480 else { /* Single char fold */
9482 /* It matches all the things that fold to it, which are
9483 * found in PL_utf8_foldclosures (including itself) */
9484 invlist = add_cp_to_invlist(invlist, uc);
9485 if (! PL_utf8_foldclosures)
9486 _load_PL_utf8_foldclosures();
9487 if ((listp = hv_fetch(PL_utf8_foldclosures,
9488 (char *) s, foldlen, FALSE)))
9490 AV* list = (AV*) *listp;
9492 for (k = 0; k <= av_tindex(list); k++) {
9493 SV** c_p = av_fetch(list, k, FALSE);
9499 /* /aa doesn't allow folds between ASCII and non- */
9500 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9501 && isASCII(c) != isASCII(uc))
9506 invlist = add_cp_to_invlist(invlist, c);
9515 #undef HEADER_LENGTH
9516 #undef TO_INTERNAL_SIZE
9517 #undef FROM_INTERNAL_SIZE
9518 #undef INVLIST_VERSION_ID
9520 /* End of inversion list object */
9523 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9525 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9526 * constructs, and updates RExC_flags with them. On input, RExC_parse
9527 * should point to the first flag; it is updated on output to point to the
9528 * final ')' or ':'. There needs to be at least one flag, or this will
9531 /* for (?g), (?gc), and (?o) warnings; warning
9532 about (?c) will warn about (?g) -- japhy */
9534 #define WASTED_O 0x01
9535 #define WASTED_G 0x02
9536 #define WASTED_C 0x04
9537 #define WASTED_GC (WASTED_G|WASTED_C)
9538 I32 wastedflags = 0x00;
9539 U32 posflags = 0, negflags = 0;
9540 U32 *flagsp = &posflags;
9541 char has_charset_modifier = '\0';
9543 bool has_use_defaults = FALSE;
9544 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9545 int x_mod_count = 0;
9547 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9549 /* '^' as an initial flag sets certain defaults */
9550 if (UCHARAT(RExC_parse) == '^') {
9552 has_use_defaults = TRUE;
9553 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9554 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9555 ? REGEX_UNICODE_CHARSET
9556 : REGEX_DEPENDS_CHARSET);
9559 cs = get_regex_charset(RExC_flags);
9560 if (cs == REGEX_DEPENDS_CHARSET
9561 && (RExC_utf8 || RExC_uni_semantics))
9563 cs = REGEX_UNICODE_CHARSET;
9566 while (*RExC_parse) {
9567 /* && strchr("iogcmsx", *RExC_parse) */
9568 /* (?g), (?gc) and (?o) are useless here
9569 and must be globally applied -- japhy */
9570 switch (*RExC_parse) {
9572 /* Code for the imsxn flags */
9573 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9575 case LOCALE_PAT_MOD:
9576 if (has_charset_modifier) {
9577 goto excess_modifier;
9579 else if (flagsp == &negflags) {
9582 cs = REGEX_LOCALE_CHARSET;
9583 has_charset_modifier = LOCALE_PAT_MOD;
9585 case UNICODE_PAT_MOD:
9586 if (has_charset_modifier) {
9587 goto excess_modifier;
9589 else if (flagsp == &negflags) {
9592 cs = REGEX_UNICODE_CHARSET;
9593 has_charset_modifier = UNICODE_PAT_MOD;
9595 case ASCII_RESTRICT_PAT_MOD:
9596 if (flagsp == &negflags) {
9599 if (has_charset_modifier) {
9600 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9601 goto excess_modifier;
9603 /* Doubled modifier implies more restricted */
9604 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9607 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9609 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9611 case DEPENDS_PAT_MOD:
9612 if (has_use_defaults) {
9613 goto fail_modifiers;
9615 else if (flagsp == &negflags) {
9618 else if (has_charset_modifier) {
9619 goto excess_modifier;
9622 /* The dual charset means unicode semantics if the
9623 * pattern (or target, not known until runtime) are
9624 * utf8, or something in the pattern indicates unicode
9626 cs = (RExC_utf8 || RExC_uni_semantics)
9627 ? REGEX_UNICODE_CHARSET
9628 : REGEX_DEPENDS_CHARSET;
9629 has_charset_modifier = DEPENDS_PAT_MOD;
9633 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9634 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9636 else if (has_charset_modifier == *(RExC_parse - 1)) {
9637 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9641 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9643 NOT_REACHED; /*NOTREACHED*/
9646 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9648 NOT_REACHED; /*NOTREACHED*/
9649 case ONCE_PAT_MOD: /* 'o' */
9650 case GLOBAL_PAT_MOD: /* 'g' */
9651 if (PASS2 && ckWARN(WARN_REGEXP)) {
9652 const I32 wflagbit = *RExC_parse == 'o'
9655 if (! (wastedflags & wflagbit) ) {
9656 wastedflags |= wflagbit;
9657 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9660 "Useless (%s%c) - %suse /%c modifier",
9661 flagsp == &negflags ? "?-" : "?",
9663 flagsp == &negflags ? "don't " : "",
9670 case CONTINUE_PAT_MOD: /* 'c' */
9671 if (PASS2 && ckWARN(WARN_REGEXP)) {
9672 if (! (wastedflags & WASTED_C) ) {
9673 wastedflags |= WASTED_GC;
9674 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9677 "Useless (%sc) - %suse /gc modifier",
9678 flagsp == &negflags ? "?-" : "?",
9679 flagsp == &negflags ? "don't " : ""
9684 case KEEPCOPY_PAT_MOD: /* 'p' */
9685 if (flagsp == &negflags) {
9687 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9689 *flagsp |= RXf_PMf_KEEPCOPY;
9693 /* A flag is a default iff it is following a minus, so
9694 * if there is a minus, it means will be trying to
9695 * re-specify a default which is an error */
9696 if (has_use_defaults || flagsp == &negflags) {
9697 goto fail_modifiers;
9700 wastedflags = 0; /* reset so (?g-c) warns twice */
9704 RExC_flags |= posflags;
9705 RExC_flags &= ~negflags;
9706 set_regex_charset(&RExC_flags, cs);
9707 if (RExC_flags & RXf_PMf_FOLD) {
9708 RExC_contains_i = 1;
9711 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9717 RExC_parse += SKIP_IF_CHAR(RExC_parse);
9718 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9719 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9720 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9721 NOT_REACHED; /*NOTREACHED*/
9728 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9733 - reg - regular expression, i.e. main body or parenthesized thing
9735 * Caller must absorb opening parenthesis.
9737 * Combining parenthesis handling with the base level of regular expression
9738 * is a trifle forced, but the need to tie the tails of the branches to what
9739 * follows makes it hard to avoid.
9741 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9743 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9745 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9748 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9749 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9750 needs to be restarted.
9751 Otherwise would only return NULL if regbranch() returns NULL, which
9754 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9755 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9756 * 2 is like 1, but indicates that nextchar() has been called to advance
9757 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9758 * this flag alerts us to the need to check for that */
9760 regnode *ret; /* Will be the head of the group. */
9763 regnode *ender = NULL;
9766 U32 oregflags = RExC_flags;
9767 bool have_branch = 0;
9769 I32 freeze_paren = 0;
9770 I32 after_freeze = 0;
9771 I32 num; /* numeric backreferences */
9773 char * parse_start = RExC_parse; /* MJD */
9774 char * const oregcomp_parse = RExC_parse;
9776 GET_RE_DEBUG_FLAGS_DECL;
9778 PERL_ARGS_ASSERT_REG;
9779 DEBUG_PARSE("reg ");
9781 *flagp = 0; /* Tentatively. */
9784 /* Make an OPEN node, if parenthesized. */
9787 /* Under /x, space and comments can be gobbled up between the '(' and
9788 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9789 * intervening space, as the sequence is a token, and a token should be
9791 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9793 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9794 char *start_verb = RExC_parse;
9795 STRLEN verb_len = 0;
9796 char *start_arg = NULL;
9797 unsigned char op = 0;
9799 int internal_argval = 0; /* internal_argval is only useful if
9802 if (has_intervening_patws) {
9804 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9806 while ( *RExC_parse && *RExC_parse != ')' ) {
9807 if ( *RExC_parse == ':' ) {
9808 start_arg = RExC_parse + 1;
9814 verb_len = RExC_parse - start_verb;
9817 while ( *RExC_parse && *RExC_parse != ')' )
9819 if ( *RExC_parse != ')' )
9820 vFAIL("Unterminated verb pattern argument");
9821 if ( RExC_parse == start_arg )
9824 if ( *RExC_parse != ')' )
9825 vFAIL("Unterminated verb pattern");
9828 switch ( *start_verb ) {
9829 case 'A': /* (*ACCEPT) */
9830 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9832 internal_argval = RExC_nestroot;
9835 case 'C': /* (*COMMIT) */
9836 if ( memEQs(start_verb,verb_len,"COMMIT") )
9839 case 'F': /* (*FAIL) */
9840 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9845 case ':': /* (*:NAME) */
9846 case 'M': /* (*MARK:NAME) */
9847 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9852 case 'P': /* (*PRUNE) */
9853 if ( memEQs(start_verb,verb_len,"PRUNE") )
9856 case 'S': /* (*SKIP) */
9857 if ( memEQs(start_verb,verb_len,"SKIP") )
9860 case 'T': /* (*THEN) */
9861 /* [19:06] <TimToady> :: is then */
9862 if ( memEQs(start_verb,verb_len,"THEN") ) {
9864 RExC_seen |= REG_CUTGROUP_SEEN;
9869 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9871 "Unknown verb pattern '%"UTF8f"'",
9872 UTF8fARG(UTF, verb_len, start_verb));
9875 if ( start_arg && internal_argval ) {
9876 vFAIL3("Verb pattern '%.*s' may not have an argument",
9877 verb_len, start_verb);
9878 } else if ( argok < 0 && !start_arg ) {
9879 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9880 verb_len, start_verb);
9882 ret = reganode(pRExC_state, op, internal_argval);
9883 if ( ! internal_argval && ! SIZE_ONLY ) {
9885 SV *sv = newSVpvn( start_arg,
9886 RExC_parse - start_arg);
9887 ARG(ret) = add_data( pRExC_state,
9889 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9896 if (!internal_argval)
9897 RExC_seen |= REG_VERBARG_SEEN;
9898 } else if ( start_arg ) {
9899 vFAIL3("Verb pattern '%.*s' may not have an argument",
9900 verb_len, start_verb);
9902 ret = reg_node(pRExC_state, op);
9904 nextchar(pRExC_state);
9907 else if (*RExC_parse == '?') { /* (?...) */
9908 bool is_logical = 0;
9909 const char * const seqstart = RExC_parse;
9910 const char * endptr;
9911 if (has_intervening_patws) {
9913 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9917 paren = *RExC_parse++;
9918 ret = NULL; /* For look-ahead/behind. */
9921 case 'P': /* (?P...) variants for those used to PCRE/Python */
9922 paren = *RExC_parse++;
9923 if ( paren == '<') /* (?P<...>) named capture */
9925 else if (paren == '>') { /* (?P>name) named recursion */
9926 goto named_recursion;
9928 else if (paren == '=') { /* (?P=...) named backref */
9929 /* this pretty much dupes the code for \k<NAME> in
9930 * regatom(), if you change this make sure you change that
9932 char* name_start = RExC_parse;
9934 SV *sv_dat = reg_scan_name(pRExC_state,
9935 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9936 if (RExC_parse == name_start || *RExC_parse != ')')
9937 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9938 vFAIL2("Sequence %.3s... not terminated",parse_start);
9941 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9942 RExC_rxi->data->data[num]=(void*)sv_dat;
9943 SvREFCNT_inc_simple_void(sv_dat);
9946 ret = reganode(pRExC_state,
9949 : (ASCII_FOLD_RESTRICTED)
9951 : (AT_LEAST_UNI_SEMANTICS)
9959 Set_Node_Offset(ret, parse_start+1);
9960 Set_Node_Cur_Length(ret, parse_start);
9962 nextchar(pRExC_state);
9966 RExC_parse += SKIP_IF_CHAR(RExC_parse);
9967 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9968 vFAIL3("Sequence (%.*s...) not recognized",
9969 RExC_parse-seqstart, seqstart);
9970 NOT_REACHED; /*NOTREACHED*/
9971 case '<': /* (?<...) */
9972 if (*RExC_parse == '!')
9974 else if (*RExC_parse != '=')
9980 case '\'': /* (?'...') */
9981 name_start= RExC_parse;
9982 svname = reg_scan_name(pRExC_state,
9983 SIZE_ONLY /* reverse test from the others */
9984 ? REG_RSN_RETURN_NAME
9985 : REG_RSN_RETURN_NULL);
9986 if (RExC_parse == name_start || *RExC_parse != paren)
9987 vFAIL2("Sequence (?%c... not terminated",
9988 paren=='>' ? '<' : paren);
9992 if (!svname) /* shouldn't happen */
9994 "panic: reg_scan_name returned NULL");
9995 if (!RExC_paren_names) {
9996 RExC_paren_names= newHV();
9997 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9999 RExC_paren_name_list= newAV();
10000 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10003 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10005 sv_dat = HeVAL(he_str);
10007 /* croak baby croak */
10009 "panic: paren_name hash element allocation failed");
10010 } else if ( SvPOK(sv_dat) ) {
10011 /* (?|...) can mean we have dupes so scan to check
10012 its already been stored. Maybe a flag indicating
10013 we are inside such a construct would be useful,
10014 but the arrays are likely to be quite small, so
10015 for now we punt -- dmq */
10016 IV count = SvIV(sv_dat);
10017 I32 *pv = (I32*)SvPVX(sv_dat);
10019 for ( i = 0 ; i < count ; i++ ) {
10020 if ( pv[i] == RExC_npar ) {
10026 pv = (I32*)SvGROW(sv_dat,
10027 SvCUR(sv_dat) + sizeof(I32)+1);
10028 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10029 pv[count] = RExC_npar;
10030 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10033 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10034 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10037 SvIV_set(sv_dat, 1);
10040 /* Yes this does cause a memory leak in debugging Perls
10042 if (!av_store(RExC_paren_name_list,
10043 RExC_npar, SvREFCNT_inc(svname)))
10044 SvREFCNT_dec_NN(svname);
10047 /*sv_dump(sv_dat);*/
10049 nextchar(pRExC_state);
10051 goto capturing_parens;
10053 RExC_seen |= REG_LOOKBEHIND_SEEN;
10054 RExC_in_lookbehind++;
10057 case '=': /* (?=...) */
10058 RExC_seen_zerolen++;
10060 case '!': /* (?!...) */
10061 RExC_seen_zerolen++;
10062 /* check if we're really just a "FAIL" assertion */
10064 nextchar(pRExC_state);
10065 if (*RExC_parse == ')') {
10066 ret=reg_node(pRExC_state, OPFAIL);
10067 nextchar(pRExC_state);
10071 case '|': /* (?|...) */
10072 /* branch reset, behave like a (?:...) except that
10073 buffers in alternations share the same numbers */
10075 after_freeze = freeze_paren = RExC_npar;
10077 case ':': /* (?:...) */
10078 case '>': /* (?>...) */
10080 case '$': /* (?$...) */
10081 case '@': /* (?@...) */
10082 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10084 case '0' : /* (?0) */
10085 case 'R' : /* (?R) */
10086 if (*RExC_parse != ')')
10087 FAIL("Sequence (?R) not terminated");
10088 ret = reg_node(pRExC_state, GOSTART);
10089 RExC_seen |= REG_GOSTART_SEEN;
10090 *flagp |= POSTPONED;
10091 nextchar(pRExC_state);
10094 /* named and numeric backreferences */
10095 case '&': /* (?&NAME) */
10096 parse_start = RExC_parse - 1;
10099 SV *sv_dat = reg_scan_name(pRExC_state,
10100 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10101 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10103 if (RExC_parse == RExC_end || *RExC_parse != ')')
10104 vFAIL("Sequence (?&... not terminated");
10105 goto gen_recurse_regop;
10108 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10110 vFAIL("Illegal pattern");
10112 goto parse_recursion;
10114 case '-': /* (?-1) */
10115 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10116 RExC_parse--; /* rewind to let it be handled later */
10120 case '1': case '2': case '3': case '4': /* (?1) */
10121 case '5': case '6': case '7': case '8': case '9':
10125 bool is_neg = FALSE;
10127 parse_start = RExC_parse - 1; /* MJD */
10128 if (*RExC_parse == '-') {
10132 if (grok_atoUV(RExC_parse, &unum, &endptr)
10136 RExC_parse = (char*)endptr;
10140 /* Some limit for num? */
10144 if (*RExC_parse!=')')
10145 vFAIL("Expecting close bracket");
10148 if ( paren == '-' ) {
10150 Diagram of capture buffer numbering.
10151 Top line is the normal capture buffer numbers
10152 Bottom line is the negative indexing as from
10156 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10160 num = RExC_npar + num;
10163 vFAIL("Reference to nonexistent group");
10165 } else if ( paren == '+' ) {
10166 num = RExC_npar + num - 1;
10169 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10171 if (num > (I32)RExC_rx->nparens) {
10173 vFAIL("Reference to nonexistent group");
10175 RExC_recurse_count++;
10176 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10177 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10178 22, "| |", (int)(depth * 2 + 1), "",
10179 (UV)ARG(ret), (IV)ARG2L(ret)));
10181 RExC_seen |= REG_RECURSE_SEEN;
10182 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10183 Set_Node_Offset(ret, parse_start); /* MJD */
10185 *flagp |= POSTPONED;
10186 nextchar(pRExC_state);
10191 case '?': /* (??...) */
10193 if (*RExC_parse != '{') {
10194 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10195 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10197 "Sequence (%"UTF8f"...) not recognized",
10198 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10199 NOT_REACHED; /*NOTREACHED*/
10201 *flagp |= POSTPONED;
10202 paren = *RExC_parse++;
10204 case '{': /* (?{...}) */
10207 struct reg_code_block *cb;
10209 RExC_seen_zerolen++;
10211 if ( !pRExC_state->num_code_blocks
10212 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10213 || pRExC_state->code_blocks[pRExC_state->code_index].start
10214 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10217 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10218 FAIL("panic: Sequence (?{...}): no code block found\n");
10219 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10221 /* this is a pre-compiled code block (?{...}) */
10222 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10223 RExC_parse = RExC_start + cb->end;
10226 if (cb->src_regex) {
10227 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10228 RExC_rxi->data->data[n] =
10229 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10230 RExC_rxi->data->data[n+1] = (void*)o;
10233 n = add_data(pRExC_state,
10234 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10235 RExC_rxi->data->data[n] = (void*)o;
10238 pRExC_state->code_index++;
10239 nextchar(pRExC_state);
10243 ret = reg_node(pRExC_state, LOGICAL);
10245 eval = reg2Lanode(pRExC_state, EVAL,
10248 /* for later propagation into (??{})
10250 RExC_flags & RXf_PMf_COMPILETIME
10255 REGTAIL(pRExC_state, ret, eval);
10256 /* deal with the length of this later - MJD */
10259 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10260 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10261 Set_Node_Offset(ret, parse_start);
10264 case '(': /* (?(?{...})...) and (?(?=...)...) */
10267 const int DEFINE_len = sizeof("DEFINE") - 1;
10268 if (RExC_parse[0] == '?') { /* (?(?...)) */
10269 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10270 || RExC_parse[1] == '<'
10271 || RExC_parse[1] == '{') { /* Lookahead or eval. */
10275 ret = reg_node(pRExC_state, LOGICAL);
10279 tail = reg(pRExC_state, 1, &flag, depth+1);
10280 if (flag & RESTART_UTF8) {
10281 *flagp = RESTART_UTF8;
10284 REGTAIL(pRExC_state, ret, tail);
10287 /* Fall through to ‘Unknown switch condition’ at the
10288 end of the if/else chain. */
10290 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10291 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10293 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10294 char *name_start= RExC_parse++;
10296 SV *sv_dat=reg_scan_name(pRExC_state,
10297 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10298 if (RExC_parse == name_start || *RExC_parse != ch)
10299 vFAIL2("Sequence (?(%c... not terminated",
10300 (ch == '>' ? '<' : ch));
10303 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10304 RExC_rxi->data->data[num]=(void*)sv_dat;
10305 SvREFCNT_inc_simple_void(sv_dat);
10307 ret = reganode(pRExC_state,NGROUPP,num);
10308 goto insert_if_check_paren;
10310 else if (RExC_end - RExC_parse >= DEFINE_len
10311 && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10313 ret = reganode(pRExC_state,DEFINEP,0);
10314 RExC_parse += DEFINE_len;
10316 goto insert_if_check_paren;
10318 else if (RExC_parse[0] == 'R') {
10321 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10323 if (grok_atoUV(RExC_parse, &uv, &endptr)
10327 RExC_parse = (char*)endptr;
10329 /* else "Switch condition not recognized" below */
10330 } else if (RExC_parse[0] == '&') {
10333 sv_dat = reg_scan_name(pRExC_state,
10335 ? REG_RSN_RETURN_NULL
10336 : REG_RSN_RETURN_DATA);
10337 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10339 ret = reganode(pRExC_state,INSUBP,parno);
10340 goto insert_if_check_paren;
10342 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10347 if (grok_atoUV(RExC_parse, &uv, &endptr)
10351 RExC_parse = (char*)endptr;
10353 /* XXX else what? */
10354 ret = reganode(pRExC_state, GROUPP, parno);
10356 insert_if_check_paren:
10357 if (*(tmp = nextchar(pRExC_state)) != ')') {
10358 /* nextchar also skips comments, so undo its work
10359 * and skip over the the next character.
10362 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10363 vFAIL("Switch condition not recognized");
10366 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10367 br = regbranch(pRExC_state, &flags, 1,depth+1);
10369 if (flags & RESTART_UTF8) {
10370 *flagp = RESTART_UTF8;
10373 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10376 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10378 c = *nextchar(pRExC_state);
10379 if (flags&HASWIDTH)
10380 *flagp |= HASWIDTH;
10383 vFAIL("(?(DEFINE)....) does not allow branches");
10385 /* Fake one for optimizer. */
10386 lastbr = reganode(pRExC_state, IFTHEN, 0);
10388 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10389 if (flags & RESTART_UTF8) {
10390 *flagp = RESTART_UTF8;
10393 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10396 REGTAIL(pRExC_state, ret, lastbr);
10397 if (flags&HASWIDTH)
10398 *flagp |= HASWIDTH;
10399 c = *nextchar(pRExC_state);
10404 if (RExC_parse>RExC_end)
10405 vFAIL("Switch (?(condition)... not terminated");
10407 vFAIL("Switch (?(condition)... contains too many branches");
10409 ender = reg_node(pRExC_state, TAIL);
10410 REGTAIL(pRExC_state, br, ender);
10412 REGTAIL(pRExC_state, lastbr, ender);
10413 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10416 REGTAIL(pRExC_state, ret, ender);
10417 RExC_size++; /* XXX WHY do we need this?!!
10418 For large programs it seems to be required
10419 but I can't figure out why. -- dmq*/
10422 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10423 vFAIL("Unknown switch condition (?(...))");
10425 case '[': /* (?[ ... ]) */
10426 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10429 RExC_parse--; /* for vFAIL to print correctly */
10430 vFAIL("Sequence (? incomplete");
10432 default: /* e.g., (?i) */
10435 parse_lparen_question_flags(pRExC_state);
10436 if (UCHARAT(RExC_parse) != ':') {
10438 nextchar(pRExC_state);
10443 nextchar(pRExC_state);
10448 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
10453 ret = reganode(pRExC_state, OPEN, parno);
10455 if (!RExC_nestroot)
10456 RExC_nestroot = parno;
10457 if (RExC_seen & REG_RECURSE_SEEN
10458 && !RExC_open_parens[parno-1])
10460 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10461 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10462 22, "| |", (int)(depth * 2 + 1), "",
10463 (IV)parno, REG_NODE_NUM(ret)));
10464 RExC_open_parens[parno-1]= ret;
10467 Set_Node_Length(ret, 1); /* MJD */
10468 Set_Node_Offset(ret, RExC_parse); /* MJD */
10471 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
10480 /* Pick up the branches, linking them together. */
10481 parse_start = RExC_parse; /* MJD */
10482 br = regbranch(pRExC_state, &flags, 1,depth+1);
10484 /* branch_len = (paren != 0); */
10487 if (flags & RESTART_UTF8) {
10488 *flagp = RESTART_UTF8;
10491 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10493 if (*RExC_parse == '|') {
10494 if (!SIZE_ONLY && RExC_extralen) {
10495 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10498 reginsert(pRExC_state, BRANCH, br, depth+1);
10499 Set_Node_Length(br, paren != 0);
10500 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10504 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10506 else if (paren == ':') {
10507 *flagp |= flags&SIMPLE;
10509 if (is_open) { /* Starts with OPEN. */
10510 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10512 else if (paren != '?') /* Not Conditional */
10514 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10516 while (*RExC_parse == '|') {
10517 if (!SIZE_ONLY && RExC_extralen) {
10518 ender = reganode(pRExC_state, LONGJMP,0);
10520 /* Append to the previous. */
10521 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10524 RExC_extralen += 2; /* Account for LONGJMP. */
10525 nextchar(pRExC_state);
10526 if (freeze_paren) {
10527 if (RExC_npar > after_freeze)
10528 after_freeze = RExC_npar;
10529 RExC_npar = freeze_paren;
10531 br = regbranch(pRExC_state, &flags, 0, depth+1);
10534 if (flags & RESTART_UTF8) {
10535 *flagp = RESTART_UTF8;
10538 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10540 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10542 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10545 if (have_branch || paren != ':') {
10546 /* Make a closing node, and hook it on the end. */
10549 ender = reg_node(pRExC_state, TAIL);
10552 ender = reganode(pRExC_state, CLOSE, parno);
10553 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10554 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10555 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10556 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10557 RExC_close_parens[parno-1]= ender;
10558 if (RExC_nestroot == parno)
10561 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10562 Set_Node_Length(ender,1); /* MJD */
10568 *flagp &= ~HASWIDTH;
10571 ender = reg_node(pRExC_state, SUCCEED);
10574 ender = reg_node(pRExC_state, END);
10576 assert(!RExC_opend); /* there can only be one! */
10577 RExC_opend = ender;
10581 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10582 DEBUG_PARSE_MSG("lsbr");
10583 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10584 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10585 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10586 SvPV_nolen_const(RExC_mysv1),
10587 (IV)REG_NODE_NUM(lastbr),
10588 SvPV_nolen_const(RExC_mysv2),
10589 (IV)REG_NODE_NUM(ender),
10590 (IV)(ender - lastbr)
10593 REGTAIL(pRExC_state, lastbr, ender);
10595 if (have_branch && !SIZE_ONLY) {
10596 char is_nothing= 1;
10598 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10600 /* Hook the tails of the branches to the closing node. */
10601 for (br = ret; br; br = regnext(br)) {
10602 const U8 op = PL_regkind[OP(br)];
10603 if (op == BRANCH) {
10604 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10605 if ( OP(NEXTOPER(br)) != NOTHING
10606 || regnext(NEXTOPER(br)) != ender)
10609 else if (op == BRANCHJ) {
10610 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10611 /* for now we always disable this optimisation * /
10612 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10613 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10619 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10620 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10621 DEBUG_PARSE_MSG("NADA");
10622 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10623 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10624 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10625 SvPV_nolen_const(RExC_mysv1),
10626 (IV)REG_NODE_NUM(ret),
10627 SvPV_nolen_const(RExC_mysv2),
10628 (IV)REG_NODE_NUM(ender),
10633 if (OP(ender) == TAIL) {
10638 for ( opt= br + 1; opt < ender ; opt++ )
10639 OP(opt)= OPTIMIZED;
10640 NEXT_OFF(br)= ender - br;
10648 static const char parens[] = "=!<,>";
10650 if (paren && (p = strchr(parens, paren))) {
10651 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10652 int flag = (p - parens) > 1;
10655 node = SUSPEND, flag = 0;
10656 reginsert(pRExC_state, node,ret, depth+1);
10657 Set_Node_Cur_Length(ret, parse_start);
10658 Set_Node_Offset(ret, parse_start + 1);
10660 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10664 /* Check for proper termination. */
10666 /* restore original flags, but keep (?p) */
10667 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10668 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10669 RExC_parse = oregcomp_parse;
10670 vFAIL("Unmatched (");
10673 else if (!paren && RExC_parse < RExC_end) {
10674 if (*RExC_parse == ')') {
10676 vFAIL("Unmatched )");
10679 FAIL("Junk on end of regexp"); /* "Can't happen". */
10680 NOT_REACHED; /* NOTREACHED */
10683 if (RExC_in_lookbehind) {
10684 RExC_in_lookbehind--;
10686 if (after_freeze > RExC_npar)
10687 RExC_npar = after_freeze;
10692 - regbranch - one alternative of an | operator
10694 * Implements the concatenation operator.
10696 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10700 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10703 regnode *chain = NULL;
10705 I32 flags = 0, c = 0;
10706 GET_RE_DEBUG_FLAGS_DECL;
10708 PERL_ARGS_ASSERT_REGBRANCH;
10710 DEBUG_PARSE("brnc");
10715 if (!SIZE_ONLY && RExC_extralen)
10716 ret = reganode(pRExC_state, BRANCHJ,0);
10718 ret = reg_node(pRExC_state, BRANCH);
10719 Set_Node_Length(ret, 1);
10723 if (!first && SIZE_ONLY)
10724 RExC_extralen += 1; /* BRANCHJ */
10726 *flagp = WORST; /* Tentatively. */
10729 nextchar(pRExC_state);
10730 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10731 flags &= ~TRYAGAIN;
10732 latest = regpiece(pRExC_state, &flags,depth+1);
10733 if (latest == NULL) {
10734 if (flags & TRYAGAIN)
10736 if (flags & RESTART_UTF8) {
10737 *flagp = RESTART_UTF8;
10740 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10742 else if (ret == NULL)
10744 *flagp |= flags&(HASWIDTH|POSTPONED);
10745 if (chain == NULL) /* First piece. */
10746 *flagp |= flags&SPSTART;
10748 /* FIXME adding one for every branch after the first is probably
10749 * excessive now we have TRIE support. (hv) */
10751 REGTAIL(pRExC_state, chain, latest);
10756 if (chain == NULL) { /* Loop ran zero times. */
10757 chain = reg_node(pRExC_state, NOTHING);
10762 *flagp |= flags&SIMPLE;
10769 - regpiece - something followed by possible [*+?]
10771 * Note that the branching code sequences used for ? and the general cases
10772 * of * and + are somewhat optimized: they use the same NOTHING node as
10773 * both the endmarker for their branch list and the body of the last branch.
10774 * It might seem that this node could be dispensed with entirely, but the
10775 * endmarker role is not redundant.
10777 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10779 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10783 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10789 const char * const origparse = RExC_parse;
10791 I32 max = REG_INFTY;
10792 #ifdef RE_TRACK_PATTERN_OFFSETS
10795 const char *maxpos = NULL;
10798 /* Save the original in case we change the emitted regop to a FAIL. */
10799 regnode * const orig_emit = RExC_emit;
10801 GET_RE_DEBUG_FLAGS_DECL;
10803 PERL_ARGS_ASSERT_REGPIECE;
10805 DEBUG_PARSE("piec");
10807 ret = regatom(pRExC_state, &flags,depth+1);
10809 if (flags & (TRYAGAIN|RESTART_UTF8))
10810 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10812 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10818 if (op == '{' && regcurly(RExC_parse)) {
10820 #ifdef RE_TRACK_PATTERN_OFFSETS
10821 parse_start = RExC_parse; /* MJD */
10823 next = RExC_parse + 1;
10824 while (isDIGIT(*next) || *next == ',') {
10825 if (*next == ',') {
10833 if (*next == '}') { /* got one */
10834 const char* endptr;
10838 if (isDIGIT(*RExC_parse)) {
10839 if (!grok_atoUV(RExC_parse, &uv, &endptr))
10840 vFAIL("Invalid quantifier in {,}");
10841 if (uv >= REG_INFTY)
10842 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10847 if (*maxpos == ',')
10850 maxpos = RExC_parse;
10851 if (isDIGIT(*maxpos)) {
10852 if (!grok_atoUV(maxpos, &uv, &endptr))
10853 vFAIL("Invalid quantifier in {,}");
10854 if (uv >= REG_INFTY)
10855 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10858 max = REG_INFTY; /* meaning "infinity" */
10861 nextchar(pRExC_state);
10862 if (max < min) { /* If can't match, warn and optimize to fail
10866 /* We can't back off the size because we have to reserve
10867 * enough space for all the things we are about to throw
10868 * away, but we can shrink it by the ammount we are about
10869 * to re-use here */
10870 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10873 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10874 RExC_emit = orig_emit;
10876 ret = reg_node(pRExC_state, OPFAIL);
10879 else if (min == max
10880 && RExC_parse < RExC_end
10881 && (*RExC_parse == '?' || *RExC_parse == '+'))
10884 ckWARN2reg(RExC_parse + 1,
10885 "Useless use of greediness modifier '%c'",
10888 /* Absorb the modifier, so later code doesn't see nor use
10890 nextchar(pRExC_state);
10894 if ((flags&SIMPLE)) {
10895 MARK_NAUGHTY_EXP(2, 2);
10896 reginsert(pRExC_state, CURLY, ret, depth+1);
10897 Set_Node_Offset(ret, parse_start+1); /* MJD */
10898 Set_Node_Cur_Length(ret, parse_start);
10901 regnode * const w = reg_node(pRExC_state, WHILEM);
10904 REGTAIL(pRExC_state, ret, w);
10905 if (!SIZE_ONLY && RExC_extralen) {
10906 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10907 reginsert(pRExC_state, NOTHING,ret, depth+1);
10908 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10910 reginsert(pRExC_state, CURLYX,ret, depth+1);
10912 Set_Node_Offset(ret, parse_start+1);
10913 Set_Node_Length(ret,
10914 op == '{' ? (RExC_parse - parse_start) : 1);
10916 if (!SIZE_ONLY && RExC_extralen)
10917 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10918 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10920 RExC_whilem_seen++, RExC_extralen += 3;
10921 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
10928 *flagp |= HASWIDTH;
10930 ARG1_SET(ret, (U16)min);
10931 ARG2_SET(ret, (U16)max);
10933 if (max == REG_INFTY)
10934 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10940 if (!ISMULT1(op)) {
10945 #if 0 /* Now runtime fix should be reliable. */
10947 /* if this is reinstated, don't forget to put this back into perldiag:
10949 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10951 (F) The part of the regexp subject to either the * or + quantifier
10952 could match an empty string. The {#} shows in the regular
10953 expression about where the problem was discovered.
10957 if (!(flags&HASWIDTH) && op != '?')
10958 vFAIL("Regexp *+ operand could be empty");
10961 #ifdef RE_TRACK_PATTERN_OFFSETS
10962 parse_start = RExC_parse;
10964 nextchar(pRExC_state);
10966 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10968 if (op == '*' && (flags&SIMPLE)) {
10969 reginsert(pRExC_state, STAR, ret, depth+1);
10972 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10974 else if (op == '*') {
10978 else if (op == '+' && (flags&SIMPLE)) {
10979 reginsert(pRExC_state, PLUS, ret, depth+1);
10982 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10984 else if (op == '+') {
10988 else if (op == '?') {
10993 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10994 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10995 ckWARN2reg(RExC_parse,
10996 "%"UTF8f" matches null string many times",
10997 UTF8fARG(UTF, (RExC_parse >= origparse
10998 ? RExC_parse - origparse
11001 (void)ReREFCNT_inc(RExC_rx_sv);
11004 if (RExC_parse < RExC_end && *RExC_parse == '?') {
11005 nextchar(pRExC_state);
11006 reginsert(pRExC_state, MINMOD, ret, depth+1);
11007 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11010 if (RExC_parse < RExC_end && *RExC_parse == '+') {
11012 nextchar(pRExC_state);
11013 ender = reg_node(pRExC_state, SUCCEED);
11014 REGTAIL(pRExC_state, ret, ender);
11015 reginsert(pRExC_state, SUSPEND, ret, depth+1);
11017 ender = reg_node(pRExC_state, TAIL);
11018 REGTAIL(pRExC_state, ret, ender);
11021 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11023 vFAIL("Nested quantifiers");
11030 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11038 /* This routine teases apart the various meanings of \N and returns
11039 * accordingly. The input parameters constrain which meaning(s) is/are valid
11040 * in the current context.
11042 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11044 * If <code_point_p> is not NULL, the context is expecting the result to be a
11045 * single code point. If this \N instance turns out to a single code point,
11046 * the function returns TRUE and sets *code_point_p to that code point.
11048 * If <node_p> is not NULL, the context is expecting the result to be one of
11049 * the things representable by a regnode. If this \N instance turns out to be
11050 * one such, the function generates the regnode, returns TRUE and sets *node_p
11051 * to point to that regnode.
11053 * If this instance of \N isn't legal in any context, this function will
11054 * generate a fatal error and not return.
11056 * On input, RExC_parse should point to the first char following the \N at the
11057 * time of the call. On successful return, RExC_parse will have been updated
11058 * to point to just after the sequence identified by this routine. Also
11059 * *flagp has been updated as needed.
11061 * When there is some problem with the current context and this \N instance,
11062 * the function returns FALSE, without advancing RExC_parse, nor setting
11063 * *node_p, nor *code_point_p, nor *flagp.
11065 * If <cp_count> is not NULL, the caller wants to know the length (in code
11066 * points) that this \N sequence matches. This is set even if the function
11067 * returns FALSE, as detailed below.
11069 * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11071 * Probably the most common case is for the \N to specify a single code point.
11072 * *cp_count will be set to 1, and *code_point_p will be set to that code
11075 * Another possibility is for the input to be an empty \N{}, which for
11076 * backwards compatibility we accept. *cp_count will be set to 0. *node_p
11077 * will be set to a generated NOTHING node.
11079 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11080 * set to 0. *node_p will be set to a generated REG_ANY node.
11082 * The fourth possibility is that \N resolves to a sequence of more than one
11083 * code points. *cp_count will be set to the number of code points in the
11084 * sequence. *node_p * will be set to a generated node returned by this
11085 * function calling S_reg().
11087 * The final possibility, which happens only when the fourth one would
11088 * otherwise be in effect, is that one of those code points requires the
11089 * pattern to be recompiled as UTF-8. The function returns FALSE, and sets
11090 * the RESTART_UTF8 flag in *flagp. When this happens, the caller needs to
11091 * desist from continuing parsing, and return this information to its caller.
11092 * This is not set for when there is only one code point, as this can be
11093 * called as part of an ANYOF node, and they can store above-Latin1 code
11094 * points without the pattern having to be in UTF-8.
11096 * For non-single-quoted regexes, the tokenizer has resolved character and
11097 * sequence names inside \N{...} into their Unicode values, normalizing the
11098 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11099 * hex-represented code points in the sequence. This is done there because
11100 * the names can vary based on what charnames pragma is in scope at the time,
11101 * so we need a way to take a snapshot of what they resolve to at the time of
11102 * the original parse. [perl #56444].
11104 * That parsing is skipped for single-quoted regexes, so we may here get
11105 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
11106 * parser. But if the single-quoted regex is something like '\N{U+41}', that
11107 * is legal and handled here. The code point is Unicode, and has to be
11108 * translated into the native character set for non-ASCII platforms.
11109 * the tokenizer passes the \N sequence through unchanged; this code will not
11110 * attempt to determine this nor expand those, instead raising a syntax error.
11113 char * endbrace; /* points to '}' following the name */
11114 char *endchar; /* Points to '.' or '}' ending cur char in the input
11116 char* p; /* Temporary */
11118 GET_RE_DEBUG_FLAGS_DECL;
11120 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11122 GET_RE_DEBUG_FLAGS;
11124 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
11125 assert(! (node_p && cp_count)); /* At most 1 should be set */
11127 if (cp_count) { /* Initialize return for the most common case */
11131 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11132 * modifier. The other meanings do not, so use a temporary until we find
11133 * out which we are being called with */
11134 p = (RExC_flags & RXf_PMf_EXTENDED)
11135 ? regpatws(pRExC_state, RExC_parse,
11136 TRUE) /* means recognize comments */
11139 /* Disambiguate between \N meaning a named character versus \N meaning
11140 * [^\n]. The latter is assumed when the {...} following the \N is a legal
11141 * quantifier, or there is no a '{' at all */
11142 if (*p != '{' || regcurly(p)) {
11151 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
11153 nextchar(pRExC_state);
11154 *node_p = reg_node(pRExC_state, REG_ANY);
11155 *flagp |= HASWIDTH|SIMPLE;
11157 Set_Node_Length(*node_p, 1); /* MJD */
11161 /* Here, we have decided it should be a named character or sequence */
11163 /* The test above made sure that the next real character is a '{', but
11164 * under the /x modifier, it could be separated by space (or a comment and
11165 * \n) and this is not allowed (for consistency with \x{...} and the
11166 * tokenizer handling of \N{NAME}). */
11167 if (*RExC_parse != '{') {
11168 vFAIL("Missing braces on \\N{}");
11171 RExC_parse++; /* Skip past the '{' */
11173 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11174 || ! (endbrace == RExC_parse /* nothing between the {} */
11175 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11176 && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11179 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11180 vFAIL("\\N{NAME} must be resolved by the lexer");
11183 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11185 if (endbrace == RExC_parse) { /* empty: \N{} */
11189 nextchar(pRExC_state);
11194 *node_p = reg_node(pRExC_state,NOTHING);
11198 RExC_parse += 2; /* Skip past the 'U+' */
11200 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11202 /* Code points are separated by dots. If none, there is only one code
11203 * point, and is terminated by the brace */
11205 if (endchar >= endbrace) {
11206 STRLEN length_of_hex;
11207 I32 grok_hex_flags;
11209 /* Here, exactly one code point. If that isn't what is wanted, fail */
11210 if (! code_point_p) {
11215 /* Convert code point from hex */
11216 length_of_hex = (STRLEN)(endchar - RExC_parse);
11217 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11218 | PERL_SCAN_DISALLOW_PREFIX
11220 /* No errors in the first pass (See [perl
11221 * #122671].) We let the code below find the
11222 * errors when there are multiple chars. */
11224 ? PERL_SCAN_SILENT_ILLDIGIT
11227 /* This routine is the one place where both single- and double-quotish
11228 * \N{U+xxxx} are evaluated. The value is a Unicode code point which
11229 * must be converted to native. */
11230 *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11235 /* The tokenizer should have guaranteed validity, but it's possible to
11236 * bypass it by using single quoting, so check. Don't do the check
11237 * here when there are multiple chars; we do it below anyway. */
11238 if (length_of_hex == 0
11239 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11241 RExC_parse += length_of_hex; /* Includes all the valid */
11242 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11243 ? UTF8SKIP(RExC_parse)
11245 /* Guard against malformed utf8 */
11246 if (RExC_parse >= endchar) {
11247 RExC_parse = endchar;
11249 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11252 RExC_parse = endbrace + 1;
11255 else { /* Is a multiple character sequence */
11256 SV * substitute_parse;
11258 char *orig_end = RExC_end;
11261 /* Count the code points, if desired, in the sequence */
11264 while (RExC_parse < endbrace) {
11265 /* Point to the beginning of the next character in the sequence. */
11266 RExC_parse = endchar + 1;
11267 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11272 /* Fail if caller doesn't want to handle a multi-code-point sequence.
11273 * But don't backup up the pointer if the caller want to know how many
11274 * code points there are (they can then handle things) */
11282 /* What is done here is to convert this to a sub-pattern of the form
11283 * \x{char1}\x{char2}... and then call reg recursively to parse it
11284 * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
11285 * while not having to worry about special handling that some code
11286 * points may have. */
11288 substitute_parse = newSVpvs("?:");
11290 while (RExC_parse < endbrace) {
11292 /* Convert to notation the rest of the code understands */
11293 sv_catpv(substitute_parse, "\\x{");
11294 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11295 sv_catpv(substitute_parse, "}");
11297 /* Point to the beginning of the next character in the sequence. */
11298 RExC_parse = endchar + 1;
11299 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11302 sv_catpv(substitute_parse, ")");
11304 RExC_parse = SvPV(substitute_parse, len);
11306 /* Don't allow empty number */
11307 if (len < (STRLEN) 8) {
11308 RExC_parse = endbrace;
11309 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11311 RExC_end = RExC_parse + len;
11313 /* The values are Unicode, and therefore not subject to recoding, but
11314 * have to be converted to native on a non-Unicode (meaning non-ASCII)
11316 RExC_override_recoding = 1;
11318 RExC_recode_x_to_native = 1;
11322 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11323 if (flags & RESTART_UTF8) {
11324 *flagp = RESTART_UTF8;
11327 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11330 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11333 /* Restore the saved values */
11334 RExC_parse = endbrace;
11335 RExC_end = orig_end;
11336 RExC_override_recoding = 0;
11338 RExC_recode_x_to_native = 0;
11341 SvREFCNT_dec_NN(substitute_parse);
11342 nextchar(pRExC_state);
11352 * It returns the code point in utf8 for the value in *encp.
11353 * value: a code value in the source encoding
11354 * encp: a pointer to an Encode object
11356 * If the result from Encode is not a single character,
11357 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11360 S_reg_recode(pTHX_ const char value, SV **encp)
11363 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11364 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11365 const STRLEN newlen = SvCUR(sv);
11366 UV uv = UNICODE_REPLACEMENT;
11368 PERL_ARGS_ASSERT_REG_RECODE;
11372 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11375 if (!newlen || numlen != newlen) {
11376 uv = UNICODE_REPLACEMENT;
11382 PERL_STATIC_INLINE U8
11383 S_compute_EXACTish(RExC_state_t *pRExC_state)
11387 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11395 op = get_regex_charset(RExC_flags);
11396 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11397 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11398 been, so there is no hole */
11401 return op + EXACTF;
11404 PERL_STATIC_INLINE void
11405 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11406 regnode *node, I32* flagp, STRLEN len, UV code_point,
11409 /* This knows the details about sizing an EXACTish node, setting flags for
11410 * it (by setting <*flagp>, and potentially populating it with a single
11413 * If <len> (the length in bytes) is non-zero, this function assumes that
11414 * the node has already been populated, and just does the sizing. In this
11415 * case <code_point> should be the final code point that has already been
11416 * placed into the node. This value will be ignored except that under some
11417 * circumstances <*flagp> is set based on it.
11419 * If <len> is zero, the function assumes that the node is to contain only
11420 * the single character given by <code_point> and calculates what <len>
11421 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11422 * additionally will populate the node's STRING with <code_point> or its
11425 * In both cases <*flagp> is appropriately set
11427 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11428 * 255, must be folded (the former only when the rules indicate it can
11431 * When it does the populating, it looks at the flag 'downgradable'. If
11432 * true with a node that folds, it checks if the single code point
11433 * participates in a fold, and if not downgrades the node to an EXACT.
11434 * This helps the optimizer */
11436 bool len_passed_in = cBOOL(len != 0);
11437 U8 character[UTF8_MAXBYTES_CASE+1];
11439 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11441 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11442 * sizing difference, and is extra work that is thrown away */
11443 if (downgradable && ! PASS2) {
11444 downgradable = FALSE;
11447 if (! len_passed_in) {
11449 if (UVCHR_IS_INVARIANT(code_point)) {
11450 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11451 *character = (U8) code_point;
11453 else { /* Here is /i and not /l. (toFOLD() is defined on just
11454 ASCII, which isn't the same thing as INVARIANT on
11455 EBCDIC, but it works there, as the extra invariants
11456 fold to themselves) */
11457 *character = toFOLD((U8) code_point);
11459 /* We can downgrade to an EXACT node if this character
11460 * isn't a folding one. Note that this assumes that
11461 * nothing above Latin1 folds to some other invariant than
11462 * one of these alphabetics; otherwise we would also have
11464 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11465 * || ASCII_FOLD_RESTRICTED))
11467 if (downgradable && PL_fold[code_point] == code_point) {
11473 else if (FOLD && (! LOC
11474 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11475 { /* Folding, and ok to do so now */
11476 UV folded = _to_uni_fold_flags(
11480 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11481 ? FOLD_FLAGS_NOMIX_ASCII
11484 && folded == code_point /* This quickly rules out many
11485 cases, avoiding the
11486 _invlist_contains_cp() overhead
11488 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11495 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11497 /* Not folding this cp, and can output it directly */
11498 *character = UTF8_TWO_BYTE_HI(code_point);
11499 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11503 uvchr_to_utf8( character, code_point);
11504 len = UTF8SKIP(character);
11506 } /* Else pattern isn't UTF8. */
11508 *character = (U8) code_point;
11510 } /* Else is folded non-UTF8 */
11511 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
11512 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
11513 || UNICODE_DOT_DOT_VERSION > 0)
11514 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11518 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11519 * comments at join_exact()); */
11520 *character = (U8) code_point;
11523 /* Can turn into an EXACT node if we know the fold at compile time,
11524 * and it folds to itself and doesn't particpate in other folds */
11527 && PL_fold_latin1[code_point] == code_point
11528 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11529 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11533 } /* else is Sharp s. May need to fold it */
11534 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11536 *(character + 1) = 's';
11540 *character = LATIN_SMALL_LETTER_SHARP_S;
11546 RExC_size += STR_SZ(len);
11549 RExC_emit += STR_SZ(len);
11550 STR_LEN(node) = len;
11551 if (! len_passed_in) {
11552 Copy((char *) character, STRING(node), len, char);
11556 *flagp |= HASWIDTH;
11558 /* A single character node is SIMPLE, except for the special-cased SHARP S
11560 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11561 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
11562 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
11563 || UNICODE_DOT_DOT_VERSION > 0)
11564 && ( code_point != LATIN_SMALL_LETTER_SHARP_S
11565 || ! FOLD || ! DEPENDS_SEMANTICS)
11571 /* The OP may not be well defined in PASS1 */
11572 if (PASS2 && OP(node) == EXACTFL) {
11573 RExC_contains_locale = 1;
11578 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11579 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11582 S_backref_value(char *p)
11584 const char* endptr;
11586 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
11593 - regatom - the lowest level
11595 Try to identify anything special at the start of the pattern. If there
11596 is, then handle it as required. This may involve generating a single regop,
11597 such as for an assertion; or it may involve recursing, such as to
11598 handle a () structure.
11600 If the string doesn't start with something special then we gobble up
11601 as much literal text as we can.
11603 Once we have been able to handle whatever type of thing started the
11604 sequence, we return.
11606 Note: we have to be careful with escapes, as they can be both literal
11607 and special, and in the case of \10 and friends, context determines which.
11609 A summary of the code structure is:
11611 switch (first_byte) {
11612 cases for each special:
11613 handle this special;
11616 switch (2nd byte) {
11617 cases for each unambiguous special:
11618 handle this special;
11620 cases for each ambigous special/literal:
11622 if (special) handle here
11624 default: // unambiguously literal:
11627 default: // is a literal char
11630 create EXACTish node for literal;
11631 while (more input and node isn't full) {
11632 switch (input_byte) {
11633 cases for each special;
11634 make sure parse pointer is set so that the next call to
11635 regatom will see this special first
11636 goto loopdone; // EXACTish node terminated by prev. char
11638 append char to EXACTISH node;
11640 get next input byte;
11644 return the generated node;
11646 Specifically there are two separate switches for handling
11647 escape sequences, with the one for handling literal escapes requiring
11648 a dummy entry for all of the special escapes that are actually handled
11651 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11653 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11655 Otherwise does not return NULL.
11659 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11661 regnode *ret = NULL;
11663 char *parse_start = RExC_parse;
11668 GET_RE_DEBUG_FLAGS_DECL;
11670 *flagp = WORST; /* Tentatively. */
11672 DEBUG_PARSE("atom");
11674 PERL_ARGS_ASSERT_REGATOM;
11677 switch ((U8)*RExC_parse) {
11679 RExC_seen_zerolen++;
11680 nextchar(pRExC_state);
11681 if (RExC_flags & RXf_PMf_MULTILINE)
11682 ret = reg_node(pRExC_state, MBOL);
11684 ret = reg_node(pRExC_state, SBOL);
11685 Set_Node_Length(ret, 1); /* MJD */
11688 nextchar(pRExC_state);
11690 RExC_seen_zerolen++;
11691 if (RExC_flags & RXf_PMf_MULTILINE)
11692 ret = reg_node(pRExC_state, MEOL);
11694 ret = reg_node(pRExC_state, SEOL);
11695 Set_Node_Length(ret, 1); /* MJD */
11698 nextchar(pRExC_state);
11699 if (RExC_flags & RXf_PMf_SINGLELINE)
11700 ret = reg_node(pRExC_state, SANY);
11702 ret = reg_node(pRExC_state, REG_ANY);
11703 *flagp |= HASWIDTH|SIMPLE;
11705 Set_Node_Length(ret, 1); /* MJD */
11709 char * const oregcomp_parse = ++RExC_parse;
11710 ret = regclass(pRExC_state, flagp,depth+1,
11711 FALSE, /* means parse the whole char class */
11712 TRUE, /* allow multi-char folds */
11713 FALSE, /* don't silence non-portable warnings. */
11714 (bool) RExC_strict,
11716 if (*RExC_parse != ']') {
11717 RExC_parse = oregcomp_parse;
11718 vFAIL("Unmatched [");
11721 if (*flagp & RESTART_UTF8)
11723 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11726 nextchar(pRExC_state);
11727 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11731 nextchar(pRExC_state);
11732 ret = reg(pRExC_state, 2, &flags,depth+1);
11734 if (flags & TRYAGAIN) {
11735 if (RExC_parse == RExC_end) {
11736 /* Make parent create an empty node if needed. */
11737 *flagp |= TRYAGAIN;
11742 if (flags & RESTART_UTF8) {
11743 *flagp = RESTART_UTF8;
11746 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11749 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11753 if (flags & TRYAGAIN) {
11754 *flagp |= TRYAGAIN;
11757 vFAIL("Internal urp");
11758 /* Supposed to be caught earlier. */
11764 vFAIL("Quantifier follows nothing");
11769 This switch handles escape sequences that resolve to some kind
11770 of special regop and not to literal text. Escape sequnces that
11771 resolve to literal text are handled below in the switch marked
11774 Every entry in this switch *must* have a corresponding entry
11775 in the literal escape switch. However, the opposite is not
11776 required, as the default for this switch is to jump to the
11777 literal text handling code.
11779 switch ((U8)*++RExC_parse) {
11780 /* Special Escapes */
11782 RExC_seen_zerolen++;
11783 ret = reg_node(pRExC_state, SBOL);
11784 /* SBOL is shared with /^/ so we set the flags so we can tell
11785 * /\A/ from /^/ in split. We check ret because first pass we
11786 * have no regop struct to set the flags on. */
11790 goto finish_meta_pat;
11792 ret = reg_node(pRExC_state, GPOS);
11793 RExC_seen |= REG_GPOS_SEEN;
11795 goto finish_meta_pat;
11797 RExC_seen_zerolen++;
11798 ret = reg_node(pRExC_state, KEEPS);
11800 /* XXX:dmq : disabling in-place substitution seems to
11801 * be necessary here to avoid cases of memory corruption, as
11802 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11804 RExC_seen |= REG_LOOKBEHIND_SEEN;
11805 goto finish_meta_pat;
11807 ret = reg_node(pRExC_state, SEOL);
11809 RExC_seen_zerolen++; /* Do not optimize RE away */
11810 goto finish_meta_pat;
11812 ret = reg_node(pRExC_state, EOS);
11814 RExC_seen_zerolen++; /* Do not optimize RE away */
11815 goto finish_meta_pat;
11817 vFAIL("\\C no longer supported");
11819 ret = reg_node(pRExC_state, CLUMP);
11820 *flagp |= HASWIDTH;
11821 goto finish_meta_pat;
11827 arg = ANYOF_WORDCHAR;
11835 regex_charset charset = get_regex_charset(RExC_flags);
11837 RExC_seen_zerolen++;
11838 RExC_seen |= REG_LOOKBEHIND_SEEN;
11839 op = BOUND + charset;
11841 if (op == BOUNDL) {
11842 RExC_contains_locale = 1;
11845 ret = reg_node(pRExC_state, op);
11847 if (*(RExC_parse + 1) != '{') {
11848 FLAGS(ret) = TRADITIONAL_BOUND;
11849 if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
11855 char name = *RExC_parse;
11858 endbrace = strchr(RExC_parse, '}');
11861 vFAIL2("Missing right brace on \\%c{}", name);
11863 /* XXX Need to decide whether to take spaces or not. Should be
11864 * consistent with \p{}, but that currently is SPACE, which
11865 * means vertical too, which seems wrong
11866 * while (isBLANK(*RExC_parse)) {
11869 if (endbrace == RExC_parse) {
11870 RExC_parse++; /* After the '}' */
11871 vFAIL2("Empty \\%c{}", name);
11873 length = endbrace - RExC_parse;
11874 /*while (isBLANK(*(RExC_parse + length - 1))) {
11877 switch (*RExC_parse) {
11880 && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
11882 goto bad_bound_type;
11884 FLAGS(ret) = GCB_BOUND;
11887 if (length != 2 || *(RExC_parse + 1) != 'b') {
11888 goto bad_bound_type;
11890 FLAGS(ret) = SB_BOUND;
11893 if (length != 2 || *(RExC_parse + 1) != 'b') {
11894 goto bad_bound_type;
11896 FLAGS(ret) = WB_BOUND;
11900 RExC_parse = endbrace;
11902 "'%"UTF8f"' is an unknown bound type",
11903 UTF8fARG(UTF, length, endbrace - length));
11904 NOT_REACHED; /*NOTREACHED*/
11906 RExC_parse = endbrace;
11907 RExC_uni_semantics = 1;
11909 if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
11913 /* Don't have to worry about UTF-8, in this message because
11914 * to get here the contents of the \b must be ASCII */
11915 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
11916 "Using /u for '%.*s' instead of /%s",
11918 endbrace - length + 1,
11919 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
11920 ? ASCII_RESTRICT_PAT_MODS
11921 : ASCII_MORE_RESTRICT_PAT_MODS);
11925 if (PASS2 && invert) {
11926 OP(ret) += NBOUND - BOUND;
11928 goto finish_meta_pat;
11936 if (! DEPENDS_SEMANTICS) {
11940 /* \d doesn't have any matches in the upper Latin1 range, hence /d
11941 * is equivalent to /u. Changing to /u saves some branches at
11944 goto join_posix_op_known;
11947 ret = reg_node(pRExC_state, LNBREAK);
11948 *flagp |= HASWIDTH|SIMPLE;
11949 goto finish_meta_pat;
11957 goto join_posix_op_known;
11963 arg = ANYOF_VERTWS;
11965 goto join_posix_op_known;
11975 op = POSIXD + get_regex_charset(RExC_flags);
11976 if (op > POSIXA) { /* /aa is same as /a */
11979 else if (op == POSIXL) {
11980 RExC_contains_locale = 1;
11983 join_posix_op_known:
11986 op += NPOSIXD - POSIXD;
11989 ret = reg_node(pRExC_state, op);
11991 FLAGS(ret) = namedclass_to_classnum(arg);
11994 *flagp |= HASWIDTH|SIMPLE;
11998 nextchar(pRExC_state);
11999 Set_Node_Length(ret, 2); /* MJD */
12005 char* parse_start = RExC_parse - 2;
12010 ret = regclass(pRExC_state, flagp,depth+1,
12011 TRUE, /* means just parse this element */
12012 FALSE, /* don't allow multi-char folds */
12013 FALSE, /* don't silence non-portable warnings.
12014 It would be a bug if these returned
12016 (bool) RExC_strict,
12018 /* regclass() can only return RESTART_UTF8 if multi-char folds
12021 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12026 Set_Node_Offset(ret, parse_start + 2);
12027 Set_Node_Cur_Length(ret, parse_start);
12028 nextchar(pRExC_state);
12032 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12033 * \N{...} evaluates to a sequence of more than one code points).
12034 * The function call below returns a regnode, which is our result.
12035 * The parameters cause it to fail if the \N{} evaluates to a
12036 * single code point; we handle those like any other literal. The
12037 * reason that the multicharacter case is handled here and not as
12038 * part of the EXACtish code is because of quantifiers. In
12039 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12040 * this way makes that Just Happen. dmq.
12041 * join_exact() will join this up with adjacent EXACTish nodes
12042 * later on, if appropriate. */
12044 if (grok_bslash_N(pRExC_state,
12045 &ret, /* Want a regnode returned */
12046 NULL, /* Fail if evaluates to a single code
12048 NULL, /* Don't need a count of how many code
12056 if (*flagp & RESTART_UTF8)
12061 case 'k': /* Handle \k<NAME> and \k'NAME' */
12064 char ch= RExC_parse[1];
12065 if (ch != '<' && ch != '\'' && ch != '{') {
12067 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12068 vFAIL2("Sequence %.2s... not terminated",parse_start);
12070 /* this pretty much dupes the code for (?P=...) in reg(), if
12071 you change this make sure you change that */
12072 char* name_start = (RExC_parse += 2);
12074 SV *sv_dat = reg_scan_name(pRExC_state,
12075 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
12076 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
12077 if (RExC_parse == name_start || *RExC_parse != ch)
12078 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12079 vFAIL2("Sequence %.3s... not terminated",parse_start);
12082 num = add_data( pRExC_state, STR_WITH_LEN("S"));
12083 RExC_rxi->data->data[num]=(void*)sv_dat;
12084 SvREFCNT_inc_simple_void(sv_dat);
12088 ret = reganode(pRExC_state,
12091 : (ASCII_FOLD_RESTRICTED)
12093 : (AT_LEAST_UNI_SEMANTICS)
12099 *flagp |= HASWIDTH;
12101 /* override incorrect value set in reganode MJD */
12102 Set_Node_Offset(ret, parse_start+1);
12103 Set_Node_Cur_Length(ret, parse_start);
12104 nextchar(pRExC_state);
12110 case '1': case '2': case '3': case '4':
12111 case '5': case '6': case '7': case '8': case '9':
12116 if (*RExC_parse == 'g') {
12120 if (*RExC_parse == '{') {
12124 if (*RExC_parse == '-') {
12128 if (hasbrace && !isDIGIT(*RExC_parse)) {
12129 if (isrel) RExC_parse--;
12131 goto parse_named_seq;
12134 num = S_backref_value(RExC_parse);
12136 vFAIL("Reference to invalid group 0");
12137 else if (num == I32_MAX) {
12138 if (isDIGIT(*RExC_parse))
12139 vFAIL("Reference to nonexistent group");
12141 vFAIL("Unterminated \\g... pattern");
12145 num = RExC_npar - num;
12147 vFAIL("Reference to nonexistent or unclosed group");
12151 num = S_backref_value(RExC_parse);
12152 /* bare \NNN might be backref or octal - if it is larger
12153 * than or equal RExC_npar then it is assumed to be an
12154 * octal escape. Note RExC_npar is +1 from the actual
12155 * number of parens. */
12156 /* Note we do NOT check if num == I32_MAX here, as that is
12157 * handled by the RExC_npar check */
12160 /* any numeric escape < 10 is always a backref */
12162 /* any numeric escape < RExC_npar is a backref */
12163 && num >= RExC_npar
12164 /* cannot be an octal escape if it starts with 8 */
12165 && *RExC_parse != '8'
12166 /* cannot be an octal escape it it starts with 9 */
12167 && *RExC_parse != '9'
12170 /* Probably not a backref, instead likely to be an
12171 * octal character escape, e.g. \35 or \777.
12172 * The above logic should make it obvious why using
12173 * octal escapes in patterns is problematic. - Yves */
12178 /* At this point RExC_parse points at a numeric escape like
12179 * \12 or \88 or something similar, which we should NOT treat
12180 * as an octal escape. It may or may not be a valid backref
12181 * escape. For instance \88888888 is unlikely to be a valid
12184 #ifdef RE_TRACK_PATTERN_OFFSETS
12185 char * const parse_start = RExC_parse - 1; /* MJD */
12187 while (isDIGIT(*RExC_parse))
12190 if (*RExC_parse != '}')
12191 vFAIL("Unterminated \\g{...} pattern");
12195 if (num > (I32)RExC_rx->nparens)
12196 vFAIL("Reference to nonexistent group");
12199 ret = reganode(pRExC_state,
12202 : (ASCII_FOLD_RESTRICTED)
12204 : (AT_LEAST_UNI_SEMANTICS)
12210 *flagp |= HASWIDTH;
12212 /* override incorrect value set in reganode MJD */
12213 Set_Node_Offset(ret, parse_start+1);
12214 Set_Node_Cur_Length(ret, parse_start);
12216 nextchar(pRExC_state);
12221 if (RExC_parse >= RExC_end)
12222 FAIL("Trailing \\");
12225 /* Do not generate "unrecognized" warnings here, we fall
12226 back into the quick-grab loop below */
12233 if (RExC_flags & RXf_PMf_EXTENDED) {
12234 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12235 if (RExC_parse < RExC_end)
12242 parse_start = RExC_parse - 1;
12251 #define MAX_NODE_STRING_SIZE 127
12252 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12254 U8 upper_parse = MAX_NODE_STRING_SIZE;
12255 U8 node_type = compute_EXACTish(pRExC_state);
12256 bool next_is_quantifier;
12257 char * oldp = NULL;
12259 /* We can convert EXACTF nodes to EXACTFU if they contain only
12260 * characters that match identically regardless of the target
12261 * string's UTF8ness. The reason to do this is that EXACTF is not
12262 * trie-able, EXACTFU is.
12264 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12265 * contain only above-Latin1 characters (hence must be in UTF8),
12266 * which don't participate in folds with Latin1-range characters,
12267 * as the latter's folds aren't known until runtime. (We don't
12268 * need to figure this out until pass 2) */
12269 bool maybe_exactfu = PASS2
12270 && (node_type == EXACTF || node_type == EXACTFL);
12272 /* If a folding node contains only code points that don't
12273 * participate in folds, it can be changed into an EXACT node,
12274 * which allows the optimizer more things to look for */
12277 ret = reg_node(pRExC_state, node_type);
12279 /* In pass1, folded, we use a temporary buffer instead of the
12280 * actual node, as the node doesn't exist yet */
12281 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12287 /* We do the EXACTFish to EXACT node only if folding. (And we
12288 * don't need to figure this out until pass 2) */
12289 maybe_exact = FOLD && PASS2;
12291 /* XXX The node can hold up to 255 bytes, yet this only goes to
12292 * 127. I (khw) do not know why. Keeping it somewhat less than
12293 * 255 allows us to not have to worry about overflow due to
12294 * converting to utf8 and fold expansion, but that value is
12295 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12296 * split up by this limit into a single one using the real max of
12297 * 255. Even at 127, this breaks under rare circumstances. If
12298 * folding, we do not want to split a node at a character that is a
12299 * non-final in a multi-char fold, as an input string could just
12300 * happen to want to match across the node boundary. The join
12301 * would solve that problem if the join actually happens. But a
12302 * series of more than two nodes in a row each of 127 would cause
12303 * the first join to succeed to get to 254, but then there wouldn't
12304 * be room for the next one, which could at be one of those split
12305 * multi-char folds. I don't know of any fool-proof solution. One
12306 * could back off to end with only a code point that isn't such a
12307 * non-final, but it is possible for there not to be any in the
12309 for (p = RExC_parse - 1;
12310 len < upper_parse && p < RExC_end;
12315 if (RExC_flags & RXf_PMf_EXTENDED)
12316 p = regpatws(pRExC_state, p,
12317 TRUE); /* means recognize comments */
12328 /* Literal Escapes Switch
12330 This switch is meant to handle escape sequences that
12331 resolve to a literal character.
12333 Every escape sequence that represents something
12334 else, like an assertion or a char class, is handled
12335 in the switch marked 'Special Escapes' above in this
12336 routine, but also has an entry here as anything that
12337 isn't explicitly mentioned here will be treated as
12338 an unescaped equivalent literal.
12341 switch ((U8)*++p) {
12342 /* These are all the special escapes. */
12343 case 'A': /* Start assertion */
12344 case 'b': case 'B': /* Word-boundary assertion*/
12345 case 'C': /* Single char !DANGEROUS! */
12346 case 'd': case 'D': /* digit class */
12347 case 'g': case 'G': /* generic-backref, pos assertion */
12348 case 'h': case 'H': /* HORIZWS */
12349 case 'k': case 'K': /* named backref, keep marker */
12350 case 'p': case 'P': /* Unicode property */
12351 case 'R': /* LNBREAK */
12352 case 's': case 'S': /* space class */
12353 case 'v': case 'V': /* VERTWS */
12354 case 'w': case 'W': /* word class */
12355 case 'X': /* eXtended Unicode "combining
12356 character sequence" */
12357 case 'z': case 'Z': /* End of line/string assertion */
12361 /* Anything after here is an escape that resolves to a
12362 literal. (Except digits, which may or may not)
12368 case 'N': /* Handle a single-code point named character. */
12369 RExC_parse = p + 1;
12370 if (! grok_bslash_N(pRExC_state,
12371 NULL, /* Fail if evaluates to
12372 anything other than a
12373 single code point */
12374 &ender, /* The returned single code
12376 NULL, /* Don't need a count of
12377 how many code points */
12381 if (*flagp & RESTART_UTF8)
12382 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12384 /* Here, it wasn't a single code point. Go close
12385 * up this EXACTish node. The switch() prior to
12386 * this switch handles the other cases */
12387 RExC_parse = p = oldp;
12391 if (ender > 0xff) {
12408 ender = ESC_NATIVE;
12418 const char* error_msg;
12420 bool valid = grok_bslash_o(&p,
12423 PASS2, /* out warnings */
12424 (bool) RExC_strict,
12425 TRUE, /* Output warnings
12430 RExC_parse = p; /* going to die anyway; point
12431 to exact spot of failure */
12435 if (IN_ENCODING && ender < 0x100) {
12436 goto recode_encoding;
12438 if (ender > 0xff) {
12445 UV result = UV_MAX; /* initialize to erroneous
12447 const char* error_msg;
12449 bool valid = grok_bslash_x(&p,
12452 PASS2, /* out warnings */
12453 (bool) RExC_strict,
12454 TRUE, /* Silence warnings
12459 RExC_parse = p; /* going to die anyway; point
12460 to exact spot of failure */
12465 if (ender < 0x100) {
12467 if (RExC_recode_x_to_native) {
12468 ender = LATIN1_TO_NATIVE(ender);
12473 goto recode_encoding;
12483 ender = grok_bslash_c(*p++, PASS2);
12485 case '8': case '9': /* must be a backreference */
12487 /* we have an escape like \8 which cannot be an octal escape
12488 * so we exit the loop, and let the outer loop handle this
12489 * escape which may or may not be a legitimate backref. */
12491 case '1': case '2': case '3':case '4':
12492 case '5': case '6': case '7':
12493 /* When we parse backslash escapes there is ambiguity
12494 * between backreferences and octal escapes. Any escape
12495 * from \1 - \9 is a backreference, any multi-digit
12496 * escape which does not start with 0 and which when
12497 * evaluated as decimal could refer to an already
12498 * parsed capture buffer is a back reference. Anything
12501 * Note this implies that \118 could be interpreted as
12502 * 118 OR as "\11" . "8" depending on whether there
12503 * were 118 capture buffers defined already in the
12506 /* NOTE, RExC_npar is 1 more than the actual number of
12507 * parens we have seen so far, hence the < RExC_npar below. */
12509 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12510 { /* Not to be treated as an octal constant, go
12518 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12520 ender = grok_oct(p, &numlen, &flags, NULL);
12521 if (ender > 0xff) {
12525 if (PASS2 /* like \08, \178 */
12528 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12530 reg_warn_non_literal_string(
12532 form_short_octal_warning(p, numlen));
12535 if (IN_ENCODING && ender < 0x100)
12536 goto recode_encoding;
12539 if (! RExC_override_recoding) {
12540 SV* enc = _get_encoding();
12541 ender = reg_recode((const char)(U8)ender, &enc);
12543 ckWARNreg(p, "Invalid escape in the specified encoding");
12549 FAIL("Trailing \\");
12552 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12553 /* Include any { following the alpha to emphasize
12554 * that it could be part of an escape at some point
12556 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12557 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12559 goto normal_default;
12560 } /* End of switch on '\' */
12563 /* Currently we don't warn when the lbrace is at the start
12564 * of a construct. This catches it in the middle of a
12565 * literal string, or when its the first thing after
12566 * something like "\b" */
12568 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12570 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12573 default: /* A literal character */
12575 if (UTF8_IS_START(*p) && UTF) {
12577 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12578 &numlen, UTF8_ALLOW_DEFAULT);
12584 } /* End of switch on the literal */
12586 /* Here, have looked at the literal character and <ender>
12587 * contains its ordinal, <p> points to the character after it
12590 if ( RExC_flags & RXf_PMf_EXTENDED)
12591 p = regpatws(pRExC_state, p,
12592 TRUE); /* means recognize comments */
12594 /* If the next thing is a quantifier, it applies to this
12595 * character only, which means that this character has to be in
12596 * its own node and can't just be appended to the string in an
12597 * existing node, so if there are already other characters in
12598 * the node, close the node with just them, and set up to do
12599 * this character again next time through, when it will be the
12600 * only thing in its new node */
12601 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12607 if (! FOLD) { /* The simple case, just append the literal */
12609 /* In the sizing pass, we need only the size of the
12610 * character we are appending, hence we can delay getting
12611 * its representation until PASS2. */
12614 const STRLEN unilen = UNISKIP(ender);
12617 /* We have to subtract 1 just below (and again in
12618 * the corresponding PASS2 code) because the loop
12619 * increments <len> each time, as all but this path
12620 * (and one other) through it add a single byte to
12621 * the EXACTish node. But these paths would change
12622 * len to be the correct final value, so cancel out
12623 * the increment that follows */
12629 } else { /* PASS2 */
12632 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12633 len += (char *) new_s - s - 1;
12634 s = (char *) new_s;
12637 *(s++) = (char) ender;
12641 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12643 /* Here are folding under /l, and the code point is
12644 * problematic. First, we know we can't simplify things */
12645 maybe_exact = FALSE;
12646 maybe_exactfu = FALSE;
12648 /* A problematic code point in this context means that its
12649 * fold isn't known until runtime, so we can't fold it now.
12650 * (The non-problematic code points are the above-Latin1
12651 * ones that fold to also all above-Latin1. Their folds
12652 * don't vary no matter what the locale is.) But here we
12653 * have characters whose fold depends on the locale.
12654 * Unlike the non-folding case above, we have to keep track
12655 * of these in the sizing pass, so that we can make sure we
12656 * don't split too-long nodes in the middle of a potential
12657 * multi-char fold. And unlike the regular fold case
12658 * handled in the else clauses below, we don't actually
12659 * fold and don't have special cases to consider. What we
12660 * do for both passes is the PASS2 code for non-folding */
12661 goto not_fold_common;
12663 else /* A regular FOLD code point */
12665 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12666 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12667 || UNICODE_DOT_DOT_VERSION > 0)
12668 /* See comments for join_exact() as to why we fold this
12669 * non-UTF at compile time */
12670 || (node_type == EXACTFU
12671 && ender == LATIN_SMALL_LETTER_SHARP_S)
12674 /* Here, are folding and are not UTF-8 encoded; therefore
12675 * the character must be in the range 0-255, and is not /l
12676 * (Not /l because we already handled these under /l in
12677 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12678 if (IS_IN_SOME_FOLD_L1(ender)) {
12679 maybe_exact = FALSE;
12681 /* See if the character's fold differs between /d and
12682 * /u. This includes the multi-char fold SHARP S to
12685 && (PL_fold[ender] != PL_fold_latin1[ender]
12686 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12687 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12688 || UNICODE_DOT_DOT_VERSION > 0)
12689 || ender == LATIN_SMALL_LETTER_SHARP_S
12691 && isALPHA_FOLD_EQ(ender, 's')
12692 && isALPHA_FOLD_EQ(*(s-1), 's'))
12695 maybe_exactfu = FALSE;
12699 /* Even when folding, we store just the input character, as
12700 * we have an array that finds its fold quickly */
12701 *(s++) = (char) ender;
12703 else { /* FOLD and UTF */
12704 /* Unlike the non-fold case, we do actually have to
12705 * calculate the results here in pass 1. This is for two
12706 * reasons, the folded length may be longer than the
12707 * unfolded, and we have to calculate how many EXACTish
12708 * nodes it will take; and we may run out of room in a node
12709 * in the middle of a potential multi-char fold, and have
12710 * to back off accordingly. */
12713 if (isASCII_uni(ender)) {
12714 folded = toFOLD(ender);
12715 *(s)++ = (U8) folded;
12720 folded = _to_uni_fold_flags(
12724 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12725 ? FOLD_FLAGS_NOMIX_ASCII
12729 /* The loop increments <len> each time, as all but this
12730 * path (and one other) through it add a single byte to
12731 * the EXACTish node. But this one has changed len to
12732 * be the correct final value, so subtract one to
12733 * cancel out the increment that follows */
12734 len += foldlen - 1;
12736 /* If this node only contains non-folding code points so
12737 * far, see if this new one is also non-folding */
12739 if (folded != ender) {
12740 maybe_exact = FALSE;
12743 /* Here the fold is the original; we have to check
12744 * further to see if anything folds to it */
12745 if (_invlist_contains_cp(PL_utf8_foldable,
12748 maybe_exact = FALSE;
12755 if (next_is_quantifier) {
12757 /* Here, the next input is a quantifier, and to get here,
12758 * the current character is the only one in the node.
12759 * Also, here <len> doesn't include the final byte for this
12765 } /* End of loop through literal characters */
12767 /* Here we have either exhausted the input or ran out of room in
12768 * the node. (If we encountered a character that can't be in the
12769 * node, transfer is made directly to <loopdone>, and so we
12770 * wouldn't have fallen off the end of the loop.) In the latter
12771 * case, we artificially have to split the node into two, because
12772 * we just don't have enough space to hold everything. This
12773 * creates a problem if the final character participates in a
12774 * multi-character fold in the non-final position, as a match that
12775 * should have occurred won't, due to the way nodes are matched,
12776 * and our artificial boundary. So back off until we find a non-
12777 * problematic character -- one that isn't at the beginning or
12778 * middle of such a fold. (Either it doesn't participate in any
12779 * folds, or appears only in the final position of all the folds it
12780 * does participate in.) A better solution with far fewer false
12781 * positives, and that would fill the nodes more completely, would
12782 * be to actually have available all the multi-character folds to
12783 * test against, and to back-off only far enough to be sure that
12784 * this node isn't ending with a partial one. <upper_parse> is set
12785 * further below (if we need to reparse the node) to include just
12786 * up through that final non-problematic character that this code
12787 * identifies, so when it is set to less than the full node, we can
12788 * skip the rest of this */
12789 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12791 const STRLEN full_len = len;
12793 assert(len >= MAX_NODE_STRING_SIZE);
12795 /* Here, <s> points to the final byte of the final character.
12796 * Look backwards through the string until find a non-
12797 * problematic character */
12801 /* This has no multi-char folds to non-UTF characters */
12802 if (ASCII_FOLD_RESTRICTED) {
12806 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12810 if (! PL_NonL1NonFinalFold) {
12811 PL_NonL1NonFinalFold = _new_invlist_C_array(
12812 NonL1_Perl_Non_Final_Folds_invlist);
12815 /* Point to the first byte of the final character */
12816 s = (char *) utf8_hop((U8 *) s, -1);
12818 while (s >= s0) { /* Search backwards until find
12819 non-problematic char */
12820 if (UTF8_IS_INVARIANT(*s)) {
12822 /* There are no ascii characters that participate
12823 * in multi-char folds under /aa. In EBCDIC, the
12824 * non-ascii invariants are all control characters,
12825 * so don't ever participate in any folds. */
12826 if (ASCII_FOLD_RESTRICTED
12827 || ! IS_NON_FINAL_FOLD(*s))
12832 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12833 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12839 else if (! _invlist_contains_cp(
12840 PL_NonL1NonFinalFold,
12841 valid_utf8_to_uvchr((U8 *) s, NULL)))
12846 /* Here, the current character is problematic in that
12847 * it does occur in the non-final position of some
12848 * fold, so try the character before it, but have to
12849 * special case the very first byte in the string, so
12850 * we don't read outside the string */
12851 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12852 } /* End of loop backwards through the string */
12854 /* If there were only problematic characters in the string,
12855 * <s> will point to before s0, in which case the length
12856 * should be 0, otherwise include the length of the
12857 * non-problematic character just found */
12858 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12861 /* Here, have found the final character, if any, that is
12862 * non-problematic as far as ending the node without splitting
12863 * it across a potential multi-char fold. <len> contains the
12864 * number of bytes in the node up-to and including that
12865 * character, or is 0 if there is no such character, meaning
12866 * the whole node contains only problematic characters. In
12867 * this case, give up and just take the node as-is. We can't
12872 /* If the node ends in an 's' we make sure it stays EXACTF,
12873 * as if it turns into an EXACTFU, it could later get
12874 * joined with another 's' that would then wrongly match
12876 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12878 maybe_exactfu = FALSE;
12882 /* Here, the node does contain some characters that aren't
12883 * problematic. If one such is the final character in the
12884 * node, we are done */
12885 if (len == full_len) {
12888 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12890 /* If the final character is problematic, but the
12891 * penultimate is not, back-off that last character to
12892 * later start a new node with it */
12897 /* Here, the final non-problematic character is earlier
12898 * in the input than the penultimate character. What we do
12899 * is reparse from the beginning, going up only as far as
12900 * this final ok one, thus guaranteeing that the node ends
12901 * in an acceptable character. The reason we reparse is
12902 * that we know how far in the character is, but we don't
12903 * know how to correlate its position with the input parse.
12904 * An alternate implementation would be to build that
12905 * correlation as we go along during the original parse,
12906 * but that would entail extra work for every node, whereas
12907 * this code gets executed only when the string is too
12908 * large for the node, and the final two characters are
12909 * problematic, an infrequent occurrence. Yet another
12910 * possible strategy would be to save the tail of the
12911 * string, and the next time regatom is called, initialize
12912 * with that. The problem with this is that unless you
12913 * back off one more character, you won't be guaranteed
12914 * regatom will get called again, unless regbranch,
12915 * regpiece ... are also changed. If you do back off that
12916 * extra character, so that there is input guaranteed to
12917 * force calling regatom, you can't handle the case where
12918 * just the first character in the node is acceptable. I
12919 * (khw) decided to try this method which doesn't have that
12920 * pitfall; if performance issues are found, we can do a
12921 * combination of the current approach plus that one */
12927 } /* End of verifying node ends with an appropriate char */
12929 loopdone: /* Jumped to when encounters something that shouldn't be
12932 /* I (khw) don't know if you can get here with zero length, but the
12933 * old code handled this situation by creating a zero-length EXACT
12934 * node. Might as well be NOTHING instead */
12940 /* If 'maybe_exact' is still set here, means there are no
12941 * code points in the node that participate in folds;
12942 * similarly for 'maybe_exactfu' and code points that match
12943 * differently depending on UTF8ness of the target string
12944 * (for /u), or depending on locale for /l */
12950 else if (maybe_exactfu) {
12956 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12957 FALSE /* Don't look to see if could
12958 be turned into an EXACT
12959 node, as we have already
12964 RExC_parse = p - 1;
12965 Set_Node_Cur_Length(ret, parse_start);
12966 nextchar(pRExC_state);
12968 /* len is STRLEN which is unsigned, need to copy to signed */
12971 vFAIL("Internal disaster");
12974 } /* End of label 'defchar:' */
12976 } /* End of giant switch on input character */
12982 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12984 /* Returns the next non-pattern-white space, non-comment character (the
12985 * latter only if 'recognize_comment is true) in the string p, which is
12986 * ended by RExC_end. See also reg_skipcomment */
12987 const char *e = RExC_end;
12989 PERL_ARGS_ASSERT_REGPATWS;
12993 if ((len = is_PATWS_safe(p, e, UTF))) {
12996 else if (recognize_comment && *p == '#') {
12997 p = reg_skipcomment(pRExC_state, p);
13006 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13008 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
13009 * sets up the bitmap and any flags, removing those code points from the
13010 * inversion list, setting it to NULL should it become completely empty */
13012 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13013 assert(PL_regkind[OP(node)] == ANYOF);
13015 ANYOF_BITMAP_ZERO(node);
13016 if (*invlist_ptr) {
13018 /* This gets set if we actually need to modify things */
13019 bool change_invlist = FALSE;
13023 /* Start looking through *invlist_ptr */
13024 invlist_iterinit(*invlist_ptr);
13025 while (invlist_iternext(*invlist_ptr, &start, &end)) {
13029 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13030 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13032 else if (end >= NUM_ANYOF_CODE_POINTS) {
13033 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
13036 /* Quit if are above what we should change */
13037 if (start >= NUM_ANYOF_CODE_POINTS) {
13041 change_invlist = TRUE;
13043 /* Set all the bits in the range, up to the max that we are doing */
13044 high = (end < NUM_ANYOF_CODE_POINTS - 1)
13046 : NUM_ANYOF_CODE_POINTS - 1;
13047 for (i = start; i <= (int) high; i++) {
13048 if (! ANYOF_BITMAP_TEST(node, i)) {
13049 ANYOF_BITMAP_SET(node, i);
13053 invlist_iterfinish(*invlist_ptr);
13055 /* Done with loop; remove any code points that are in the bitmap from
13056 * *invlist_ptr; similarly for code points above the bitmap if we have
13057 * a flag to match all of them anyways */
13058 if (change_invlist) {
13059 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13061 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13062 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13065 /* If have completely emptied it, remove it completely */
13066 if (_invlist_len(*invlist_ptr) == 0) {
13067 SvREFCNT_dec_NN(*invlist_ptr);
13068 *invlist_ptr = NULL;
13073 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13074 Character classes ([:foo:]) can also be negated ([:^foo:]).
13075 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13076 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13077 but trigger failures because they are currently unimplemented. */
13079 #define POSIXCC_DONE(c) ((c) == ':')
13080 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13081 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13083 PERL_STATIC_INLINE I32
13084 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
13086 I32 namedclass = OOB_NAMEDCLASS;
13088 PERL_ARGS_ASSERT_REGPPOSIXCC;
13090 if (value == '[' && RExC_parse + 1 < RExC_end &&
13091 /* I smell either [: or [= or [. -- POSIX has been here, right? */
13092 POSIXCC(UCHARAT(RExC_parse)))
13094 const char c = UCHARAT(RExC_parse);
13095 char* const s = RExC_parse++;
13097 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
13099 if (RExC_parse == RExC_end) {
13102 /* Try to give a better location for the error (than the end of
13103 * the string) by looking for the matching ']' */
13105 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
13108 vFAIL2("Unmatched '%c' in POSIX class", c);
13110 /* Grandfather lone [:, [=, [. */
13114 const char* const t = RExC_parse++; /* skip over the c */
13117 if (UCHARAT(RExC_parse) == ']') {
13118 const char *posixcc = s + 1;
13119 RExC_parse++; /* skip over the ending ] */
13122 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
13123 const I32 skip = t - posixcc;
13125 /* Initially switch on the length of the name. */
13128 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
13129 this is the Perl \w
13131 namedclass = ANYOF_WORDCHAR;
13134 /* Names all of length 5. */
13135 /* alnum alpha ascii blank cntrl digit graph lower
13136 print punct space upper */
13137 /* Offset 4 gives the best switch position. */
13138 switch (posixcc[4]) {
13140 if (memEQ(posixcc, "alph", 4)) /* alpha */
13141 namedclass = ANYOF_ALPHA;
13144 if (memEQ(posixcc, "spac", 4)) /* space */
13145 namedclass = ANYOF_SPACE;
13148 if (memEQ(posixcc, "grap", 4)) /* graph */
13149 namedclass = ANYOF_GRAPH;
13152 if (memEQ(posixcc, "asci", 4)) /* ascii */
13153 namedclass = ANYOF_ASCII;
13156 if (memEQ(posixcc, "blan", 4)) /* blank */
13157 namedclass = ANYOF_BLANK;
13160 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
13161 namedclass = ANYOF_CNTRL;
13164 if (memEQ(posixcc, "alnu", 4)) /* alnum */
13165 namedclass = ANYOF_ALPHANUMERIC;
13168 if (memEQ(posixcc, "lowe", 4)) /* lower */
13169 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
13170 else if (memEQ(posixcc, "uppe", 4)) /* upper */
13171 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
13174 if (memEQ(posixcc, "digi", 4)) /* digit */
13175 namedclass = ANYOF_DIGIT;
13176 else if (memEQ(posixcc, "prin", 4)) /* print */
13177 namedclass = ANYOF_PRINT;
13178 else if (memEQ(posixcc, "punc", 4)) /* punct */
13179 namedclass = ANYOF_PUNCT;
13184 if (memEQ(posixcc, "xdigit", 6))
13185 namedclass = ANYOF_XDIGIT;
13189 if (namedclass == OOB_NAMEDCLASS)
13191 "POSIX class [:%"UTF8f":] unknown",
13192 UTF8fARG(UTF, t - s - 1, s + 1));
13194 /* The #defines are structured so each complement is +1 to
13195 * the normal one */
13199 assert (posixcc[skip] == ':');
13200 assert (posixcc[skip+1] == ']');
13201 } else if (!SIZE_ONLY) {
13202 /* [[=foo=]] and [[.foo.]] are still future. */
13204 /* adjust RExC_parse so the warning shows after
13205 the class closes */
13206 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13208 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13211 /* Maternal grandfather:
13212 * "[:" ending in ":" but not in ":]" */
13214 vFAIL("Unmatched '[' in POSIX class");
13217 /* Grandfather lone [:, [=, [. */
13227 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13229 /* This applies some heuristics at the current parse position (which should
13230 * be at a '[') to see if what follows might be intended to be a [:posix:]
13231 * class. It returns true if it really is a posix class, of course, but it
13232 * also can return true if it thinks that what was intended was a posix
13233 * class that didn't quite make it.
13235 * It will return true for
13237 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13238 * ')' indicating the end of the (?[
13239 * [:any garbage including %^&$ punctuation:]
13241 * This is designed to be called only from S_handle_regex_sets; it could be
13242 * easily adapted to be called from the spot at the beginning of regclass()
13243 * that checks to see in a normal bracketed class if the surrounding []
13244 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13245 * change long-standing behavior, so I (khw) didn't do that */
13246 char* p = RExC_parse + 1;
13247 char first_char = *p;
13249 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13251 assert(*(p - 1) == '[');
13253 if (! POSIXCC(first_char)) {
13258 while (p < RExC_end && isWORDCHAR(*p)) p++;
13260 if (p >= RExC_end) {
13264 if (p - RExC_parse > 2 /* Got at least 1 word character */
13265 && (*p == first_char
13266 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13271 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13274 && p - RExC_parse > 2 /* [:] evaluates to colon;
13275 [::] is a bad posix class. */
13276 && first_char == *(p - 1));
13279 STATIC unsigned int
13280 S_regex_set_precedence(const U8 my_operator) {
13282 /* Returns the precedence in the (?[...]) construct of the input operator,
13283 * specified by its character representation. The precedence follows
13284 * general Perl rules, but it extends this so that ')' and ']' have (low)
13285 * precedence even though they aren't really operators */
13287 switch (my_operator) {
13303 NOT_REACHED; /* NOTREACHED */
13304 return 0; /* Silence compiler warning */
13308 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13309 I32 *flagp, U32 depth,
13310 char * const oregcomp_parse)
13312 /* Handle the (?[...]) construct to do set operations */
13314 U8 curchar; /* Current character being parsed */
13315 UV start, end; /* End points of code point ranges */
13316 SV* final = NULL; /* The end result inversion list */
13317 SV* result_string; /* 'final' stringified */
13318 AV* stack; /* stack of operators and operands not yet
13320 AV* fence_stack = NULL; /* A stack containing the positions in
13321 'stack' of where the undealt-with left
13322 parens would be if they were actually
13324 IV fence = 0; /* Position of where most recent undealt-
13325 with left paren in stack is; -1 if none.
13327 STRLEN len; /* Temporary */
13328 regnode* node; /* Temporary, and final regnode returned by
13330 const bool save_fold = FOLD; /* Temporary */
13331 char *save_end, *save_parse; /* Temporaries */
13333 GET_RE_DEBUG_FLAGS_DECL;
13335 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13337 if (LOC) { /* XXX could make valid in UTF-8 locales */
13338 vFAIL("(?[...]) not valid in locale");
13340 RExC_uni_semantics = 1; /* The use of this operator implies /u. This
13341 is required so that the compile time values
13342 are valid in all runtime cases */
13344 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13345 * (such as EXACT). Thus we can skip most everything if just sizing. We
13346 * call regclass to handle '[]' so as to not have to reinvent its parsing
13347 * rules here (throwing away the size it computes each time). And, we exit
13348 * upon an unescaped ']' that isn't one ending a regclass. To do both
13349 * these things, we need to realize that something preceded by a backslash
13350 * is escaped, so we have to keep track of backslashes */
13352 UV depth = 0; /* how many nested (?[...]) constructs */
13354 while (RExC_parse < RExC_end) {
13355 SV* current = NULL;
13356 RExC_parse = regpatws(pRExC_state, RExC_parse,
13357 TRUE); /* means recognize comments */
13358 switch (*RExC_parse) {
13360 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13365 /* Skip the next byte (which could cause us to end up in
13366 * the middle of a UTF-8 character, but since none of those
13367 * are confusable with anything we currently handle in this
13368 * switch (invariants all), it's safe. We'll just hit the
13369 * default: case next time and keep on incrementing until
13370 * we find one of the invariants we do handle. */
13375 /* If this looks like it is a [:posix:] class, leave the
13376 * parse pointer at the '[' to fool regclass() into
13377 * thinking it is part of a '[[:posix:]]'. That function
13378 * will use strict checking to force a syntax error if it
13379 * doesn't work out to a legitimate class */
13380 bool is_posix_class
13381 = could_it_be_a_POSIX_class(pRExC_state);
13382 if (! is_posix_class) {
13386 /* regclass() can only return RESTART_UTF8 if multi-char
13387 folds are allowed. */
13388 if (!regclass(pRExC_state, flagp,depth+1,
13389 is_posix_class, /* parse the whole char
13390 class only if not a
13392 FALSE, /* don't allow multi-char folds */
13393 TRUE, /* silence non-portable warnings. */
13397 FAIL2("panic: regclass returned NULL to handle_sets, "
13398 "flags=%#"UVxf"", (UV) *flagp);
13400 /* function call leaves parse pointing to the ']', except
13401 * if we faked it */
13402 if (is_posix_class) {
13406 SvREFCNT_dec(current); /* In case it returned something */
13411 if (depth--) break;
13413 if (RExC_parse < RExC_end
13414 && *RExC_parse == ')')
13416 node = reganode(pRExC_state, ANYOF, 0);
13417 RExC_size += ANYOF_SKIP;
13418 nextchar(pRExC_state);
13419 Set_Node_Length(node,
13420 RExC_parse - oregcomp_parse + 1); /* MJD */
13429 FAIL("Syntax error in (?[...])");
13432 /* Pass 2 only after this. */
13433 Perl_ck_warner_d(aTHX_
13434 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13435 "The regex_sets feature is experimental" REPORT_LOCATION,
13436 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13438 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13439 RExC_precomp + (RExC_parse - RExC_precomp)));
13441 /* Everything in this construct is a metacharacter. Operands begin with
13442 * either a '\' (for an escape sequence), or a '[' for a bracketed
13443 * character class. Any other character should be an operator, or
13444 * parenthesis for grouping. Both types of operands are handled by calling
13445 * regclass() to parse them. It is called with a parameter to indicate to
13446 * return the computed inversion list. The parsing here is implemented via
13447 * a stack. Each entry on the stack is a single character representing one
13448 * of the operators; or else a pointer to an operand inversion list. */
13450 #define IS_OPERAND(a) (! SvIOK(a))
13452 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
13453 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
13454 * with prounouncing it called it Reverse Polish instead, but now that YOU
13455 * know how to prounounce it you can use the correct term, thus giving due
13456 * credit to the person who invented it, and impressing your geek friends.
13457 * Wikipedia says that the pronounciation of "Ł" has been changing so that
13458 * it is now more like an English initial W (as in wonk) than an L.)
13460 * This means that, for example, 'a | b & c' is stored on the stack as
13468 * where the numbers in brackets give the stack [array] element number.
13469 * In this implementation, parentheses are not stored on the stack.
13470 * Instead a '(' creates a "fence" so that the part of the stack below the
13471 * fence is invisible except to the corresponding ')' (this allows us to
13472 * replace testing for parens, by using instead subtraction of the fence
13473 * position). As new operands are processed they are pushed onto the stack
13474 * (except as noted in the next paragraph). New operators of higher
13475 * precedence than the current final one are inserted on the stack before
13476 * the lhs operand (so that when the rhs is pushed next, everything will be
13477 * in the correct positions shown above. When an operator of equal or
13478 * lower precedence is encountered in parsing, all the stacked operations
13479 * of equal or higher precedence are evaluated, leaving the result as the
13480 * top entry on the stack. This makes higher precedence operations
13481 * evaluate before lower precedence ones, and causes operations of equal
13482 * precedence to left associate.
13484 * The only unary operator '!' is immediately pushed onto the stack when
13485 * encountered. When an operand is encountered, if the top of the stack is
13486 * a '!", the complement is immediately performed, and the '!' popped. The
13487 * resulting value is treated as a new operand, and the logic in the
13488 * previous paragraph is executed. Thus in the expression
13490 * the stack looks like
13496 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
13503 * A ')' is treated as an operator with lower precedence than all the
13504 * aforementioned ones, which causes all operations on the stack above the
13505 * corresponding '(' to be evaluated down to a single resultant operand.
13506 * Then the fence for the '(' is removed, and the operand goes through the
13507 * algorithm above, without the fence.
13509 * A separate stack is kept of the fence positions, so that the position of
13510 * the latest so-far unbalanced '(' is at the top of it.
13512 * The ']' ending the construct is treated as the lowest operator of all,
13513 * so that everything gets evaluated down to a single operand, which is the
13516 sv_2mortal((SV *)(stack = newAV()));
13517 sv_2mortal((SV *)(fence_stack = newAV()));
13519 while (RExC_parse < RExC_end) {
13520 I32 top_index; /* Index of top-most element in 'stack' */
13521 SV** top_ptr; /* Pointer to top 'stack' element */
13522 SV* current = NULL; /* To contain the current inversion list
13524 SV* only_to_avoid_leaks;
13526 /* Skip white space */
13527 RExC_parse = regpatws(pRExC_state, RExC_parse,
13528 TRUE /* means recognize comments */ );
13529 if (RExC_parse >= RExC_end) {
13530 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13533 curchar = UCHARAT(RExC_parse);
13537 top_index = av_tindex(stack);
13540 SV** stacked_ptr; /* Ptr to something already on 'stack' */
13541 char stacked_operator; /* The topmost operator on the 'stack'. */
13542 SV* lhs; /* Operand to the left of the operator */
13543 SV* rhs; /* Operand to the right of the operator */
13544 SV* fence_ptr; /* Pointer to top element of the fence
13549 if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
13551 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13552 * This happens when we have some thing like
13554 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13556 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13558 * Here we would be handling the interpolated
13559 * '$thai_or_lao'. We handle this by a recursive call to
13560 * ourselves which returns the inversion list the
13561 * interpolated expression evaluates to. We use the flags
13562 * from the interpolated pattern. */
13563 U32 save_flags = RExC_flags;
13564 const char * save_parse;
13566 RExC_parse += 2; /* Skip past the '(?' */
13567 save_parse = RExC_parse;
13569 /* Parse any flags for the '(?' */
13570 parse_lparen_question_flags(pRExC_state);
13572 if (RExC_parse == save_parse /* Makes sure there was at
13573 least one flag (or else
13574 this embedding wasn't
13576 || RExC_parse >= RExC_end - 4
13577 || UCHARAT(RExC_parse) != ':'
13578 || UCHARAT(++RExC_parse) != '('
13579 || UCHARAT(++RExC_parse) != '?'
13580 || UCHARAT(++RExC_parse) != '[')
13583 /* In combination with the above, this moves the
13584 * pointer to the point just after the first erroneous
13585 * character (or if there are no flags, to where they
13586 * should have been) */
13587 if (RExC_parse >= RExC_end - 4) {
13588 RExC_parse = RExC_end;
13590 else if (RExC_parse != save_parse) {
13591 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13593 vFAIL("Expecting '(?flags:(?[...'");
13596 /* Recurse, with the meat of the embedded expression */
13598 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13599 depth+1, oregcomp_parse);
13601 /* Here, 'current' contains the embedded expression's
13602 * inversion list, and RExC_parse points to the trailing
13603 * ']'; the next character should be the ')' */
13605 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13607 /* Then the ')' matching the original '(' handled by this
13608 * case: statement */
13610 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13613 RExC_flags = save_flags;
13614 goto handle_operand;
13617 /* A regular '('. Look behind for illegal syntax */
13618 if (top_index - fence >= 0) {
13619 /* If the top entry on the stack is an operator, it had
13620 * better be a '!', otherwise the entry below the top
13621 * operand should be an operator */
13622 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
13623 || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!')
13624 || top_index - fence < 1
13625 || ! (stacked_ptr = av_fetch(stack,
13628 || IS_OPERAND(*stacked_ptr))
13631 vFAIL("Unexpected '(' with no preceding operator");
13635 /* Stack the position of this undealt-with left paren */
13636 fence = top_index + 1;
13637 av_push(fence_stack, newSViv(fence));
13641 /* regclass() can only return RESTART_UTF8 if multi-char
13642 folds are allowed. */
13643 if (!regclass(pRExC_state, flagp,depth+1,
13644 TRUE, /* means parse just the next thing */
13645 FALSE, /* don't allow multi-char folds */
13646 FALSE, /* don't silence non-portable warnings. */
13650 FAIL2("panic: regclass returned NULL to handle_sets, "
13651 "flags=%#"UVxf"", (UV) *flagp);
13654 /* regclass() will return with parsing just the \ sequence,
13655 * leaving the parse pointer at the next thing to parse */
13657 goto handle_operand;
13659 case '[': /* Is a bracketed character class */
13661 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13663 if (! is_posix_class) {
13667 /* regclass() can only return RESTART_UTF8 if multi-char
13668 folds are allowed. */
13669 if(!regclass(pRExC_state, flagp,depth+1,
13670 is_posix_class, /* parse the whole char class
13671 only if not a posix class */
13672 FALSE, /* don't allow multi-char folds */
13673 FALSE, /* don't silence non-portable warnings. */
13678 FAIL2("panic: regclass returned NULL to handle_sets, "
13679 "flags=%#"UVxf"", (UV) *flagp);
13682 /* function call leaves parse pointing to the ']', except if we
13684 if (is_posix_class) {
13688 goto handle_operand;
13692 if (top_index >= 1) {
13693 goto join_operators;
13696 /* Only a single operand on the stack: are done */
13700 if (av_tindex(fence_stack) < 0) {
13702 vFAIL("Unexpected ')'");
13705 /* If at least two thing on the stack, treat this as an
13707 if (top_index - fence >= 1) {
13708 goto join_operators;
13711 /* Here only a single thing on the fenced stack, and there is a
13712 * fence. Get rid of it */
13713 fence_ptr = av_pop(fence_stack);
13715 fence = SvIV(fence_ptr) - 1;
13716 SvREFCNT_dec_NN(fence_ptr);
13723 /* Having gotten rid of the fence, we pop the operand at the
13724 * stack top and process it as a newly encountered operand */
13725 current = av_pop(stack);
13726 assert(IS_OPERAND(current));
13727 goto handle_operand;
13735 /* These binary operators should have a left operand already
13737 if ( top_index - fence < 0
13738 || top_index - fence == 1
13739 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13740 || ! IS_OPERAND(*top_ptr))
13742 goto unexpected_binary;
13745 /* If only the one operand is on the part of the stack visible
13746 * to us, we just place this operator in the proper position */
13747 if (top_index - fence < 2) {
13749 /* Place the operator before the operand */
13751 SV* lhs = av_pop(stack);
13752 av_push(stack, newSVuv(curchar));
13753 av_push(stack, lhs);
13757 /* But if there is something else on the stack, we need to
13758 * process it before this new operator if and only if the
13759 * stacked operation has equal or higher precedence than the
13764 /* The operator on the stack is supposed to be below both its
13766 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
13767 || IS_OPERAND(*stacked_ptr))
13769 /* But if not, it's legal and indicates we are completely
13770 * done if and only if we're currently processing a ']',
13771 * which should be the final thing in the expression */
13772 if (curchar == ']') {
13778 vFAIL2("Unexpected binary operator '%c' with no "
13779 "preceding operand", curchar);
13781 stacked_operator = (char) SvUV(*stacked_ptr);
13783 if (regex_set_precedence(curchar)
13784 > regex_set_precedence(stacked_operator))
13786 /* Here, the new operator has higher precedence than the
13787 * stacked one. This means we need to add the new one to
13788 * the stack to await its rhs operand (and maybe more
13789 * stuff). We put it before the lhs operand, leaving
13790 * untouched the stacked operator and everything below it
13792 lhs = av_pop(stack);
13793 assert(IS_OPERAND(lhs));
13795 av_push(stack, newSVuv(curchar));
13796 av_push(stack, lhs);
13800 /* Here, the new operator has equal or lower precedence than
13801 * what's already there. This means the operation already
13802 * there should be performed now, before the new one. */
13803 rhs = av_pop(stack);
13804 lhs = av_pop(stack);
13806 assert(IS_OPERAND(rhs));
13807 assert(IS_OPERAND(lhs));
13809 switch (stacked_operator) {
13811 _invlist_intersection(lhs, rhs, &rhs);
13816 _invlist_union(lhs, rhs, &rhs);
13820 _invlist_subtract(lhs, rhs, &rhs);
13823 case '^': /* The union minus the intersection */
13829 _invlist_union(lhs, rhs, &u);
13830 _invlist_intersection(lhs, rhs, &i);
13831 /* _invlist_subtract will overwrite rhs
13832 without freeing what it already contains */
13834 _invlist_subtract(u, i, &rhs);
13835 SvREFCNT_dec_NN(i);
13836 SvREFCNT_dec_NN(u);
13837 SvREFCNT_dec_NN(element);
13843 /* Here, the higher precedence operation has been done, and the
13844 * result is in 'rhs'. We overwrite the stacked operator with
13845 * the result. Then we redo this code to either push the new
13846 * operator onto the stack or perform any higher precedence
13847 * stacked operation */
13848 only_to_avoid_leaks = av_pop(stack);
13849 SvREFCNT_dec(only_to_avoid_leaks);
13850 av_push(stack, rhs);
13853 case '!': /* Highest priority, right associative, so just push
13855 av_push(stack, newSVuv(curchar));
13859 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13860 vFAIL("Unexpected character");
13864 /* Here 'current' is the operand. If something is already on the
13865 * stack, we have to check if it is a !. */
13866 top_index = av_tindex(stack); /* Code above may have altered the
13867 * stack in the time since we
13868 * earlier set 'top_index'. */
13869 if (top_index - fence >= 0) {
13870 /* If the top entry on the stack is an operator, it had better
13871 * be a '!', otherwise the entry below the top operand should
13872 * be an operator */
13873 top_ptr = av_fetch(stack, top_index, FALSE);
13875 if (! IS_OPERAND(*top_ptr)) {
13877 /* The only permissible operator at the top of the stack is
13878 * '!', which is applied immediately to this operand. */
13879 curchar = (char) SvUV(*top_ptr);
13880 if (curchar != '!') {
13881 SvREFCNT_dec(current);
13882 vFAIL2("Unexpected binary operator '%c' with no "
13883 "preceding operand", curchar);
13886 _invlist_invert(current);
13888 only_to_avoid_leaks = av_pop(stack);
13889 SvREFCNT_dec(only_to_avoid_leaks);
13890 top_index = av_tindex(stack);
13892 /* And we redo with the inverted operand. This allows
13893 * handling multiple ! in a row */
13894 goto handle_operand;
13896 /* Single operand is ok only for the non-binary ')'
13898 else if ((top_index - fence == 0 && curchar != ')')
13899 || (top_index - fence > 0
13900 && (! (stacked_ptr = av_fetch(stack,
13903 || IS_OPERAND(*stacked_ptr))))
13905 SvREFCNT_dec(current);
13906 vFAIL("Operand with no preceding operator");
13910 /* Here there was nothing on the stack or the top element was
13911 * another operand. Just add this new one */
13912 av_push(stack, current);
13914 } /* End of switch on next parse token */
13916 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13917 } /* End of loop parsing through the construct */
13920 if (av_tindex(fence_stack) >= 0) {
13921 vFAIL("Unmatched (");
13924 if (av_tindex(stack) < 0 /* Was empty */
13925 || ((final = av_pop(stack)) == NULL)
13926 || ! IS_OPERAND(final)
13927 || av_tindex(stack) >= 0) /* More left on stack */
13929 SvREFCNT_dec(final);
13930 vFAIL("Incomplete expression within '(?[ ])'");
13933 /* Here, 'final' is the resultant inversion list from evaluating the
13934 * expression. Return it if so requested */
13935 if (return_invlist) {
13936 *return_invlist = final;
13940 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13941 * expecting a string of ranges and individual code points */
13942 invlist_iterinit(final);
13943 result_string = newSVpvs("");
13944 while (invlist_iternext(final, &start, &end)) {
13945 if (start == end) {
13946 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13949 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13954 /* About to generate an ANYOF (or similar) node from the inversion list we
13955 * have calculated */
13956 save_parse = RExC_parse;
13957 RExC_parse = SvPV(result_string, len);
13958 save_end = RExC_end;
13959 RExC_end = RExC_parse + len;
13961 /* We turn off folding around the call, as the class we have constructed
13962 * already has all folding taken into consideration, and we don't want
13963 * regclass() to add to that */
13964 RExC_flags &= ~RXf_PMf_FOLD;
13965 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13967 node = regclass(pRExC_state, flagp,depth+1,
13968 FALSE, /* means parse the whole char class */
13969 FALSE, /* don't allow multi-char folds */
13970 TRUE, /* silence non-portable warnings. The above may very
13971 well have generated non-portable code points, but
13972 they're valid on this machine */
13973 FALSE, /* similarly, no need for strict */
13977 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13980 RExC_flags |= RXf_PMf_FOLD;
13982 RExC_parse = save_parse + 1;
13983 RExC_end = save_end;
13984 SvREFCNT_dec_NN(final);
13985 SvREFCNT_dec_NN(result_string);
13987 nextchar(pRExC_state);
13988 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13994 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13996 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13997 * innocent-looking character class, like /[ks]/i won't have to go out to
13998 * disk to find the possible matches.
14000 * This should be called only for a Latin1-range code points, cp, which is
14001 * known to be involved in a simple fold with other code points above
14002 * Latin1. It would give false results if /aa has been specified.
14003 * Multi-char folds are outside the scope of this, and must be handled
14006 * XXX It would be better to generate these via regen, in case a new
14007 * version of the Unicode standard adds new mappings, though that is not
14008 * really likely, and may be caught by the default: case of the switch
14011 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
14013 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
14019 add_cp_to_invlist(*invlist, KELVIN_SIGN);
14023 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
14026 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
14027 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
14029 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14030 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14031 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
14033 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14034 *invlist = add_cp_to_invlist(*invlist,
14035 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14038 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
14040 case LATIN_SMALL_LETTER_SHARP_S:
14041 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
14046 #if UNICODE_MAJOR_VERSION < 3 \
14047 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
14049 /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
14054 add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
14055 # if UNICODE_DOT_DOT_VERSION == 1
14056 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
14062 /* Use deprecated warning to increase the chances of this being
14065 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
14072 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
14074 /* This adds the string scalar <multi_string> to the array
14075 * <multi_char_matches>. <multi_string> is known to have exactly
14076 * <cp_count> code points in it. This is used when constructing a
14077 * bracketed character class and we find something that needs to match more
14078 * than a single character.
14080 * <multi_char_matches> is actually an array of arrays. Each top-level
14081 * element is an array that contains all the strings known so far that are
14082 * the same length. And that length (in number of code points) is the same
14083 * as the index of the top-level array. Hence, the [2] element is an
14084 * array, each element thereof is a string containing TWO code points;
14085 * while element [3] is for strings of THREE characters, and so on. Since
14086 * this is for multi-char strings there can never be a [0] nor [1] element.
14088 * When we rewrite the character class below, we will do so such that the
14089 * longest strings are written first, so that it prefers the longest
14090 * matching strings first. This is done even if it turns out that any
14091 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
14092 * Christiansen has agreed that this is ok. This makes the test for the
14093 * ligature 'ffi' come before the test for 'ff', for example */
14096 AV** this_array_ptr;
14098 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
14100 if (! multi_char_matches) {
14101 multi_char_matches = newAV();
14104 if (av_exists(multi_char_matches, cp_count)) {
14105 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
14106 this_array = *this_array_ptr;
14109 this_array = newAV();
14110 av_store(multi_char_matches, cp_count,
14113 av_push(this_array, multi_string);
14115 return multi_char_matches;
14118 /* The names of properties whose definitions are not known at compile time are
14119 * stored in this SV, after a constant heading. So if the length has been
14120 * changed since initialization, then there is a run-time definition. */
14121 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
14122 (SvCUR(listsv) != initial_listsv_len)
14125 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
14126 const bool stop_at_1, /* Just parse the next thing, don't
14127 look for a full character class */
14128 bool allow_multi_folds,
14129 const bool silence_non_portable, /* Don't output warnings
14133 SV** ret_invlist /* Return an inversion list, not a node */
14136 /* parse a bracketed class specification. Most of these will produce an
14137 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
14138 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
14139 * under /i with multi-character folds: it will be rewritten following the
14140 * paradigm of this example, where the <multi-fold>s are characters which
14141 * fold to multiple character sequences:
14142 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
14143 * gets effectively rewritten as:
14144 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
14145 * reg() gets called (recursively) on the rewritten version, and this
14146 * function will return what it constructs. (Actually the <multi-fold>s
14147 * aren't physically removed from the [abcdefghi], it's just that they are
14148 * ignored in the recursion by means of a flag:
14149 * <RExC_in_multi_char_class>.)
14151 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
14152 * characters, with the corresponding bit set if that character is in the
14153 * list. For characters above this, a range list or swash is used. There
14154 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
14155 * determinable at compile time
14157 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
14158 * to be restarted. This can only happen if ret_invlist is non-NULL.
14161 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
14163 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
14166 IV namedclass = OOB_NAMEDCLASS;
14167 char *rangebegin = NULL;
14168 bool need_class = 0;
14170 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
14171 than just initialized. */
14172 SV* properties = NULL; /* Code points that match \p{} \P{} */
14173 SV* posixes = NULL; /* Code points that match classes like [:word:],
14174 extended beyond the Latin1 range. These have to
14175 be kept separate from other code points for much
14176 of this function because their handling is
14177 different under /i, and for most classes under
14179 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
14180 separate for a while from the non-complemented
14181 versions because of complications with /d
14183 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
14184 treated more simply than the general case,
14185 leading to less compilation and execution
14187 UV element_count = 0; /* Number of distinct elements in the class.
14188 Optimizations may be possible if this is tiny */
14189 AV * multi_char_matches = NULL; /* Code points that fold to more than one
14190 character; used under /i */
14192 char * stop_ptr = RExC_end; /* where to stop parsing */
14193 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
14196 /* Unicode properties are stored in a swash; this holds the current one
14197 * being parsed. If this swash is the only above-latin1 component of the
14198 * character class, an optimization is to pass it directly on to the
14199 * execution engine. Otherwise, it is set to NULL to indicate that there
14200 * are other things in the class that have to be dealt with at execution
14202 SV* swash = NULL; /* Code points that match \p{} \P{} */
14204 /* Set if a component of this character class is user-defined; just passed
14205 * on to the engine */
14206 bool has_user_defined_property = FALSE;
14208 /* inversion list of code points this node matches only when the target
14209 * string is in UTF-8. (Because is under /d) */
14210 SV* depends_list = NULL;
14212 /* Inversion list of code points this node matches regardless of things
14213 * like locale, folding, utf8ness of the target string */
14214 SV* cp_list = NULL;
14216 /* Like cp_list, but code points on this list need to be checked for things
14217 * that fold to/from them under /i */
14218 SV* cp_foldable_list = NULL;
14220 /* Like cp_list, but code points on this list are valid only when the
14221 * runtime locale is UTF-8 */
14222 SV* only_utf8_locale_list = NULL;
14224 /* In a range, if one of the endpoints is non-character-set portable,
14225 * meaning that it hard-codes a code point that may mean a different
14226 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
14227 * mnemonic '\t' which each mean the same character no matter which
14228 * character set the platform is on. */
14229 unsigned int non_portable_endpoint = 0;
14231 /* Is the range unicode? which means on a platform that isn't 1-1 native
14232 * to Unicode (i.e. non-ASCII), each code point in it should be considered
14233 * to be a Unicode value. */
14234 bool unicode_range = FALSE;
14235 bool invert = FALSE; /* Is this class to be complemented */
14237 bool warn_super = ALWAYS_WARN_SUPER;
14239 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
14240 case we need to change the emitted regop to an EXACT. */
14241 const char * orig_parse = RExC_parse;
14242 const SSize_t orig_size = RExC_size;
14243 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
14244 GET_RE_DEBUG_FLAGS_DECL;
14246 PERL_ARGS_ASSERT_REGCLASS;
14248 PERL_UNUSED_ARG(depth);
14251 DEBUG_PARSE("clas");
14253 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
14254 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
14255 && UNICODE_DOT_DOT_VERSION == 0)
14256 allow_multi_folds = FALSE;
14259 /* Assume we are going to generate an ANYOF node. */
14260 ret = reganode(pRExC_state,
14267 RExC_size += ANYOF_SKIP;
14268 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
14271 ANYOF_FLAGS(ret) = 0;
14273 RExC_emit += ANYOF_SKIP;
14274 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
14275 initial_listsv_len = SvCUR(listsv);
14276 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
14280 RExC_parse = regpatws(pRExC_state, RExC_parse,
14281 FALSE /* means don't recognize comments */ );
14284 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
14287 allow_multi_folds = FALSE;
14290 RExC_parse = regpatws(pRExC_state, RExC_parse,
14291 FALSE /* means don't recognize comments */ );
14295 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
14296 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
14297 const char *s = RExC_parse;
14298 const char c = *s++;
14303 while (isWORDCHAR(*s))
14305 if (*s && c == *s && s[1] == ']') {
14306 SAVEFREESV(RExC_rx_sv);
14308 "POSIX syntax [%c %c] belongs inside character classes",
14310 (void)ReREFCNT_inc(RExC_rx_sv);
14314 /* If the caller wants us to just parse a single element, accomplish this
14315 * by faking the loop ending condition */
14316 if (stop_at_1 && RExC_end > RExC_parse) {
14317 stop_ptr = RExC_parse + 1;
14320 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
14321 if (UCHARAT(RExC_parse) == ']')
14322 goto charclassloop;
14325 if (RExC_parse >= stop_ptr) {
14330 RExC_parse = regpatws(pRExC_state, RExC_parse,
14331 FALSE /* means don't recognize comments */ );
14334 if (UCHARAT(RExC_parse) == ']') {
14340 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
14341 save_value = value;
14342 save_prevvalue = prevvalue;
14345 rangebegin = RExC_parse;
14347 non_portable_endpoint = 0;
14350 value = utf8n_to_uvchr((U8*)RExC_parse,
14351 RExC_end - RExC_parse,
14352 &numlen, UTF8_ALLOW_DEFAULT);
14353 RExC_parse += numlen;
14356 value = UCHARAT(RExC_parse++);
14359 && RExC_parse < RExC_end
14360 && POSIXCC(UCHARAT(RExC_parse)))
14362 namedclass = regpposixcc(pRExC_state, value, strict);
14364 else if (value == '\\') {
14365 /* Is a backslash; get the code point of the char after it */
14366 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
14367 value = utf8n_to_uvchr((U8*)RExC_parse,
14368 RExC_end - RExC_parse,
14369 &numlen, UTF8_ALLOW_DEFAULT);
14370 RExC_parse += numlen;
14373 value = UCHARAT(RExC_parse++);
14375 /* Some compilers cannot handle switching on 64-bit integer
14376 * values, therefore value cannot be an UV. Yes, this will
14377 * be a problem later if we want switch on Unicode.
14378 * A similar issue a little bit later when switching on
14379 * namedclass. --jhi */
14381 /* If the \ is escaping white space when white space is being
14382 * skipped, it means that that white space is wanted literally, and
14383 * is already in 'value'. Otherwise, need to translate the escape
14384 * into what it signifies. */
14385 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
14387 case 'w': namedclass = ANYOF_WORDCHAR; break;
14388 case 'W': namedclass = ANYOF_NWORDCHAR; break;
14389 case 's': namedclass = ANYOF_SPACE; break;
14390 case 'S': namedclass = ANYOF_NSPACE; break;
14391 case 'd': namedclass = ANYOF_DIGIT; break;
14392 case 'D': namedclass = ANYOF_NDIGIT; break;
14393 case 'v': namedclass = ANYOF_VERTWS; break;
14394 case 'V': namedclass = ANYOF_NVERTWS; break;
14395 case 'h': namedclass = ANYOF_HORIZWS; break;
14396 case 'H': namedclass = ANYOF_NHORIZWS; break;
14397 case 'N': /* Handle \N{NAME} in class */
14399 const char * const backslash_N_beg = RExC_parse - 2;
14402 if (! grok_bslash_N(pRExC_state,
14403 NULL, /* No regnode */
14404 &value, /* Yes single value */
14405 &cp_count, /* Multiple code pt count */
14410 if (*flagp & RESTART_UTF8)
14411 FAIL("panic: grok_bslash_N set RESTART_UTF8");
14413 if (cp_count < 0) {
14414 vFAIL("\\N in a character class must be a named character: \\N{...}");
14416 else if (cp_count == 0) {
14418 RExC_parse++; /* Position after the "}" */
14419 vFAIL("Zero length \\N{}");
14422 ckWARNreg(RExC_parse,
14423 "Ignoring zero length \\N{} in character class");
14426 else { /* cp_count > 1 */
14427 if (! RExC_in_multi_char_class) {
14428 if (invert || range || *RExC_parse == '-') {
14431 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
14434 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
14436 break; /* <value> contains the first code
14437 point. Drop out of the switch to
14441 SV * multi_char_N = newSVpvn(backslash_N_beg,
14442 RExC_parse - backslash_N_beg);
14444 = add_multi_match(multi_char_matches,
14449 } /* End of cp_count != 1 */
14451 /* This element should not be processed further in this
14454 value = save_value;
14455 prevvalue = save_prevvalue;
14456 continue; /* Back to top of loop to get next char */
14459 /* Here, is a single code point, and <value> contains it */
14460 unicode_range = TRUE; /* \N{} are Unicode */
14468 /* We will handle any undefined properties ourselves */
14469 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14470 /* And we actually would prefer to get
14471 * the straight inversion list of the
14472 * swash, since we will be accessing it
14473 * anyway, to save a little time */
14474 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14476 if (RExC_parse >= RExC_end)
14477 vFAIL2("Empty \\%c{}", (U8)value);
14478 if (*RExC_parse == '{') {
14479 const U8 c = (U8)value;
14480 e = strchr(RExC_parse++, '}');
14482 vFAIL2("Missing right brace on \\%c{}", c);
14483 while (isSPACE(*RExC_parse))
14485 if (e == RExC_parse)
14486 vFAIL2("Empty \\%c{}", c);
14487 n = e - RExC_parse;
14488 while (isSPACE(*(RExC_parse + n - 1)))
14499 if (UCHARAT(RExC_parse) == '^') {
14502 /* toggle. (The rhs xor gets the single bit that
14503 * differs between P and p; the other xor inverts just
14505 value ^= 'P' ^ 'p';
14507 while (isSPACE(*RExC_parse)) {
14512 /* Try to get the definition of the property into
14513 * <invlist>. If /i is in effect, the effective property
14514 * will have its name be <__NAME_i>. The design is
14515 * discussed in commit
14516 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14517 name = savepv(Perl_form(aTHX_
14519 (FOLD) ? "__" : "",
14525 /* Look up the property name, and get its swash and
14526 * inversion list, if the property is found */
14528 SvREFCNT_dec_NN(swash);
14530 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14533 NULL, /* No inversion list */
14536 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14537 HV* curpkg = (IN_PERL_COMPILETIME)
14539 : CopSTASH(PL_curcop);
14541 SvREFCNT_dec_NN(swash);
14545 /* Here didn't find it. It could be a user-defined
14546 * property that will be available at run-time. If we
14547 * accept only compile-time properties, is an error;
14548 * otherwise add it to the list for run-time look up */
14550 RExC_parse = e + 1;
14552 "Property '%"UTF8f"' is unknown",
14553 UTF8fARG(UTF, n, name));
14556 /* If the property name doesn't already have a package
14557 * name, add the current one to it so that it can be
14558 * referred to outside it. [perl #121777] */
14559 if (curpkg && ! instr(name, "::")) {
14560 char* pkgname = HvNAME(curpkg);
14561 if (strNE(pkgname, "main")) {
14562 char* full_name = Perl_form(aTHX_
14566 n = strlen(full_name);
14568 name = savepvn(full_name, n);
14571 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14572 (value == 'p' ? '+' : '!'),
14573 UTF8fARG(UTF, n, name));
14574 has_user_defined_property = TRUE;
14576 /* We don't know yet, so have to assume that the
14577 * property could match something in the Latin1 range,
14578 * hence something that isn't utf8. Note that this
14579 * would cause things in <depends_list> to match
14580 * inappropriately, except that any \p{}, including
14581 * this one forces Unicode semantics, which means there
14582 * is no <depends_list> */
14584 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14588 /* Here, did get the swash and its inversion list. If
14589 * the swash is from a user-defined property, then this
14590 * whole character class should be regarded as such */
14591 if (swash_init_flags
14592 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14594 has_user_defined_property = TRUE;
14597 /* We warn on matching an above-Unicode code point
14598 * if the match would return true, except don't
14599 * warn for \p{All}, which has exactly one element
14601 (_invlist_contains_cp(invlist, 0x110000)
14602 && (! (_invlist_len(invlist) == 1
14603 && *invlist_array(invlist) == 0)))
14609 /* Invert if asking for the complement */
14610 if (value == 'P') {
14611 _invlist_union_complement_2nd(properties,
14615 /* The swash can't be used as-is, because we've
14616 * inverted things; delay removing it to here after
14617 * have copied its invlist above */
14618 SvREFCNT_dec_NN(swash);
14622 _invlist_union(properties, invlist, &properties);
14627 RExC_parse = e + 1;
14628 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14631 /* \p means they want Unicode semantics */
14632 RExC_uni_semantics = 1;
14635 case 'n': value = '\n'; break;
14636 case 'r': value = '\r'; break;
14637 case 't': value = '\t'; break;
14638 case 'f': value = '\f'; break;
14639 case 'b': value = '\b'; break;
14640 case 'e': value = ESC_NATIVE; break;
14641 case 'a': value = '\a'; break;
14643 RExC_parse--; /* function expects to be pointed at the 'o' */
14645 const char* error_msg;
14646 bool valid = grok_bslash_o(&RExC_parse,
14649 PASS2, /* warnings only in
14652 silence_non_portable,
14658 non_portable_endpoint++;
14659 if (IN_ENCODING && value < 0x100) {
14660 goto recode_encoding;
14664 RExC_parse--; /* function expects to be pointed at the 'x' */
14666 const char* error_msg;
14667 bool valid = grok_bslash_x(&RExC_parse,
14670 PASS2, /* Output warnings */
14672 silence_non_portable,
14678 non_portable_endpoint++;
14679 if (IN_ENCODING && value < 0x100)
14680 goto recode_encoding;
14683 value = grok_bslash_c(*RExC_parse++, PASS2);
14684 non_portable_endpoint++;
14686 case '0': case '1': case '2': case '3': case '4':
14687 case '5': case '6': case '7':
14689 /* Take 1-3 octal digits */
14690 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14691 numlen = (strict) ? 4 : 3;
14692 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14693 RExC_parse += numlen;
14696 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14697 vFAIL("Need exactly 3 octal digits");
14699 else if (! SIZE_ONLY /* like \08, \178 */
14701 && RExC_parse < RExC_end
14702 && isDIGIT(*RExC_parse)
14703 && ckWARN(WARN_REGEXP))
14705 SAVEFREESV(RExC_rx_sv);
14706 reg_warn_non_literal_string(
14708 form_short_octal_warning(RExC_parse, numlen));
14709 (void)ReREFCNT_inc(RExC_rx_sv);
14712 non_portable_endpoint++;
14713 if (IN_ENCODING && value < 0x100)
14714 goto recode_encoding;
14718 if (! RExC_override_recoding) {
14719 SV* enc = _get_encoding();
14720 value = reg_recode((const char)(U8)value, &enc);
14723 vFAIL("Invalid escape in the specified encoding");
14726 ckWARNreg(RExC_parse,
14727 "Invalid escape in the specified encoding");
14733 /* Allow \_ to not give an error */
14734 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14736 vFAIL2("Unrecognized escape \\%c in character class",
14740 SAVEFREESV(RExC_rx_sv);
14741 ckWARN2reg(RExC_parse,
14742 "Unrecognized escape \\%c in character class passed through",
14744 (void)ReREFCNT_inc(RExC_rx_sv);
14748 } /* End of switch on char following backslash */
14749 } /* end of handling backslash escape sequences */
14751 /* Here, we have the current token in 'value' */
14753 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14756 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14757 * literal, as is the character that began the false range, i.e.
14758 * the 'a' in the examples */
14761 const int w = (RExC_parse >= rangebegin)
14762 ? RExC_parse - rangebegin
14766 "False [] range \"%"UTF8f"\"",
14767 UTF8fARG(UTF, w, rangebegin));
14770 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14771 ckWARN2reg(RExC_parse,
14772 "False [] range \"%"UTF8f"\"",
14773 UTF8fARG(UTF, w, rangebegin));
14774 (void)ReREFCNT_inc(RExC_rx_sv);
14775 cp_list = add_cp_to_invlist(cp_list, '-');
14776 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14781 range = 0; /* this was not a true range */
14782 element_count += 2; /* So counts for three values */
14785 classnum = namedclass_to_classnum(namedclass);
14787 if (LOC && namedclass < ANYOF_POSIXL_MAX
14788 #ifndef HAS_ISASCII
14789 && classnum != _CC_ASCII
14792 /* What the Posix classes (like \w, [:space:]) match in locale
14793 * isn't knowable under locale until actual match time. Room
14794 * must be reserved (one time per outer bracketed class) to
14795 * store such classes. The space will contain a bit for each
14796 * named class that is to be matched against. This isn't
14797 * needed for \p{} and pseudo-classes, as they are not affected
14798 * by locale, and hence are dealt with separately */
14799 if (! need_class) {
14802 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14805 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14807 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14808 ANYOF_POSIXL_ZERO(ret);
14811 /* Coverity thinks it is possible for this to be negative; both
14812 * jhi and khw think it's not, but be safer */
14813 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14814 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14816 /* See if it already matches the complement of this POSIX
14818 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14819 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14823 posixl_matches_all = TRUE;
14824 break; /* No need to continue. Since it matches both
14825 e.g., \w and \W, it matches everything, and the
14826 bracketed class can be optimized into qr/./s */
14829 /* Add this class to those that should be checked at runtime */
14830 ANYOF_POSIXL_SET(ret, namedclass);
14832 /* The above-Latin1 characters are not subject to locale rules.
14833 * Just add them, in the second pass, to the
14834 * unconditionally-matched list */
14836 SV* scratch_list = NULL;
14838 /* Get the list of the above-Latin1 code points this
14840 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14841 PL_XPosix_ptrs[classnum],
14843 /* Odd numbers are complements, like
14844 * NDIGIT, NASCII, ... */
14845 namedclass % 2 != 0,
14847 /* Checking if 'cp_list' is NULL first saves an extra
14848 * clone. Its reference count will be decremented at the
14849 * next union, etc, or if this is the only instance, at the
14850 * end of the routine */
14852 cp_list = scratch_list;
14855 _invlist_union(cp_list, scratch_list, &cp_list);
14856 SvREFCNT_dec_NN(scratch_list);
14858 continue; /* Go get next character */
14861 else if (! SIZE_ONLY) {
14863 /* Here, not in pass1 (in that pass we skip calculating the
14864 * contents of this class), and is /l, or is a POSIX class for
14865 * which /l doesn't matter (or is a Unicode property, which is
14866 * skipped here). */
14867 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14868 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14870 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14871 * nor /l make a difference in what these match,
14872 * therefore we just add what they match to cp_list. */
14873 if (classnum != _CC_VERTSPACE) {
14874 assert( namedclass == ANYOF_HORIZWS
14875 || namedclass == ANYOF_NHORIZWS);
14877 /* It turns out that \h is just a synonym for
14879 classnum = _CC_BLANK;
14882 _invlist_union_maybe_complement_2nd(
14884 PL_XPosix_ptrs[classnum],
14885 namedclass % 2 != 0, /* Complement if odd
14886 (NHORIZWS, NVERTWS)
14891 else if (UNI_SEMANTICS
14892 || classnum == _CC_ASCII
14893 || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
14894 || classnum == _CC_XDIGIT)))
14896 /* We usually have to worry about /d and /a affecting what
14897 * POSIX classes match, with special code needed for /d
14898 * because we won't know until runtime what all matches.
14899 * But there is no extra work needed under /u, and
14900 * [:ascii:] is unaffected by /a and /d; and :digit: and
14901 * :xdigit: don't have runtime differences under /d. So we
14902 * can special case these, and avoid some extra work below,
14903 * and at runtime. */
14904 _invlist_union_maybe_complement_2nd(
14906 PL_XPosix_ptrs[classnum],
14907 namedclass % 2 != 0,
14910 else { /* Garden variety class. If is NUPPER, NALPHA, ...
14911 complement and use nposixes */
14912 SV** posixes_ptr = namedclass % 2 == 0
14915 _invlist_union_maybe_complement_2nd(
14917 PL_XPosix_ptrs[classnum],
14918 namedclass % 2 != 0,
14922 } /* end of namedclass \blah */
14925 RExC_parse = regpatws(pRExC_state, RExC_parse,
14926 FALSE /* means don't recognize comments */ );
14929 /* If 'range' is set, 'value' is the ending of a range--check its
14930 * validity. (If value isn't a single code point in the case of a
14931 * range, we should have figured that out above in the code that
14932 * catches false ranges). Later, we will handle each individual code
14933 * point in the range. If 'range' isn't set, this could be the
14934 * beginning of a range, so check for that by looking ahead to see if
14935 * the next real character to be processed is the range indicator--the
14940 /* For unicode ranges, we have to test that the Unicode as opposed
14941 * to the native values are not decreasing. (Above 255, there is
14942 * no difference between native and Unicode) */
14943 if (unicode_range && prevvalue < 255 && value < 255) {
14944 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14945 goto backwards_range;
14950 if (prevvalue > value) /* b-a */ {
14955 w = RExC_parse - rangebegin;
14957 "Invalid [] range \"%"UTF8f"\"",
14958 UTF8fARG(UTF, w, rangebegin));
14959 NOT_REACHED; /* NOTREACHED */
14963 prevvalue = value; /* save the beginning of the potential range */
14964 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14965 && *RExC_parse == '-')
14967 char* next_char_ptr = RExC_parse + 1;
14968 if (skip_white) { /* Get the next real char after the '-' */
14969 next_char_ptr = regpatws(pRExC_state,
14971 FALSE); /* means don't recognize
14975 /* If the '-' is at the end of the class (just before the ']',
14976 * it is a literal minus; otherwise it is a range */
14977 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14978 RExC_parse = next_char_ptr;
14980 /* a bad range like \w-, [:word:]- ? */
14981 if (namedclass > OOB_NAMEDCLASS) {
14982 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14983 const int w = RExC_parse >= rangebegin
14984 ? RExC_parse - rangebegin
14987 vFAIL4("False [] range \"%*.*s\"",
14992 "False [] range \"%*.*s\"",
14997 cp_list = add_cp_to_invlist(cp_list, '-');
15001 range = 1; /* yeah, it's a range! */
15002 continue; /* but do it the next time */
15007 if (namedclass > OOB_NAMEDCLASS) {
15011 /* Here, we have a single value this time through the loop, and
15012 * <prevvalue> is the beginning of the range, if any; or <value> if
15015 /* non-Latin1 code point implies unicode semantics. Must be set in
15016 * pass1 so is there for the whole of pass 2 */
15018 RExC_uni_semantics = 1;
15021 /* Ready to process either the single value, or the completed range.
15022 * For single-valued non-inverted ranges, we consider the possibility
15023 * of multi-char folds. (We made a conscious decision to not do this
15024 * for the other cases because it can often lead to non-intuitive
15025 * results. For example, you have the peculiar case that:
15026 * "s s" =~ /^[^\xDF]+$/i => Y
15027 * "ss" =~ /^[^\xDF]+$/i => N
15029 * See [perl #89750] */
15030 if (FOLD && allow_multi_folds && value == prevvalue) {
15031 if (value == LATIN_SMALL_LETTER_SHARP_S
15032 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
15035 /* Here <value> is indeed a multi-char fold. Get what it is */
15037 U8 foldbuf[UTF8_MAXBYTES_CASE];
15040 UV folded = _to_uni_fold_flags(
15044 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
15045 ? FOLD_FLAGS_NOMIX_ASCII
15049 /* Here, <folded> should be the first character of the
15050 * multi-char fold of <value>, with <foldbuf> containing the
15051 * whole thing. But, if this fold is not allowed (because of
15052 * the flags), <fold> will be the same as <value>, and should
15053 * be processed like any other character, so skip the special
15055 if (folded != value) {
15057 /* Skip if we are recursed, currently parsing the class
15058 * again. Otherwise add this character to the list of
15059 * multi-char folds. */
15060 if (! RExC_in_multi_char_class) {
15061 STRLEN cp_count = utf8_length(foldbuf,
15062 foldbuf + foldlen);
15063 SV* multi_fold = sv_2mortal(newSVpvs(""));
15065 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
15068 = add_multi_match(multi_char_matches,
15074 /* This element should not be processed further in this
15077 value = save_value;
15078 prevvalue = save_prevvalue;
15084 if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
15087 /* If the range starts above 255, everything is portable and
15088 * likely to be so for any forseeable character set, so don't
15090 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
15091 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
15093 else if (prevvalue != value) {
15095 /* Under strict, ranges that stop and/or end in an ASCII
15096 * printable should have each end point be a portable value
15097 * for it (preferably like 'A', but we don't warn if it is
15098 * a (portable) Unicode name or code point), and the range
15099 * must be be all digits or all letters of the same case.
15100 * Otherwise, the range is non-portable and unclear as to
15101 * what it contains */
15102 if ((isPRINT_A(prevvalue) || isPRINT_A(value))
15103 && (non_portable_endpoint
15104 || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
15105 || (isLOWER_A(prevvalue) && isLOWER_A(value))
15106 || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
15108 vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
15110 else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
15112 /* But the nature of Unicode and languages mean we
15113 * can't do the same checks for above-ASCII ranges,
15114 * except in the case of digit ones. These should
15115 * contain only digits from the same group of 10. The
15116 * ASCII case is handled just above. 0x660 is the
15117 * first digit character beyond ASCII. Hence here, the
15118 * range could be a range of digits. Find out. */
15119 IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15121 IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15124 /* If the range start and final points are in the same
15125 * inversion list element, it means that either both
15126 * are not digits, or both are digits in a consecutive
15127 * sequence of digits. (So far, Unicode has kept all
15128 * such sequences as distinct groups of 10, but assert
15129 * to make sure). If the end points are not in the
15130 * same element, neither should be a digit. */
15131 if (index_start == index_final) {
15132 assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
15133 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15134 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15136 /* But actually Unicode did have one group of 11
15137 * 'digits' in 5.2, so in case we are operating
15138 * on that version, let that pass */
15139 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15140 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15142 && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15146 else if ((index_start >= 0
15147 && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
15148 || (index_final >= 0
15149 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
15151 vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
15156 if ((! range || prevvalue == value) && non_portable_endpoint) {
15157 if (isPRINT_A(value)) {
15160 if (isBACKSLASHED_PUNCT(value)) {
15161 literal[d++] = '\\';
15163 literal[d++] = (char) value;
15164 literal[d++] = '\0';
15167 "\"%.*s\" is more clearly written simply as \"%s\"",
15168 (int) (RExC_parse - rangebegin),
15173 else if isMNEMONIC_CNTRL(value) {
15175 "\"%.*s\" is more clearly written simply as \"%s\"",
15176 (int) (RExC_parse - rangebegin),
15178 cntrl_to_mnemonic((char) value)
15184 /* Deal with this element of the class */
15188 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15191 /* On non-ASCII platforms, for ranges that span all of 0..255, and
15192 * ones that don't require special handling, we can just add the
15193 * range like we do for ASCII platforms */
15194 if ((UNLIKELY(prevvalue == 0) && value >= 255)
15195 || ! (prevvalue < 256
15197 || (! non_portable_endpoint
15198 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
15199 || (isUPPER_A(prevvalue)
15200 && isUPPER_A(value)))))))
15202 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15206 /* Here, requires special handling. This can be because it is
15207 * a range whose code points are considered to be Unicode, and
15208 * so must be individually translated into native, or because
15209 * its a subrange of 'A-Z' or 'a-z' which each aren't
15210 * contiguous in EBCDIC, but we have defined them to include
15211 * only the "expected" upper or lower case ASCII alphabetics.
15212 * Subranges above 255 are the same in native and Unicode, so
15213 * can be added as a range */
15214 U8 start = NATIVE_TO_LATIN1(prevvalue);
15216 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
15217 for (j = start; j <= end; j++) {
15218 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
15221 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15228 range = 0; /* this range (if it was one) is done now */
15229 } /* End of loop through all the text within the brackets */
15231 /* If anything in the class expands to more than one character, we have to
15232 * deal with them by building up a substitute parse string, and recursively
15233 * calling reg() on it, instead of proceeding */
15234 if (multi_char_matches) {
15235 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
15238 char *save_end = RExC_end;
15239 char *save_parse = RExC_parse;
15240 bool first_time = TRUE; /* First multi-char occurrence doesn't get
15245 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
15246 because too confusing */
15248 sv_catpv(substitute_parse, "(?:");
15252 /* Look at the longest folds first */
15253 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
15255 if (av_exists(multi_char_matches, cp_count)) {
15256 AV** this_array_ptr;
15259 this_array_ptr = (AV**) av_fetch(multi_char_matches,
15261 while ((this_sequence = av_pop(*this_array_ptr)) !=
15264 if (! first_time) {
15265 sv_catpv(substitute_parse, "|");
15267 first_time = FALSE;
15269 sv_catpv(substitute_parse, SvPVX(this_sequence));
15274 /* If the character class contains anything else besides these
15275 * multi-character folds, have to include it in recursive parsing */
15276 if (element_count) {
15277 sv_catpv(substitute_parse, "|[");
15278 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
15279 sv_catpv(substitute_parse, "]");
15282 sv_catpv(substitute_parse, ")");
15285 /* This is a way to get the parse to skip forward a whole named
15286 * sequence instead of matching the 2nd character when it fails the
15288 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
15292 RExC_parse = SvPV(substitute_parse, len);
15293 RExC_end = RExC_parse + len;
15294 RExC_in_multi_char_class = 1;
15295 RExC_override_recoding = 1;
15296 RExC_emit = (regnode *)orig_emit;
15298 ret = reg(pRExC_state, 1, ®_flags, depth+1);
15300 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
15302 RExC_parse = save_parse;
15303 RExC_end = save_end;
15304 RExC_in_multi_char_class = 0;
15305 RExC_override_recoding = 0;
15306 SvREFCNT_dec_NN(multi_char_matches);
15310 /* Here, we've gone through the entire class and dealt with multi-char
15311 * folds. We are now in a position that we can do some checks to see if we
15312 * can optimize this ANYOF node into a simpler one, even in Pass 1.
15313 * Currently we only do two checks:
15314 * 1) is in the unlikely event that the user has specified both, eg. \w and
15315 * \W under /l, then the class matches everything. (This optimization
15316 * is done only to make the optimizer code run later work.)
15317 * 2) if the character class contains only a single element (including a
15318 * single range), we see if there is an equivalent node for it.
15319 * Other checks are possible */
15320 if (! ret_invlist /* Can't optimize if returning the constructed
15322 && (UNLIKELY(posixl_matches_all) || element_count == 1))
15327 if (UNLIKELY(posixl_matches_all)) {
15330 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
15331 \w or [:digit:] or \p{foo}
15334 /* All named classes are mapped into POSIXish nodes, with its FLAG
15335 * argument giving which class it is */
15336 switch ((I32)namedclass) {
15337 case ANYOF_UNIPROP:
15340 /* These don't depend on the charset modifiers. They always
15341 * match under /u rules */
15342 case ANYOF_NHORIZWS:
15343 case ANYOF_HORIZWS:
15344 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
15347 case ANYOF_NVERTWS:
15352 /* The actual POSIXish node for all the rest depends on the
15353 * charset modifier. The ones in the first set depend only on
15354 * ASCII or, if available on this platform, also locale */
15358 op = (LOC) ? POSIXL : POSIXA;
15364 /* The following don't have any matches in the upper Latin1
15365 * range, hence /d is equivalent to /u for them. Making it /u
15366 * saves some branches at runtime */
15370 case ANYOF_NXDIGIT:
15371 if (! DEPENDS_SEMANTICS) {
15372 goto treat_as_default;
15378 /* The following change to CASED under /i */
15384 namedclass = ANYOF_CASED + (namedclass % 2);
15388 /* The rest have more possibilities depending on the charset.
15389 * We take advantage of the enum ordering of the charset
15390 * modifiers to get the exact node type, */
15393 op = POSIXD + get_regex_charset(RExC_flags);
15394 if (op > POSIXA) { /* /aa is same as /a */
15399 /* The odd numbered ones are the complements of the
15400 * next-lower even number one */
15401 if (namedclass % 2 == 1) {
15405 arg = namedclass_to_classnum(namedclass);
15409 else if (value == prevvalue) {
15411 /* Here, the class consists of just a single code point */
15414 if (! LOC && value == '\n') {
15415 op = REG_ANY; /* Optimize [^\n] */
15416 *flagp |= HASWIDTH|SIMPLE;
15420 else if (value < 256 || UTF) {
15422 /* Optimize a single value into an EXACTish node, but not if it
15423 * would require converting the pattern to UTF-8. */
15424 op = compute_EXACTish(pRExC_state);
15426 } /* Otherwise is a range */
15427 else if (! LOC) { /* locale could vary these */
15428 if (prevvalue == '0') {
15429 if (value == '9') {
15434 else if (! FOLD || ASCII_FOLD_RESTRICTED) {
15435 /* We can optimize A-Z or a-z, but not if they could match
15436 * something like the KELVIN SIGN under /i. */
15437 if (prevvalue == 'A') {
15440 && ! non_portable_endpoint
15443 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
15447 else if (prevvalue == 'a') {
15450 && ! non_portable_endpoint
15453 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
15460 /* Here, we have changed <op> away from its initial value iff we found
15461 * an optimization */
15464 /* Throw away this ANYOF regnode, and emit the calculated one,
15465 * which should correspond to the beginning, not current, state of
15467 const char * cur_parse = RExC_parse;
15468 RExC_parse = (char *)orig_parse;
15472 /* To get locale nodes to not use the full ANYOF size would
15473 * require moving the code above that writes the portions
15474 * of it that aren't in other nodes to after this point.
15475 * e.g. ANYOF_POSIXL_SET */
15476 RExC_size = orig_size;
15480 RExC_emit = (regnode *)orig_emit;
15481 if (PL_regkind[op] == POSIXD) {
15482 if (op == POSIXL) {
15483 RExC_contains_locale = 1;
15486 op += NPOSIXD - POSIXD;
15491 ret = reg_node(pRExC_state, op);
15493 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
15497 *flagp |= HASWIDTH|SIMPLE;
15499 else if (PL_regkind[op] == EXACT) {
15500 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15501 TRUE /* downgradable to EXACT */
15505 RExC_parse = (char *) cur_parse;
15507 SvREFCNT_dec(posixes);
15508 SvREFCNT_dec(nposixes);
15509 SvREFCNT_dec(simple_posixes);
15510 SvREFCNT_dec(cp_list);
15511 SvREFCNT_dec(cp_foldable_list);
15518 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
15520 /* If folding, we calculate all characters that could fold to or from the
15521 * ones already on the list */
15522 if (cp_foldable_list) {
15524 UV start, end; /* End points of code point ranges */
15526 SV* fold_intersection = NULL;
15529 /* Our calculated list will be for Unicode rules. For locale
15530 * matching, we have to keep a separate list that is consulted at
15531 * runtime only when the locale indicates Unicode rules. For
15532 * non-locale, we just use to the general list */
15534 use_list = &only_utf8_locale_list;
15537 use_list = &cp_list;
15540 /* Only the characters in this class that participate in folds need
15541 * be checked. Get the intersection of this class and all the
15542 * possible characters that are foldable. This can quickly narrow
15543 * down a large class */
15544 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
15545 &fold_intersection);
15547 /* The folds for all the Latin1 characters are hard-coded into this
15548 * program, but we have to go out to disk to get the others. */
15549 if (invlist_highest(cp_foldable_list) >= 256) {
15551 /* This is a hash that for a particular fold gives all
15552 * characters that are involved in it */
15553 if (! PL_utf8_foldclosures) {
15554 _load_PL_utf8_foldclosures();
15558 /* Now look at the foldable characters in this class individually */
15559 invlist_iterinit(fold_intersection);
15560 while (invlist_iternext(fold_intersection, &start, &end)) {
15563 /* Look at every character in the range */
15564 for (j = start; j <= end; j++) {
15565 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
15571 if (IS_IN_SOME_FOLD_L1(j)) {
15573 /* ASCII is always matched; non-ASCII is matched
15574 * only under Unicode rules (which could happen
15575 * under /l if the locale is a UTF-8 one */
15576 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15577 *use_list = add_cp_to_invlist(*use_list,
15578 PL_fold_latin1[j]);
15582 add_cp_to_invlist(depends_list,
15583 PL_fold_latin1[j]);
15587 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15588 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15590 add_above_Latin1_folds(pRExC_state,
15597 /* Here is an above Latin1 character. We don't have the
15598 * rules hard-coded for it. First, get its fold. This is
15599 * the simple fold, as the multi-character folds have been
15600 * handled earlier and separated out */
15601 _to_uni_fold_flags(j, foldbuf, &foldlen,
15602 (ASCII_FOLD_RESTRICTED)
15603 ? FOLD_FLAGS_NOMIX_ASCII
15606 /* Single character fold of above Latin1. Add everything in
15607 * its fold closure to the list that this node should match.
15608 * The fold closures data structure is a hash with the keys
15609 * being the UTF-8 of every character that is folded to, like
15610 * 'k', and the values each an array of all code points that
15611 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
15612 * Multi-character folds are not included */
15613 if ((listp = hv_fetch(PL_utf8_foldclosures,
15614 (char *) foldbuf, foldlen, FALSE)))
15616 AV* list = (AV*) *listp;
15618 for (k = 0; k <= av_tindex(list); k++) {
15619 SV** c_p = av_fetch(list, k, FALSE);
15625 /* /aa doesn't allow folds between ASCII and non- */
15626 if ((ASCII_FOLD_RESTRICTED
15627 && (isASCII(c) != isASCII(j))))
15632 /* Folds under /l which cross the 255/256 boundary
15633 * are added to a separate list. (These are valid
15634 * only when the locale is UTF-8.) */
15635 if (c < 256 && LOC) {
15636 *use_list = add_cp_to_invlist(*use_list, c);
15640 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15642 cp_list = add_cp_to_invlist(cp_list, c);
15645 /* Similarly folds involving non-ascii Latin1
15646 * characters under /d are added to their list */
15647 depends_list = add_cp_to_invlist(depends_list,
15654 SvREFCNT_dec_NN(fold_intersection);
15657 /* Now that we have finished adding all the folds, there is no reason
15658 * to keep the foldable list separate */
15659 _invlist_union(cp_list, cp_foldable_list, &cp_list);
15660 SvREFCNT_dec_NN(cp_foldable_list);
15663 /* And combine the result (if any) with any inversion list from posix
15664 * classes. The lists are kept separate up to now because we don't want to
15665 * fold the classes (folding of those is automatically handled by the swash
15666 * fetching code) */
15667 if (simple_posixes) {
15668 _invlist_union(cp_list, simple_posixes, &cp_list);
15669 SvREFCNT_dec_NN(simple_posixes);
15671 if (posixes || nposixes) {
15672 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15673 /* Under /a and /aa, nothing above ASCII matches these */
15674 _invlist_intersection(posixes,
15675 PL_XPosix_ptrs[_CC_ASCII],
15679 if (DEPENDS_SEMANTICS) {
15680 /* Under /d, everything in the upper half of the Latin1 range
15681 * matches these complements */
15682 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15684 else if (AT_LEAST_ASCII_RESTRICTED) {
15685 /* Under /a and /aa, everything above ASCII matches these
15687 _invlist_union_complement_2nd(nposixes,
15688 PL_XPosix_ptrs[_CC_ASCII],
15692 _invlist_union(posixes, nposixes, &posixes);
15693 SvREFCNT_dec_NN(nposixes);
15696 posixes = nposixes;
15699 if (! DEPENDS_SEMANTICS) {
15701 _invlist_union(cp_list, posixes, &cp_list);
15702 SvREFCNT_dec_NN(posixes);
15709 /* Under /d, we put into a separate list the Latin1 things that
15710 * match only when the target string is utf8 */
15711 SV* nonascii_but_latin1_properties = NULL;
15712 _invlist_intersection(posixes, PL_UpperLatin1,
15713 &nonascii_but_latin1_properties);
15714 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15717 _invlist_union(cp_list, posixes, &cp_list);
15718 SvREFCNT_dec_NN(posixes);
15724 if (depends_list) {
15725 _invlist_union(depends_list, nonascii_but_latin1_properties,
15727 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15730 depends_list = nonascii_but_latin1_properties;
15735 /* And combine the result (if any) with any inversion list from properties.
15736 * The lists are kept separate up to now so that we can distinguish the two
15737 * in regards to matching above-Unicode. A run-time warning is generated
15738 * if a Unicode property is matched against a non-Unicode code point. But,
15739 * we allow user-defined properties to match anything, without any warning,
15740 * and we also suppress the warning if there is a portion of the character
15741 * class that isn't a Unicode property, and which matches above Unicode, \W
15742 * or [\x{110000}] for example.
15743 * (Note that in this case, unlike the Posix one above, there is no
15744 * <depends_list>, because having a Unicode property forces Unicode
15749 /* If it matters to the final outcome, see if a non-property
15750 * component of the class matches above Unicode. If so, the
15751 * warning gets suppressed. This is true even if just a single
15752 * such code point is specified, as though not strictly correct if
15753 * another such code point is matched against, the fact that they
15754 * are using above-Unicode code points indicates they should know
15755 * the issues involved */
15757 warn_super = ! (invert
15758 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15761 _invlist_union(properties, cp_list, &cp_list);
15762 SvREFCNT_dec_NN(properties);
15765 cp_list = properties;
15769 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15773 /* Here, we have calculated what code points should be in the character
15776 * Now we can see about various optimizations. Fold calculation (which we
15777 * did above) needs to take place before inversion. Otherwise /[^k]/i
15778 * would invert to include K, which under /i would match k, which it
15779 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15780 * folded until runtime */
15782 /* If we didn't do folding, it's because some information isn't available
15783 * until runtime; set the run-time fold flag for these. (We don't have to
15784 * worry about properties folding, as that is taken care of by the swash
15785 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15786 * locales, or the class matches at least one 0-255 range code point */
15788 if (only_utf8_locale_list) {
15789 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15791 else if (cp_list) { /* Look to see if there a 0-255 code point is in
15794 invlist_iterinit(cp_list);
15795 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15796 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15798 invlist_iterfinish(cp_list);
15802 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15803 * at compile time. Besides not inverting folded locale now, we can't
15804 * invert if there are things such as \w, which aren't known until runtime
15808 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15810 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15812 _invlist_invert(cp_list);
15814 /* Any swash can't be used as-is, because we've inverted things */
15816 SvREFCNT_dec_NN(swash);
15820 /* Clear the invert flag since have just done it here */
15827 *ret_invlist = cp_list;
15828 SvREFCNT_dec(swash);
15830 /* Discard the generated node */
15832 RExC_size = orig_size;
15835 RExC_emit = orig_emit;
15840 /* Some character classes are equivalent to other nodes. Such nodes take
15841 * up less room and generally fewer operations to execute than ANYOF nodes.
15842 * Above, we checked for and optimized into some such equivalents for
15843 * certain common classes that are easy to test. Getting to this point in
15844 * the code means that the class didn't get optimized there. Since this
15845 * code is only executed in Pass 2, it is too late to save space--it has
15846 * been allocated in Pass 1, and currently isn't given back. But turning
15847 * things into an EXACTish node can allow the optimizer to join it to any
15848 * adjacent such nodes. And if the class is equivalent to things like /./,
15849 * expensive run-time swashes can be avoided. Now that we have more
15850 * complete information, we can find things necessarily missed by the
15851 * earlier code. I (khw) am not sure how much to look for here. It would
15852 * be easy, but perhaps too slow, to check any candidates against all the
15853 * node types they could possibly match using _invlistEQ(). */
15858 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15859 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15861 /* We don't optimize if we are supposed to make sure all non-Unicode
15862 * code points raise a warning, as only ANYOF nodes have this check.
15864 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15867 U8 op = END; /* The optimzation node-type */
15868 const char * cur_parse= RExC_parse;
15870 invlist_iterinit(cp_list);
15871 if (! invlist_iternext(cp_list, &start, &end)) {
15873 /* Here, the list is empty. This happens, for example, when a
15874 * Unicode property is the only thing in the character class, and
15875 * it doesn't match anything. (perluniprops.pod notes such
15878 *flagp |= HASWIDTH|SIMPLE;
15880 else if (start == end) { /* The range is a single code point */
15881 if (! invlist_iternext(cp_list, &start, &end)
15883 /* Don't do this optimization if it would require changing
15884 * the pattern to UTF-8 */
15885 && (start < 256 || UTF))
15887 /* Here, the list contains a single code point. Can optimize
15888 * into an EXACTish node */
15899 /* A locale node under folding with one code point can be
15900 * an EXACTFL, as its fold won't be calculated until
15906 /* Here, we are generally folding, but there is only one
15907 * code point to match. If we have to, we use an EXACT
15908 * node, but it would be better for joining with adjacent
15909 * nodes in the optimization pass if we used the same
15910 * EXACTFish node that any such are likely to be. We can
15911 * do this iff the code point doesn't participate in any
15912 * folds. For example, an EXACTF of a colon is the same as
15913 * an EXACT one, since nothing folds to or from a colon. */
15915 if (IS_IN_SOME_FOLD_L1(value)) {
15920 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15925 /* If we haven't found the node type, above, it means we
15926 * can use the prevailing one */
15928 op = compute_EXACTish(pRExC_state);
15933 else if (start == 0) {
15934 if (end == UV_MAX) {
15936 *flagp |= HASWIDTH|SIMPLE;
15939 else if (end == '\n' - 1
15940 && invlist_iternext(cp_list, &start, &end)
15941 && start == '\n' + 1 && end == UV_MAX)
15944 *flagp |= HASWIDTH|SIMPLE;
15948 invlist_iterfinish(cp_list);
15951 RExC_parse = (char *)orig_parse;
15952 RExC_emit = (regnode *)orig_emit;
15954 ret = reg_node(pRExC_state, op);
15956 RExC_parse = (char *)cur_parse;
15958 if (PL_regkind[op] == EXACT) {
15959 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15960 TRUE /* downgradable to EXACT */
15964 SvREFCNT_dec_NN(cp_list);
15969 /* Here, <cp_list> contains all the code points we can determine at
15970 * compile time that match under all conditions. Go through it, and
15971 * for things that belong in the bitmap, put them there, and delete from
15972 * <cp_list>. While we are at it, see if everything above 255 is in the
15973 * list, and if so, set a flag to speed up execution */
15975 populate_ANYOF_from_invlist(ret, &cp_list);
15978 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15981 /* Here, the bitmap has been populated with all the Latin1 code points that
15982 * always match. Can now add to the overall list those that match only
15983 * when the target string is UTF-8 (<depends_list>). */
15984 if (depends_list) {
15986 _invlist_union(cp_list, depends_list, &cp_list);
15987 SvREFCNT_dec_NN(depends_list);
15990 cp_list = depends_list;
15992 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15995 /* If there is a swash and more than one element, we can't use the swash in
15996 * the optimization below. */
15997 if (swash && element_count > 1) {
15998 SvREFCNT_dec_NN(swash);
16002 /* Note that the optimization of using 'swash' if it is the only thing in
16003 * the class doesn't have us change swash at all, so it can include things
16004 * that are also in the bitmap; otherwise we have purposely deleted that
16005 * duplicate information */
16006 set_ANYOF_arg(pRExC_state, ret, cp_list,
16007 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
16009 only_utf8_locale_list,
16010 swash, has_user_defined_property);
16012 *flagp |= HASWIDTH|SIMPLE;
16014 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
16015 RExC_contains_locale = 1;
16021 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
16024 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
16025 regnode* const node,
16027 SV* const runtime_defns,
16028 SV* const only_utf8_locale_list,
16030 const bool has_user_defined_property)
16032 /* Sets the arg field of an ANYOF-type node 'node', using information about
16033 * the node passed-in. If there is nothing outside the node's bitmap, the
16034 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
16035 * the count returned by add_data(), having allocated and stored an array,
16036 * av, that that count references, as follows:
16037 * av[0] stores the character class description in its textual form.
16038 * This is used later (regexec.c:Perl_regclass_swash()) to
16039 * initialize the appropriate swash, and is also useful for dumping
16040 * the regnode. This is set to &PL_sv_undef if the textual
16041 * description is not needed at run-time (as happens if the other
16042 * elements completely define the class)
16043 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
16044 * computed from av[0]. But if no further computation need be done,
16045 * the swash is stored here now (and av[0] is &PL_sv_undef).
16046 * av[2] stores the inversion list of code points that match only if the
16047 * current locale is UTF-8
16048 * av[3] stores the cp_list inversion list for use in addition or instead
16049 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
16050 * (Otherwise everything needed is already in av[0] and av[1])
16051 * av[4] is set if any component of the class is from a user-defined
16052 * property; used only if av[3] exists */
16056 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
16058 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
16059 assert(! (ANYOF_FLAGS(node)
16060 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16061 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
16062 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
16065 AV * const av = newAV();
16068 assert(ANYOF_FLAGS(node)
16069 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16070 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16072 av_store(av, 0, (runtime_defns)
16073 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
16076 av_store(av, 1, swash);
16077 SvREFCNT_dec_NN(cp_list);
16080 av_store(av, 1, &PL_sv_undef);
16082 av_store(av, 3, cp_list);
16083 av_store(av, 4, newSVuv(has_user_defined_property));
16087 if (only_utf8_locale_list) {
16088 av_store(av, 2, only_utf8_locale_list);
16091 av_store(av, 2, &PL_sv_undef);
16094 rv = newRV_noinc(MUTABLE_SV(av));
16095 n = add_data(pRExC_state, STR_WITH_LEN("s"));
16096 RExC_rxi->data->data[n] = (void*)rv;
16101 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
16103 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
16104 const regnode* node,
16107 SV** only_utf8_locale_ptr,
16111 /* For internal core use only.
16112 * Returns the swash for the input 'node' in the regex 'prog'.
16113 * If <doinit> is 'true', will attempt to create the swash if not already
16115 * If <listsvp> is non-null, will return the printable contents of the
16116 * swash. This can be used to get debugging information even before the
16117 * swash exists, by calling this function with 'doinit' set to false, in
16118 * which case the components that will be used to eventually create the
16119 * swash are returned (in a printable form).
16120 * If <exclude_list> is not NULL, it is an inversion list of things to
16121 * exclude from what's returned in <listsvp>.
16122 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
16123 * that, in spite of this function's name, the swash it returns may include
16124 * the bitmap data as well */
16127 SV *si = NULL; /* Input swash initialization string */
16128 SV* invlist = NULL;
16130 RXi_GET_DECL(prog,progi);
16131 const struct reg_data * const data = prog ? progi->data : NULL;
16133 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
16135 assert(ANYOF_FLAGS(node)
16136 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16137 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16139 if (data && data->count) {
16140 const U32 n = ARG(node);
16142 if (data->what[n] == 's') {
16143 SV * const rv = MUTABLE_SV(data->data[n]);
16144 AV * const av = MUTABLE_AV(SvRV(rv));
16145 SV **const ary = AvARRAY(av);
16146 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
16148 si = *ary; /* ary[0] = the string to initialize the swash with */
16150 /* Elements 3 and 4 are either both present or both absent. [3] is
16151 * any inversion list generated at compile time; [4] indicates if
16152 * that inversion list has any user-defined properties in it. */
16153 if (av_tindex(av) >= 2) {
16154 if (only_utf8_locale_ptr
16156 && ary[2] != &PL_sv_undef)
16158 *only_utf8_locale_ptr = ary[2];
16161 assert(only_utf8_locale_ptr);
16162 *only_utf8_locale_ptr = NULL;
16165 if (av_tindex(av) >= 3) {
16167 if (SvUV(ary[4])) {
16168 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
16176 /* Element [1] is reserved for the set-up swash. If already there,
16177 * return it; if not, create it and store it there */
16178 if (ary[1] && SvROK(ary[1])) {
16181 else if (doinit && ((si && si != &PL_sv_undef)
16182 || (invlist && invlist != &PL_sv_undef))) {
16184 sw = _core_swash_init("utf8", /* the utf8 package */
16188 0, /* not from tr/// */
16190 &swash_init_flags);
16191 (void)av_store(av, 1, sw);
16196 /* If requested, return a printable version of what this swash matches */
16198 SV* matches_string = newSVpvs("");
16200 /* The swash should be used, if possible, to get the data, as it
16201 * contains the resolved data. But this function can be called at
16202 * compile-time, before everything gets resolved, in which case we
16203 * return the currently best available information, which is the string
16204 * that will eventually be used to do that resolving, 'si' */
16205 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
16206 && (si && si != &PL_sv_undef))
16208 sv_catsv(matches_string, si);
16211 /* Add the inversion list to whatever we have. This may have come from
16212 * the swash, or from an input parameter */
16214 if (exclude_list) {
16215 SV* clone = invlist_clone(invlist);
16216 _invlist_subtract(clone, exclude_list, &clone);
16217 sv_catsv(matches_string, _invlist_contents(clone));
16218 SvREFCNT_dec_NN(clone);
16221 sv_catsv(matches_string, _invlist_contents(invlist));
16224 *listsvp = matches_string;
16229 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
16231 /* reg_skipcomment()
16233 Absorbs an /x style # comment from the input stream,
16234 returning a pointer to the first character beyond the comment, or if the
16235 comment terminates the pattern without anything following it, this returns
16236 one past the final character of the pattern (in other words, RExC_end) and
16237 sets the REG_RUN_ON_COMMENT_SEEN flag.
16239 Note it's the callers responsibility to ensure that we are
16240 actually in /x mode
16244 PERL_STATIC_INLINE char*
16245 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
16247 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
16251 while (p < RExC_end) {
16252 if (*(++p) == '\n') {
16257 /* we ran off the end of the pattern without ending the comment, so we have
16258 * to add an \n when wrapping */
16259 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
16265 Advances the parse position, and optionally absorbs
16266 "whitespace" from the inputstream.
16268 Without /x "whitespace" means (?#...) style comments only,
16269 with /x this means (?#...) and # comments and whitespace proper.
16271 Returns the RExC_parse point from BEFORE the scan occurs.
16273 This is the /x friendly way of saying RExC_parse++.
16277 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
16279 char* const retval = RExC_parse++;
16281 PERL_ARGS_ASSERT_NEXTCHAR;
16284 if (RExC_end - RExC_parse >= 3
16285 && *RExC_parse == '('
16286 && RExC_parse[1] == '?'
16287 && RExC_parse[2] == '#')
16289 while (*RExC_parse != ')') {
16290 if (RExC_parse == RExC_end)
16291 FAIL("Sequence (?#... not terminated");
16297 if (RExC_flags & RXf_PMf_EXTENDED) {
16298 char * p = regpatws(pRExC_state, RExC_parse,
16299 TRUE); /* means recognize comments */
16300 if (p != RExC_parse) {
16310 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
16312 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
16313 * space. In pass1, it aligns and increments RExC_size; in pass2,
16316 regnode * const ret = RExC_emit;
16317 GET_RE_DEBUG_FLAGS_DECL;
16319 PERL_ARGS_ASSERT_REGNODE_GUTS;
16321 assert(extra_size >= regarglen[op]);
16324 SIZE_ALIGN(RExC_size);
16325 RExC_size += 1 + extra_size;
16328 if (RExC_emit >= RExC_emit_bound)
16329 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
16330 op, (void*)RExC_emit, (void*)RExC_emit_bound);
16332 NODE_ALIGN_FILL(ret);
16333 #ifndef RE_TRACK_PATTERN_OFFSETS
16334 PERL_UNUSED_ARG(name);
16336 if (RExC_offsets) { /* MJD */
16338 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
16341 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
16342 ? "Overwriting end of array!\n" : "OK",
16343 (UV)(RExC_emit - RExC_emit_start),
16344 (UV)(RExC_parse - RExC_start),
16345 (UV)RExC_offsets[0]));
16346 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
16353 - reg_node - emit a node
16355 STATIC regnode * /* Location. */
16356 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
16358 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
16360 PERL_ARGS_ASSERT_REG_NODE;
16362 assert(regarglen[op] == 0);
16365 regnode *ptr = ret;
16366 FILL_ADVANCE_NODE(ptr, op);
16373 - reganode - emit a node with an argument
16375 STATIC regnode * /* Location. */
16376 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
16378 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
16380 PERL_ARGS_ASSERT_REGANODE;
16382 assert(regarglen[op] == 1);
16385 regnode *ptr = ret;
16386 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
16393 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
16395 /* emit a node with U32 and I32 arguments */
16397 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
16399 PERL_ARGS_ASSERT_REG2LANODE;
16401 assert(regarglen[op] == 2);
16404 regnode *ptr = ret;
16405 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
16412 - reginsert - insert an operator in front of already-emitted operand
16414 * Means relocating the operand.
16417 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
16422 const int offset = regarglen[(U8)op];
16423 const int size = NODE_STEP_REGNODE + offset;
16424 GET_RE_DEBUG_FLAGS_DECL;
16426 PERL_ARGS_ASSERT_REGINSERT;
16427 PERL_UNUSED_CONTEXT;
16428 PERL_UNUSED_ARG(depth);
16429 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
16430 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
16439 if (RExC_open_parens) {
16441 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
16442 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
16443 if ( RExC_open_parens[paren] >= opnd ) {
16444 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
16445 RExC_open_parens[paren] += size;
16447 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
16449 if ( RExC_close_parens[paren] >= opnd ) {
16450 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
16451 RExC_close_parens[paren] += size;
16453 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
16458 while (src > opnd) {
16459 StructCopy(--src, --dst, regnode);
16460 #ifdef RE_TRACK_PATTERN_OFFSETS
16461 if (RExC_offsets) { /* MJD 20010112 */
16463 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
16467 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
16468 ? "Overwriting end of array!\n" : "OK",
16469 (UV)(src - RExC_emit_start),
16470 (UV)(dst - RExC_emit_start),
16471 (UV)RExC_offsets[0]));
16472 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
16473 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
16479 place = opnd; /* Op node, where operand used to be. */
16480 #ifdef RE_TRACK_PATTERN_OFFSETS
16481 if (RExC_offsets) { /* MJD */
16483 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
16487 (UV)(place - RExC_emit_start) > RExC_offsets[0]
16488 ? "Overwriting end of array!\n" : "OK",
16489 (UV)(place - RExC_emit_start),
16490 (UV)(RExC_parse - RExC_start),
16491 (UV)RExC_offsets[0]));
16492 Set_Node_Offset(place, RExC_parse);
16493 Set_Node_Length(place, 1);
16496 src = NEXTOPER(place);
16497 FILL_ADVANCE_NODE(place, op);
16498 Zero(src, offset, regnode);
16502 - regtail - set the next-pointer at the end of a node chain of p to val.
16503 - SEE ALSO: regtail_study
16505 /* TODO: All three parms should be const */
16507 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16508 const regnode *val,U32 depth)
16511 GET_RE_DEBUG_FLAGS_DECL;
16513 PERL_ARGS_ASSERT_REGTAIL;
16515 PERL_UNUSED_ARG(depth);
16521 /* Find last node. */
16524 regnode * const temp = regnext(scan);
16526 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
16527 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16528 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
16529 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
16530 (temp == NULL ? "->" : ""),
16531 (temp == NULL ? PL_reg_name[OP(val)] : "")
16539 if (reg_off_by_arg[OP(scan)]) {
16540 ARG_SET(scan, val - scan);
16543 NEXT_OFF(scan) = val - scan;
16549 - regtail_study - set the next-pointer at the end of a node chain of p to val.
16550 - Look for optimizable sequences at the same time.
16551 - currently only looks for EXACT chains.
16553 This is experimental code. The idea is to use this routine to perform
16554 in place optimizations on branches and groups as they are constructed,
16555 with the long term intention of removing optimization from study_chunk so
16556 that it is purely analytical.
16558 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
16559 to control which is which.
16562 /* TODO: All four parms should be const */
16565 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16566 const regnode *val,U32 depth)
16570 #ifdef EXPERIMENTAL_INPLACESCAN
16573 GET_RE_DEBUG_FLAGS_DECL;
16575 PERL_ARGS_ASSERT_REGTAIL_STUDY;
16581 /* Find last node. */
16585 regnode * const temp = regnext(scan);
16586 #ifdef EXPERIMENTAL_INPLACESCAN
16587 if (PL_regkind[OP(scan)] == EXACT) {
16588 bool unfolded_multi_char; /* Unexamined in this routine */
16589 if (join_exact(pRExC_state, scan, &min,
16590 &unfolded_multi_char, 1, val, depth+1))
16595 switch (OP(scan)) {
16599 case EXACTFA_NO_TRIE:
16605 if( exact == PSEUDO )
16607 else if ( exact != OP(scan) )
16616 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16617 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16618 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16619 SvPV_nolen_const(RExC_mysv),
16620 REG_NODE_NUM(scan),
16621 PL_reg_name[exact]);
16628 DEBUG_PARSE_MSG("");
16629 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16630 PerlIO_printf(Perl_debug_log,
16631 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16632 SvPV_nolen_const(RExC_mysv),
16633 (IV)REG_NODE_NUM(val),
16637 if (reg_off_by_arg[OP(scan)]) {
16638 ARG_SET(scan, val - scan);
16641 NEXT_OFF(scan) = val - scan;
16649 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16654 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16659 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16661 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16662 if (flags & (1<<bit)) {
16663 if (!set++ && lead)
16664 PerlIO_printf(Perl_debug_log, "%s",lead);
16665 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16670 PerlIO_printf(Perl_debug_log, "\n");
16672 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16677 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16683 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16685 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16686 if (flags & (1<<bit)) {
16687 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16690 if (!set++ && lead)
16691 PerlIO_printf(Perl_debug_log, "%s",lead);
16692 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16695 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16696 if (!set++ && lead) {
16697 PerlIO_printf(Perl_debug_log, "%s",lead);
16700 case REGEX_UNICODE_CHARSET:
16701 PerlIO_printf(Perl_debug_log, "UNICODE");
16703 case REGEX_LOCALE_CHARSET:
16704 PerlIO_printf(Perl_debug_log, "LOCALE");
16706 case REGEX_ASCII_RESTRICTED_CHARSET:
16707 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16709 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16710 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16713 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16719 PerlIO_printf(Perl_debug_log, "\n");
16721 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16727 Perl_regdump(pTHX_ const regexp *r)
16730 SV * const sv = sv_newmortal();
16731 SV *dsv= sv_newmortal();
16732 RXi_GET_DECL(r,ri);
16733 GET_RE_DEBUG_FLAGS_DECL;
16735 PERL_ARGS_ASSERT_REGDUMP;
16737 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16739 /* Header fields of interest. */
16740 if (r->anchored_substr) {
16741 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16742 RE_SV_DUMPLEN(r->anchored_substr), 30);
16743 PerlIO_printf(Perl_debug_log,
16744 "anchored %s%s at %"IVdf" ",
16745 s, RE_SV_TAIL(r->anchored_substr),
16746 (IV)r->anchored_offset);
16747 } else if (r->anchored_utf8) {
16748 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16749 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16750 PerlIO_printf(Perl_debug_log,
16751 "anchored utf8 %s%s at %"IVdf" ",
16752 s, RE_SV_TAIL(r->anchored_utf8),
16753 (IV)r->anchored_offset);
16755 if (r->float_substr) {
16756 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16757 RE_SV_DUMPLEN(r->float_substr), 30);
16758 PerlIO_printf(Perl_debug_log,
16759 "floating %s%s at %"IVdf"..%"UVuf" ",
16760 s, RE_SV_TAIL(r->float_substr),
16761 (IV)r->float_min_offset, (UV)r->float_max_offset);
16762 } else if (r->float_utf8) {
16763 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16764 RE_SV_DUMPLEN(r->float_utf8), 30);
16765 PerlIO_printf(Perl_debug_log,
16766 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16767 s, RE_SV_TAIL(r->float_utf8),
16768 (IV)r->float_min_offset, (UV)r->float_max_offset);
16770 if (r->check_substr || r->check_utf8)
16771 PerlIO_printf(Perl_debug_log,
16773 (r->check_substr == r->float_substr
16774 && r->check_utf8 == r->float_utf8
16775 ? "(checking floating" : "(checking anchored"));
16776 if (r->intflags & PREGf_NOSCAN)
16777 PerlIO_printf(Perl_debug_log, " noscan");
16778 if (r->extflags & RXf_CHECK_ALL)
16779 PerlIO_printf(Perl_debug_log, " isall");
16780 if (r->check_substr || r->check_utf8)
16781 PerlIO_printf(Perl_debug_log, ") ");
16783 if (ri->regstclass) {
16784 regprop(r, sv, ri->regstclass, NULL, NULL);
16785 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16787 if (r->intflags & PREGf_ANCH) {
16788 PerlIO_printf(Perl_debug_log, "anchored");
16789 if (r->intflags & PREGf_ANCH_MBOL)
16790 PerlIO_printf(Perl_debug_log, "(MBOL)");
16791 if (r->intflags & PREGf_ANCH_SBOL)
16792 PerlIO_printf(Perl_debug_log, "(SBOL)");
16793 if (r->intflags & PREGf_ANCH_GPOS)
16794 PerlIO_printf(Perl_debug_log, "(GPOS)");
16795 (void)PerlIO_putc(Perl_debug_log, ' ');
16797 if (r->intflags & PREGf_GPOS_SEEN)
16798 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16799 if (r->intflags & PREGf_SKIP)
16800 PerlIO_printf(Perl_debug_log, "plus ");
16801 if (r->intflags & PREGf_IMPLICIT)
16802 PerlIO_printf(Perl_debug_log, "implicit ");
16803 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16804 if (r->extflags & RXf_EVAL_SEEN)
16805 PerlIO_printf(Perl_debug_log, "with eval ");
16806 PerlIO_printf(Perl_debug_log, "\n");
16808 regdump_extflags("r->extflags: ",r->extflags);
16809 regdump_intflags("r->intflags: ",r->intflags);
16812 PERL_ARGS_ASSERT_REGDUMP;
16813 PERL_UNUSED_CONTEXT;
16814 PERL_UNUSED_ARG(r);
16815 #endif /* DEBUGGING */
16819 - regprop - printable representation of opcode, with run time support
16823 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16828 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16829 static const char * const anyofs[] = {
16830 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16831 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
16832 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
16833 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
16834 || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
16835 #error Need to adjust order of anyofs[]
16870 RXi_GET_DECL(prog,progi);
16871 GET_RE_DEBUG_FLAGS_DECL;
16873 PERL_ARGS_ASSERT_REGPROP;
16875 sv_setpvn(sv, "", 0);
16877 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
16878 /* It would be nice to FAIL() here, but this may be called from
16879 regexec.c, and it would be hard to supply pRExC_state. */
16880 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16881 (int)OP(o), (int)REGNODE_MAX);
16882 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16884 k = PL_regkind[OP(o)];
16887 sv_catpvs(sv, " ");
16888 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16889 * is a crude hack but it may be the best for now since
16890 * we have no flag "this EXACTish node was UTF-8"
16892 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16893 PERL_PV_ESCAPE_UNI_DETECT |
16894 PERL_PV_ESCAPE_NONASCII |
16895 PERL_PV_PRETTY_ELLIPSES |
16896 PERL_PV_PRETTY_LTGT |
16897 PERL_PV_PRETTY_NOCLEAR
16899 } else if (k == TRIE) {
16900 /* print the details of the trie in dumpuntil instead, as
16901 * progi->data isn't available here */
16902 const char op = OP(o);
16903 const U32 n = ARG(o);
16904 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16905 (reg_ac_data *)progi->data->data[n] :
16907 const reg_trie_data * const trie
16908 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16910 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16911 DEBUG_TRIE_COMPILE_r(
16912 Perl_sv_catpvf(aTHX_ sv,
16913 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16914 (UV)trie->startstate,
16915 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16916 (UV)trie->wordcount,
16919 (UV)TRIE_CHARCOUNT(trie),
16920 (UV)trie->uniquecharcount
16923 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16924 sv_catpvs(sv, "[");
16925 (void) put_charclass_bitmap_innards(sv,
16926 (IS_ANYOF_TRIE(op))
16928 : TRIE_BITMAP(trie),
16930 sv_catpvs(sv, "]");
16933 } else if (k == CURLY) {
16934 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16935 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16936 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16938 else if (k == WHILEM && o->flags) /* Ordinal/of */
16939 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16940 else if (k == REF || k == OPEN || k == CLOSE
16941 || k == GROUPP || OP(o)==ACCEPT)
16943 AV *name_list= NULL;
16944 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16945 if ( RXp_PAREN_NAMES(prog) ) {
16946 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16947 } else if ( pRExC_state ) {
16948 name_list= RExC_paren_name_list;
16951 if ( k != REF || (OP(o) < NREF)) {
16952 SV **name= av_fetch(name_list, ARG(o), 0 );
16954 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16957 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16958 I32 *nums=(I32*)SvPVX(sv_dat);
16959 SV **name= av_fetch(name_list, nums[0], 0 );
16962 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16963 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16964 (n ? "," : ""), (IV)nums[n]);
16966 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16970 if ( k == REF && reginfo) {
16971 U32 n = ARG(o); /* which paren pair */
16972 I32 ln = prog->offs[n].start;
16973 if (prog->lastparen < n || ln == -1)
16974 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16975 else if (ln == prog->offs[n].end)
16976 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16978 const char *s = reginfo->strbeg + ln;
16979 Perl_sv_catpvf(aTHX_ sv, ": ");
16980 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16981 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16984 } else if (k == GOSUB) {
16985 AV *name_list= NULL;
16986 if ( RXp_PAREN_NAMES(prog) ) {
16987 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16988 } else if ( pRExC_state ) {
16989 name_list= RExC_paren_name_list;
16992 /* Paren and offset */
16993 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16995 SV **name= av_fetch(name_list, ARG(o), 0 );
16997 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17000 else if (k == VERB) {
17002 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
17003 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
17004 } else if (k == LOGICAL)
17005 /* 2: embedded, otherwise 1 */
17006 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
17007 else if (k == ANYOF) {
17008 const U8 flags = ANYOF_FLAGS(o);
17010 SV* bitmap_invlist; /* Will hold what the bit map contains */
17013 if (OP(o) == ANYOFL)
17014 sv_catpvs(sv, "{loc}");
17015 if (flags & ANYOF_LOC_FOLD)
17016 sv_catpvs(sv, "{i}");
17017 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
17018 if (flags & ANYOF_INVERT)
17019 sv_catpvs(sv, "^");
17021 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
17023 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
17026 /* output any special charclass tests (used entirely under use
17028 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
17030 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
17031 if (ANYOF_POSIXL_TEST(o,i)) {
17032 sv_catpv(sv, anyofs[i]);
17038 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
17039 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
17040 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
17044 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
17045 if (flags & ANYOF_INVERT)
17046 /*make sure the invert info is in each */
17047 sv_catpvs(sv, "^");
17050 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
17051 sv_catpvs(sv, "{non-utf8-latin1-all}");
17054 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
17055 sv_catpvs(sv, "{above_bitmap_all}");
17057 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
17058 SV *lv; /* Set if there is something outside the bit map. */
17059 bool byte_output = FALSE; /* If something has been output */
17060 SV *only_utf8_locale;
17062 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
17063 * is used to guarantee that nothing in the bitmap gets
17065 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
17066 &lv, &only_utf8_locale,
17068 if (lv && lv != &PL_sv_undef) {
17069 char *s = savesvpv(lv);
17070 char * const origs = s;
17072 while (*s && *s != '\n')
17076 const char * const t = ++s;
17078 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
17079 sv_catpvs(sv, "{outside bitmap}");
17082 sv_catpvs(sv, "{utf8}");
17086 sv_catpvs(sv, " ");
17092 /* Truncate very long output */
17093 if (s - origs > 256) {
17094 Perl_sv_catpvf(aTHX_ sv,
17096 (int) (s - origs - 1),
17102 else if (*s == '\t') {
17116 SvREFCNT_dec_NN(lv);
17119 if ((flags & ANYOF_LOC_FOLD)
17120 && only_utf8_locale
17121 && only_utf8_locale != &PL_sv_undef)
17124 int max_entries = 256;
17126 sv_catpvs(sv, "{utf8 locale}");
17127 invlist_iterinit(only_utf8_locale);
17128 while (invlist_iternext(only_utf8_locale,
17130 put_range(sv, start, end, FALSE);
17132 if (max_entries < 0) {
17133 sv_catpvs(sv, "...");
17137 invlist_iterfinish(only_utf8_locale);
17141 SvREFCNT_dec(bitmap_invlist);
17144 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
17146 else if (k == POSIXD || k == NPOSIXD) {
17147 U8 index = FLAGS(o) * 2;
17148 if (index < C_ARRAY_LENGTH(anyofs)) {
17149 if (*anyofs[index] != '[') {
17152 sv_catpv(sv, anyofs[index]);
17153 if (*anyofs[index] != '[') {
17158 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
17161 else if (k == BOUND || k == NBOUND) {
17162 /* Must be synced with order of 'bound_type' in regcomp.h */
17163 const char * const bounds[] = {
17164 "", /* Traditional */
17169 sv_catpv(sv, bounds[FLAGS(o)]);
17171 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
17172 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
17173 else if (OP(o) == SBOL)
17174 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
17176 PERL_UNUSED_CONTEXT;
17177 PERL_UNUSED_ARG(sv);
17178 PERL_UNUSED_ARG(o);
17179 PERL_UNUSED_ARG(prog);
17180 PERL_UNUSED_ARG(reginfo);
17181 PERL_UNUSED_ARG(pRExC_state);
17182 #endif /* DEBUGGING */
17188 Perl_re_intuit_string(pTHX_ REGEXP * const r)
17189 { /* Assume that RE_INTUIT is set */
17190 struct regexp *const prog = ReANY(r);
17191 GET_RE_DEBUG_FLAGS_DECL;
17193 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
17194 PERL_UNUSED_CONTEXT;
17198 const char * const s = SvPV_nolen_const(RX_UTF8(r)
17199 ? prog->check_utf8 : prog->check_substr);
17201 if (!PL_colorset) reginitcolors();
17202 PerlIO_printf(Perl_debug_log,
17203 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
17205 RX_UTF8(r) ? "utf8 " : "",
17206 PL_colors[5],PL_colors[0],
17209 (strlen(s) > 60 ? "..." : ""));
17212 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
17213 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
17219 handles refcounting and freeing the perl core regexp structure. When
17220 it is necessary to actually free the structure the first thing it
17221 does is call the 'free' method of the regexp_engine associated to
17222 the regexp, allowing the handling of the void *pprivate; member
17223 first. (This routine is not overridable by extensions, which is why
17224 the extensions free is called first.)
17226 See regdupe and regdupe_internal if you change anything here.
17228 #ifndef PERL_IN_XSUB_RE
17230 Perl_pregfree(pTHX_ REGEXP *r)
17236 Perl_pregfree2(pTHX_ REGEXP *rx)
17238 struct regexp *const r = ReANY(rx);
17239 GET_RE_DEBUG_FLAGS_DECL;
17241 PERL_ARGS_ASSERT_PREGFREE2;
17243 if (r->mother_re) {
17244 ReREFCNT_dec(r->mother_re);
17246 CALLREGFREE_PVT(rx); /* free the private data */
17247 SvREFCNT_dec(RXp_PAREN_NAMES(r));
17248 Safefree(r->xpv_len_u.xpvlenu_pv);
17251 SvREFCNT_dec(r->anchored_substr);
17252 SvREFCNT_dec(r->anchored_utf8);
17253 SvREFCNT_dec(r->float_substr);
17254 SvREFCNT_dec(r->float_utf8);
17255 Safefree(r->substrs);
17257 RX_MATCH_COPY_FREE(rx);
17258 #ifdef PERL_ANY_COW
17259 SvREFCNT_dec(r->saved_copy);
17262 SvREFCNT_dec(r->qr_anoncv);
17263 rx->sv_u.svu_rx = 0;
17268 This is a hacky workaround to the structural issue of match results
17269 being stored in the regexp structure which is in turn stored in
17270 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
17271 could be PL_curpm in multiple contexts, and could require multiple
17272 result sets being associated with the pattern simultaneously, such
17273 as when doing a recursive match with (??{$qr})
17275 The solution is to make a lightweight copy of the regexp structure
17276 when a qr// is returned from the code executed by (??{$qr}) this
17277 lightweight copy doesn't actually own any of its data except for
17278 the starp/end and the actual regexp structure itself.
17284 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
17286 struct regexp *ret;
17287 struct regexp *const r = ReANY(rx);
17288 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
17290 PERL_ARGS_ASSERT_REG_TEMP_COPY;
17293 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
17295 SvOK_off((SV *)ret_x);
17297 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
17298 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
17299 made both spots point to the same regexp body.) */
17300 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
17301 assert(!SvPVX(ret_x));
17302 ret_x->sv_u.svu_rx = temp->sv_any;
17303 temp->sv_any = NULL;
17304 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
17305 SvREFCNT_dec_NN(temp);
17306 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
17307 ing below will not set it. */
17308 SvCUR_set(ret_x, SvCUR(rx));
17311 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
17312 sv_force_normal(sv) is called. */
17314 ret = ReANY(ret_x);
17316 SvFLAGS(ret_x) |= SvUTF8(rx);
17317 /* We share the same string buffer as the original regexp, on which we
17318 hold a reference count, incremented when mother_re is set below.
17319 The string pointer is copied here, being part of the regexp struct.
17321 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
17322 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
17324 const I32 npar = r->nparens+1;
17325 Newx(ret->offs, npar, regexp_paren_pair);
17326 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17329 Newx(ret->substrs, 1, struct reg_substr_data);
17330 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17332 SvREFCNT_inc_void(ret->anchored_substr);
17333 SvREFCNT_inc_void(ret->anchored_utf8);
17334 SvREFCNT_inc_void(ret->float_substr);
17335 SvREFCNT_inc_void(ret->float_utf8);
17337 /* check_substr and check_utf8, if non-NULL, point to either their
17338 anchored or float namesakes, and don't hold a second reference. */
17340 RX_MATCH_COPIED_off(ret_x);
17341 #ifdef PERL_ANY_COW
17342 ret->saved_copy = NULL;
17344 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
17345 SvREFCNT_inc_void(ret->qr_anoncv);
17351 /* regfree_internal()
17353 Free the private data in a regexp. This is overloadable by
17354 extensions. Perl takes care of the regexp structure in pregfree(),
17355 this covers the *pprivate pointer which technically perl doesn't
17356 know about, however of course we have to handle the
17357 regexp_internal structure when no extension is in use.
17359 Note this is called before freeing anything in the regexp
17364 Perl_regfree_internal(pTHX_ REGEXP * const rx)
17366 struct regexp *const r = ReANY(rx);
17367 RXi_GET_DECL(r,ri);
17368 GET_RE_DEBUG_FLAGS_DECL;
17370 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
17376 SV *dsv= sv_newmortal();
17377 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
17378 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
17379 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
17380 PL_colors[4],PL_colors[5],s);
17383 #ifdef RE_TRACK_PATTERN_OFFSETS
17385 Safefree(ri->u.offsets); /* 20010421 MJD */
17387 if (ri->code_blocks) {
17389 for (n = 0; n < ri->num_code_blocks; n++)
17390 SvREFCNT_dec(ri->code_blocks[n].src_regex);
17391 Safefree(ri->code_blocks);
17395 int n = ri->data->count;
17398 /* If you add a ->what type here, update the comment in regcomp.h */
17399 switch (ri->data->what[n]) {
17405 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
17408 Safefree(ri->data->data[n]);
17414 { /* Aho Corasick add-on structure for a trie node.
17415 Used in stclass optimization only */
17417 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
17418 #ifdef USE_ITHREADS
17422 refcount = --aho->refcount;
17425 PerlMemShared_free(aho->states);
17426 PerlMemShared_free(aho->fail);
17427 /* do this last!!!! */
17428 PerlMemShared_free(ri->data->data[n]);
17429 /* we should only ever get called once, so
17430 * assert as much, and also guard the free
17431 * which /might/ happen twice. At the least
17432 * it will make code anlyzers happy and it
17433 * doesn't cost much. - Yves */
17434 assert(ri->regstclass);
17435 if (ri->regstclass) {
17436 PerlMemShared_free(ri->regstclass);
17437 ri->regstclass = 0;
17444 /* trie structure. */
17446 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
17447 #ifdef USE_ITHREADS
17451 refcount = --trie->refcount;
17454 PerlMemShared_free(trie->charmap);
17455 PerlMemShared_free(trie->states);
17456 PerlMemShared_free(trie->trans);
17458 PerlMemShared_free(trie->bitmap);
17460 PerlMemShared_free(trie->jump);
17461 PerlMemShared_free(trie->wordinfo);
17462 /* do this last!!!! */
17463 PerlMemShared_free(ri->data->data[n]);
17468 Perl_croak(aTHX_ "panic: regfree data code '%c'",
17469 ri->data->what[n]);
17472 Safefree(ri->data->what);
17473 Safefree(ri->data);
17479 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
17480 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
17481 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
17484 re_dup - duplicate a regexp.
17486 This routine is expected to clone a given regexp structure. It is only
17487 compiled under USE_ITHREADS.
17489 After all of the core data stored in struct regexp is duplicated
17490 the regexp_engine.dupe method is used to copy any private data
17491 stored in the *pprivate pointer. This allows extensions to handle
17492 any duplication it needs to do.
17494 See pregfree() and regfree_internal() if you change anything here.
17496 #if defined(USE_ITHREADS)
17497 #ifndef PERL_IN_XSUB_RE
17499 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
17503 const struct regexp *r = ReANY(sstr);
17504 struct regexp *ret = ReANY(dstr);
17506 PERL_ARGS_ASSERT_RE_DUP_GUTS;
17508 npar = r->nparens+1;
17509 Newx(ret->offs, npar, regexp_paren_pair);
17510 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17512 if (ret->substrs) {
17513 /* Do it this way to avoid reading from *r after the StructCopy().
17514 That way, if any of the sv_dup_inc()s dislodge *r from the L1
17515 cache, it doesn't matter. */
17516 const bool anchored = r->check_substr
17517 ? r->check_substr == r->anchored_substr
17518 : r->check_utf8 == r->anchored_utf8;
17519 Newx(ret->substrs, 1, struct reg_substr_data);
17520 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17522 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
17523 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
17524 ret->float_substr = sv_dup_inc(ret->float_substr, param);
17525 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
17527 /* check_substr and check_utf8, if non-NULL, point to either their
17528 anchored or float namesakes, and don't hold a second reference. */
17530 if (ret->check_substr) {
17532 assert(r->check_utf8 == r->anchored_utf8);
17533 ret->check_substr = ret->anchored_substr;
17534 ret->check_utf8 = ret->anchored_utf8;
17536 assert(r->check_substr == r->float_substr);
17537 assert(r->check_utf8 == r->float_utf8);
17538 ret->check_substr = ret->float_substr;
17539 ret->check_utf8 = ret->float_utf8;
17541 } else if (ret->check_utf8) {
17543 ret->check_utf8 = ret->anchored_utf8;
17545 ret->check_utf8 = ret->float_utf8;
17550 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
17551 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
17554 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
17556 if (RX_MATCH_COPIED(dstr))
17557 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
17559 ret->subbeg = NULL;
17560 #ifdef PERL_ANY_COW
17561 ret->saved_copy = NULL;
17564 /* Whether mother_re be set or no, we need to copy the string. We
17565 cannot refrain from copying it when the storage points directly to
17566 our mother regexp, because that's
17567 1: a buffer in a different thread
17568 2: something we no longer hold a reference on
17569 so we need to copy it locally. */
17570 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
17571 ret->mother_re = NULL;
17573 #endif /* PERL_IN_XSUB_RE */
17578 This is the internal complement to regdupe() which is used to copy
17579 the structure pointed to by the *pprivate pointer in the regexp.
17580 This is the core version of the extension overridable cloning hook.
17581 The regexp structure being duplicated will be copied by perl prior
17582 to this and will be provided as the regexp *r argument, however
17583 with the /old/ structures pprivate pointer value. Thus this routine
17584 may override any copying normally done by perl.
17586 It returns a pointer to the new regexp_internal structure.
17590 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
17593 struct regexp *const r = ReANY(rx);
17594 regexp_internal *reti;
17596 RXi_GET_DECL(r,ri);
17598 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17602 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17603 char, regexp_internal);
17604 Copy(ri->program, reti->program, len+1, regnode);
17606 reti->num_code_blocks = ri->num_code_blocks;
17607 if (ri->code_blocks) {
17609 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17610 struct reg_code_block);
17611 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17612 struct reg_code_block);
17613 for (n = 0; n < ri->num_code_blocks; n++)
17614 reti->code_blocks[n].src_regex = (REGEXP*)
17615 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17618 reti->code_blocks = NULL;
17620 reti->regstclass = NULL;
17623 struct reg_data *d;
17624 const int count = ri->data->count;
17627 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17628 char, struct reg_data);
17629 Newx(d->what, count, U8);
17632 for (i = 0; i < count; i++) {
17633 d->what[i] = ri->data->what[i];
17634 switch (d->what[i]) {
17635 /* see also regcomp.h and regfree_internal() */
17636 case 'a': /* actually an AV, but the dup function is identical. */
17640 case 'u': /* actually an HV, but the dup function is identical. */
17641 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17644 /* This is cheating. */
17645 Newx(d->data[i], 1, regnode_ssc);
17646 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17647 reti->regstclass = (regnode*)d->data[i];
17650 /* Trie stclasses are readonly and can thus be shared
17651 * without duplication. We free the stclass in pregfree
17652 * when the corresponding reg_ac_data struct is freed.
17654 reti->regstclass= ri->regstclass;
17658 ((reg_trie_data*)ri->data->data[i])->refcount++;
17663 d->data[i] = ri->data->data[i];
17666 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17667 ri->data->what[i]);
17676 reti->name_list_idx = ri->name_list_idx;
17678 #ifdef RE_TRACK_PATTERN_OFFSETS
17679 if (ri->u.offsets) {
17680 Newx(reti->u.offsets, 2*len+1, U32);
17681 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17684 SetProgLen(reti,len);
17687 return (void*)reti;
17690 #endif /* USE_ITHREADS */
17692 #ifndef PERL_IN_XSUB_RE
17695 - regnext - dig the "next" pointer out of a node
17698 Perl_regnext(pTHX_ regnode *p)
17705 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17706 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17707 (int)OP(p), (int)REGNODE_MAX);
17710 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17719 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17722 STRLEN l1 = strlen(pat1);
17723 STRLEN l2 = strlen(pat2);
17726 const char *message;
17728 PERL_ARGS_ASSERT_RE_CROAK2;
17734 Copy(pat1, buf, l1 , char);
17735 Copy(pat2, buf + l1, l2 , char);
17736 buf[l1 + l2] = '\n';
17737 buf[l1 + l2 + 1] = '\0';
17738 va_start(args, pat2);
17739 msv = vmess(buf, &args);
17741 message = SvPV_const(msv,l1);
17744 Copy(message, buf, l1 , char);
17745 /* l1-1 to avoid \n */
17746 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17749 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
17751 #ifndef PERL_IN_XSUB_RE
17753 Perl_save_re_context(pTHX)
17758 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
17761 const REGEXP * const rx = PM_GETRE(PL_curpm);
17763 nparens = RX_NPARENS(rx);
17766 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
17767 * that PL_curpm will be null, but that utf8.pm and the modules it
17768 * loads will only use $1..$3.
17769 * The t/porting/re_context.t test file checks this assumption.
17774 for (i = 1; i <= nparens; i++) {
17775 char digits[TYPE_CHARS(long)];
17776 const STRLEN len = my_snprintf(digits, sizeof(digits),
17778 GV *const *const gvp
17779 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
17782 GV * const gv = *gvp;
17783 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
17793 S_put_code_point(pTHX_ SV *sv, UV c)
17795 PERL_ARGS_ASSERT_PUT_CODE_POINT;
17798 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17800 else if (isPRINT(c)) {
17801 const char string = (char) c;
17802 if (isBACKSLASHED_PUNCT(c))
17803 sv_catpvs(sv, "\\");
17804 sv_catpvn(sv, &string, 1);
17807 const char * const mnemonic = cntrl_to_mnemonic((char) c);
17809 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17812 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17817 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17820 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17822 /* Appends to 'sv' a displayable version of the range of code points from
17823 * 'start' to 'end'. It assumes that only ASCII printables are displayable
17824 * as-is (though some of these will be escaped by put_code_point()). */
17826 const unsigned int min_range_count = 3;
17828 assert(start <= end);
17830 PERL_ARGS_ASSERT_PUT_RANGE;
17832 while (start <= end) {
17834 const char * format;
17836 if (end - start < min_range_count) {
17838 /* Individual chars in short ranges */
17839 for (; start <= end; start++) {
17840 put_code_point(sv, start);
17845 /* If permitted by the input options, and there is a possibility that
17846 * this range contains a printable literal, look to see if there is
17848 if (allow_literals && start <= MAX_PRINT_A) {
17850 /* If the range begin isn't an ASCII printable, effectively split
17851 * the range into two parts:
17852 * 1) the portion before the first such printable,
17854 * and output them separately. */
17855 if (! isPRINT_A(start)) {
17856 UV temp_end = start + 1;
17858 /* There is no point looking beyond the final possible
17859 * printable, in MAX_PRINT_A */
17860 UV max = MIN(end, MAX_PRINT_A);
17862 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17866 /* Here, temp_end points to one beyond the first printable if
17867 * found, or to one beyond 'max' if not. If none found, make
17868 * sure that we use the entire range */
17869 if (temp_end > MAX_PRINT_A) {
17870 temp_end = end + 1;
17873 /* Output the first part of the split range, the part that
17874 * doesn't have printables, with no looking for literals
17875 * (otherwise we would infinitely recurse) */
17876 put_range(sv, start, temp_end - 1, FALSE);
17878 /* The 2nd part of the range (if any) starts here. */
17881 /* We continue instead of dropping down because even if the 2nd
17882 * part is non-empty, it could be so short that we want to
17883 * output it specially, as tested for at the top of this loop.
17888 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
17889 * output a sub-range of just the digits or letters, then process
17890 * the remaining portion as usual. */
17891 if (isALPHANUMERIC_A(start)) {
17892 UV mask = (isDIGIT_A(start))
17897 UV temp_end = start + 1;
17899 /* Find the end of the sub-range that includes just the
17900 * characters in the same class as the first character in it */
17901 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17906 /* For short ranges, don't duplicate the code above to output
17907 * them; just call recursively */
17908 if (temp_end - start < min_range_count) {
17909 put_range(sv, start, temp_end, FALSE);
17911 else { /* Output as a range */
17912 put_code_point(sv, start);
17913 sv_catpvs(sv, "-");
17914 put_code_point(sv, temp_end);
17916 start = temp_end + 1;
17920 /* We output any other printables as individual characters */
17921 if (isPUNCT_A(start) || isSPACE_A(start)) {
17922 while (start <= end && (isPUNCT_A(start)
17923 || isSPACE_A(start)))
17925 put_code_point(sv, start);
17930 } /* End of looking for literals */
17932 /* Here is not to output as a literal. Some control characters have
17933 * mnemonic names. Split off any of those at the beginning and end of
17934 * the range to print mnemonically. It isn't possible for many of
17935 * these to be in a row, so this won't overwhelm with output */
17936 while (isMNEMONIC_CNTRL(start) && start <= end) {
17937 put_code_point(sv, start);
17940 if (start < end && isMNEMONIC_CNTRL(end)) {
17942 /* Here, the final character in the range has a mnemonic name.
17943 * Work backwards from the end to find the final non-mnemonic */
17944 UV temp_end = end - 1;
17945 while (isMNEMONIC_CNTRL(temp_end)) {
17949 /* And separately output the range that doesn't have mnemonics */
17950 put_range(sv, start, temp_end, FALSE);
17952 /* Then output the mnemonic trailing controls */
17953 start = temp_end + 1;
17954 while (start <= end) {
17955 put_code_point(sv, start);
17961 /* As a final resort, output the range or subrange as hex. */
17963 this_end = (end < NUM_ANYOF_CODE_POINTS)
17965 : NUM_ANYOF_CODE_POINTS - 1;
17966 format = (this_end < 256)
17967 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17968 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17969 GCC_DIAG_IGNORE(-Wformat-nonliteral);
17970 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17977 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17979 /* Appends to 'sv' a displayable version of the innards of the bracketed
17980 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
17981 * output anything, and bitmap_invlist, if not NULL, will point to an
17982 * inversion list of what is in the bit map */
17986 unsigned int punct_count = 0;
17987 SV* invlist = NULL;
17988 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17989 bool allow_literals = TRUE;
17991 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17993 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17995 /* Worst case is exactly every-other code point is in the list */
17996 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17998 /* Convert the bit map to an inversion list, keeping track of how many
17999 * ASCII puncts are set, including an extra amount for the backslashed
18001 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
18002 if (BITMAP_TEST(bitmap, i)) {
18003 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
18004 if (isPUNCT_A(i)) {
18006 if isBACKSLASHED_PUNCT(i) {
18013 /* Nothing to output */
18014 if (_invlist_len(*invlist_ptr) == 0) {
18015 SvREFCNT_dec(invlist);
18019 /* Generally, it is more readable if printable characters are output as
18020 * literals, but if a range (nearly) spans all of them, it's best to output
18021 * it as a single range. This code will use a single range if all but 2
18022 * printables are in it */
18023 invlist_iterinit(*invlist_ptr);
18024 while (invlist_iternext(*invlist_ptr, &start, &end)) {
18026 /* If range starts beyond final printable, it doesn't have any in it */
18027 if (start > MAX_PRINT_A) {
18031 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
18032 * all but two, the range must start and end no later than 2 from
18034 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
18035 if (end > MAX_PRINT_A) {
18041 if (end - start >= MAX_PRINT_A - ' ' - 2) {
18042 allow_literals = FALSE;
18047 invlist_iterfinish(*invlist_ptr);
18049 /* The legibility of the output depends mostly on how many punctuation
18050 * characters are output. There are 32 possible ASCII ones, and some have
18051 * an additional backslash, bringing it to currently 36, so if any more
18052 * than 18 are to be output, we can instead output it as its complement,
18053 * yielding fewer puncts, and making it more legible. But give some weight
18054 * to the fact that outputting it as a complement is less legible than a
18055 * straight output, so don't complement unless we are somewhat over the 18
18057 if (allow_literals && punct_count > 22) {
18058 sv_catpvs(sv, "^");
18060 /* Add everything remaining to the list, so when we invert it just
18061 * below, it will be excluded */
18062 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
18063 _invlist_invert(*invlist_ptr);
18066 /* Here we have figured things out. Output each range */
18067 invlist_iterinit(*invlist_ptr);
18068 while (invlist_iternext(*invlist_ptr, &start, &end)) {
18069 if (start >= NUM_ANYOF_CODE_POINTS) {
18072 put_range(sv, start, end, allow_literals);
18074 invlist_iterfinish(*invlist_ptr);
18079 #define CLEAR_OPTSTART \
18080 if (optstart) STMT_START { \
18081 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
18082 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
18086 #define DUMPUNTIL(b,e) \
18088 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
18090 STATIC const regnode *
18091 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
18092 const regnode *last, const regnode *plast,
18093 SV* sv, I32 indent, U32 depth)
18095 U8 op = PSEUDO; /* Arbitrary non-END op. */
18096 const regnode *next;
18097 const regnode *optstart= NULL;
18099 RXi_GET_DECL(r,ri);
18100 GET_RE_DEBUG_FLAGS_DECL;
18102 PERL_ARGS_ASSERT_DUMPUNTIL;
18104 #ifdef DEBUG_DUMPUNTIL
18105 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
18106 last ? last-start : 0,plast ? plast-start : 0);
18109 if (plast && plast < last)
18112 while (PL_regkind[op] != END && (!last || node < last)) {
18114 /* While that wasn't END last time... */
18117 if (op == CLOSE || op == WHILEM)
18119 next = regnext((regnode *)node);
18122 if (OP(node) == OPTIMIZED) {
18123 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
18130 regprop(r, sv, node, NULL, NULL);
18131 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
18132 (int)(2*indent + 1), "", SvPVX_const(sv));
18134 if (OP(node) != OPTIMIZED) {
18135 if (next == NULL) /* Next ptr. */
18136 PerlIO_printf(Perl_debug_log, " (0)");
18137 else if (PL_regkind[(U8)op] == BRANCH
18138 && PL_regkind[OP(next)] != BRANCH )
18139 PerlIO_printf(Perl_debug_log, " (FAIL)");
18141 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
18142 (void)PerlIO_putc(Perl_debug_log, '\n');
18146 if (PL_regkind[(U8)op] == BRANCHJ) {
18149 const regnode *nnode = (OP(next) == LONGJMP
18150 ? regnext((regnode *)next)
18152 if (last && nnode > last)
18154 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
18157 else if (PL_regkind[(U8)op] == BRANCH) {
18159 DUMPUNTIL(NEXTOPER(node), next);
18161 else if ( PL_regkind[(U8)op] == TRIE ) {
18162 const regnode *this_trie = node;
18163 const char op = OP(node);
18164 const U32 n = ARG(node);
18165 const reg_ac_data * const ac = op>=AHOCORASICK ?
18166 (reg_ac_data *)ri->data->data[n] :
18168 const reg_trie_data * const trie =
18169 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
18171 AV *const trie_words
18172 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
18174 const regnode *nextbranch= NULL;
18177 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
18178 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
18180 PerlIO_printf(Perl_debug_log, "%*s%s ",
18181 (int)(2*(indent+3)), "",
18183 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
18184 SvCUR(*elem_ptr), 60,
18185 PL_colors[0], PL_colors[1],
18187 ? PERL_PV_ESCAPE_UNI
18189 | PERL_PV_PRETTY_ELLIPSES
18190 | PERL_PV_PRETTY_LTGT
18195 U16 dist= trie->jump[word_idx+1];
18196 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
18197 (UV)((dist ? this_trie + dist : next) - start));
18200 nextbranch= this_trie + trie->jump[0];
18201 DUMPUNTIL(this_trie + dist, nextbranch);
18203 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
18204 nextbranch= regnext((regnode *)nextbranch);
18206 PerlIO_printf(Perl_debug_log, "\n");
18209 if (last && next > last)
18214 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
18215 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
18216 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
18218 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
18220 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
18222 else if ( op == PLUS || op == STAR) {
18223 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
18225 else if (PL_regkind[(U8)op] == ANYOF) {
18226 /* arglen 1 + class block */
18227 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
18228 ? ANYOF_POSIXL_SKIP
18230 node = NEXTOPER(node);
18232 else if (PL_regkind[(U8)op] == EXACT) {
18233 /* Literal string, where present. */
18234 node += NODE_SZ_STR(node) - 1;
18235 node = NEXTOPER(node);
18238 node = NEXTOPER(node);
18239 node += regarglen[(U8)op];
18241 if (op == CURLYX || op == OPEN)
18245 #ifdef DEBUG_DUMPUNTIL
18246 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
18251 #endif /* DEBUGGING */
18254 * ex: set ts=8 sts=4 sw=4 et: