5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
109 /* this is a chain of data about sub patterns we are processing that
110 need to be handled separately/specially in study_chunk. Its so
111 we can simulate recursion without losing state. */
113 typedef struct scan_frame {
114 regnode *last_regnode; /* last node to process in this frame */
115 regnode *next_regnode; /* next node to process when last is reached */
116 U32 prev_recursed_depth;
117 I32 stopparen; /* what stopparen do we use */
118 U32 is_top_frame; /* what flags do we use? */
120 struct scan_frame *this_prev_frame; /* this previous frame */
121 struct scan_frame *prev_frame; /* previous frame */
122 struct scan_frame *next_frame; /* next frame */
125 struct RExC_state_t {
126 U32 flags; /* RXf_* are we folding, multilining? */
127 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
128 char *precomp; /* uncompiled string. */
129 REGEXP *rx_sv; /* The SV that is the regexp. */
130 regexp *rx; /* perl core regexp structure */
131 regexp_internal *rxi; /* internal data for regexp object
133 char *start; /* Start of input for compile */
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
136 SSize_t whilem_seen; /* number of WHILEM in this expr */
137 regnode *emit_start; /* Start of emitted-code area */
138 regnode *emit_bound; /* First regnode outside of the
140 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
141 implies compiling, so don't emit */
142 regnode_ssc emit_dummy; /* placeholder for emit to point to;
143 large enough for the largest
144 non-EXACTish node, so can use it as
146 I32 naughty; /* How bad is this pattern? */
147 I32 sawback; /* Did we see \1, ...? */
149 SSize_t size; /* Code size. */
150 I32 npar; /* Capture buffer count, (OPEN) plus
151 one. ("par" 0 is the whole
153 I32 nestroot; /* root parens we are in - used by
157 regnode **open_parens; /* pointers to open parens */
158 regnode **close_parens; /* pointers to close parens */
159 regnode *opend; /* END node in program */
160 I32 utf8; /* whether the pattern is utf8 or not */
161 I32 orig_utf8; /* whether the pattern was originally in utf8 */
162 /* XXX use this for future optimisation of case
163 * where pattern must be upgraded to utf8. */
164 I32 uni_semantics; /* If a d charset modifier should use unicode
165 rules, even if the pattern is not in
167 HV *paren_names; /* Paren names */
169 regnode **recurse; /* Recurse regops */
170 I32 recurse_count; /* Number of recurse regops */
171 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
173 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
177 I32 override_recoding;
178 I32 in_multi_char_class;
179 struct reg_code_block *code_blocks; /* positions of literal (?{})
181 int num_code_blocks; /* size of code_blocks[] */
182 int code_index; /* next code_blocks[] slot */
183 SSize_t maxlen; /* mininum possible number of chars in string to match */
184 scan_frame *frame_head;
185 scan_frame *frame_last;
187 #ifdef ADD_TO_REGEXEC
188 char *starttry; /* -Dr: where regtry was called. */
189 #define RExC_starttry (pRExC_state->starttry)
191 SV *runtime_code_qr; /* qr with the runtime code blocks */
193 const char *lastparse;
195 AV *paren_name_list; /* idx -> name */
196 U32 study_chunk_recursed_count;
199 #define RExC_lastparse (pRExC_state->lastparse)
200 #define RExC_lastnum (pRExC_state->lastnum)
201 #define RExC_paren_name_list (pRExC_state->paren_name_list)
202 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
203 #define RExC_mysv (pRExC_state->mysv1)
204 #define RExC_mysv1 (pRExC_state->mysv1)
205 #define RExC_mysv2 (pRExC_state->mysv2)
210 #define RExC_flags (pRExC_state->flags)
211 #define RExC_pm_flags (pRExC_state->pm_flags)
212 #define RExC_precomp (pRExC_state->precomp)
213 #define RExC_rx_sv (pRExC_state->rx_sv)
214 #define RExC_rx (pRExC_state->rx)
215 #define RExC_rxi (pRExC_state->rxi)
216 #define RExC_start (pRExC_state->start)
217 #define RExC_end (pRExC_state->end)
218 #define RExC_parse (pRExC_state->parse)
219 #define RExC_whilem_seen (pRExC_state->whilem_seen)
220 #ifdef RE_TRACK_PATTERN_OFFSETS
221 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
224 #define RExC_emit (pRExC_state->emit)
225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
226 #define RExC_emit_start (pRExC_state->emit_start)
227 #define RExC_emit_bound (pRExC_state->emit_bound)
228 #define RExC_sawback (pRExC_state->sawback)
229 #define RExC_seen (pRExC_state->seen)
230 #define RExC_size (pRExC_state->size)
231 #define RExC_maxlen (pRExC_state->maxlen)
232 #define RExC_npar (pRExC_state->npar)
233 #define RExC_nestroot (pRExC_state->nestroot)
234 #define RExC_extralen (pRExC_state->extralen)
235 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
236 #define RExC_utf8 (pRExC_state->utf8)
237 #define RExC_uni_semantics (pRExC_state->uni_semantics)
238 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
239 #define RExC_open_parens (pRExC_state->open_parens)
240 #define RExC_close_parens (pRExC_state->close_parens)
241 #define RExC_opend (pRExC_state->opend)
242 #define RExC_paren_names (pRExC_state->paren_names)
243 #define RExC_recurse (pRExC_state->recurse)
244 #define RExC_recurse_count (pRExC_state->recurse_count)
245 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
246 #define RExC_study_chunk_recursed_bytes \
247 (pRExC_state->study_chunk_recursed_bytes)
248 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
249 #define RExC_contains_locale (pRExC_state->contains_locale)
250 #define RExC_contains_i (pRExC_state->contains_i)
251 #define RExC_override_recoding (pRExC_state->override_recoding)
252 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
253 #define RExC_frame_head (pRExC_state->frame_head)
254 #define RExC_frame_last (pRExC_state->frame_last)
255 #define RExC_frame_count (pRExC_state->frame_count)
257 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
258 * a flag to disable back-off on the fixed/floating substrings - if it's
259 * a high complexity pattern we assume the benefit of avoiding a full match
260 * is worth the cost of checking for the substrings even if they rarely help.
262 #define RExC_naughty (pRExC_state->naughty)
263 #define TOO_NAUGHTY (10)
264 #define MARK_NAUGHTY(add) \
265 if (RExC_naughty < TOO_NAUGHTY) \
266 RExC_naughty += (add)
267 #define MARK_NAUGHTY_EXP(exp, add) \
268 if (RExC_naughty < TOO_NAUGHTY) \
269 RExC_naughty += RExC_naughty / (exp) + (add)
271 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
272 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
273 ((*s) == '{' && regcurly(s)))
276 * Flags to be passed up and down.
278 #define WORST 0 /* Worst case. */
279 #define HASWIDTH 0x01 /* Known to match non-null strings. */
281 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
282 * character. (There needs to be a case: in the switch statement in regexec.c
283 * for any node marked SIMPLE.) Note that this is not the same thing as
286 #define SPSTART 0x04 /* Starts with * or + */
287 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
288 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
289 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
291 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
293 /* whether trie related optimizations are enabled */
294 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
295 #define TRIE_STUDY_OPT
296 #define FULL_TRIE_STUDY
302 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
303 #define PBITVAL(paren) (1 << ((paren) & 7))
304 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
305 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
306 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
308 #define REQUIRE_UTF8 STMT_START { \
310 *flagp = RESTART_UTF8; \
315 /* This converts the named class defined in regcomp.h to its equivalent class
316 * number defined in handy.h. */
317 #define namedclass_to_classnum(class) ((int) ((class) / 2))
318 #define classnum_to_namedclass(classnum) ((classnum) * 2)
320 #define _invlist_union_complement_2nd(a, b, output) \
321 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
322 #define _invlist_intersection_complement_2nd(a, b, output) \
323 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
325 /* About scan_data_t.
327 During optimisation we recurse through the regexp program performing
328 various inplace (keyhole style) optimisations. In addition study_chunk
329 and scan_commit populate this data structure with information about
330 what strings MUST appear in the pattern. We look for the longest
331 string that must appear at a fixed location, and we look for the
332 longest string that may appear at a floating location. So for instance
337 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
338 strings (because they follow a .* construct). study_chunk will identify
339 both FOO and BAR as being the longest fixed and floating strings respectively.
341 The strings can be composites, for instance
345 will result in a composite fixed substring 'foo'.
347 For each string some basic information is maintained:
349 - offset or min_offset
350 This is the position the string must appear at, or not before.
351 It also implicitly (when combined with minlenp) tells us how many
352 characters must match before the string we are searching for.
353 Likewise when combined with minlenp and the length of the string it
354 tells us how many characters must appear after the string we have
358 Only used for floating strings. This is the rightmost point that
359 the string can appear at. If set to SSize_t_MAX it indicates that the
360 string can occur infinitely far to the right.
363 A pointer to the minimum number of characters of the pattern that the
364 string was found inside. This is important as in the case of positive
365 lookahead or positive lookbehind we can have multiple patterns
370 The minimum length of the pattern overall is 3, the minimum length
371 of the lookahead part is 3, but the minimum length of the part that
372 will actually match is 1. So 'FOO's minimum length is 3, but the
373 minimum length for the F is 1. This is important as the minimum length
374 is used to determine offsets in front of and behind the string being
375 looked for. Since strings can be composites this is the length of the
376 pattern at the time it was committed with a scan_commit. Note that
377 the length is calculated by study_chunk, so that the minimum lengths
378 are not known until the full pattern has been compiled, thus the
379 pointer to the value.
383 In the case of lookbehind the string being searched for can be
384 offset past the start point of the final matching string.
385 If this value was just blithely removed from the min_offset it would
386 invalidate some of the calculations for how many chars must match
387 before or after (as they are derived from min_offset and minlen and
388 the length of the string being searched for).
389 When the final pattern is compiled and the data is moved from the
390 scan_data_t structure into the regexp structure the information
391 about lookbehind is factored in, with the information that would
392 have been lost precalculated in the end_shift field for the
395 The fields pos_min and pos_delta are used to store the minimum offset
396 and the delta to the maximum offset at the current point in the pattern.
400 typedef struct scan_data_t {
401 /*I32 len_min; unused */
402 /*I32 len_delta; unused */
406 SSize_t last_end; /* min value, <0 unless valid. */
407 SSize_t last_start_min;
408 SSize_t last_start_max;
409 SV **longest; /* Either &l_fixed, or &l_float. */
410 SV *longest_fixed; /* longest fixed string found in pattern */
411 SSize_t offset_fixed; /* offset where it starts */
412 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
413 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
414 SV *longest_float; /* longest floating string found in pattern */
415 SSize_t offset_float_min; /* earliest point in string it can appear */
416 SSize_t offset_float_max; /* latest point in string it can appear */
417 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
418 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
421 SSize_t *last_closep;
422 regnode_ssc *start_class;
426 * Forward declarations for pregcomp()'s friends.
429 static const scan_data_t zero_scan_data =
430 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
432 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
433 #define SF_BEFORE_SEOL 0x0001
434 #define SF_BEFORE_MEOL 0x0002
435 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
436 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
438 #define SF_FIX_SHIFT_EOL (+2)
439 #define SF_FL_SHIFT_EOL (+4)
441 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
442 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
444 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
445 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
446 #define SF_IS_INF 0x0040
447 #define SF_HAS_PAR 0x0080
448 #define SF_IN_PAR 0x0100
449 #define SF_HAS_EVAL 0x0200
450 #define SCF_DO_SUBSTR 0x0400
451 #define SCF_DO_STCLASS_AND 0x0800
452 #define SCF_DO_STCLASS_OR 0x1000
453 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
454 #define SCF_WHILEM_VISITED_POS 0x2000
456 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
457 #define SCF_SEEN_ACCEPT 0x8000
458 #define SCF_TRIE_DOING_RESTUDY 0x10000
459 #define SCF_IN_DEFINE 0x20000
464 #define UTF cBOOL(RExC_utf8)
466 /* The enums for all these are ordered so things work out correctly */
467 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
468 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
469 == REGEX_DEPENDS_CHARSET)
470 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
471 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
472 >= REGEX_UNICODE_CHARSET)
473 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
474 == REGEX_ASCII_RESTRICTED_CHARSET)
475 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
476 >= REGEX_ASCII_RESTRICTED_CHARSET)
477 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
478 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
480 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
482 /* For programs that want to be strictly Unicode compatible by dying if any
483 * attempt is made to match a non-Unicode code point against a Unicode
485 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
487 #define OOB_NAMEDCLASS -1
489 /* There is no code point that is out-of-bounds, so this is problematic. But
490 * its only current use is to initialize a variable that is always set before
492 #define OOB_UNICODE 0xDEADBEEF
494 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
495 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
498 /* length of regex to show in messages that don't mark a position within */
499 #define RegexLengthToShowInErrorMessages 127
502 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
503 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
504 * op/pragma/warn/regcomp.
506 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
507 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
509 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
510 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
512 #define REPORT_LOCATION_ARGS(offset) \
513 UTF8fARG(UTF, offset, RExC_precomp), \
514 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
517 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
518 * arg. Show regex, up to a maximum length. If it's too long, chop and add
521 #define _FAIL(code) STMT_START { \
522 const char *ellipses = ""; \
523 IV len = RExC_end - RExC_precomp; \
526 SAVEFREESV(RExC_rx_sv); \
527 if (len > RegexLengthToShowInErrorMessages) { \
528 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
529 len = RegexLengthToShowInErrorMessages - 10; \
535 #define FAIL(msg) _FAIL( \
536 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
537 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
539 #define FAIL2(msg,arg) _FAIL( \
540 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
541 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
544 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
546 #define Simple_vFAIL(m) STMT_START { \
548 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
549 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
550 m, REPORT_LOCATION_ARGS(offset)); \
554 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
556 #define vFAIL(m) STMT_START { \
558 SAVEFREESV(RExC_rx_sv); \
563 * Like Simple_vFAIL(), but accepts two arguments.
565 #define Simple_vFAIL2(m,a1) STMT_START { \
566 const IV offset = RExC_parse - RExC_precomp; \
567 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
568 REPORT_LOCATION_ARGS(offset)); \
572 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
574 #define vFAIL2(m,a1) STMT_START { \
576 SAVEFREESV(RExC_rx_sv); \
577 Simple_vFAIL2(m, a1); \
582 * Like Simple_vFAIL(), but accepts three arguments.
584 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
585 const IV offset = RExC_parse - RExC_precomp; \
586 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
587 REPORT_LOCATION_ARGS(offset)); \
591 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
593 #define vFAIL3(m,a1,a2) STMT_START { \
595 SAVEFREESV(RExC_rx_sv); \
596 Simple_vFAIL3(m, a1, a2); \
600 * Like Simple_vFAIL(), but accepts four arguments.
602 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
603 const IV offset = RExC_parse - RExC_precomp; \
604 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
605 REPORT_LOCATION_ARGS(offset)); \
608 #define vFAIL4(m,a1,a2,a3) STMT_START { \
610 SAVEFREESV(RExC_rx_sv); \
611 Simple_vFAIL4(m, a1, a2, a3); \
614 /* A specialized version of vFAIL2 that works with UTF8f */
615 #define vFAIL2utf8f(m, a1) STMT_START { \
616 const IV offset = RExC_parse - RExC_precomp; \
618 SAVEFREESV(RExC_rx_sv); \
619 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
620 REPORT_LOCATION_ARGS(offset)); \
623 /* These have asserts in them because of [perl #122671] Many warnings in
624 * regcomp.c can occur twice. If they get output in pass1 and later in that
625 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
626 * would get output again. So they should be output in pass2, and these
627 * asserts make sure new warnings follow that paradigm. */
629 /* m is not necessarily a "literal string", in this macro */
630 #define reg_warn_non_literal_string(loc, m) STMT_START { \
631 const IV offset = loc - RExC_precomp; \
632 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
633 m, REPORT_LOCATION_ARGS(offset)); \
636 #define ckWARNreg(loc,m) STMT_START { \
637 const IV offset = loc - RExC_precomp; \
638 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
639 REPORT_LOCATION_ARGS(offset)); \
642 #define vWARN_dep(loc, m) STMT_START { \
643 const IV offset = loc - RExC_precomp; \
644 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
645 REPORT_LOCATION_ARGS(offset)); \
648 #define ckWARNdep(loc,m) STMT_START { \
649 const IV offset = loc - RExC_precomp; \
650 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
652 REPORT_LOCATION_ARGS(offset)); \
655 #define ckWARNregdep(loc,m) STMT_START { \
656 const IV offset = loc - RExC_precomp; \
657 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
659 REPORT_LOCATION_ARGS(offset)); \
662 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
663 const IV offset = loc - RExC_precomp; \
664 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
666 a1, REPORT_LOCATION_ARGS(offset)); \
669 #define ckWARN2reg(loc, m, a1) STMT_START { \
670 const IV offset = loc - RExC_precomp; \
671 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
672 a1, REPORT_LOCATION_ARGS(offset)); \
675 #define vWARN3(loc, m, a1, a2) STMT_START { \
676 const IV offset = loc - RExC_precomp; \
677 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
678 a1, a2, REPORT_LOCATION_ARGS(offset)); \
681 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
682 const IV offset = loc - RExC_precomp; \
683 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
684 a1, a2, REPORT_LOCATION_ARGS(offset)); \
687 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
688 const IV offset = loc - RExC_precomp; \
689 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
690 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
693 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
694 const IV offset = loc - RExC_precomp; \
695 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
696 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
699 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
700 const IV offset = loc - RExC_precomp; \
701 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
702 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
706 /* Allow for side effects in s */
707 #define REGC(c,s) STMT_START { \
708 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
711 /* Macros for recording node offsets. 20001227 mjd@plover.com
712 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
713 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
714 * Element 0 holds the number n.
715 * Position is 1 indexed.
717 #ifndef RE_TRACK_PATTERN_OFFSETS
718 #define Set_Node_Offset_To_R(node,byte)
719 #define Set_Node_Offset(node,byte)
720 #define Set_Cur_Node_Offset
721 #define Set_Node_Length_To_R(node,len)
722 #define Set_Node_Length(node,len)
723 #define Set_Node_Cur_Length(node,start)
724 #define Node_Offset(n)
725 #define Node_Length(n)
726 #define Set_Node_Offset_Length(node,offset,len)
727 #define ProgLen(ri) ri->u.proglen
728 #define SetProgLen(ri,x) ri->u.proglen = x
730 #define ProgLen(ri) ri->u.offsets[0]
731 #define SetProgLen(ri,x) ri->u.offsets[0] = x
732 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
734 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
735 __LINE__, (int)(node), (int)(byte))); \
737 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
740 RExC_offsets[2*(node)-1] = (byte); \
745 #define Set_Node_Offset(node,byte) \
746 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
747 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
749 #define Set_Node_Length_To_R(node,len) STMT_START { \
751 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
752 __LINE__, (int)(node), (int)(len))); \
754 Perl_croak(aTHX_ "value of node is %d in Length macro", \
757 RExC_offsets[2*(node)] = (len); \
762 #define Set_Node_Length(node,len) \
763 Set_Node_Length_To_R((node)-RExC_emit_start, len)
764 #define Set_Node_Cur_Length(node, start) \
765 Set_Node_Length(node, RExC_parse - start)
767 /* Get offsets and lengths */
768 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
769 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
771 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
772 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
773 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
777 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
778 #define EXPERIMENTAL_INPLACESCAN
779 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
781 #define DEBUG_RExC_seen() \
782 DEBUG_OPTIMISE_MORE_r({ \
783 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
785 if (RExC_seen & REG_ZERO_LEN_SEEN) \
786 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
788 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
789 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
791 if (RExC_seen & REG_GPOS_SEEN) \
792 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
794 if (RExC_seen & REG_CANY_SEEN) \
795 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
797 if (RExC_seen & REG_RECURSE_SEEN) \
798 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
800 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
801 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
803 if (RExC_seen & REG_VERBARG_SEEN) \
804 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
806 if (RExC_seen & REG_CUTGROUP_SEEN) \
807 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
809 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
810 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
812 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
813 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
815 if (RExC_seen & REG_GOSTART_SEEN) \
816 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
818 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
819 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
821 PerlIO_printf(Perl_debug_log,"\n"); \
824 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
825 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
827 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
829 PerlIO_printf(Perl_debug_log, "%s", open_str); \
830 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
831 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
832 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
833 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
834 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
835 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
836 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
837 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
838 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
839 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
840 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
841 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
842 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
843 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
844 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
845 PerlIO_printf(Perl_debug_log, "%s", close_str); \
849 #define DEBUG_STUDYDATA(str,data,depth) \
850 DEBUG_OPTIMISE_MORE_r(if(data){ \
851 PerlIO_printf(Perl_debug_log, \
852 "%*s" str "Pos:%"IVdf"/%"IVdf \
854 (int)(depth)*2, "", \
855 (IV)((data)->pos_min), \
856 (IV)((data)->pos_delta), \
857 (UV)((data)->flags) \
859 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
860 PerlIO_printf(Perl_debug_log, \
861 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
862 (IV)((data)->whilem_c), \
863 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
864 is_inf ? "INF " : "" \
866 if ((data)->last_found) \
867 PerlIO_printf(Perl_debug_log, \
868 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
869 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
870 SvPVX_const((data)->last_found), \
871 (IV)((data)->last_end), \
872 (IV)((data)->last_start_min), \
873 (IV)((data)->last_start_max), \
874 ((data)->longest && \
875 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
876 SvPVX_const((data)->longest_fixed), \
877 (IV)((data)->offset_fixed), \
878 ((data)->longest && \
879 (data)->longest==&((data)->longest_float)) ? "*" : "", \
880 SvPVX_const((data)->longest_float), \
881 (IV)((data)->offset_float_min), \
882 (IV)((data)->offset_float_max) \
884 PerlIO_printf(Perl_debug_log,"\n"); \
889 /* is c a control character for which we have a mnemonic? */
890 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
893 S_cntrl_to_mnemonic(const U8 c)
895 /* Returns the mnemonic string that represents character 'c', if one
896 * exists; NULL otherwise. The only ones that exist for the purposes of
897 * this routine are a few control characters */
900 case '\a': return "\\a";
901 case '\b': return "\\b";
902 case ESC_NATIVE: return "\\e";
903 case '\f': return "\\f";
904 case '\n': return "\\n";
905 case '\r': return "\\r";
906 case '\t': return "\\t";
914 /* Mark that we cannot extend a found fixed substring at this point.
915 Update the longest found anchored substring and the longest found
916 floating substrings if needed. */
919 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
920 SSize_t *minlenp, int is_inf)
922 const STRLEN l = CHR_SVLEN(data->last_found);
923 const STRLEN old_l = CHR_SVLEN(*data->longest);
924 GET_RE_DEBUG_FLAGS_DECL;
926 PERL_ARGS_ASSERT_SCAN_COMMIT;
928 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
929 SvSetMagicSV(*data->longest, data->last_found);
930 if (*data->longest == data->longest_fixed) {
931 data->offset_fixed = l ? data->last_start_min : data->pos_min;
932 if (data->flags & SF_BEFORE_EOL)
934 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
936 data->flags &= ~SF_FIX_BEFORE_EOL;
937 data->minlen_fixed=minlenp;
938 data->lookbehind_fixed=0;
940 else { /* *data->longest == data->longest_float */
941 data->offset_float_min = l ? data->last_start_min : data->pos_min;
942 data->offset_float_max = (l
943 ? data->last_start_max
944 : (data->pos_delta > SSize_t_MAX - data->pos_min
946 : data->pos_min + data->pos_delta));
948 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
949 data->offset_float_max = SSize_t_MAX;
950 if (data->flags & SF_BEFORE_EOL)
952 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
954 data->flags &= ~SF_FL_BEFORE_EOL;
955 data->minlen_float=minlenp;
956 data->lookbehind_float=0;
959 SvCUR_set(data->last_found, 0);
961 SV * const sv = data->last_found;
962 if (SvUTF8(sv) && SvMAGICAL(sv)) {
963 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
969 data->flags &= ~SF_BEFORE_EOL;
970 DEBUG_STUDYDATA("commit: ",data,0);
973 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
974 * list that describes which code points it matches */
977 S_ssc_anything(pTHX_ regnode_ssc *ssc)
979 /* Set the SSC 'ssc' to match an empty string or any code point */
981 PERL_ARGS_ASSERT_SSC_ANYTHING;
983 assert(is_ANYOF_SYNTHETIC(ssc));
985 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
986 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
987 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
991 S_ssc_is_anything(const regnode_ssc *ssc)
993 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
994 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
995 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
996 * in any way, so there's no point in using it */
1001 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1003 assert(is_ANYOF_SYNTHETIC(ssc));
1005 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1009 /* See if the list consists solely of the range 0 - Infinity */
1010 invlist_iterinit(ssc->invlist);
1011 ret = invlist_iternext(ssc->invlist, &start, &end)
1015 invlist_iterfinish(ssc->invlist);
1021 /* If e.g., both \w and \W are set, matches everything */
1022 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1024 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1025 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1035 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1037 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1038 * string, any code point, or any posix class under locale */
1040 PERL_ARGS_ASSERT_SSC_INIT;
1042 Zero(ssc, 1, regnode_ssc);
1043 set_ANYOF_SYNTHETIC(ssc);
1044 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1047 /* If any portion of the regex is to operate under locale rules that aren't
1048 * fully known at compile time, initialization includes it. The reason
1049 * this isn't done for all regexes is that the optimizer was written under
1050 * the assumption that locale was all-or-nothing. Given the complexity and
1051 * lack of documentation in the optimizer, and that there are inadequate
1052 * test cases for locale, many parts of it may not work properly, it is
1053 * safest to avoid locale unless necessary. */
1054 if (RExC_contains_locale) {
1055 ANYOF_POSIXL_SETALL(ssc);
1058 ANYOF_POSIXL_ZERO(ssc);
1063 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1064 const regnode_ssc *ssc)
1066 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1067 * to the list of code points matched, and locale posix classes; hence does
1068 * not check its flags) */
1073 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1075 assert(is_ANYOF_SYNTHETIC(ssc));
1077 invlist_iterinit(ssc->invlist);
1078 ret = invlist_iternext(ssc->invlist, &start, &end)
1082 invlist_iterfinish(ssc->invlist);
1088 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1096 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1097 const regnode_charclass* const node)
1099 /* Returns a mortal inversion list defining which code points are matched
1100 * by 'node', which is of type ANYOF. Handles complementing the result if
1101 * appropriate. If some code points aren't knowable at this time, the
1102 * returned list must, and will, contain every code point that is a
1105 SV* invlist = sv_2mortal(_new_invlist(0));
1106 SV* only_utf8_locale_invlist = NULL;
1108 const U32 n = ARG(node);
1109 bool new_node_has_latin1 = FALSE;
1111 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1113 /* Look at the data structure created by S_set_ANYOF_arg() */
1114 if (n != ANYOF_ONLY_HAS_BITMAP) {
1115 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1116 AV * const av = MUTABLE_AV(SvRV(rv));
1117 SV **const ary = AvARRAY(av);
1118 assert(RExC_rxi->data->what[n] == 's');
1120 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1121 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1123 else if (ary[0] && ary[0] != &PL_sv_undef) {
1125 /* Here, no compile-time swash, and there are things that won't be
1126 * known until runtime -- we have to assume it could be anything */
1127 return _add_range_to_invlist(invlist, 0, UV_MAX);
1129 else if (ary[3] && ary[3] != &PL_sv_undef) {
1131 /* Here no compile-time swash, and no run-time only data. Use the
1132 * node's inversion list */
1133 invlist = sv_2mortal(invlist_clone(ary[3]));
1136 /* Get the code points valid only under UTF-8 locales */
1137 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1138 && ary[2] && ary[2] != &PL_sv_undef)
1140 only_utf8_locale_invlist = ary[2];
1144 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1145 * code points, and an inversion list for the others, but if there are code
1146 * points that should match only conditionally on the target string being
1147 * UTF-8, those are placed in the inversion list, and not the bitmap.
1148 * Since there are circumstances under which they could match, they are
1149 * included in the SSC. But if the ANYOF node is to be inverted, we have
1150 * to exclude them here, so that when we invert below, the end result
1151 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1152 * have to do this here before we add the unconditionally matched code
1154 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1155 _invlist_intersection_complement_2nd(invlist,
1160 /* Add in the points from the bit map */
1161 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1162 if (ANYOF_BITMAP_TEST(node, i)) {
1163 invlist = add_cp_to_invlist(invlist, i);
1164 new_node_has_latin1 = TRUE;
1168 /* If this can match all upper Latin1 code points, have to add them
1170 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1171 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1174 /* Similarly for these */
1175 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1176 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1179 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1180 _invlist_invert(invlist);
1182 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1184 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1185 * locale. We can skip this if there are no 0-255 at all. */
1186 _invlist_union(invlist, PL_Latin1, &invlist);
1189 /* Similarly add the UTF-8 locale possible matches. These have to be
1190 * deferred until after the non-UTF-8 locale ones are taken care of just
1191 * above, or it leads to wrong results under ANYOF_INVERT */
1192 if (only_utf8_locale_invlist) {
1193 _invlist_union_maybe_complement_2nd(invlist,
1194 only_utf8_locale_invlist,
1195 ANYOF_FLAGS(node) & ANYOF_INVERT,
1202 /* These two functions currently do the exact same thing */
1203 #define ssc_init_zero ssc_init
1205 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1206 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1208 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1209 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1210 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1213 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1214 const regnode_charclass *and_with)
1216 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1217 * another SSC or a regular ANYOF class. Can create false positives. */
1222 PERL_ARGS_ASSERT_SSC_AND;
1224 assert(is_ANYOF_SYNTHETIC(ssc));
1226 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1227 * the code point inversion list and just the relevant flags */
1228 if (is_ANYOF_SYNTHETIC(and_with)) {
1229 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1230 anded_flags = ANYOF_FLAGS(and_with);
1232 /* XXX This is a kludge around what appears to be deficiencies in the
1233 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1234 * there are paths through the optimizer where it doesn't get weeded
1235 * out when it should. And if we don't make some extra provision for
1236 * it like the code just below, it doesn't get added when it should.
1237 * This solution is to add it only when AND'ing, which is here, and
1238 * only when what is being AND'ed is the pristine, original node
1239 * matching anything. Thus it is like adding it to ssc_anything() but
1240 * only when the result is to be AND'ed. Probably the same solution
1241 * could be adopted for the same problem we have with /l matching,
1242 * which is solved differently in S_ssc_init(), and that would lead to
1243 * fewer false positives than that solution has. But if this solution
1244 * creates bugs, the consequences are only that a warning isn't raised
1245 * that should be; while the consequences for having /l bugs is
1246 * incorrect matches */
1247 if (ssc_is_anything((regnode_ssc *)and_with)) {
1248 anded_flags |= ANYOF_WARN_SUPER;
1252 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1253 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1256 ANYOF_FLAGS(ssc) &= anded_flags;
1258 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1259 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1260 * 'and_with' may be inverted. When not inverted, we have the situation of
1262 * (C1 | P1) & (C2 | P2)
1263 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1264 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1265 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1266 * <= ((C1 & C2) | P1 | P2)
1267 * Alternatively, the last few steps could be:
1268 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1269 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1270 * <= (C1 | C2 | (P1 & P2))
1271 * We favor the second approach if either P1 or P2 is non-empty. This is
1272 * because these components are a barrier to doing optimizations, as what
1273 * they match cannot be known until the moment of matching as they are
1274 * dependent on the current locale, 'AND"ing them likely will reduce or
1276 * But we can do better if we know that C1,P1 are in their initial state (a
1277 * frequent occurrence), each matching everything:
1278 * (<everything>) & (C2 | P2) = C2 | P2
1279 * Similarly, if C2,P2 are in their initial state (again a frequent
1280 * occurrence), the result is a no-op
1281 * (C1 | P1) & (<everything>) = C1 | P1
1284 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1285 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1286 * <= (C1 & ~C2) | (P1 & ~P2)
1289 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1290 && ! is_ANYOF_SYNTHETIC(and_with))
1294 ssc_intersection(ssc,
1296 FALSE /* Has already been inverted */
1299 /* If either P1 or P2 is empty, the intersection will be also; can skip
1301 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1302 ANYOF_POSIXL_ZERO(ssc);
1304 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1306 /* Note that the Posix class component P from 'and_with' actually
1308 * P = Pa | Pb | ... | Pn
1309 * where each component is one posix class, such as in [\w\s].
1311 * ~P = ~(Pa | Pb | ... | Pn)
1312 * = ~Pa & ~Pb & ... & ~Pn
1313 * <= ~Pa | ~Pb | ... | ~Pn
1314 * The last is something we can easily calculate, but unfortunately
1315 * is likely to have many false positives. We could do better
1316 * in some (but certainly not all) instances if two classes in
1317 * P have known relationships. For example
1318 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1320 * :lower: & :print: = :lower:
1321 * And similarly for classes that must be disjoint. For example,
1322 * since \s and \w can have no elements in common based on rules in
1323 * the POSIX standard,
1324 * \w & ^\S = nothing
1325 * Unfortunately, some vendor locales do not meet the Posix
1326 * standard, in particular almost everything by Microsoft.
1327 * The loop below just changes e.g., \w into \W and vice versa */
1329 regnode_charclass_posixl temp;
1330 int add = 1; /* To calculate the index of the complement */
1332 ANYOF_POSIXL_ZERO(&temp);
1333 for (i = 0; i < ANYOF_MAX; i++) {
1335 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1336 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1338 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1339 ANYOF_POSIXL_SET(&temp, i + add);
1341 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1343 ANYOF_POSIXL_AND(&temp, ssc);
1345 } /* else ssc already has no posixes */
1346 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1347 in its initial state */
1348 else if (! is_ANYOF_SYNTHETIC(and_with)
1349 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1351 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1352 * copy it over 'ssc' */
1353 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1354 if (is_ANYOF_SYNTHETIC(and_with)) {
1355 StructCopy(and_with, ssc, regnode_ssc);
1358 ssc->invlist = anded_cp_list;
1359 ANYOF_POSIXL_ZERO(ssc);
1360 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1361 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1365 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1366 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1368 /* One or the other of P1, P2 is non-empty. */
1369 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1370 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1372 ssc_union(ssc, anded_cp_list, FALSE);
1374 else { /* P1 = P2 = empty */
1375 ssc_intersection(ssc, anded_cp_list, FALSE);
1381 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1382 const regnode_charclass *or_with)
1384 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1385 * another SSC or a regular ANYOF class. Can create false positives if
1386 * 'or_with' is to be inverted. */
1391 PERL_ARGS_ASSERT_SSC_OR;
1393 assert(is_ANYOF_SYNTHETIC(ssc));
1395 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1396 * the code point inversion list and just the relevant flags */
1397 if (is_ANYOF_SYNTHETIC(or_with)) {
1398 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1399 ored_flags = ANYOF_FLAGS(or_with);
1402 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1403 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1406 ANYOF_FLAGS(ssc) |= ored_flags;
1408 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1409 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1410 * 'or_with' may be inverted. When not inverted, we have the simple
1411 * situation of computing:
1412 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1413 * If P1|P2 yields a situation with both a class and its complement are
1414 * set, like having both \w and \W, this matches all code points, and we
1415 * can delete these from the P component of the ssc going forward. XXX We
1416 * might be able to delete all the P components, but I (khw) am not certain
1417 * about this, and it is better to be safe.
1420 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1421 * <= (C1 | P1) | ~C2
1422 * <= (C1 | ~C2) | P1
1423 * (which results in actually simpler code than the non-inverted case)
1426 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1427 && ! is_ANYOF_SYNTHETIC(or_with))
1429 /* We ignore P2, leaving P1 going forward */
1430 } /* else Not inverted */
1431 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1432 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1433 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1435 for (i = 0; i < ANYOF_MAX; i += 2) {
1436 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1438 ssc_match_all_cp(ssc);
1439 ANYOF_POSIXL_CLEAR(ssc, i);
1440 ANYOF_POSIXL_CLEAR(ssc, i+1);
1448 FALSE /* Already has been inverted */
1452 PERL_STATIC_INLINE void
1453 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1455 PERL_ARGS_ASSERT_SSC_UNION;
1457 assert(is_ANYOF_SYNTHETIC(ssc));
1459 _invlist_union_maybe_complement_2nd(ssc->invlist,
1465 PERL_STATIC_INLINE void
1466 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1468 const bool invert2nd)
1470 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1472 assert(is_ANYOF_SYNTHETIC(ssc));
1474 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1480 PERL_STATIC_INLINE void
1481 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1483 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1485 assert(is_ANYOF_SYNTHETIC(ssc));
1487 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1490 PERL_STATIC_INLINE void
1491 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1493 /* AND just the single code point 'cp' into the SSC 'ssc' */
1495 SV* cp_list = _new_invlist(2);
1497 PERL_ARGS_ASSERT_SSC_CP_AND;
1499 assert(is_ANYOF_SYNTHETIC(ssc));
1501 cp_list = add_cp_to_invlist(cp_list, cp);
1502 ssc_intersection(ssc, cp_list,
1503 FALSE /* Not inverted */
1505 SvREFCNT_dec_NN(cp_list);
1508 PERL_STATIC_INLINE void
1509 S_ssc_clear_locale(regnode_ssc *ssc)
1511 /* Set the SSC 'ssc' to not match any locale things */
1512 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1514 assert(is_ANYOF_SYNTHETIC(ssc));
1516 ANYOF_POSIXL_ZERO(ssc);
1517 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1520 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1523 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1525 /* The synthetic start class is used to hopefully quickly winnow down
1526 * places where a pattern could start a match in the target string. If it
1527 * doesn't really narrow things down that much, there isn't much point to
1528 * having the overhead of using it. This function uses some very crude
1529 * heuristics to decide if to use the ssc or not.
1531 * It returns TRUE if 'ssc' rules out more than half what it considers to
1532 * be the "likely" possible matches, but of course it doesn't know what the
1533 * actual things being matched are going to be; these are only guesses
1535 * For /l matches, it assumes that the only likely matches are going to be
1536 * in the 0-255 range, uniformly distributed, so half of that is 127
1537 * For /a and /d matches, it assumes that the likely matches will be just
1538 * the ASCII range, so half of that is 63
1539 * For /u and there isn't anything matching above the Latin1 range, it
1540 * assumes that that is the only range likely to be matched, and uses
1541 * half that as the cut-off: 127. If anything matches above Latin1,
1542 * it assumes that all of Unicode could match (uniformly), except for
1543 * non-Unicode code points and things in the General Category "Other"
1544 * (unassigned, private use, surrogates, controls and formats). This
1545 * is a much large number. */
1547 const U32 max_match = (LOC)
1551 : (invlist_highest(ssc->invlist) < 256)
1553 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1554 U32 count = 0; /* Running total of number of code points matched by
1556 UV start, end; /* Start and end points of current range in inversion
1559 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1561 invlist_iterinit(ssc->invlist);
1562 while (invlist_iternext(ssc->invlist, &start, &end)) {
1564 /* /u is the only thing that we expect to match above 255; so if not /u
1565 * and even if there are matches above 255, ignore them. This catches
1566 * things like \d under /d which does match the digits above 255, but
1567 * since the pattern is /d, it is not likely to be expecting them */
1568 if (! UNI_SEMANTICS) {
1572 end = MIN(end, 255);
1574 count += end - start + 1;
1575 if (count > max_match) {
1576 invlist_iterfinish(ssc->invlist);
1586 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1588 /* The inversion list in the SSC is marked mortal; now we need a more
1589 * permanent copy, which is stored the same way that is done in a regular
1590 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1593 SV* invlist = invlist_clone(ssc->invlist);
1595 PERL_ARGS_ASSERT_SSC_FINALIZE;
1597 assert(is_ANYOF_SYNTHETIC(ssc));
1599 /* The code in this file assumes that all but these flags aren't relevant
1600 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1601 * by the time we reach here */
1602 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1604 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1606 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1607 NULL, NULL, NULL, FALSE);
1609 /* Make sure is clone-safe */
1610 ssc->invlist = NULL;
1612 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1613 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1616 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1619 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1620 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1621 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1622 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1623 ? (TRIE_LIST_CUR( idx ) - 1) \
1629 dump_trie(trie,widecharmap,revcharmap)
1630 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1631 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1633 These routines dump out a trie in a somewhat readable format.
1634 The _interim_ variants are used for debugging the interim
1635 tables that are used to generate the final compressed
1636 representation which is what dump_trie expects.
1638 Part of the reason for their existence is to provide a form
1639 of documentation as to how the different representations function.
1644 Dumps the final compressed table form of the trie to Perl_debug_log.
1645 Used for debugging make_trie().
1649 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1650 AV *revcharmap, U32 depth)
1653 SV *sv=sv_newmortal();
1654 int colwidth= widecharmap ? 6 : 4;
1656 GET_RE_DEBUG_FLAGS_DECL;
1658 PERL_ARGS_ASSERT_DUMP_TRIE;
1660 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1661 (int)depth * 2 + 2,"",
1662 "Match","Base","Ofs" );
1664 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1665 SV ** const tmp = av_fetch( revcharmap, state, 0);
1667 PerlIO_printf( Perl_debug_log, "%*s",
1669 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670 PL_colors[0], PL_colors[1],
1671 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672 PERL_PV_ESCAPE_FIRSTCHAR
1677 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1678 (int)depth * 2 + 2,"");
1680 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1681 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1682 PerlIO_printf( Perl_debug_log, "\n");
1684 for( state = 1 ; state < trie->statecount ; state++ ) {
1685 const U32 base = trie->states[ state ].trans.base;
1687 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1688 (int)depth * 2 + 2,"", (UV)state);
1690 if ( trie->states[ state ].wordnum ) {
1691 PerlIO_printf( Perl_debug_log, " W%4X",
1692 trie->states[ state ].wordnum );
1694 PerlIO_printf( Perl_debug_log, "%6s", "" );
1697 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1702 while( ( base + ofs < trie->uniquecharcount ) ||
1703 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1704 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1708 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1710 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1711 if ( ( base + ofs >= trie->uniquecharcount )
1712 && ( base + ofs - trie->uniquecharcount
1714 && trie->trans[ base + ofs
1715 - trie->uniquecharcount ].check == state )
1717 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1719 (UV)trie->trans[ base + ofs
1720 - trie->uniquecharcount ].next );
1722 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1726 PerlIO_printf( Perl_debug_log, "]");
1729 PerlIO_printf( Perl_debug_log, "\n" );
1731 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1733 for (word=1; word <= trie->wordcount; word++) {
1734 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1735 (int)word, (int)(trie->wordinfo[word].prev),
1736 (int)(trie->wordinfo[word].len));
1738 PerlIO_printf(Perl_debug_log, "\n" );
1741 Dumps a fully constructed but uncompressed trie in list form.
1742 List tries normally only are used for construction when the number of
1743 possible chars (trie->uniquecharcount) is very high.
1744 Used for debugging make_trie().
1747 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1748 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1752 SV *sv=sv_newmortal();
1753 int colwidth= widecharmap ? 6 : 4;
1754 GET_RE_DEBUG_FLAGS_DECL;
1756 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1758 /* print out the table precompression. */
1759 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1760 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1761 "------:-----+-----------------\n" );
1763 for( state=1 ; state < next_alloc ; state ++ ) {
1766 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1767 (int)depth * 2 + 2,"", (UV)state );
1768 if ( ! trie->states[ state ].wordnum ) {
1769 PerlIO_printf( Perl_debug_log, "%5s| ","");
1771 PerlIO_printf( Perl_debug_log, "W%4x| ",
1772 trie->states[ state ].wordnum
1775 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1776 SV ** const tmp = av_fetch( revcharmap,
1777 TRIE_LIST_ITEM(state,charid).forid, 0);
1779 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1781 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1783 PL_colors[0], PL_colors[1],
1784 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1785 | PERL_PV_ESCAPE_FIRSTCHAR
1787 TRIE_LIST_ITEM(state,charid).forid,
1788 (UV)TRIE_LIST_ITEM(state,charid).newstate
1791 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1792 (int)((depth * 2) + 14), "");
1795 PerlIO_printf( Perl_debug_log, "\n");
1800 Dumps a fully constructed but uncompressed trie in table form.
1801 This is the normal DFA style state transition table, with a few
1802 twists to facilitate compression later.
1803 Used for debugging make_trie().
1806 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1807 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1812 SV *sv=sv_newmortal();
1813 int colwidth= widecharmap ? 6 : 4;
1814 GET_RE_DEBUG_FLAGS_DECL;
1816 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1819 print out the table precompression so that we can do a visual check
1820 that they are identical.
1823 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1825 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1826 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1828 PerlIO_printf( Perl_debug_log, "%*s",
1830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1831 PL_colors[0], PL_colors[1],
1832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1833 PERL_PV_ESCAPE_FIRSTCHAR
1839 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1841 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1845 PerlIO_printf( Perl_debug_log, "\n" );
1847 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1849 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1850 (int)depth * 2 + 2,"",
1851 (UV)TRIE_NODENUM( state ) );
1853 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1854 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1856 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1858 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1860 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1861 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1862 (UV)trie->trans[ state ].check );
1864 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1865 (UV)trie->trans[ state ].check,
1866 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1874 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1875 startbranch: the first branch in the whole branch sequence
1876 first : start branch of sequence of branch-exact nodes.
1877 May be the same as startbranch
1878 last : Thing following the last branch.
1879 May be the same as tail.
1880 tail : item following the branch sequence
1881 count : words in the sequence
1882 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1883 depth : indent depth
1885 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1887 A trie is an N'ary tree where the branches are determined by digital
1888 decomposition of the key. IE, at the root node you look up the 1st character and
1889 follow that branch repeat until you find the end of the branches. Nodes can be
1890 marked as "accepting" meaning they represent a complete word. Eg:
1894 would convert into the following structure. Numbers represent states, letters
1895 following numbers represent valid transitions on the letter from that state, if
1896 the number is in square brackets it represents an accepting state, otherwise it
1897 will be in parenthesis.
1899 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1903 (1) +-i->(6)-+-s->[7]
1905 +-s->(3)-+-h->(4)-+-e->[5]
1907 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1909 This shows that when matching against the string 'hers' we will begin at state 1
1910 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1911 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1912 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1913 single traverse. We store a mapping from accepting to state to which word was
1914 matched, and then when we have multiple possibilities we try to complete the
1915 rest of the regex in the order in which they occured in the alternation.
1917 The only prior NFA like behaviour that would be changed by the TRIE support is
1918 the silent ignoring of duplicate alternations which are of the form:
1920 / (DUPE|DUPE) X? (?{ ... }) Y /x
1922 Thus EVAL blocks following a trie may be called a different number of times with
1923 and without the optimisation. With the optimisations dupes will be silently
1924 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1925 the following demonstrates:
1927 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1929 which prints out 'word' three times, but
1931 'words'=~/(word|word|word)(?{ print $1 })S/
1933 which doesnt print it out at all. This is due to other optimisations kicking in.
1935 Example of what happens on a structural level:
1937 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1939 1: CURLYM[1] {1,32767}(18)
1950 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1951 and should turn into:
1953 1: CURLYM[1] {1,32767}(18)
1955 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1963 Cases where tail != last would be like /(?foo|bar)baz/:
1973 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1974 and would end up looking like:
1977 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1984 d = uvchr_to_utf8_flags(d, uv, 0);
1986 is the recommended Unicode-aware way of saying
1991 #define TRIE_STORE_REVCHAR(val) \
1994 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1995 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1996 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1997 SvCUR_set(zlopp, kapow - flrbbbbb); \
2000 av_push(revcharmap, zlopp); \
2002 char ooooff = (char)val; \
2003 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2007 /* This gets the next character from the input, folding it if not already
2009 #define TRIE_READ_CHAR STMT_START { \
2012 /* if it is UTF then it is either already folded, or does not need \
2014 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2016 else if (folder == PL_fold_latin1) { \
2017 /* This folder implies Unicode rules, which in the range expressible \
2018 * by not UTF is the lower case, with the two exceptions, one of \
2019 * which should have been taken care of before calling this */ \
2020 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2021 uvc = toLOWER_L1(*uc); \
2022 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2025 /* raw data, will be folded later if needed */ \
2033 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2034 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2035 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2036 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2038 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2039 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2040 TRIE_LIST_CUR( state )++; \
2043 #define TRIE_LIST_NEW(state) STMT_START { \
2044 Newxz( trie->states[ state ].trans.list, \
2045 4, reg_trie_trans_le ); \
2046 TRIE_LIST_CUR( state ) = 1; \
2047 TRIE_LIST_LEN( state ) = 4; \
2050 #define TRIE_HANDLE_WORD(state) STMT_START { \
2051 U16 dupe= trie->states[ state ].wordnum; \
2052 regnode * const noper_next = regnext( noper ); \
2055 /* store the word for dumping */ \
2057 if (OP(noper) != NOTHING) \
2058 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2060 tmp = newSVpvn_utf8( "", 0, UTF ); \
2061 av_push( trie_words, tmp ); \
2065 trie->wordinfo[curword].prev = 0; \
2066 trie->wordinfo[curword].len = wordlen; \
2067 trie->wordinfo[curword].accept = state; \
2069 if ( noper_next < tail ) { \
2071 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2073 trie->jump[curword] = (U16)(noper_next - convert); \
2075 jumper = noper_next; \
2077 nextbranch= regnext(cur); \
2081 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2082 /* chain, so that when the bits of chain are later */\
2083 /* linked together, the dups appear in the chain */\
2084 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2085 trie->wordinfo[dupe].prev = curword; \
2087 /* we haven't inserted this word yet. */ \
2088 trie->states[ state ].wordnum = curword; \
2093 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2094 ( ( base + charid >= ucharcount \
2095 && base + charid < ubound \
2096 && state == trie->trans[ base - ucharcount + charid ].check \
2097 && trie->trans[ base - ucharcount + charid ].next ) \
2098 ? trie->trans[ base - ucharcount + charid ].next \
2099 : ( state==1 ? special : 0 ) \
2103 #define MADE_JUMP_TRIE 2
2104 #define MADE_EXACT_TRIE 4
2107 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2108 regnode *first, regnode *last, regnode *tail,
2109 U32 word_count, U32 flags, U32 depth)
2111 /* first pass, loop through and scan words */
2112 reg_trie_data *trie;
2113 HV *widecharmap = NULL;
2114 AV *revcharmap = newAV();
2120 regnode *jumper = NULL;
2121 regnode *nextbranch = NULL;
2122 regnode *convert = NULL;
2123 U32 *prev_states; /* temp array mapping each state to previous one */
2124 /* we just use folder as a flag in utf8 */
2125 const U8 * folder = NULL;
2128 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2129 AV *trie_words = NULL;
2130 /* along with revcharmap, this only used during construction but both are
2131 * useful during debugging so we store them in the struct when debugging.
2134 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2135 STRLEN trie_charcount=0;
2137 SV *re_trie_maxbuff;
2138 GET_RE_DEBUG_FLAGS_DECL;
2140 PERL_ARGS_ASSERT_MAKE_TRIE;
2142 PERL_UNUSED_ARG(depth);
2146 case EXACT: case EXACTL: break;
2150 case EXACTFLU8: folder = PL_fold_latin1; break;
2151 case EXACTF: folder = PL_fold; break;
2152 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2155 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2157 trie->startstate = 1;
2158 trie->wordcount = word_count;
2159 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2160 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2161 if (flags == EXACT || flags == EXACTL)
2162 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2163 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2164 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2167 trie_words = newAV();
2170 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2171 assert(re_trie_maxbuff);
2172 if (!SvIOK(re_trie_maxbuff)) {
2173 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2175 DEBUG_TRIE_COMPILE_r({
2176 PerlIO_printf( Perl_debug_log,
2177 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2178 (int)depth * 2 + 2, "",
2179 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2180 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2183 /* Find the node we are going to overwrite */
2184 if ( first == startbranch && OP( last ) != BRANCH ) {
2185 /* whole branch chain */
2188 /* branch sub-chain */
2189 convert = NEXTOPER( first );
2192 /* -- First loop and Setup --
2194 We first traverse the branches and scan each word to determine if it
2195 contains widechars, and how many unique chars there are, this is
2196 important as we have to build a table with at least as many columns as we
2199 We use an array of integers to represent the character codes 0..255
2200 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2201 the native representation of the character value as the key and IV's for
2204 *TODO* If we keep track of how many times each character is used we can
2205 remap the columns so that the table compression later on is more
2206 efficient in terms of memory by ensuring the most common value is in the
2207 middle and the least common are on the outside. IMO this would be better
2208 than a most to least common mapping as theres a decent chance the most
2209 common letter will share a node with the least common, meaning the node
2210 will not be compressible. With a middle is most common approach the worst
2211 case is when we have the least common nodes twice.
2215 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2216 regnode *noper = NEXTOPER( cur );
2217 const U8 *uc = (U8*)STRING( noper );
2218 const U8 *e = uc + STR_LEN( noper );
2220 U32 wordlen = 0; /* required init */
2221 STRLEN minchars = 0;
2222 STRLEN maxchars = 0;
2223 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2226 if (OP(noper) == NOTHING) {
2227 regnode *noper_next= regnext(noper);
2228 if (noper_next != tail && OP(noper_next) == flags) {
2230 uc= (U8*)STRING(noper);
2231 e= uc + STR_LEN(noper);
2232 trie->minlen= STR_LEN(noper);
2239 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2240 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2241 regardless of encoding */
2242 if (OP( noper ) == EXACTFU_SS) {
2243 /* false positives are ok, so just set this */
2244 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2247 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2249 TRIE_CHARCOUNT(trie)++;
2252 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2253 * is in effect. Under /i, this character can match itself, or
2254 * anything that folds to it. If not under /i, it can match just
2255 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2256 * all fold to k, and all are single characters. But some folds
2257 * expand to more than one character, so for example LATIN SMALL
2258 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2259 * the string beginning at 'uc' is 'ffi', it could be matched by
2260 * three characters, or just by the one ligature character. (It
2261 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2262 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2263 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2264 * match.) The trie needs to know the minimum and maximum number
2265 * of characters that could match so that it can use size alone to
2266 * quickly reject many match attempts. The max is simple: it is
2267 * the number of folded characters in this branch (since a fold is
2268 * never shorter than what folds to it. */
2272 /* And the min is equal to the max if not under /i (indicated by
2273 * 'folder' being NULL), or there are no multi-character folds. If
2274 * there is a multi-character fold, the min is incremented just
2275 * once, for the character that folds to the sequence. Each
2276 * character in the sequence needs to be added to the list below of
2277 * characters in the trie, but we count only the first towards the
2278 * min number of characters needed. This is done through the
2279 * variable 'foldlen', which is returned by the macros that look
2280 * for these sequences as the number of bytes the sequence
2281 * occupies. Each time through the loop, we decrement 'foldlen' by
2282 * how many bytes the current char occupies. Only when it reaches
2283 * 0 do we increment 'minchars' or look for another multi-character
2285 if (folder == NULL) {
2288 else if (foldlen > 0) {
2289 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2294 /* See if *uc is the beginning of a multi-character fold. If
2295 * so, we decrement the length remaining to look at, to account
2296 * for the current character this iteration. (We can use 'uc'
2297 * instead of the fold returned by TRIE_READ_CHAR because for
2298 * non-UTF, the latin1_safe macro is smart enough to account
2299 * for all the unfolded characters, and because for UTF, the
2300 * string will already have been folded earlier in the
2301 * compilation process */
2303 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2304 foldlen -= UTF8SKIP(uc);
2307 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2312 /* The current character (and any potential folds) should be added
2313 * to the possible matching characters for this position in this
2317 U8 folded= folder[ (U8) uvc ];
2318 if ( !trie->charmap[ folded ] ) {
2319 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2320 TRIE_STORE_REVCHAR( folded );
2323 if ( !trie->charmap[ uvc ] ) {
2324 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2325 TRIE_STORE_REVCHAR( uvc );
2328 /* store the codepoint in the bitmap, and its folded
2330 TRIE_BITMAP_SET(trie, uvc);
2332 /* store the folded codepoint */
2333 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2336 /* store first byte of utf8 representation of
2337 variant codepoints */
2338 if (! UVCHR_IS_INVARIANT(uvc)) {
2339 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2342 set_bit = 0; /* We've done our bit :-) */
2346 /* XXX We could come up with the list of code points that fold
2347 * to this using PL_utf8_foldclosures, except not for
2348 * multi-char folds, as there may be multiple combinations
2349 * there that could work, which needs to wait until runtime to
2350 * resolve (The comment about LIGATURE FFI above is such an
2355 widecharmap = newHV();
2357 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2360 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2362 if ( !SvTRUE( *svpp ) ) {
2363 sv_setiv( *svpp, ++trie->uniquecharcount );
2364 TRIE_STORE_REVCHAR(uvc);
2367 } /* end loop through characters in this branch of the trie */
2369 /* We take the min and max for this branch and combine to find the min
2370 * and max for all branches processed so far */
2371 if( cur == first ) {
2372 trie->minlen = minchars;
2373 trie->maxlen = maxchars;
2374 } else if (minchars < trie->minlen) {
2375 trie->minlen = minchars;
2376 } else if (maxchars > trie->maxlen) {
2377 trie->maxlen = maxchars;
2379 } /* end first pass */
2380 DEBUG_TRIE_COMPILE_r(
2381 PerlIO_printf( Perl_debug_log,
2382 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2383 (int)depth * 2 + 2,"",
2384 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2385 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2386 (int)trie->minlen, (int)trie->maxlen )
2390 We now know what we are dealing with in terms of unique chars and
2391 string sizes so we can calculate how much memory a naive
2392 representation using a flat table will take. If it's over a reasonable
2393 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2394 conservative but potentially much slower representation using an array
2397 At the end we convert both representations into the same compressed
2398 form that will be used in regexec.c for matching with. The latter
2399 is a form that cannot be used to construct with but has memory
2400 properties similar to the list form and access properties similar
2401 to the table form making it both suitable for fast searches and
2402 small enough that its feasable to store for the duration of a program.
2404 See the comment in the code where the compressed table is produced
2405 inplace from the flat tabe representation for an explanation of how
2406 the compression works.
2411 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2414 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2415 > SvIV(re_trie_maxbuff) )
2418 Second Pass -- Array Of Lists Representation
2420 Each state will be represented by a list of charid:state records
2421 (reg_trie_trans_le) the first such element holds the CUR and LEN
2422 points of the allocated array. (See defines above).
2424 We build the initial structure using the lists, and then convert
2425 it into the compressed table form which allows faster lookups
2426 (but cant be modified once converted).
2429 STRLEN transcount = 1;
2431 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2432 "%*sCompiling trie using list compiler\n",
2433 (int)depth * 2 + 2, ""));
2435 trie->states = (reg_trie_state *)
2436 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2437 sizeof(reg_trie_state) );
2441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2443 regnode *noper = NEXTOPER( cur );
2444 U8 *uc = (U8*)STRING( noper );
2445 const U8 *e = uc + STR_LEN( noper );
2446 U32 state = 1; /* required init */
2447 U16 charid = 0; /* sanity init */
2448 U32 wordlen = 0; /* required init */
2450 if (OP(noper) == NOTHING) {
2451 regnode *noper_next= regnext(noper);
2452 if (noper_next != tail && OP(noper_next) == flags) {
2454 uc= (U8*)STRING(noper);
2455 e= uc + STR_LEN(noper);
2459 if (OP(noper) != NOTHING) {
2460 for ( ; uc < e ; uc += len ) {
2465 charid = trie->charmap[ uvc ];
2467 SV** const svpp = hv_fetch( widecharmap,
2474 charid=(U16)SvIV( *svpp );
2477 /* charid is now 0 if we dont know the char read, or
2478 * nonzero if we do */
2485 if ( !trie->states[ state ].trans.list ) {
2486 TRIE_LIST_NEW( state );
2489 check <= TRIE_LIST_USED( state );
2492 if ( TRIE_LIST_ITEM( state, check ).forid
2495 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2500 newstate = next_alloc++;
2501 prev_states[newstate] = state;
2502 TRIE_LIST_PUSH( state, charid, newstate );
2507 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2511 TRIE_HANDLE_WORD(state);
2513 } /* end second pass */
2515 /* next alloc is the NEXT state to be allocated */
2516 trie->statecount = next_alloc;
2517 trie->states = (reg_trie_state *)
2518 PerlMemShared_realloc( trie->states,
2520 * sizeof(reg_trie_state) );
2522 /* and now dump it out before we compress it */
2523 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2524 revcharmap, next_alloc,
2528 trie->trans = (reg_trie_trans *)
2529 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2536 for( state=1 ; state < next_alloc ; state ++ ) {
2540 DEBUG_TRIE_COMPILE_MORE_r(
2541 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2545 if (trie->states[state].trans.list) {
2546 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2550 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2551 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2552 if ( forid < minid ) {
2554 } else if ( forid > maxid ) {
2558 if ( transcount < tp + maxid - minid + 1) {
2560 trie->trans = (reg_trie_trans *)
2561 PerlMemShared_realloc( trie->trans,
2563 * sizeof(reg_trie_trans) );
2564 Zero( trie->trans + (transcount / 2),
2568 base = trie->uniquecharcount + tp - minid;
2569 if ( maxid == minid ) {
2571 for ( ; zp < tp ; zp++ ) {
2572 if ( ! trie->trans[ zp ].next ) {
2573 base = trie->uniquecharcount + zp - minid;
2574 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2576 trie->trans[ zp ].check = state;
2582 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2584 trie->trans[ tp ].check = state;
2589 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2590 const U32 tid = base
2591 - trie->uniquecharcount
2592 + TRIE_LIST_ITEM( state, idx ).forid;
2593 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2595 trie->trans[ tid ].check = state;
2597 tp += ( maxid - minid + 1 );
2599 Safefree(trie->states[ state ].trans.list);
2602 DEBUG_TRIE_COMPILE_MORE_r(
2603 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2606 trie->states[ state ].trans.base=base;
2608 trie->lasttrans = tp + 1;
2612 Second Pass -- Flat Table Representation.
2614 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2615 each. We know that we will need Charcount+1 trans at most to store
2616 the data (one row per char at worst case) So we preallocate both
2617 structures assuming worst case.
2619 We then construct the trie using only the .next slots of the entry
2622 We use the .check field of the first entry of the node temporarily
2623 to make compression both faster and easier by keeping track of how
2624 many non zero fields are in the node.
2626 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2629 There are two terms at use here: state as a TRIE_NODEIDX() which is
2630 a number representing the first entry of the node, and state as a
2631 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2632 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2633 if there are 2 entrys per node. eg:
2641 The table is internally in the right hand, idx form. However as we
2642 also have to deal with the states array which is indexed by nodenum
2643 we have to use TRIE_NODENUM() to convert.
2646 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2647 "%*sCompiling trie using table compiler\n",
2648 (int)depth * 2 + 2, ""));
2650 trie->trans = (reg_trie_trans *)
2651 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2652 * trie->uniquecharcount + 1,
2653 sizeof(reg_trie_trans) );
2654 trie->states = (reg_trie_state *)
2655 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2656 sizeof(reg_trie_state) );
2657 next_alloc = trie->uniquecharcount + 1;
2660 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2662 regnode *noper = NEXTOPER( cur );
2663 const U8 *uc = (U8*)STRING( noper );
2664 const U8 *e = uc + STR_LEN( noper );
2666 U32 state = 1; /* required init */
2668 U16 charid = 0; /* sanity init */
2669 U32 accept_state = 0; /* sanity init */
2671 U32 wordlen = 0; /* required init */
2673 if (OP(noper) == NOTHING) {
2674 regnode *noper_next= regnext(noper);
2675 if (noper_next != tail && OP(noper_next) == flags) {
2677 uc= (U8*)STRING(noper);
2678 e= uc + STR_LEN(noper);
2682 if ( OP(noper) != NOTHING ) {
2683 for ( ; uc < e ; uc += len ) {
2688 charid = trie->charmap[ uvc ];
2690 SV* const * const svpp = hv_fetch( widecharmap,
2694 charid = svpp ? (U16)SvIV(*svpp) : 0;
2698 if ( !trie->trans[ state + charid ].next ) {
2699 trie->trans[ state + charid ].next = next_alloc;
2700 trie->trans[ state ].check++;
2701 prev_states[TRIE_NODENUM(next_alloc)]
2702 = TRIE_NODENUM(state);
2703 next_alloc += trie->uniquecharcount;
2705 state = trie->trans[ state + charid ].next;
2707 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2709 /* charid is now 0 if we dont know the char read, or
2710 * nonzero if we do */
2713 accept_state = TRIE_NODENUM( state );
2714 TRIE_HANDLE_WORD(accept_state);
2716 } /* end second pass */
2718 /* and now dump it out before we compress it */
2719 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2721 next_alloc, depth+1));
2725 * Inplace compress the table.*
2727 For sparse data sets the table constructed by the trie algorithm will
2728 be mostly 0/FAIL transitions or to put it another way mostly empty.
2729 (Note that leaf nodes will not contain any transitions.)
2731 This algorithm compresses the tables by eliminating most such
2732 transitions, at the cost of a modest bit of extra work during lookup:
2734 - Each states[] entry contains a .base field which indicates the
2735 index in the state[] array wheres its transition data is stored.
2737 - If .base is 0 there are no valid transitions from that node.
2739 - If .base is nonzero then charid is added to it to find an entry in
2742 -If trans[states[state].base+charid].check!=state then the
2743 transition is taken to be a 0/Fail transition. Thus if there are fail
2744 transitions at the front of the node then the .base offset will point
2745 somewhere inside the previous nodes data (or maybe even into a node
2746 even earlier), but the .check field determines if the transition is
2750 The following process inplace converts the table to the compressed
2751 table: We first do not compress the root node 1,and mark all its
2752 .check pointers as 1 and set its .base pointer as 1 as well. This
2753 allows us to do a DFA construction from the compressed table later,
2754 and ensures that any .base pointers we calculate later are greater
2757 - We set 'pos' to indicate the first entry of the second node.
2759 - We then iterate over the columns of the node, finding the first and
2760 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2761 and set the .check pointers accordingly, and advance pos
2762 appropriately and repreat for the next node. Note that when we copy
2763 the next pointers we have to convert them from the original
2764 NODEIDX form to NODENUM form as the former is not valid post
2767 - If a node has no transitions used we mark its base as 0 and do not
2768 advance the pos pointer.
2770 - If a node only has one transition we use a second pointer into the
2771 structure to fill in allocated fail transitions from other states.
2772 This pointer is independent of the main pointer and scans forward
2773 looking for null transitions that are allocated to a state. When it
2774 finds one it writes the single transition into the "hole". If the
2775 pointer doesnt find one the single transition is appended as normal.
2777 - Once compressed we can Renew/realloc the structures to release the
2780 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2781 specifically Fig 3.47 and the associated pseudocode.
2785 const U32 laststate = TRIE_NODENUM( next_alloc );
2788 trie->statecount = laststate;
2790 for ( state = 1 ; state < laststate ; state++ ) {
2792 const U32 stateidx = TRIE_NODEIDX( state );
2793 const U32 o_used = trie->trans[ stateidx ].check;
2794 U32 used = trie->trans[ stateidx ].check;
2795 trie->trans[ stateidx ].check = 0;
2798 used && charid < trie->uniquecharcount;
2801 if ( flag || trie->trans[ stateidx + charid ].next ) {
2802 if ( trie->trans[ stateidx + charid ].next ) {
2804 for ( ; zp < pos ; zp++ ) {
2805 if ( ! trie->trans[ zp ].next ) {
2809 trie->states[ state ].trans.base
2811 + trie->uniquecharcount
2813 trie->trans[ zp ].next
2814 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2816 trie->trans[ zp ].check = state;
2817 if ( ++zp > pos ) pos = zp;
2824 trie->states[ state ].trans.base
2825 = pos + trie->uniquecharcount - charid ;
2827 trie->trans[ pos ].next
2828 = SAFE_TRIE_NODENUM(
2829 trie->trans[ stateidx + charid ].next );
2830 trie->trans[ pos ].check = state;
2835 trie->lasttrans = pos + 1;
2836 trie->states = (reg_trie_state *)
2837 PerlMemShared_realloc( trie->states, laststate
2838 * sizeof(reg_trie_state) );
2839 DEBUG_TRIE_COMPILE_MORE_r(
2840 PerlIO_printf( Perl_debug_log,
2841 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2842 (int)depth * 2 + 2,"",
2843 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2847 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2850 } /* end table compress */
2852 DEBUG_TRIE_COMPILE_MORE_r(
2853 PerlIO_printf(Perl_debug_log,
2854 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2855 (int)depth * 2 + 2, "",
2856 (UV)trie->statecount,
2857 (UV)trie->lasttrans)
2859 /* resize the trans array to remove unused space */
2860 trie->trans = (reg_trie_trans *)
2861 PerlMemShared_realloc( trie->trans, trie->lasttrans
2862 * sizeof(reg_trie_trans) );
2864 { /* Modify the program and insert the new TRIE node */
2865 U8 nodetype =(U8)(flags & 0xFF);
2869 regnode *optimize = NULL;
2870 #ifdef RE_TRACK_PATTERN_OFFSETS
2873 U32 mjd_nodelen = 0;
2874 #endif /* RE_TRACK_PATTERN_OFFSETS */
2875 #endif /* DEBUGGING */
2877 This means we convert either the first branch or the first Exact,
2878 depending on whether the thing following (in 'last') is a branch
2879 or not and whther first is the startbranch (ie is it a sub part of
2880 the alternation or is it the whole thing.)
2881 Assuming its a sub part we convert the EXACT otherwise we convert
2882 the whole branch sequence, including the first.
2884 /* Find the node we are going to overwrite */
2885 if ( first != startbranch || OP( last ) == BRANCH ) {
2886 /* branch sub-chain */
2887 NEXT_OFF( first ) = (U16)(last - first);
2888 #ifdef RE_TRACK_PATTERN_OFFSETS
2890 mjd_offset= Node_Offset((convert));
2891 mjd_nodelen= Node_Length((convert));
2894 /* whole branch chain */
2896 #ifdef RE_TRACK_PATTERN_OFFSETS
2899 const regnode *nop = NEXTOPER( convert );
2900 mjd_offset= Node_Offset((nop));
2901 mjd_nodelen= Node_Length((nop));
2905 PerlIO_printf(Perl_debug_log,
2906 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2907 (int)depth * 2 + 2, "",
2908 (UV)mjd_offset, (UV)mjd_nodelen)
2911 /* But first we check to see if there is a common prefix we can
2912 split out as an EXACT and put in front of the TRIE node. */
2913 trie->startstate= 1;
2914 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2916 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2920 const U32 base = trie->states[ state ].trans.base;
2922 if ( trie->states[state].wordnum )
2925 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2926 if ( ( base + ofs >= trie->uniquecharcount ) &&
2927 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2928 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2930 if ( ++count > 1 ) {
2931 SV **tmp = av_fetch( revcharmap, ofs, 0);
2932 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2933 if ( state == 1 ) break;
2935 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2937 PerlIO_printf(Perl_debug_log,
2938 "%*sNew Start State=%"UVuf" Class: [",
2939 (int)depth * 2 + 2, "",
2942 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2943 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2945 TRIE_BITMAP_SET(trie,*ch);
2947 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2949 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2953 TRIE_BITMAP_SET(trie,*ch);
2955 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2956 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2962 SV **tmp = av_fetch( revcharmap, idx, 0);
2964 char *ch = SvPV( *tmp, len );
2966 SV *sv=sv_newmortal();
2967 PerlIO_printf( Perl_debug_log,
2968 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2969 (int)depth * 2 + 2, "",
2971 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2972 PL_colors[0], PL_colors[1],
2973 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2974 PERL_PV_ESCAPE_FIRSTCHAR
2979 OP( convert ) = nodetype;
2980 str=STRING(convert);
2983 STR_LEN(convert) += len;
2989 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2994 trie->prefixlen = (state-1);
2996 regnode *n = convert+NODE_SZ_STR(convert);
2997 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2998 trie->startstate = state;
2999 trie->minlen -= (state - 1);
3000 trie->maxlen -= (state - 1);
3002 /* At least the UNICOS C compiler choked on this
3003 * being argument to DEBUG_r(), so let's just have
3006 #ifdef PERL_EXT_RE_BUILD
3012 regnode *fix = convert;
3013 U32 word = trie->wordcount;
3015 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3016 while( ++fix < n ) {
3017 Set_Node_Offset_Length(fix, 0, 0);
3020 SV ** const tmp = av_fetch( trie_words, word, 0 );
3022 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3023 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3025 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3033 NEXT_OFF(convert) = (U16)(tail - convert);
3034 DEBUG_r(optimize= n);
3040 if ( trie->maxlen ) {
3041 NEXT_OFF( convert ) = (U16)(tail - convert);
3042 ARG_SET( convert, data_slot );
3043 /* Store the offset to the first unabsorbed branch in
3044 jump[0], which is otherwise unused by the jump logic.
3045 We use this when dumping a trie and during optimisation. */
3047 trie->jump[0] = (U16)(nextbranch - convert);
3049 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3050 * and there is a bitmap
3051 * and the first "jump target" node we found leaves enough room
3052 * then convert the TRIE node into a TRIEC node, with the bitmap
3053 * embedded inline in the opcode - this is hypothetically faster.
3055 if ( !trie->states[trie->startstate].wordnum
3057 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3059 OP( convert ) = TRIEC;
3060 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3061 PerlMemShared_free(trie->bitmap);
3064 OP( convert ) = TRIE;
3066 /* store the type in the flags */
3067 convert->flags = nodetype;
3071 + regarglen[ OP( convert ) ];
3073 /* XXX We really should free up the resource in trie now,
3074 as we won't use them - (which resources?) dmq */
3076 /* needed for dumping*/
3077 DEBUG_r(if (optimize) {
3078 regnode *opt = convert;
3080 while ( ++opt < optimize) {
3081 Set_Node_Offset_Length(opt,0,0);
3084 Try to clean up some of the debris left after the
3087 while( optimize < jumper ) {
3088 mjd_nodelen += Node_Length((optimize));
3089 OP( optimize ) = OPTIMIZED;
3090 Set_Node_Offset_Length(optimize,0,0);
3093 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3095 } /* end node insert */
3097 /* Finish populating the prev field of the wordinfo array. Walk back
3098 * from each accept state until we find another accept state, and if
3099 * so, point the first word's .prev field at the second word. If the
3100 * second already has a .prev field set, stop now. This will be the
3101 * case either if we've already processed that word's accept state,
3102 * or that state had multiple words, and the overspill words were
3103 * already linked up earlier.
3110 for (word=1; word <= trie->wordcount; word++) {
3112 if (trie->wordinfo[word].prev)
3114 state = trie->wordinfo[word].accept;
3116 state = prev_states[state];
3119 prev = trie->states[state].wordnum;
3123 trie->wordinfo[word].prev = prev;
3125 Safefree(prev_states);
3129 /* and now dump out the compressed format */
3130 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3132 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3134 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3135 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3137 SvREFCNT_dec_NN(revcharmap);
3141 : trie->startstate>1
3147 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3149 /* The Trie is constructed and compressed now so we can build a fail array if
3152 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3154 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3158 We find the fail state for each state in the trie, this state is the longest
3159 proper suffix of the current state's 'word' that is also a proper prefix of
3160 another word in our trie. State 1 represents the word '' and is thus the
3161 default fail state. This allows the DFA not to have to restart after its
3162 tried and failed a word at a given point, it simply continues as though it
3163 had been matching the other word in the first place.
3165 'abcdgu'=~/abcdefg|cdgu/
3166 When we get to 'd' we are still matching the first word, we would encounter
3167 'g' which would fail, which would bring us to the state representing 'd' in
3168 the second word where we would try 'g' and succeed, proceeding to match
3171 /* add a fail transition */
3172 const U32 trie_offset = ARG(source);
3173 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3175 const U32 ucharcount = trie->uniquecharcount;
3176 const U32 numstates = trie->statecount;
3177 const U32 ubound = trie->lasttrans + ucharcount;
3181 U32 base = trie->states[ 1 ].trans.base;
3184 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3186 GET_RE_DEBUG_FLAGS_DECL;
3188 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3189 PERL_UNUSED_CONTEXT;
3191 PERL_UNUSED_ARG(depth);
3194 if ( OP(source) == TRIE ) {
3195 struct regnode_1 *op = (struct regnode_1 *)
3196 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3197 StructCopy(source,op,struct regnode_1);
3198 stclass = (regnode *)op;
3200 struct regnode_charclass *op = (struct regnode_charclass *)
3201 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3202 StructCopy(source,op,struct regnode_charclass);
3203 stclass = (regnode *)op;
3205 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3207 ARG_SET( stclass, data_slot );
3208 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3209 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3210 aho->trie=trie_offset;
3211 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3212 Copy( trie->states, aho->states, numstates, reg_trie_state );
3213 Newxz( q, numstates, U32);
3214 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3217 /* initialize fail[0..1] to be 1 so that we always have
3218 a valid final fail state */
3219 fail[ 0 ] = fail[ 1 ] = 1;
3221 for ( charid = 0; charid < ucharcount ; charid++ ) {
3222 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3224 q[ q_write ] = newstate;
3225 /* set to point at the root */
3226 fail[ q[ q_write++ ] ]=1;
3229 while ( q_read < q_write) {
3230 const U32 cur = q[ q_read++ % numstates ];
3231 base = trie->states[ cur ].trans.base;
3233 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3234 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3236 U32 fail_state = cur;
3239 fail_state = fail[ fail_state ];
3240 fail_base = aho->states[ fail_state ].trans.base;
3241 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3243 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3244 fail[ ch_state ] = fail_state;
3245 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3247 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3249 q[ q_write++ % numstates] = ch_state;
3253 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3254 when we fail in state 1, this allows us to use the
3255 charclass scan to find a valid start char. This is based on the principle
3256 that theres a good chance the string being searched contains lots of stuff
3257 that cant be a start char.
3259 fail[ 0 ] = fail[ 1 ] = 0;
3260 DEBUG_TRIE_COMPILE_r({
3261 PerlIO_printf(Perl_debug_log,
3262 "%*sStclass Failtable (%"UVuf" states): 0",
3263 (int)(depth * 2), "", (UV)numstates
3265 for( q_read=1; q_read<numstates; q_read++ ) {
3266 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3268 PerlIO_printf(Perl_debug_log, "\n");
3271 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3276 #define DEBUG_PEEP(str,scan,depth) \
3277 DEBUG_OPTIMISE_r({if (scan){ \
3278 regnode *Next = regnext(scan); \
3279 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3280 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3281 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3282 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3283 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3284 PerlIO_printf(Perl_debug_log, "\n"); \
3287 /* The below joins as many adjacent EXACTish nodes as possible into a single
3288 * one. The regop may be changed if the node(s) contain certain sequences that
3289 * require special handling. The joining is only done if:
3290 * 1) there is room in the current conglomerated node to entirely contain the
3292 * 2) they are the exact same node type
3294 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3295 * these get optimized out
3297 * If a node is to match under /i (folded), the number of characters it matches
3298 * can be different than its character length if it contains a multi-character
3299 * fold. *min_subtract is set to the total delta number of characters of the
3302 * And *unfolded_multi_char is set to indicate whether or not the node contains
3303 * an unfolded multi-char fold. This happens when whether the fold is valid or
3304 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3305 * SMALL LETTER SHARP S, as only if the target string being matched against
3306 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3307 * folding rules depend on the locale in force at runtime. (Multi-char folds
3308 * whose components are all above the Latin1 range are not run-time locale
3309 * dependent, and have already been folded by the time this function is
3312 * This is as good a place as any to discuss the design of handling these
3313 * multi-character fold sequences. It's been wrong in Perl for a very long
3314 * time. There are three code points in Unicode whose multi-character folds
3315 * were long ago discovered to mess things up. The previous designs for
3316 * dealing with these involved assigning a special node for them. This
3317 * approach doesn't always work, as evidenced by this example:
3318 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3319 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3320 * would match just the \xDF, it won't be able to handle the case where a
3321 * successful match would have to cross the node's boundary. The new approach
3322 * that hopefully generally solves the problem generates an EXACTFU_SS node
3323 * that is "sss" in this case.
3325 * It turns out that there are problems with all multi-character folds, and not
3326 * just these three. Now the code is general, for all such cases. The
3327 * approach taken is:
3328 * 1) This routine examines each EXACTFish node that could contain multi-
3329 * character folded sequences. Since a single character can fold into
3330 * such a sequence, the minimum match length for this node is less than
3331 * the number of characters in the node. This routine returns in
3332 * *min_subtract how many characters to subtract from the the actual
3333 * length of the string to get a real minimum match length; it is 0 if
3334 * there are no multi-char foldeds. This delta is used by the caller to
3335 * adjust the min length of the match, and the delta between min and max,
3336 * so that the optimizer doesn't reject these possibilities based on size
3338 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3339 * is used for an EXACTFU node that contains at least one "ss" sequence in
3340 * it. For non-UTF-8 patterns and strings, this is the only case where
3341 * there is a possible fold length change. That means that a regular
3342 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3343 * with length changes, and so can be processed faster. regexec.c takes
3344 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3345 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3346 * known until runtime). This saves effort in regex matching. However,
3347 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3348 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3349 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3350 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3351 * possibilities for the non-UTF8 patterns are quite simple, except for
3352 * the sharp s. All the ones that don't involve a UTF-8 target string are
3353 * members of a fold-pair, and arrays are set up for all of them so that
3354 * the other member of the pair can be found quickly. Code elsewhere in
3355 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3356 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3357 * described in the next item.
3358 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3359 * validity of the fold won't be known until runtime, and so must remain
3360 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3361 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3362 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3363 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3364 * The reason this is a problem is that the optimizer part of regexec.c
3365 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3366 * that a character in the pattern corresponds to at most a single
3367 * character in the target string. (And I do mean character, and not byte
3368 * here, unlike other parts of the documentation that have never been
3369 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3370 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3371 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3372 * nodes, violate the assumption, and they are the only instances where it
3373 * is violated. I'm reluctant to try to change the assumption, as the
3374 * code involved is impenetrable to me (khw), so instead the code here
3375 * punts. This routine examines EXACTFL nodes, and (when the pattern
3376 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3377 * boolean indicating whether or not the node contains such a fold. When
3378 * it is true, the caller sets a flag that later causes the optimizer in
3379 * this file to not set values for the floating and fixed string lengths,
3380 * and thus avoids the optimizer code in regexec.c that makes the invalid
3381 * assumption. Thus, there is no optimization based on string lengths for
3382 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3383 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3384 * assumption is wrong only in these cases is that all other non-UTF-8
3385 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3386 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3387 * EXACTF nodes because we don't know at compile time if it actually
3388 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3389 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3390 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3391 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3392 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3393 * string would require the pattern to be forced into UTF-8, the overhead
3394 * of which we want to avoid. Similarly the unfolded multi-char folds in
3395 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3398 * Similarly, the code that generates tries doesn't currently handle
3399 * not-already-folded multi-char folds, and it looks like a pain to change
3400 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3401 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3402 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3403 * using /iaa matching will be doing so almost entirely with ASCII
3404 * strings, so this should rarely be encountered in practice */
3406 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3407 if (PL_regkind[OP(scan)] == EXACT) \
3408 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3411 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3412 UV *min_subtract, bool *unfolded_multi_char,
3413 U32 flags,regnode *val, U32 depth)
3415 /* Merge several consecutive EXACTish nodes into one. */
3416 regnode *n = regnext(scan);
3418 regnode *next = scan + NODE_SZ_STR(scan);
3422 regnode *stop = scan;
3423 GET_RE_DEBUG_FLAGS_DECL;
3425 PERL_UNUSED_ARG(depth);
3428 PERL_ARGS_ASSERT_JOIN_EXACT;
3429 #ifndef EXPERIMENTAL_INPLACESCAN
3430 PERL_UNUSED_ARG(flags);
3431 PERL_UNUSED_ARG(val);
3433 DEBUG_PEEP("join",scan,depth);
3435 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3436 * EXACT ones that are mergeable to the current one. */
3438 && (PL_regkind[OP(n)] == NOTHING
3439 || (stringok && OP(n) == OP(scan)))
3441 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3444 if (OP(n) == TAIL || n > next)
3446 if (PL_regkind[OP(n)] == NOTHING) {
3447 DEBUG_PEEP("skip:",n,depth);
3448 NEXT_OFF(scan) += NEXT_OFF(n);
3449 next = n + NODE_STEP_REGNODE;
3456 else if (stringok) {
3457 const unsigned int oldl = STR_LEN(scan);
3458 regnode * const nnext = regnext(n);
3460 /* XXX I (khw) kind of doubt that this works on platforms (should
3461 * Perl ever run on one) where U8_MAX is above 255 because of lots
3462 * of other assumptions */
3463 /* Don't join if the sum can't fit into a single node */
3464 if (oldl + STR_LEN(n) > U8_MAX)
3467 DEBUG_PEEP("merg",n,depth);
3470 NEXT_OFF(scan) += NEXT_OFF(n);
3471 STR_LEN(scan) += STR_LEN(n);
3472 next = n + NODE_SZ_STR(n);
3473 /* Now we can overwrite *n : */
3474 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3482 #ifdef EXPERIMENTAL_INPLACESCAN
3483 if (flags && !NEXT_OFF(n)) {
3484 DEBUG_PEEP("atch", val, depth);
3485 if (reg_off_by_arg[OP(n)]) {
3486 ARG_SET(n, val - n);
3489 NEXT_OFF(n) = val - n;
3497 *unfolded_multi_char = FALSE;
3499 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3500 * can now analyze for sequences of problematic code points. (Prior to
3501 * this final joining, sequences could have been split over boundaries, and
3502 * hence missed). The sequences only happen in folding, hence for any
3503 * non-EXACT EXACTish node */
3504 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3505 U8* s0 = (U8*) STRING(scan);
3507 U8* s_end = s0 + STR_LEN(scan);
3509 int total_count_delta = 0; /* Total delta number of characters that
3510 multi-char folds expand to */
3512 /* One pass is made over the node's string looking for all the
3513 * possibilities. To avoid some tests in the loop, there are two main
3514 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3519 if (OP(scan) == EXACTFL) {
3522 /* An EXACTFL node would already have been changed to another
3523 * node type unless there is at least one character in it that
3524 * is problematic; likely a character whose fold definition
3525 * won't be known until runtime, and so has yet to be folded.
3526 * For all but the UTF-8 locale, folds are 1-1 in length, but
3527 * to handle the UTF-8 case, we need to create a temporary
3528 * folded copy using UTF-8 locale rules in order to analyze it.
3529 * This is because our macros that look to see if a sequence is
3530 * a multi-char fold assume everything is folded (otherwise the
3531 * tests in those macros would be too complicated and slow).
3532 * Note that here, the non-problematic folds will have already
3533 * been done, so we can just copy such characters. We actually
3534 * don't completely fold the EXACTFL string. We skip the
3535 * unfolded multi-char folds, as that would just create work
3536 * below to figure out the size they already are */
3538 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3541 STRLEN s_len = UTF8SKIP(s);
3542 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3543 Copy(s, d, s_len, U8);
3546 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3547 *unfolded_multi_char = TRUE;
3548 Copy(s, d, s_len, U8);
3551 else if (isASCII(*s)) {
3552 *(d++) = toFOLD(*s);
3556 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3562 /* Point the remainder of the routine to look at our temporary
3566 } /* End of creating folded copy of EXACTFL string */
3568 /* Examine the string for a multi-character fold sequence. UTF-8
3569 * patterns have all characters pre-folded by the time this code is
3571 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3572 length sequence we are looking for is 2 */
3574 int count = 0; /* How many characters in a multi-char fold */
3575 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3576 if (! len) { /* Not a multi-char fold: get next char */
3581 /* Nodes with 'ss' require special handling, except for
3582 * EXACTFA-ish for which there is no multi-char fold to this */
3583 if (len == 2 && *s == 's' && *(s+1) == 's'
3584 && OP(scan) != EXACTFA
3585 && OP(scan) != EXACTFA_NO_TRIE)
3588 if (OP(scan) != EXACTFL) {
3589 OP(scan) = EXACTFU_SS;
3593 else { /* Here is a generic multi-char fold. */
3594 U8* multi_end = s + len;
3596 /* Count how many characters are in it. In the case of
3597 * /aa, no folds which contain ASCII code points are
3598 * allowed, so check for those, and skip if found. */
3599 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3600 count = utf8_length(s, multi_end);
3604 while (s < multi_end) {
3607 goto next_iteration;
3617 /* The delta is how long the sequence is minus 1 (1 is how long
3618 * the character that folds to the sequence is) */
3619 total_count_delta += count - 1;
3623 /* We created a temporary folded copy of the string in EXACTFL
3624 * nodes. Therefore we need to be sure it doesn't go below zero,
3625 * as the real string could be shorter */
3626 if (OP(scan) == EXACTFL) {
3627 int total_chars = utf8_length((U8*) STRING(scan),
3628 (U8*) STRING(scan) + STR_LEN(scan));
3629 if (total_count_delta > total_chars) {
3630 total_count_delta = total_chars;
3634 *min_subtract += total_count_delta;
3637 else if (OP(scan) == EXACTFA) {
3639 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3640 * fold to the ASCII range (and there are no existing ones in the
3641 * upper latin1 range). But, as outlined in the comments preceding
3642 * this function, we need to flag any occurrences of the sharp s.
3643 * This character forbids trie formation (because of added
3646 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3647 OP(scan) = EXACTFA_NO_TRIE;
3648 *unfolded_multi_char = TRUE;
3657 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3658 * folds that are all Latin1. As explained in the comments
3659 * preceding this function, we look also for the sharp s in EXACTF
3660 * and EXACTFL nodes; it can be in the final position. Otherwise
3661 * we can stop looking 1 byte earlier because have to find at least
3662 * two characters for a multi-fold */
3663 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3668 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3669 if (! len) { /* Not a multi-char fold. */
3670 if (*s == LATIN_SMALL_LETTER_SHARP_S
3671 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3673 *unfolded_multi_char = TRUE;
3680 && isALPHA_FOLD_EQ(*s, 's')
3681 && isALPHA_FOLD_EQ(*(s+1), 's'))
3684 /* EXACTF nodes need to know that the minimum length
3685 * changed so that a sharp s in the string can match this
3686 * ss in the pattern, but they remain EXACTF nodes, as they
3687 * won't match this unless the target string is is UTF-8,
3688 * which we don't know until runtime. EXACTFL nodes can't
3689 * transform into EXACTFU nodes */
3690 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3691 OP(scan) = EXACTFU_SS;
3695 *min_subtract += len - 1;
3702 /* Allow dumping but overwriting the collection of skipped
3703 * ops and/or strings with fake optimized ops */
3704 n = scan + NODE_SZ_STR(scan);
3712 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3716 /* REx optimizer. Converts nodes into quicker variants "in place".
3717 Finds fixed substrings. */
3719 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3720 to the position after last scanned or to NULL. */
3722 #define INIT_AND_WITHP \
3723 assert(!and_withp); \
3724 Newx(and_withp,1, regnode_ssc); \
3725 SAVEFREEPV(and_withp)
3729 S_unwind_scan_frames(pTHX_ const void *p)
3731 scan_frame *f= (scan_frame *)p;
3733 scan_frame *n= f->next_frame;
3741 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3742 SSize_t *minlenp, SSize_t *deltap,
3747 regnode_ssc *and_withp,
3748 U32 flags, U32 depth)
3749 /* scanp: Start here (read-write). */
3750 /* deltap: Write maxlen-minlen here. */
3751 /* last: Stop before this one. */
3752 /* data: string data about the pattern */
3753 /* stopparen: treat close N as END */
3754 /* recursed: which subroutines have we recursed into */
3755 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3757 /* There must be at least this number of characters to match */
3760 regnode *scan = *scanp, *next;
3762 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3763 int is_inf_internal = 0; /* The studied chunk is infinite */
3764 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3765 scan_data_t data_fake;
3766 SV *re_trie_maxbuff = NULL;
3767 regnode *first_non_open = scan;
3768 SSize_t stopmin = SSize_t_MAX;
3769 scan_frame *frame = NULL;
3770 GET_RE_DEBUG_FLAGS_DECL;
3772 PERL_ARGS_ASSERT_STUDY_CHUNK;
3776 while (first_non_open && OP(first_non_open) == OPEN)
3777 first_non_open=regnext(first_non_open);
3783 RExC_study_chunk_recursed_count++;
3785 DEBUG_OPTIMISE_MORE_r(
3787 PerlIO_printf(Perl_debug_log,
3788 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3789 (int)(depth*2), "", (long)stopparen,
3790 (unsigned long)RExC_study_chunk_recursed_count,
3791 (unsigned long)depth, (unsigned long)recursed_depth,
3794 if (recursed_depth) {
3797 for ( j = 0 ; j < recursed_depth ; j++ ) {
3798 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3800 PAREN_TEST(RExC_study_chunk_recursed +
3801 ( j * RExC_study_chunk_recursed_bytes), i )
3804 !PAREN_TEST(RExC_study_chunk_recursed +
3805 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3808 PerlIO_printf(Perl_debug_log," %d",(int)i);
3812 if ( j + 1 < recursed_depth ) {
3813 PerlIO_printf(Perl_debug_log, ",");
3817 PerlIO_printf(Perl_debug_log,"\n");
3820 while ( scan && OP(scan) != END && scan < last ){
3821 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3822 node length to get a real minimum (because
3823 the folded version may be shorter) */
3824 bool unfolded_multi_char = FALSE;
3825 /* Peephole optimizer: */
3826 DEBUG_STUDYDATA("Peep:", data, depth);
3827 DEBUG_PEEP("Peep", scan, depth);
3830 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3831 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3832 * by a different invocation of reg() -- Yves
3834 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3836 /* Follow the next-chain of the current node and optimize
3837 away all the NOTHINGs from it. */
3838 if (OP(scan) != CURLYX) {
3839 const int max = (reg_off_by_arg[OP(scan)]
3841 /* I32 may be smaller than U16 on CRAYs! */
3842 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3843 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3847 /* Skip NOTHING and LONGJMP. */
3848 while ((n = regnext(n))
3849 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3850 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3851 && off + noff < max)
3853 if (reg_off_by_arg[OP(scan)])
3856 NEXT_OFF(scan) = off;
3859 /* The principal pseudo-switch. Cannot be a switch, since we
3860 look into several different things. */
3861 if ( OP(scan) == DEFINEP ) {
3863 SSize_t deltanext = 0;
3864 SSize_t fake_last_close = 0;
3865 I32 f = SCF_IN_DEFINE;
3867 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3868 scan = regnext(scan);
3869 assert( OP(scan) == IFTHEN );
3870 DEBUG_PEEP("expect IFTHEN", scan, depth);
3872 data_fake.last_closep= &fake_last_close;
3874 next = regnext(scan);
3875 scan = NEXTOPER(NEXTOPER(scan));
3876 DEBUG_PEEP("scan", scan, depth);
3877 DEBUG_PEEP("next", next, depth);
3879 /* we suppose the run is continuous, last=next...
3880 * NOTE we dont use the return here! */
3881 (void)study_chunk(pRExC_state, &scan, &minlen,
3882 &deltanext, next, &data_fake, stopparen,
3883 recursed_depth, NULL, f, depth+1);
3888 OP(scan) == BRANCH ||
3889 OP(scan) == BRANCHJ ||
3892 next = regnext(scan);
3895 /* The op(next)==code check below is to see if we
3896 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3897 * IFTHEN is special as it might not appear in pairs.
3898 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3899 * we dont handle it cleanly. */
3900 if (OP(next) == code || code == IFTHEN) {
3901 /* NOTE - There is similar code to this block below for
3902 * handling TRIE nodes on a re-study. If you change stuff here
3903 * check there too. */
3904 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3906 regnode * const startbranch=scan;
3908 if (flags & SCF_DO_SUBSTR) {
3909 /* Cannot merge strings after this. */
3910 scan_commit(pRExC_state, data, minlenp, is_inf);
3913 if (flags & SCF_DO_STCLASS)
3914 ssc_init_zero(pRExC_state, &accum);
3916 while (OP(scan) == code) {
3917 SSize_t deltanext, minnext, fake;
3919 regnode_ssc this_class;
3921 DEBUG_PEEP("Branch", scan, depth);
3924 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3926 data_fake.whilem_c = data->whilem_c;
3927 data_fake.last_closep = data->last_closep;
3930 data_fake.last_closep = &fake;
3932 data_fake.pos_delta = delta;
3933 next = regnext(scan);
3935 scan = NEXTOPER(scan); /* everything */
3936 if (code != BRANCH) /* everything but BRANCH */
3937 scan = NEXTOPER(scan);
3939 if (flags & SCF_DO_STCLASS) {
3940 ssc_init(pRExC_state, &this_class);
3941 data_fake.start_class = &this_class;
3942 f = SCF_DO_STCLASS_AND;
3944 if (flags & SCF_WHILEM_VISITED_POS)
3945 f |= SCF_WHILEM_VISITED_POS;
3947 /* we suppose the run is continuous, last=next...*/
3948 minnext = study_chunk(pRExC_state, &scan, minlenp,
3949 &deltanext, next, &data_fake, stopparen,
3950 recursed_depth, NULL, f,depth+1);
3954 if (deltanext == SSize_t_MAX) {
3955 is_inf = is_inf_internal = 1;
3957 } else if (max1 < minnext + deltanext)
3958 max1 = minnext + deltanext;
3960 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3962 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3963 if ( stopmin > minnext)
3964 stopmin = min + min1;
3965 flags &= ~SCF_DO_SUBSTR;
3967 data->flags |= SCF_SEEN_ACCEPT;
3970 if (data_fake.flags & SF_HAS_EVAL)
3971 data->flags |= SF_HAS_EVAL;
3972 data->whilem_c = data_fake.whilem_c;
3974 if (flags & SCF_DO_STCLASS)
3975 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3977 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3979 if (flags & SCF_DO_SUBSTR) {
3980 data->pos_min += min1;
3981 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3982 data->pos_delta = SSize_t_MAX;
3984 data->pos_delta += max1 - min1;
3985 if (max1 != min1 || is_inf)
3986 data->longest = &(data->longest_float);
3989 if (delta == SSize_t_MAX
3990 || SSize_t_MAX - delta - (max1 - min1) < 0)
3991 delta = SSize_t_MAX;
3993 delta += max1 - min1;
3994 if (flags & SCF_DO_STCLASS_OR) {
3995 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3997 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3998 flags &= ~SCF_DO_STCLASS;
4001 else if (flags & SCF_DO_STCLASS_AND) {
4003 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4004 flags &= ~SCF_DO_STCLASS;
4007 /* Switch to OR mode: cache the old value of
4008 * data->start_class */
4010 StructCopy(data->start_class, and_withp, regnode_ssc);
4011 flags &= ~SCF_DO_STCLASS_AND;
4012 StructCopy(&accum, data->start_class, regnode_ssc);
4013 flags |= SCF_DO_STCLASS_OR;
4017 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4018 OP( startbranch ) == BRANCH )
4022 Assuming this was/is a branch we are dealing with: 'scan'
4023 now points at the item that follows the branch sequence,
4024 whatever it is. We now start at the beginning of the
4025 sequence and look for subsequences of
4031 which would be constructed from a pattern like
4034 If we can find such a subsequence we need to turn the first
4035 element into a trie and then add the subsequent branch exact
4036 strings to the trie.
4040 1. patterns where the whole set of branches can be
4043 2. patterns where only a subset can be converted.
4045 In case 1 we can replace the whole set with a single regop
4046 for the trie. In case 2 we need to keep the start and end
4049 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4050 becomes BRANCH TRIE; BRANCH X;
4052 There is an additional case, that being where there is a
4053 common prefix, which gets split out into an EXACT like node
4054 preceding the TRIE node.
4056 If x(1..n)==tail then we can do a simple trie, if not we make
4057 a "jump" trie, such that when we match the appropriate word
4058 we "jump" to the appropriate tail node. Essentially we turn
4059 a nested if into a case structure of sorts.
4064 if (!re_trie_maxbuff) {
4065 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4066 if (!SvIOK(re_trie_maxbuff))
4067 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4069 if ( SvIV(re_trie_maxbuff)>=0 ) {
4071 regnode *first = (regnode *)NULL;
4072 regnode *last = (regnode *)NULL;
4073 regnode *tail = scan;
4077 /* var tail is used because there may be a TAIL
4078 regop in the way. Ie, the exacts will point to the
4079 thing following the TAIL, but the last branch will
4080 point at the TAIL. So we advance tail. If we
4081 have nested (?:) we may have to move through several
4085 while ( OP( tail ) == TAIL ) {
4086 /* this is the TAIL generated by (?:) */
4087 tail = regnext( tail );
4091 DEBUG_TRIE_COMPILE_r({
4092 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4093 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4094 (int)depth * 2 + 2, "",
4095 "Looking for TRIE'able sequences. Tail node is: ",
4096 SvPV_nolen_const( RExC_mysv )
4102 Step through the branches
4103 cur represents each branch,
4104 noper is the first thing to be matched as part
4106 noper_next is the regnext() of that node.
4108 We normally handle a case like this
4109 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4110 support building with NOJUMPTRIE, which restricts
4111 the trie logic to structures like /FOO|BAR/.
4113 If noper is a trieable nodetype then the branch is
4114 a possible optimization target. If we are building
4115 under NOJUMPTRIE then we require that noper_next is
4116 the same as scan (our current position in the regex
4119 Once we have two or more consecutive such branches
4120 we can create a trie of the EXACT's contents and
4121 stitch it in place into the program.
4123 If the sequence represents all of the branches in
4124 the alternation we replace the entire thing with a
4127 Otherwise when it is a subsequence we need to
4128 stitch it in place and replace only the relevant
4129 branches. This means the first branch has to remain
4130 as it is used by the alternation logic, and its
4131 next pointer, and needs to be repointed at the item
4132 on the branch chain following the last branch we
4133 have optimized away.
4135 This could be either a BRANCH, in which case the
4136 subsequence is internal, or it could be the item
4137 following the branch sequence in which case the
4138 subsequence is at the end (which does not
4139 necessarily mean the first node is the start of the
4142 TRIE_TYPE(X) is a define which maps the optype to a
4146 ----------------+-----------
4150 EXACTFU_SS | EXACTFU
4153 EXACTFLU8 | EXACTFLU8
4157 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4159 : ( EXACT == (X) ) \
4161 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4163 : ( EXACTFA == (X) ) \
4165 : ( EXACTL == (X) ) \
4167 : ( EXACTFLU8 == (X) ) \
4171 /* dont use tail as the end marker for this traverse */
4172 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4173 regnode * const noper = NEXTOPER( cur );
4174 U8 noper_type = OP( noper );
4175 U8 noper_trietype = TRIE_TYPE( noper_type );
4176 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4177 regnode * const noper_next = regnext( noper );
4178 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4179 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4182 DEBUG_TRIE_COMPILE_r({
4183 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4184 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4185 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4187 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4188 PerlIO_printf( Perl_debug_log, " -> %s",
4189 SvPV_nolen_const(RExC_mysv));
4192 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4193 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4194 SvPV_nolen_const(RExC_mysv));
4196 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4197 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4198 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4202 /* Is noper a trieable nodetype that can be merged
4203 * with the current trie (if there is one)? */
4207 ( noper_trietype == NOTHING)
4208 || ( trietype == NOTHING )
4209 || ( trietype == noper_trietype )
4212 && noper_next == tail
4216 /* Handle mergable triable node Either we are
4217 * the first node in a new trieable sequence,
4218 * in which case we do some bookkeeping,
4219 * otherwise we update the end pointer. */
4222 if ( noper_trietype == NOTHING ) {
4223 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4224 regnode * const noper_next = regnext( noper );
4225 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4226 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4229 if ( noper_next_trietype ) {
4230 trietype = noper_next_trietype;
4231 } else if (noper_next_type) {
4232 /* a NOTHING regop is 1 regop wide.
4233 * We need at least two for a trie
4234 * so we can't merge this in */
4238 trietype = noper_trietype;
4241 if ( trietype == NOTHING )
4242 trietype = noper_trietype;
4247 } /* end handle mergable triable node */
4249 /* handle unmergable node -
4250 * noper may either be a triable node which can
4251 * not be tried together with the current trie,
4252 * or a non triable node */
4254 /* If last is set and trietype is not
4255 * NOTHING then we have found at least two
4256 * triable branch sequences in a row of a
4257 * similar trietype so we can turn them
4258 * into a trie. If/when we allow NOTHING to
4259 * start a trie sequence this condition
4260 * will be required, and it isn't expensive
4261 * so we leave it in for now. */
4262 if ( trietype && trietype != NOTHING )
4263 make_trie( pRExC_state,
4264 startbranch, first, cur, tail,
4265 count, trietype, depth+1 );
4266 last = NULL; /* note: we clear/update
4267 first, trietype etc below,
4268 so we dont do it here */
4272 && noper_next == tail
4275 /* noper is triable, so we can start a new
4279 trietype = noper_trietype;
4281 /* if we already saw a first but the
4282 * current node is not triable then we have
4283 * to reset the first information. */
4288 } /* end handle unmergable node */
4289 } /* loop over branches */
4290 DEBUG_TRIE_COMPILE_r({
4291 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4292 PerlIO_printf( Perl_debug_log,
4293 "%*s- %s (%d) <SCAN FINISHED>\n",
4295 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4298 if ( last && trietype ) {
4299 if ( trietype != NOTHING ) {
4300 /* the last branch of the sequence was part of
4301 * a trie, so we have to construct it here
4302 * outside of the loop */
4303 made= make_trie( pRExC_state, startbranch,
4304 first, scan, tail, count,
4305 trietype, depth+1 );
4306 #ifdef TRIE_STUDY_OPT
4307 if ( ((made == MADE_EXACT_TRIE &&
4308 startbranch == first)
4309 || ( first_non_open == first )) &&
4311 flags |= SCF_TRIE_RESTUDY;
4312 if ( startbranch == first
4315 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4320 /* at this point we know whatever we have is a
4321 * NOTHING sequence/branch AND if 'startbranch'
4322 * is 'first' then we can turn the whole thing
4325 if ( startbranch == first ) {
4327 /* the entire thing is a NOTHING sequence,
4328 * something like this: (?:|) So we can
4329 * turn it into a plain NOTHING op. */
4330 DEBUG_TRIE_COMPILE_r({
4331 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4332 PerlIO_printf( Perl_debug_log,
4333 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4334 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4337 OP(startbranch)= NOTHING;
4338 NEXT_OFF(startbranch)= tail - startbranch;
4339 for ( opt= startbranch + 1; opt < tail ; opt++ )
4343 } /* end if ( last) */
4344 } /* TRIE_MAXBUF is non zero */
4349 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4350 scan = NEXTOPER(NEXTOPER(scan));
4351 } else /* single branch is optimized. */
4352 scan = NEXTOPER(scan);
4354 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4356 regnode *start = NULL;
4357 regnode *end = NULL;
4358 U32 my_recursed_depth= recursed_depth;
4361 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4362 /* Do setup, note this code has side effects beyond
4363 * the rest of this block. Specifically setting
4364 * RExC_recurse[] must happen at least once during
4366 if (OP(scan) == GOSUB) {
4368 RExC_recurse[ARG2L(scan)] = scan;
4369 start = RExC_open_parens[paren-1];
4370 end = RExC_close_parens[paren-1];
4372 start = RExC_rxi->program + 1;
4375 /* NOTE we MUST always execute the above code, even
4376 * if we do nothing with a GOSUB/GOSTART */
4378 ( flags & SCF_IN_DEFINE )
4381 (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4383 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4386 /* no need to do anything here if we are in a define. */
4387 /* or we are after some kind of infinite construct
4388 * so we can skip recursing into this item.
4389 * Since it is infinite we will not change the maxlen
4390 * or delta, and if we miss something that might raise
4391 * the minlen it will merely pessimise a little.
4393 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4394 * might result in a minlen of 1 and not of 4,
4395 * but this doesn't make us mismatch, just try a bit
4396 * harder than we should.
4398 scan= regnext(scan);
4405 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4407 /* it is quite possible that there are more efficient ways
4408 * to do this. We maintain a bitmap per level of recursion
4409 * of which patterns we have entered so we can detect if a
4410 * pattern creates a possible infinite loop. When we
4411 * recurse down a level we copy the previous levels bitmap
4412 * down. When we are at recursion level 0 we zero the top
4413 * level bitmap. It would be nice to implement a different
4414 * more efficient way of doing this. In particular the top
4415 * level bitmap may be unnecessary.
4417 if (!recursed_depth) {
4418 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4420 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4421 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4422 RExC_study_chunk_recursed_bytes, U8);
4424 /* we havent recursed into this paren yet, so recurse into it */
4425 DEBUG_STUDYDATA("set:", data,depth);
4426 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4427 my_recursed_depth= recursed_depth + 1;
4429 DEBUG_STUDYDATA("inf:", data,depth);
4430 /* some form of infinite recursion, assume infinite length
4432 if (flags & SCF_DO_SUBSTR) {
4433 scan_commit(pRExC_state, data, minlenp, is_inf);
4434 data->longest = &(data->longest_float);
4436 is_inf = is_inf_internal = 1;
4437 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4438 ssc_anything(data->start_class);
4439 flags &= ~SCF_DO_STCLASS;
4441 start= NULL; /* reset start so we dont recurse later on. */
4446 end = regnext(scan);
4449 scan_frame *newframe;
4451 if (!RExC_frame_last) {
4452 Newxz(newframe, 1, scan_frame);
4453 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4454 RExC_frame_head= newframe;
4456 } else if (!RExC_frame_last->next_frame) {
4457 Newxz(newframe,1,scan_frame);
4458 RExC_frame_last->next_frame= newframe;
4459 newframe->prev_frame= RExC_frame_last;
4462 newframe= RExC_frame_last->next_frame;
4464 RExC_frame_last= newframe;
4466 newframe->next_regnode = regnext(scan);
4467 newframe->last_regnode = last;
4468 newframe->stopparen = stopparen;
4469 newframe->prev_recursed_depth = recursed_depth;
4470 newframe->this_prev_frame= frame;
4472 DEBUG_STUDYDATA("frame-new:",data,depth);
4473 DEBUG_PEEP("fnew", scan, depth);
4480 recursed_depth= my_recursed_depth;
4485 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4486 SSize_t l = STR_LEN(scan);
4489 const U8 * const s = (U8*)STRING(scan);
4490 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4491 l = utf8_length(s, s + l);
4493 uc = *((U8*)STRING(scan));
4496 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4497 /* The code below prefers earlier match for fixed
4498 offset, later match for variable offset. */
4499 if (data->last_end == -1) { /* Update the start info. */
4500 data->last_start_min = data->pos_min;
4501 data->last_start_max = is_inf
4502 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4504 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4506 SvUTF8_on(data->last_found);
4508 SV * const sv = data->last_found;
4509 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4510 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4511 if (mg && mg->mg_len >= 0)
4512 mg->mg_len += utf8_length((U8*)STRING(scan),
4513 (U8*)STRING(scan)+STR_LEN(scan));
4515 data->last_end = data->pos_min + l;
4516 data->pos_min += l; /* As in the first entry. */
4517 data->flags &= ~SF_BEFORE_EOL;
4520 /* ANDing the code point leaves at most it, and not in locale, and
4521 * can't match null string */
4522 if (flags & SCF_DO_STCLASS_AND) {
4523 ssc_cp_and(data->start_class, uc);
4524 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4525 ssc_clear_locale(data->start_class);
4527 else if (flags & SCF_DO_STCLASS_OR) {
4528 ssc_add_cp(data->start_class, uc);
4529 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4531 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4532 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4534 flags &= ~SCF_DO_STCLASS;
4536 else if (PL_regkind[OP(scan)] == EXACT) {
4537 /* But OP != EXACT!, so is EXACTFish */
4538 SSize_t l = STR_LEN(scan);
4539 const U8 * s = (U8*)STRING(scan);
4541 /* Search for fixed substrings supports EXACT only. */
4542 if (flags & SCF_DO_SUBSTR) {
4544 scan_commit(pRExC_state, data, minlenp, is_inf);
4547 l = utf8_length(s, s + l);
4549 if (unfolded_multi_char) {
4550 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4552 min += l - min_subtract;
4554 delta += min_subtract;
4555 if (flags & SCF_DO_SUBSTR) {
4556 data->pos_min += l - min_subtract;
4557 if (data->pos_min < 0) {
4560 data->pos_delta += min_subtract;
4562 data->longest = &(data->longest_float);
4566 if (flags & SCF_DO_STCLASS) {
4567 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4569 assert(EXACTF_invlist);
4570 if (flags & SCF_DO_STCLASS_AND) {
4571 if (OP(scan) != EXACTFL)
4572 ssc_clear_locale(data->start_class);
4573 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4574 ANYOF_POSIXL_ZERO(data->start_class);
4575 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4577 else { /* SCF_DO_STCLASS_OR */
4578 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4579 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4581 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4582 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4584 flags &= ~SCF_DO_STCLASS;
4585 SvREFCNT_dec(EXACTF_invlist);
4588 else if (REGNODE_VARIES(OP(scan))) {
4589 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4590 I32 fl = 0, f = flags;
4591 regnode * const oscan = scan;
4592 regnode_ssc this_class;
4593 regnode_ssc *oclass = NULL;
4594 I32 next_is_eval = 0;
4596 switch (PL_regkind[OP(scan)]) {
4597 case WHILEM: /* End of (?:...)* . */
4598 scan = NEXTOPER(scan);
4601 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4602 next = NEXTOPER(scan);
4603 if (OP(next) == EXACT
4604 || OP(next) == EXACTL
4605 || (flags & SCF_DO_STCLASS))
4608 maxcount = REG_INFTY;
4609 next = regnext(scan);
4610 scan = NEXTOPER(scan);
4614 if (flags & SCF_DO_SUBSTR)
4619 if (flags & SCF_DO_STCLASS) {
4621 maxcount = REG_INFTY;
4622 next = regnext(scan);
4623 scan = NEXTOPER(scan);
4626 if (flags & SCF_DO_SUBSTR) {
4627 scan_commit(pRExC_state, data, minlenp, is_inf);
4628 /* Cannot extend fixed substrings */
4629 data->longest = &(data->longest_float);
4631 is_inf = is_inf_internal = 1;
4632 scan = regnext(scan);
4633 goto optimize_curly_tail;
4635 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4636 && (scan->flags == stopparen))
4641 mincount = ARG1(scan);
4642 maxcount = ARG2(scan);
4644 next = regnext(scan);
4645 if (OP(scan) == CURLYX) {
4646 I32 lp = (data ? *(data->last_closep) : 0);
4647 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4649 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4650 next_is_eval = (OP(scan) == EVAL);
4652 if (flags & SCF_DO_SUBSTR) {
4654 scan_commit(pRExC_state, data, minlenp, is_inf);
4655 /* Cannot extend fixed substrings */
4656 pos_before = data->pos_min;
4660 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4662 data->flags |= SF_IS_INF;
4664 if (flags & SCF_DO_STCLASS) {
4665 ssc_init(pRExC_state, &this_class);
4666 oclass = data->start_class;
4667 data->start_class = &this_class;
4668 f |= SCF_DO_STCLASS_AND;
4669 f &= ~SCF_DO_STCLASS_OR;
4671 /* Exclude from super-linear cache processing any {n,m}
4672 regops for which the combination of input pos and regex
4673 pos is not enough information to determine if a match
4676 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4677 regex pos at the \s*, the prospects for a match depend not
4678 only on the input position but also on how many (bar\s*)
4679 repeats into the {4,8} we are. */
4680 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4681 f &= ~SCF_WHILEM_VISITED_POS;
4683 /* This will finish on WHILEM, setting scan, or on NULL: */
4684 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4685 last, data, stopparen, recursed_depth, NULL,
4687 ? (f & ~SCF_DO_SUBSTR)
4691 if (flags & SCF_DO_STCLASS)
4692 data->start_class = oclass;
4693 if (mincount == 0 || minnext == 0) {
4694 if (flags & SCF_DO_STCLASS_OR) {
4695 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4697 else if (flags & SCF_DO_STCLASS_AND) {
4698 /* Switch to OR mode: cache the old value of
4699 * data->start_class */
4701 StructCopy(data->start_class, and_withp, regnode_ssc);
4702 flags &= ~SCF_DO_STCLASS_AND;
4703 StructCopy(&this_class, data->start_class, regnode_ssc);
4704 flags |= SCF_DO_STCLASS_OR;
4705 ANYOF_FLAGS(data->start_class)
4706 |= SSC_MATCHES_EMPTY_STRING;
4708 } else { /* Non-zero len */
4709 if (flags & SCF_DO_STCLASS_OR) {
4710 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4711 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4713 else if (flags & SCF_DO_STCLASS_AND)
4714 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4715 flags &= ~SCF_DO_STCLASS;
4717 if (!scan) /* It was not CURLYX, but CURLY. */
4719 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4720 /* ? quantifier ok, except for (?{ ... }) */
4721 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4722 && (minnext == 0) && (deltanext == 0)
4723 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4724 && maxcount <= REG_INFTY/3) /* Complement check for big
4727 /* Fatal warnings may leak the regexp without this: */
4728 SAVEFREESV(RExC_rx_sv);
4729 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4730 "Quantifier unexpected on zero-length expression "
4731 "in regex m/%"UTF8f"/",
4732 UTF8fARG(UTF, RExC_end - RExC_precomp,
4734 (void)ReREFCNT_inc(RExC_rx_sv);
4737 min += minnext * mincount;
4738 is_inf_internal |= deltanext == SSize_t_MAX
4739 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4740 is_inf |= is_inf_internal;
4742 delta = SSize_t_MAX;
4744 delta += (minnext + deltanext) * maxcount
4745 - minnext * mincount;
4747 /* Try powerful optimization CURLYX => CURLYN. */
4748 if ( OP(oscan) == CURLYX && data
4749 && data->flags & SF_IN_PAR
4750 && !(data->flags & SF_HAS_EVAL)
4751 && !deltanext && minnext == 1 ) {
4752 /* Try to optimize to CURLYN. */
4753 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4754 regnode * const nxt1 = nxt;
4761 if (!REGNODE_SIMPLE(OP(nxt))
4762 && !(PL_regkind[OP(nxt)] == EXACT
4763 && STR_LEN(nxt) == 1))
4769 if (OP(nxt) != CLOSE)
4771 if (RExC_open_parens) {
4772 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4773 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4775 /* Now we know that nxt2 is the only contents: */
4776 oscan->flags = (U8)ARG(nxt);
4778 OP(nxt1) = NOTHING; /* was OPEN. */
4781 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4782 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4783 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4784 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4785 OP(nxt + 1) = OPTIMIZED; /* was count. */
4786 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4791 /* Try optimization CURLYX => CURLYM. */
4792 if ( OP(oscan) == CURLYX && data
4793 && !(data->flags & SF_HAS_PAR)
4794 && !(data->flags & SF_HAS_EVAL)
4795 && !deltanext /* atom is fixed width */
4796 && minnext != 0 /* CURLYM can't handle zero width */
4798 /* Nor characters whose fold at run-time may be
4799 * multi-character */
4800 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4802 /* XXXX How to optimize if data == 0? */
4803 /* Optimize to a simpler form. */
4804 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4808 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4809 && (OP(nxt2) != WHILEM))
4811 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4812 /* Need to optimize away parenths. */
4813 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4814 /* Set the parenth number. */
4815 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4817 oscan->flags = (U8)ARG(nxt);
4818 if (RExC_open_parens) {
4819 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4820 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4822 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4823 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4826 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4827 OP(nxt + 1) = OPTIMIZED; /* was count. */
4828 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4829 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4832 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4833 regnode *nnxt = regnext(nxt1);
4835 if (reg_off_by_arg[OP(nxt1)])
4836 ARG_SET(nxt1, nxt2 - nxt1);
4837 else if (nxt2 - nxt1 < U16_MAX)
4838 NEXT_OFF(nxt1) = nxt2 - nxt1;
4840 OP(nxt) = NOTHING; /* Cannot beautify */
4845 /* Optimize again: */
4846 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4847 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4852 else if ((OP(oscan) == CURLYX)
4853 && (flags & SCF_WHILEM_VISITED_POS)
4854 /* See the comment on a similar expression above.
4855 However, this time it's not a subexpression
4856 we care about, but the expression itself. */
4857 && (maxcount == REG_INFTY)
4858 && data && ++data->whilem_c < 16) {
4859 /* This stays as CURLYX, we can put the count/of pair. */
4860 /* Find WHILEM (as in regexec.c) */
4861 regnode *nxt = oscan + NEXT_OFF(oscan);
4863 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4865 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4866 | (RExC_whilem_seen << 4)); /* On WHILEM */
4868 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4870 if (flags & SCF_DO_SUBSTR) {
4871 SV *last_str = NULL;
4872 STRLEN last_chrs = 0;
4873 int counted = mincount != 0;
4875 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4877 SSize_t b = pos_before >= data->last_start_min
4878 ? pos_before : data->last_start_min;
4880 const char * const s = SvPV_const(data->last_found, l);
4881 SSize_t old = b - data->last_start_min;
4884 old = utf8_hop((U8*)s, old) - (U8*)s;
4886 /* Get the added string: */
4887 last_str = newSVpvn_utf8(s + old, l, UTF);
4888 last_chrs = UTF ? utf8_length((U8*)(s + old),
4889 (U8*)(s + old + l)) : l;
4890 if (deltanext == 0 && pos_before == b) {
4891 /* What was added is a constant string */
4894 SvGROW(last_str, (mincount * l) + 1);
4895 repeatcpy(SvPVX(last_str) + l,
4896 SvPVX_const(last_str), l,
4898 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4899 /* Add additional parts. */
4900 SvCUR_set(data->last_found,
4901 SvCUR(data->last_found) - l);
4902 sv_catsv(data->last_found, last_str);
4904 SV * sv = data->last_found;
4906 SvUTF8(sv) && SvMAGICAL(sv) ?
4907 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4908 if (mg && mg->mg_len >= 0)
4909 mg->mg_len += last_chrs * (mincount-1);
4911 last_chrs *= mincount;
4912 data->last_end += l * (mincount - 1);
4915 /* start offset must point into the last copy */
4916 data->last_start_min += minnext * (mincount - 1);
4917 data->last_start_max =
4920 : data->last_start_max +
4921 (maxcount - 1) * (minnext + data->pos_delta);
4924 /* It is counted once already... */
4925 data->pos_min += minnext * (mincount - counted);
4927 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4928 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4929 " maxcount=%"UVuf" mincount=%"UVuf"\n",
4930 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4932 if (deltanext != SSize_t_MAX)
4933 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4934 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4935 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4937 if (deltanext == SSize_t_MAX
4938 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4939 data->pos_delta = SSize_t_MAX;
4941 data->pos_delta += - counted * deltanext +
4942 (minnext + deltanext) * maxcount - minnext * mincount;
4943 if (mincount != maxcount) {
4944 /* Cannot extend fixed substrings found inside
4946 scan_commit(pRExC_state, data, minlenp, is_inf);
4947 if (mincount && last_str) {
4948 SV * const sv = data->last_found;
4949 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4950 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4954 sv_setsv(sv, last_str);
4955 data->last_end = data->pos_min;
4956 data->last_start_min = data->pos_min - last_chrs;
4957 data->last_start_max = is_inf
4959 : data->pos_min + data->pos_delta - last_chrs;
4961 data->longest = &(data->longest_float);
4963 SvREFCNT_dec(last_str);
4965 if (data && (fl & SF_HAS_EVAL))
4966 data->flags |= SF_HAS_EVAL;
4967 optimize_curly_tail:
4968 if (OP(oscan) != CURLYX) {
4969 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4971 NEXT_OFF(oscan) += NEXT_OFF(next);
4977 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4982 if (flags & SCF_DO_SUBSTR) {
4983 /* Cannot expect anything... */
4984 scan_commit(pRExC_state, data, minlenp, is_inf);
4985 data->longest = &(data->longest_float);
4987 is_inf = is_inf_internal = 1;
4988 if (flags & SCF_DO_STCLASS_OR) {
4989 if (OP(scan) == CLUMP) {
4990 /* Actually is any start char, but very few code points
4991 * aren't start characters */
4992 ssc_match_all_cp(data->start_class);
4995 ssc_anything(data->start_class);
4998 flags &= ~SCF_DO_STCLASS;
5002 else if (OP(scan) == LNBREAK) {
5003 if (flags & SCF_DO_STCLASS) {
5004 if (flags & SCF_DO_STCLASS_AND) {
5005 ssc_intersection(data->start_class,
5006 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5007 ssc_clear_locale(data->start_class);
5008 ANYOF_FLAGS(data->start_class)
5009 &= ~SSC_MATCHES_EMPTY_STRING;
5011 else if (flags & SCF_DO_STCLASS_OR) {
5012 ssc_union(data->start_class,
5013 PL_XPosix_ptrs[_CC_VERTSPACE],
5015 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5017 /* See commit msg for
5018 * 749e076fceedeb708a624933726e7989f2302f6a */
5019 ANYOF_FLAGS(data->start_class)
5020 &= ~SSC_MATCHES_EMPTY_STRING;
5022 flags &= ~SCF_DO_STCLASS;
5025 if (delta != SSize_t_MAX)
5026 delta++; /* Because of the 2 char string cr-lf */
5027 if (flags & SCF_DO_SUBSTR) {
5028 /* Cannot expect anything... */
5029 scan_commit(pRExC_state, data, minlenp, is_inf);
5031 data->pos_delta += 1;
5032 data->longest = &(data->longest_float);
5035 else if (REGNODE_SIMPLE(OP(scan))) {
5037 if (flags & SCF_DO_SUBSTR) {
5038 scan_commit(pRExC_state, data, minlenp, is_inf);
5042 if (flags & SCF_DO_STCLASS) {
5044 SV* my_invlist = NULL;
5047 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5048 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5050 /* Some of the logic below assumes that switching
5051 locale on will only add false positives. */
5056 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5061 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5062 ssc_match_all_cp(data->start_class);
5067 SV* REG_ANY_invlist = _new_invlist(2);
5068 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5070 if (flags & SCF_DO_STCLASS_OR) {
5071 ssc_union(data->start_class,
5073 TRUE /* TRUE => invert, hence all but \n
5077 else if (flags & SCF_DO_STCLASS_AND) {
5078 ssc_intersection(data->start_class,
5080 TRUE /* TRUE => invert */
5082 ssc_clear_locale(data->start_class);
5084 SvREFCNT_dec_NN(REG_ANY_invlist);
5090 if (flags & SCF_DO_STCLASS_AND)
5091 ssc_and(pRExC_state, data->start_class,
5092 (regnode_charclass *) scan);
5094 ssc_or(pRExC_state, data->start_class,
5095 (regnode_charclass *) scan);
5103 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5104 if (flags & SCF_DO_STCLASS_AND) {
5105 bool was_there = cBOOL(
5106 ANYOF_POSIXL_TEST(data->start_class,
5108 ANYOF_POSIXL_ZERO(data->start_class);
5109 if (was_there) { /* Do an AND */
5110 ANYOF_POSIXL_SET(data->start_class, namedclass);
5112 /* No individual code points can now match */
5113 data->start_class->invlist
5114 = sv_2mortal(_new_invlist(0));
5117 int complement = namedclass + ((invert) ? -1 : 1);
5119 assert(flags & SCF_DO_STCLASS_OR);
5121 /* If the complement of this class was already there,
5122 * the result is that they match all code points,
5123 * (\d + \D == everything). Remove the classes from
5124 * future consideration. Locale is not relevant in
5126 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5127 ssc_match_all_cp(data->start_class);
5128 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5129 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5131 else { /* The usual case; just add this class to the
5133 ANYOF_POSIXL_SET(data->start_class, namedclass);
5138 case NPOSIXA: /* For these, we always know the exact set of
5143 if (FLAGS(scan) == _CC_ASCII) {
5144 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5147 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5148 PL_XPosix_ptrs[_CC_ASCII],
5159 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5161 /* NPOSIXD matches all upper Latin1 code points unless the
5162 * target string being matched is UTF-8, which is
5163 * unknowable until match time. Since we are going to
5164 * invert, we want to get rid of all of them so that the
5165 * inversion will match all */
5166 if (OP(scan) == NPOSIXD) {
5167 _invlist_subtract(my_invlist, PL_UpperLatin1,
5173 if (flags & SCF_DO_STCLASS_AND) {
5174 ssc_intersection(data->start_class, my_invlist, invert);
5175 ssc_clear_locale(data->start_class);
5178 assert(flags & SCF_DO_STCLASS_OR);
5179 ssc_union(data->start_class, my_invlist, invert);
5181 SvREFCNT_dec(my_invlist);
5183 if (flags & SCF_DO_STCLASS_OR)
5184 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5185 flags &= ~SCF_DO_STCLASS;
5188 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5189 data->flags |= (OP(scan) == MEOL
5192 scan_commit(pRExC_state, data, minlenp, is_inf);
5195 else if ( PL_regkind[OP(scan)] == BRANCHJ
5196 /* Lookbehind, or need to calculate parens/evals/stclass: */
5197 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5198 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5200 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5201 || OP(scan) == UNLESSM )
5203 /* Negative Lookahead/lookbehind
5204 In this case we can't do fixed string optimisation.
5207 SSize_t deltanext, minnext, fake = 0;
5212 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5214 data_fake.whilem_c = data->whilem_c;
5215 data_fake.last_closep = data->last_closep;
5218 data_fake.last_closep = &fake;
5219 data_fake.pos_delta = delta;
5220 if ( flags & SCF_DO_STCLASS && !scan->flags
5221 && OP(scan) == IFMATCH ) { /* Lookahead */
5222 ssc_init(pRExC_state, &intrnl);
5223 data_fake.start_class = &intrnl;
5224 f |= SCF_DO_STCLASS_AND;
5226 if (flags & SCF_WHILEM_VISITED_POS)
5227 f |= SCF_WHILEM_VISITED_POS;
5228 next = regnext(scan);
5229 nscan = NEXTOPER(NEXTOPER(scan));
5230 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5231 last, &data_fake, stopparen,
5232 recursed_depth, NULL, f, depth+1);
5235 FAIL("Variable length lookbehind not implemented");
5237 else if (minnext > (I32)U8_MAX) {
5238 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5241 scan->flags = (U8)minnext;
5244 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5246 if (data_fake.flags & SF_HAS_EVAL)
5247 data->flags |= SF_HAS_EVAL;
5248 data->whilem_c = data_fake.whilem_c;
5250 if (f & SCF_DO_STCLASS_AND) {
5251 if (flags & SCF_DO_STCLASS_OR) {
5252 /* OR before, AND after: ideally we would recurse with
5253 * data_fake to get the AND applied by study of the
5254 * remainder of the pattern, and then derecurse;
5255 * *** HACK *** for now just treat as "no information".
5256 * See [perl #56690].
5258 ssc_init(pRExC_state, data->start_class);
5260 /* AND before and after: combine and continue. These
5261 * assertions are zero-length, so can match an EMPTY
5263 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5264 ANYOF_FLAGS(data->start_class)
5265 |= SSC_MATCHES_EMPTY_STRING;
5269 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5271 /* Positive Lookahead/lookbehind
5272 In this case we can do fixed string optimisation,
5273 but we must be careful about it. Note in the case of
5274 lookbehind the positions will be offset by the minimum
5275 length of the pattern, something we won't know about
5276 until after the recurse.
5278 SSize_t deltanext, fake = 0;
5282 /* We use SAVEFREEPV so that when the full compile
5283 is finished perl will clean up the allocated
5284 minlens when it's all done. This way we don't
5285 have to worry about freeing them when we know
5286 they wont be used, which would be a pain.
5289 Newx( minnextp, 1, SSize_t );
5290 SAVEFREEPV(minnextp);
5293 StructCopy(data, &data_fake, scan_data_t);
5294 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5297 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5298 data_fake.last_found=newSVsv(data->last_found);
5302 data_fake.last_closep = &fake;
5303 data_fake.flags = 0;
5304 data_fake.pos_delta = delta;
5306 data_fake.flags |= SF_IS_INF;
5307 if ( flags & SCF_DO_STCLASS && !scan->flags
5308 && OP(scan) == IFMATCH ) { /* Lookahead */
5309 ssc_init(pRExC_state, &intrnl);
5310 data_fake.start_class = &intrnl;
5311 f |= SCF_DO_STCLASS_AND;
5313 if (flags & SCF_WHILEM_VISITED_POS)
5314 f |= SCF_WHILEM_VISITED_POS;
5315 next = regnext(scan);
5316 nscan = NEXTOPER(NEXTOPER(scan));
5318 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5319 &deltanext, last, &data_fake,
5320 stopparen, recursed_depth, NULL,
5324 FAIL("Variable length lookbehind not implemented");
5326 else if (*minnextp > (I32)U8_MAX) {
5327 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5330 scan->flags = (U8)*minnextp;
5335 if (f & SCF_DO_STCLASS_AND) {
5336 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5337 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5340 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5342 if (data_fake.flags & SF_HAS_EVAL)
5343 data->flags |= SF_HAS_EVAL;
5344 data->whilem_c = data_fake.whilem_c;
5345 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5346 if (RExC_rx->minlen<*minnextp)
5347 RExC_rx->minlen=*minnextp;
5348 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5349 SvREFCNT_dec_NN(data_fake.last_found);
5351 if ( data_fake.minlen_fixed != minlenp )
5353 data->offset_fixed= data_fake.offset_fixed;
5354 data->minlen_fixed= data_fake.minlen_fixed;
5355 data->lookbehind_fixed+= scan->flags;
5357 if ( data_fake.minlen_float != minlenp )
5359 data->minlen_float= data_fake.minlen_float;
5360 data->offset_float_min=data_fake.offset_float_min;
5361 data->offset_float_max=data_fake.offset_float_max;
5362 data->lookbehind_float+= scan->flags;
5369 else if (OP(scan) == OPEN) {
5370 if (stopparen != (I32)ARG(scan))
5373 else if (OP(scan) == CLOSE) {
5374 if (stopparen == (I32)ARG(scan)) {
5377 if ((I32)ARG(scan) == is_par) {
5378 next = regnext(scan);
5380 if ( next && (OP(next) != WHILEM) && next < last)
5381 is_par = 0; /* Disable optimization */
5384 *(data->last_closep) = ARG(scan);
5386 else if (OP(scan) == EVAL) {
5388 data->flags |= SF_HAS_EVAL;
5390 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5391 if (flags & SCF_DO_SUBSTR) {
5392 scan_commit(pRExC_state, data, minlenp, is_inf);
5393 flags &= ~SCF_DO_SUBSTR;
5395 if (data && OP(scan)==ACCEPT) {
5396 data->flags |= SCF_SEEN_ACCEPT;
5401 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5403 if (flags & SCF_DO_SUBSTR) {
5404 scan_commit(pRExC_state, data, minlenp, is_inf);
5405 data->longest = &(data->longest_float);
5407 is_inf = is_inf_internal = 1;
5408 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5409 ssc_anything(data->start_class);
5410 flags &= ~SCF_DO_STCLASS;
5412 else if (OP(scan) == GPOS) {
5413 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5414 !(delta || is_inf || (data && data->pos_delta)))
5416 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5417 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5418 if (RExC_rx->gofs < (STRLEN)min)
5419 RExC_rx->gofs = min;
5421 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5425 #ifdef TRIE_STUDY_OPT
5426 #ifdef FULL_TRIE_STUDY
5427 else if (PL_regkind[OP(scan)] == TRIE) {
5428 /* NOTE - There is similar code to this block above for handling
5429 BRANCH nodes on the initial study. If you change stuff here
5431 regnode *trie_node= scan;
5432 regnode *tail= regnext(scan);
5433 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5434 SSize_t max1 = 0, min1 = SSize_t_MAX;
5437 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5438 /* Cannot merge strings after this. */
5439 scan_commit(pRExC_state, data, minlenp, is_inf);
5441 if (flags & SCF_DO_STCLASS)
5442 ssc_init_zero(pRExC_state, &accum);
5448 const regnode *nextbranch= NULL;
5451 for ( word=1 ; word <= trie->wordcount ; word++)
5453 SSize_t deltanext=0, minnext=0, f = 0, fake;
5454 regnode_ssc this_class;
5456 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5458 data_fake.whilem_c = data->whilem_c;
5459 data_fake.last_closep = data->last_closep;
5462 data_fake.last_closep = &fake;
5463 data_fake.pos_delta = delta;
5464 if (flags & SCF_DO_STCLASS) {
5465 ssc_init(pRExC_state, &this_class);
5466 data_fake.start_class = &this_class;
5467 f = SCF_DO_STCLASS_AND;
5469 if (flags & SCF_WHILEM_VISITED_POS)
5470 f |= SCF_WHILEM_VISITED_POS;
5472 if (trie->jump[word]) {
5474 nextbranch = trie_node + trie->jump[0];
5475 scan= trie_node + trie->jump[word];
5476 /* We go from the jump point to the branch that follows
5477 it. Note this means we need the vestigal unused
5478 branches even though they arent otherwise used. */
5479 minnext = study_chunk(pRExC_state, &scan, minlenp,
5480 &deltanext, (regnode *)nextbranch, &data_fake,
5481 stopparen, recursed_depth, NULL, f,depth+1);
5483 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5484 nextbranch= regnext((regnode*)nextbranch);
5486 if (min1 > (SSize_t)(minnext + trie->minlen))
5487 min1 = minnext + trie->minlen;
5488 if (deltanext == SSize_t_MAX) {
5489 is_inf = is_inf_internal = 1;
5491 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5492 max1 = minnext + deltanext + trie->maxlen;
5494 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5496 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5497 if ( stopmin > min + min1)
5498 stopmin = min + min1;
5499 flags &= ~SCF_DO_SUBSTR;
5501 data->flags |= SCF_SEEN_ACCEPT;
5504 if (data_fake.flags & SF_HAS_EVAL)
5505 data->flags |= SF_HAS_EVAL;
5506 data->whilem_c = data_fake.whilem_c;
5508 if (flags & SCF_DO_STCLASS)
5509 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5512 if (flags & SCF_DO_SUBSTR) {
5513 data->pos_min += min1;
5514 data->pos_delta += max1 - min1;
5515 if (max1 != min1 || is_inf)
5516 data->longest = &(data->longest_float);
5519 if (delta != SSize_t_MAX)
5520 delta += max1 - min1;
5521 if (flags & SCF_DO_STCLASS_OR) {
5522 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5524 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5525 flags &= ~SCF_DO_STCLASS;
5528 else if (flags & SCF_DO_STCLASS_AND) {
5530 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5531 flags &= ~SCF_DO_STCLASS;
5534 /* Switch to OR mode: cache the old value of
5535 * data->start_class */
5537 StructCopy(data->start_class, and_withp, regnode_ssc);
5538 flags &= ~SCF_DO_STCLASS_AND;
5539 StructCopy(&accum, data->start_class, regnode_ssc);
5540 flags |= SCF_DO_STCLASS_OR;
5547 else if (PL_regkind[OP(scan)] == TRIE) {
5548 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5551 min += trie->minlen;
5552 delta += (trie->maxlen - trie->minlen);
5553 flags &= ~SCF_DO_STCLASS; /* xxx */
5554 if (flags & SCF_DO_SUBSTR) {
5555 /* Cannot expect anything... */
5556 scan_commit(pRExC_state, data, minlenp, is_inf);
5557 data->pos_min += trie->minlen;
5558 data->pos_delta += (trie->maxlen - trie->minlen);
5559 if (trie->maxlen != trie->minlen)
5560 data->longest = &(data->longest_float);
5562 if (trie->jump) /* no more substrings -- for now /grr*/
5563 flags &= ~SCF_DO_SUBSTR;
5565 #endif /* old or new */
5566 #endif /* TRIE_STUDY_OPT */
5568 /* Else: zero-length, ignore. */
5569 scan = regnext(scan);
5571 /* If we are exiting a recursion we can unset its recursed bit
5572 * and allow ourselves to enter it again - no danger of an
5573 * infinite loop there.
5574 if (stopparen > -1 && recursed) {
5575 DEBUG_STUDYDATA("unset:", data,depth);
5576 PAREN_UNSET( recursed, stopparen);
5582 DEBUG_STUDYDATA("frame-end:",data,depth);
5583 DEBUG_PEEP("fend", scan, depth);
5585 /* restore previous context */
5586 last = frame->last_regnode;
5587 scan = frame->next_regnode;
5588 stopparen = frame->stopparen;
5589 recursed_depth = frame->prev_recursed_depth;
5591 RExC_frame_last = frame->prev_frame;
5592 frame = frame->this_prev_frame;
5593 goto fake_study_recurse;
5598 DEBUG_STUDYDATA("pre-fin:",data,depth);
5601 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5603 if (flags & SCF_DO_SUBSTR && is_inf)
5604 data->pos_delta = SSize_t_MAX - data->pos_min;
5605 if (is_par > (I32)U8_MAX)
5607 if (is_par && pars==1 && data) {
5608 data->flags |= SF_IN_PAR;
5609 data->flags &= ~SF_HAS_PAR;
5611 else if (pars && data) {
5612 data->flags |= SF_HAS_PAR;
5613 data->flags &= ~SF_IN_PAR;
5615 if (flags & SCF_DO_STCLASS_OR)
5616 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5617 if (flags & SCF_TRIE_RESTUDY)
5618 data->flags |= SCF_TRIE_RESTUDY;
5620 DEBUG_STUDYDATA("post-fin:",data,depth);
5623 SSize_t final_minlen= min < stopmin ? min : stopmin;
5625 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5626 if (final_minlen > SSize_t_MAX - delta)
5627 RExC_maxlen = SSize_t_MAX;
5628 else if (RExC_maxlen < final_minlen + delta)
5629 RExC_maxlen = final_minlen + delta;
5631 return final_minlen;
5637 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5639 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5641 PERL_ARGS_ASSERT_ADD_DATA;
5643 Renewc(RExC_rxi->data,
5644 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5645 char, struct reg_data);
5647 Renew(RExC_rxi->data->what, count + n, U8);
5649 Newx(RExC_rxi->data->what, n, U8);
5650 RExC_rxi->data->count = count + n;
5651 Copy(s, RExC_rxi->data->what + count, n, U8);
5655 /*XXX: todo make this not included in a non debugging perl, but appears to be
5656 * used anyway there, in 'use re' */
5657 #ifndef PERL_IN_XSUB_RE
5659 Perl_reginitcolors(pTHX)
5661 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5663 char *t = savepv(s);
5667 t = strchr(t, '\t');
5673 PL_colors[i] = t = (char *)"";
5678 PL_colors[i++] = (char *)"";
5685 #ifdef TRIE_STUDY_OPT
5686 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5689 (data.flags & SCF_TRIE_RESTUDY) \
5697 #define CHECK_RESTUDY_GOTO_butfirst
5701 * pregcomp - compile a regular expression into internal code
5703 * Decides which engine's compiler to call based on the hint currently in
5707 #ifndef PERL_IN_XSUB_RE
5709 /* return the currently in-scope regex engine (or the default if none) */
5711 regexp_engine const *
5712 Perl_current_re_engine(pTHX)
5714 if (IN_PERL_COMPILETIME) {
5715 HV * const table = GvHV(PL_hintgv);
5718 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5719 return &PL_core_reg_engine;
5720 ptr = hv_fetchs(table, "regcomp", FALSE);
5721 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5722 return &PL_core_reg_engine;
5723 return INT2PTR(regexp_engine*,SvIV(*ptr));
5727 if (!PL_curcop->cop_hints_hash)
5728 return &PL_core_reg_engine;
5729 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5730 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5731 return &PL_core_reg_engine;
5732 return INT2PTR(regexp_engine*,SvIV(ptr));
5738 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5740 regexp_engine const *eng = current_re_engine();
5741 GET_RE_DEBUG_FLAGS_DECL;
5743 PERL_ARGS_ASSERT_PREGCOMP;
5745 /* Dispatch a request to compile a regexp to correct regexp engine. */
5747 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5750 return CALLREGCOMP_ENG(eng, pattern, flags);
5754 /* public(ish) entry point for the perl core's own regex compiling code.
5755 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5756 * pattern rather than a list of OPs, and uses the internal engine rather
5757 * than the current one */
5760 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5762 SV *pat = pattern; /* defeat constness! */
5763 PERL_ARGS_ASSERT_RE_COMPILE;
5764 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5765 #ifdef PERL_IN_XSUB_RE
5768 &PL_core_reg_engine,
5770 NULL, NULL, rx_flags, 0);
5774 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5775 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5776 * point to the realloced string and length.
5778 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5782 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5783 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5785 U8 *const src = (U8*)*pat_p;
5790 GET_RE_DEBUG_FLAGS_DECL;
5792 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5793 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5795 Newx(dst, *plen_p * 2 + 1, U8);
5798 while (s < *plen_p) {
5799 append_utf8_from_native_byte(src[s], &d);
5800 if (n < num_code_blocks) {
5801 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5802 pRExC_state->code_blocks[n].start = d - dst - 1;
5803 assert(*(d - 1) == '(');
5806 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5807 pRExC_state->code_blocks[n].end = d - dst - 1;
5808 assert(*(d - 1) == ')');
5817 *pat_p = (char*) dst;
5819 RExC_orig_utf8 = RExC_utf8 = 1;
5824 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5825 * while recording any code block indices, and handling overloading,
5826 * nested qr// objects etc. If pat is null, it will allocate a new
5827 * string, or just return the first arg, if there's only one.
5829 * Returns the malloced/updated pat.
5830 * patternp and pat_count is the array of SVs to be concatted;
5831 * oplist is the optional list of ops that generated the SVs;
5832 * recompile_p is a pointer to a boolean that will be set if
5833 * the regex will need to be recompiled.
5834 * delim, if non-null is an SV that will be inserted between each element
5838 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5839 SV *pat, SV ** const patternp, int pat_count,
5840 OP *oplist, bool *recompile_p, SV *delim)
5844 bool use_delim = FALSE;
5845 bool alloced = FALSE;
5847 /* if we know we have at least two args, create an empty string,
5848 * then concatenate args to that. For no args, return an empty string */
5849 if (!pat && pat_count != 1) {
5855 for (svp = patternp; svp < patternp + pat_count; svp++) {
5858 STRLEN orig_patlen = 0;
5860 SV *msv = use_delim ? delim : *svp;
5861 if (!msv) msv = &PL_sv_undef;
5863 /* if we've got a delimiter, we go round the loop twice for each
5864 * svp slot (except the last), using the delimiter the second
5873 if (SvTYPE(msv) == SVt_PVAV) {
5874 /* we've encountered an interpolated array within
5875 * the pattern, e.g. /...@a..../. Expand the list of elements,
5876 * then recursively append elements.
5877 * The code in this block is based on S_pushav() */
5879 AV *const av = (AV*)msv;
5880 const SSize_t maxarg = AvFILL(av) + 1;
5884 assert(oplist->op_type == OP_PADAV
5885 || oplist->op_type == OP_RV2AV);
5886 oplist = OpSIBLING(oplist);
5889 if (SvRMAGICAL(av)) {
5892 Newx(array, maxarg, SV*);
5894 for (i=0; i < maxarg; i++) {
5895 SV ** const svp = av_fetch(av, i, FALSE);
5896 array[i] = svp ? *svp : &PL_sv_undef;
5900 array = AvARRAY(av);
5902 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5903 array, maxarg, NULL, recompile_p,
5905 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5911 /* we make the assumption here that each op in the list of
5912 * op_siblings maps to one SV pushed onto the stack,
5913 * except for code blocks, with have both an OP_NULL and
5915 * This allows us to match up the list of SVs against the
5916 * list of OPs to find the next code block.
5918 * Note that PUSHMARK PADSV PADSV ..
5920 * PADRANGE PADSV PADSV ..
5921 * so the alignment still works. */
5924 if (oplist->op_type == OP_NULL
5925 && (oplist->op_flags & OPf_SPECIAL))
5927 assert(n < pRExC_state->num_code_blocks);
5928 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5929 pRExC_state->code_blocks[n].block = oplist;
5930 pRExC_state->code_blocks[n].src_regex = NULL;
5933 oplist = OpSIBLING(oplist); /* skip CONST */
5936 oplist = OpSIBLING(oplist);;
5939 /* apply magic and QR overloading to arg */
5942 if (SvROK(msv) && SvAMAGIC(msv)) {
5943 SV *sv = AMG_CALLunary(msv, regexp_amg);
5947 if (SvTYPE(sv) != SVt_REGEXP)
5948 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5953 /* try concatenation overload ... */
5954 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5955 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5958 /* overloading involved: all bets are off over literal
5959 * code. Pretend we haven't seen it */
5960 pRExC_state->num_code_blocks -= n;
5964 /* ... or failing that, try "" overload */
5965 while (SvAMAGIC(msv)
5966 && (sv = AMG_CALLunary(msv, string_amg))
5970 && SvRV(msv) == SvRV(sv))
5975 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5979 /* this is a partially unrolled
5980 * sv_catsv_nomg(pat, msv);
5981 * that allows us to adjust code block indices if
5984 char *dst = SvPV_force_nomg(pat, dlen);
5986 if (SvUTF8(msv) && !SvUTF8(pat)) {
5987 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5988 sv_setpvn(pat, dst, dlen);
5991 sv_catsv_nomg(pat, msv);
5998 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6001 /* extract any code blocks within any embedded qr//'s */
6002 if (rx && SvTYPE(rx) == SVt_REGEXP
6003 && RX_ENGINE((REGEXP*)rx)->op_comp)
6006 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6007 if (ri->num_code_blocks) {
6009 /* the presence of an embedded qr// with code means
6010 * we should always recompile: the text of the
6011 * qr// may not have changed, but it may be a
6012 * different closure than last time */
6014 Renew(pRExC_state->code_blocks,
6015 pRExC_state->num_code_blocks + ri->num_code_blocks,
6016 struct reg_code_block);
6017 pRExC_state->num_code_blocks += ri->num_code_blocks;
6019 for (i=0; i < ri->num_code_blocks; i++) {
6020 struct reg_code_block *src, *dst;
6021 STRLEN offset = orig_patlen
6022 + ReANY((REGEXP *)rx)->pre_prefix;
6023 assert(n < pRExC_state->num_code_blocks);
6024 src = &ri->code_blocks[i];
6025 dst = &pRExC_state->code_blocks[n];
6026 dst->start = src->start + offset;
6027 dst->end = src->end + offset;
6028 dst->block = src->block;
6029 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6038 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6047 /* see if there are any run-time code blocks in the pattern.
6048 * False positives are allowed */
6051 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6052 char *pat, STRLEN plen)
6057 PERL_UNUSED_CONTEXT;
6059 for (s = 0; s < plen; s++) {
6060 if (n < pRExC_state->num_code_blocks
6061 && s == pRExC_state->code_blocks[n].start)
6063 s = pRExC_state->code_blocks[n].end;
6067 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6069 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6071 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6078 /* Handle run-time code blocks. We will already have compiled any direct
6079 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6080 * copy of it, but with any literal code blocks blanked out and
6081 * appropriate chars escaped; then feed it into
6083 * eval "qr'modified_pattern'"
6087 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6091 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6093 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6094 * and merge them with any code blocks of the original regexp.
6096 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6097 * instead, just save the qr and return FALSE; this tells our caller that
6098 * the original pattern needs upgrading to utf8.
6102 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6103 char *pat, STRLEN plen)
6107 GET_RE_DEBUG_FLAGS_DECL;
6109 if (pRExC_state->runtime_code_qr) {
6110 /* this is the second time we've been called; this should
6111 * only happen if the main pattern got upgraded to utf8
6112 * during compilation; re-use the qr we compiled first time
6113 * round (which should be utf8 too)
6115 qr = pRExC_state->runtime_code_qr;
6116 pRExC_state->runtime_code_qr = NULL;
6117 assert(RExC_utf8 && SvUTF8(qr));
6123 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6127 /* determine how many extra chars we need for ' and \ escaping */
6128 for (s = 0; s < plen; s++) {
6129 if (pat[s] == '\'' || pat[s] == '\\')
6133 Newx(newpat, newlen, char);
6135 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6137 for (s = 0; s < plen; s++) {
6138 if (n < pRExC_state->num_code_blocks
6139 && s == pRExC_state->code_blocks[n].start)
6141 /* blank out literal code block */
6142 assert(pat[s] == '(');
6143 while (s <= pRExC_state->code_blocks[n].end) {
6151 if (pat[s] == '\'' || pat[s] == '\\')
6156 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6160 PerlIO_printf(Perl_debug_log,
6161 "%sre-parsing pattern for runtime code:%s %s\n",
6162 PL_colors[4],PL_colors[5],newpat);
6165 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6170 PUSHSTACKi(PERLSI_REQUIRE);
6171 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6172 * parsing qr''; normally only q'' does this. It also alters
6174 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6175 SvREFCNT_dec_NN(sv);
6180 SV * const errsv = ERRSV;
6181 if (SvTRUE_NN(errsv))
6183 Safefree(pRExC_state->code_blocks);
6184 /* use croak_sv ? */
6185 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6188 assert(SvROK(qr_ref));
6190 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6191 /* the leaving below frees the tmp qr_ref.
6192 * Give qr a life of its own */
6200 if (!RExC_utf8 && SvUTF8(qr)) {
6201 /* first time through; the pattern got upgraded; save the
6202 * qr for the next time through */
6203 assert(!pRExC_state->runtime_code_qr);
6204 pRExC_state->runtime_code_qr = qr;
6209 /* extract any code blocks within the returned qr// */
6212 /* merge the main (r1) and run-time (r2) code blocks into one */
6214 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6215 struct reg_code_block *new_block, *dst;
6216 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6219 if (!r2->num_code_blocks) /* we guessed wrong */
6221 SvREFCNT_dec_NN(qr);
6226 r1->num_code_blocks + r2->num_code_blocks,
6227 struct reg_code_block);
6230 while ( i1 < r1->num_code_blocks
6231 || i2 < r2->num_code_blocks)
6233 struct reg_code_block *src;
6236 if (i1 == r1->num_code_blocks) {
6237 src = &r2->code_blocks[i2++];
6240 else if (i2 == r2->num_code_blocks)
6241 src = &r1->code_blocks[i1++];
6242 else if ( r1->code_blocks[i1].start
6243 < r2->code_blocks[i2].start)
6245 src = &r1->code_blocks[i1++];
6246 assert(src->end < r2->code_blocks[i2].start);
6249 assert( r1->code_blocks[i1].start
6250 > r2->code_blocks[i2].start);
6251 src = &r2->code_blocks[i2++];
6253 assert(src->end < r1->code_blocks[i1].start);
6256 assert(pat[src->start] == '(');
6257 assert(pat[src->end] == ')');
6258 dst->start = src->start;
6259 dst->end = src->end;
6260 dst->block = src->block;
6261 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6265 r1->num_code_blocks += r2->num_code_blocks;
6266 Safefree(r1->code_blocks);
6267 r1->code_blocks = new_block;
6270 SvREFCNT_dec_NN(qr);
6276 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6277 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6278 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6279 STRLEN longest_length, bool eol, bool meol)
6281 /* This is the common code for setting up the floating and fixed length
6282 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6283 * as to whether succeeded or not */
6288 if (! (longest_length
6289 || (eol /* Can't have SEOL and MULTI */
6290 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6292 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6293 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6298 /* copy the information about the longest from the reg_scan_data
6299 over to the program. */
6300 if (SvUTF8(sv_longest)) {
6301 *rx_utf8 = sv_longest;
6304 *rx_substr = sv_longest;
6307 /* end_shift is how many chars that must be matched that
6308 follow this item. We calculate it ahead of time as once the
6309 lookbehind offset is added in we lose the ability to correctly
6311 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6312 *rx_end_shift = ml - offset
6313 - longest_length + (SvTAIL(sv_longest) != 0)
6316 t = (eol/* Can't have SEOL and MULTI */
6317 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6318 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6324 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6325 * regular expression into internal code.
6326 * The pattern may be passed either as:
6327 * a list of SVs (patternp plus pat_count)
6328 * a list of OPs (expr)
6329 * If both are passed, the SV list is used, but the OP list indicates
6330 * which SVs are actually pre-compiled code blocks
6332 * The SVs in the list have magic and qr overloading applied to them (and
6333 * the list may be modified in-place with replacement SVs in the latter
6336 * If the pattern hasn't changed from old_re, then old_re will be
6339 * eng is the current engine. If that engine has an op_comp method, then
6340 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6341 * do the initial concatenation of arguments and pass on to the external
6344 * If is_bare_re is not null, set it to a boolean indicating whether the
6345 * arg list reduced (after overloading) to a single bare regex which has
6346 * been returned (i.e. /$qr/).
6348 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6350 * pm_flags contains the PMf_* flags, typically based on those from the
6351 * pm_flags field of the related PMOP. Currently we're only interested in
6352 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6354 * We can't allocate space until we know how big the compiled form will be,
6355 * but we can't compile it (and thus know how big it is) until we've got a
6356 * place to put the code. So we cheat: we compile it twice, once with code
6357 * generation turned off and size counting turned on, and once "for real".
6358 * This also means that we don't allocate space until we are sure that the
6359 * thing really will compile successfully, and we never have to move the
6360 * code and thus invalidate pointers into it. (Note that it has to be in
6361 * one piece because free() must be able to free it all.) [NB: not true in perl]
6363 * Beware that the optimization-preparation code in here knows about some
6364 * of the structure of the compiled regexp. [I'll say.]
6368 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6369 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6370 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6374 regexp_internal *ri;
6382 SV *code_blocksv = NULL;
6383 SV** new_patternp = patternp;
6385 /* these are all flags - maybe they should be turned
6386 * into a single int with different bit masks */
6387 I32 sawlookahead = 0;
6392 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6394 bool runtime_code = 0;
6396 RExC_state_t RExC_state;
6397 RExC_state_t * const pRExC_state = &RExC_state;
6398 #ifdef TRIE_STUDY_OPT
6400 RExC_state_t copyRExC_state;
6402 GET_RE_DEBUG_FLAGS_DECL;
6404 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6406 DEBUG_r(if (!PL_colorset) reginitcolors());
6408 #ifndef PERL_IN_XSUB_RE
6409 /* Initialize these here instead of as-needed, as is quick and avoids
6410 * having to test them each time otherwise */
6411 if (! PL_AboveLatin1) {
6412 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6413 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6414 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6415 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6416 PL_HasMultiCharFold =
6417 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6419 /* This is calculated here, because the Perl program that generates the
6420 * static global ones doesn't currently have access to
6421 * NUM_ANYOF_CODE_POINTS */
6422 PL_InBitmap = _new_invlist(2);
6423 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6424 NUM_ANYOF_CODE_POINTS - 1);
6428 pRExC_state->code_blocks = NULL;
6429 pRExC_state->num_code_blocks = 0;
6432 *is_bare_re = FALSE;
6434 if (expr && (expr->op_type == OP_LIST ||
6435 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6436 /* allocate code_blocks if needed */
6440 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6441 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6442 ncode++; /* count of DO blocks */
6444 pRExC_state->num_code_blocks = ncode;
6445 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6450 /* compile-time pattern with just OP_CONSTs and DO blocks */
6455 /* find how many CONSTs there are */
6458 if (expr->op_type == OP_CONST)
6461 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6462 if (o->op_type == OP_CONST)
6466 /* fake up an SV array */
6468 assert(!new_patternp);
6469 Newx(new_patternp, n, SV*);
6470 SAVEFREEPV(new_patternp);
6474 if (expr->op_type == OP_CONST)
6475 new_patternp[n] = cSVOPx_sv(expr);
6477 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6478 if (o->op_type == OP_CONST)
6479 new_patternp[n++] = cSVOPo_sv;
6484 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6485 "Assembling pattern from %d elements%s\n", pat_count,
6486 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6488 /* set expr to the first arg op */
6490 if (pRExC_state->num_code_blocks
6491 && expr->op_type != OP_CONST)
6493 expr = cLISTOPx(expr)->op_first;
6494 assert( expr->op_type == OP_PUSHMARK
6495 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6496 || expr->op_type == OP_PADRANGE);
6497 expr = OpSIBLING(expr);
6500 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6501 expr, &recompile, NULL);
6503 /* handle bare (possibly after overloading) regex: foo =~ $re */
6508 if (SvTYPE(re) == SVt_REGEXP) {
6512 Safefree(pRExC_state->code_blocks);
6513 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6514 "Precompiled pattern%s\n",
6515 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6521 exp = SvPV_nomg(pat, plen);
6523 if (!eng->op_comp) {
6524 if ((SvUTF8(pat) && IN_BYTES)
6525 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6527 /* make a temporary copy; either to convert to bytes,
6528 * or to avoid repeating get-magic / overloaded stringify */
6529 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6530 (IN_BYTES ? 0 : SvUTF8(pat)));
6532 Safefree(pRExC_state->code_blocks);
6533 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6536 /* ignore the utf8ness if the pattern is 0 length */
6537 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6538 RExC_uni_semantics = 0;
6539 RExC_contains_locale = 0;
6540 RExC_contains_i = 0;
6541 pRExC_state->runtime_code_qr = NULL;
6542 RExC_frame_head= NULL;
6543 RExC_frame_last= NULL;
6544 RExC_frame_count= 0;
6547 RExC_mysv1= sv_newmortal();
6548 RExC_mysv2= sv_newmortal();
6551 SV *dsv= sv_newmortal();
6552 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6553 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6554 PL_colors[4],PL_colors[5],s);
6558 /* we jump here if we upgrade the pattern to utf8 and have to
6561 if ((pm_flags & PMf_USE_RE_EVAL)
6562 /* this second condition covers the non-regex literal case,
6563 * i.e. $foo =~ '(?{})'. */
6564 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6566 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6568 /* return old regex if pattern hasn't changed */
6569 /* XXX: note in the below we have to check the flags as well as the
6572 * Things get a touch tricky as we have to compare the utf8 flag
6573 * independently from the compile flags. */
6577 && !!RX_UTF8(old_re) == !!RExC_utf8
6578 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6579 && RX_PRECOMP(old_re)
6580 && RX_PRELEN(old_re) == plen
6581 && memEQ(RX_PRECOMP(old_re), exp, plen)
6582 && !runtime_code /* with runtime code, always recompile */ )
6584 Safefree(pRExC_state->code_blocks);
6588 rx_flags = orig_rx_flags;
6590 if (rx_flags & PMf_FOLD) {
6591 RExC_contains_i = 1;
6593 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6595 /* Set to use unicode semantics if the pattern is in utf8 and has the
6596 * 'depends' charset specified, as it means unicode when utf8 */
6597 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6601 RExC_flags = rx_flags;
6602 RExC_pm_flags = pm_flags;
6605 if (TAINTING_get && TAINT_get)
6606 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6608 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6609 /* whoops, we have a non-utf8 pattern, whilst run-time code
6610 * got compiled as utf8. Try again with a utf8 pattern */
6611 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6612 pRExC_state->num_code_blocks);
6613 goto redo_first_pass;
6616 assert(!pRExC_state->runtime_code_qr);
6622 RExC_in_lookbehind = 0;
6623 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6625 RExC_override_recoding = 0;
6626 RExC_in_multi_char_class = 0;
6628 /* First pass: determine size, legality. */
6631 RExC_end = exp + plen;
6636 RExC_emit = (regnode *) &RExC_emit_dummy;
6637 RExC_whilem_seen = 0;
6638 RExC_open_parens = NULL;
6639 RExC_close_parens = NULL;
6641 RExC_paren_names = NULL;
6643 RExC_paren_name_list = NULL;
6645 RExC_recurse = NULL;
6646 RExC_study_chunk_recursed = NULL;
6647 RExC_study_chunk_recursed_bytes= 0;
6648 RExC_recurse_count = 0;
6649 pRExC_state->code_index = 0;
6651 #if 0 /* REGC() is (currently) a NOP at the first pass.
6652 * Clever compilers notice this and complain. --jhi */
6653 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6656 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6658 RExC_lastparse=NULL;
6660 /* reg may croak on us, not giving us a chance to free
6661 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6662 need it to survive as long as the regexp (qr/(?{})/).
6663 We must check that code_blocksv is not already set, because we may
6664 have jumped back to restart the sizing pass. */
6665 if (pRExC_state->code_blocks && !code_blocksv) {
6666 code_blocksv = newSV_type(SVt_PV);
6667 SAVEFREESV(code_blocksv);
6668 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6669 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6671 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6672 /* It's possible to write a regexp in ascii that represents Unicode
6673 codepoints outside of the byte range, such as via \x{100}. If we
6674 detect such a sequence we have to convert the entire pattern to utf8
6675 and then recompile, as our sizing calculation will have been based
6676 on 1 byte == 1 character, but we will need to use utf8 to encode
6677 at least some part of the pattern, and therefore must convert the whole
6680 if (flags & RESTART_UTF8) {
6681 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6682 pRExC_state->num_code_blocks);
6683 goto redo_first_pass;
6685 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6688 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6691 PerlIO_printf(Perl_debug_log,
6692 "Required size %"IVdf" nodes\n"
6693 "Starting second pass (creation)\n",
6696 RExC_lastparse=NULL;
6699 /* The first pass could have found things that force Unicode semantics */
6700 if ((RExC_utf8 || RExC_uni_semantics)
6701 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6703 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6706 /* Small enough for pointer-storage convention?
6707 If extralen==0, this means that we will not need long jumps. */
6708 if (RExC_size >= 0x10000L && RExC_extralen)
6709 RExC_size += RExC_extralen;
6712 if (RExC_whilem_seen > 15)
6713 RExC_whilem_seen = 15;
6715 /* Allocate space and zero-initialize. Note, the two step process
6716 of zeroing when in debug mode, thus anything assigned has to
6717 happen after that */
6718 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6720 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6721 char, regexp_internal);
6722 if ( r == NULL || ri == NULL )
6723 FAIL("Regexp out of space");
6725 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6726 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6729 /* bulk initialize base fields with 0. */
6730 Zero(ri, sizeof(regexp_internal), char);
6733 /* non-zero initialization begins here */
6736 r->extflags = rx_flags;
6737 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6739 if (pm_flags & PMf_IS_QR) {
6740 ri->code_blocks = pRExC_state->code_blocks;
6741 ri->num_code_blocks = pRExC_state->num_code_blocks;
6746 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6747 if (pRExC_state->code_blocks[n].src_regex)
6748 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6749 SAVEFREEPV(pRExC_state->code_blocks);
6753 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6754 bool has_charset = (get_regex_charset(r->extflags)
6755 != REGEX_DEPENDS_CHARSET);
6757 /* The caret is output if there are any defaults: if not all the STD
6758 * flags are set, or if no character set specifier is needed */
6760 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6762 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6763 == REG_RUN_ON_COMMENT_SEEN);
6764 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6765 >> RXf_PMf_STD_PMMOD_SHIFT);
6766 const char *fptr = STD_PAT_MODS; /*"msixn"*/
6768 /* Allocate for the worst case, which is all the std flags are turned
6769 * on. If more precision is desired, we could do a population count of
6770 * the flags set. This could be done with a small lookup table, or by
6771 * shifting, masking and adding, or even, when available, assembly
6772 * language for a machine-language population count.
6773 * We never output a minus, as all those are defaults, so are
6774 * covered by the caret */
6775 const STRLEN wraplen = plen + has_p + has_runon
6776 + has_default /* If needs a caret */
6778 /* If needs a character set specifier */
6779 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6780 + (sizeof(STD_PAT_MODS) - 1)
6781 + (sizeof("(?:)") - 1);
6783 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6784 r->xpv_len_u.xpvlenu_pv = p;
6786 SvFLAGS(rx) |= SVf_UTF8;
6789 /* If a default, cover it using the caret */
6791 *p++= DEFAULT_PAT_MOD;
6795 const char* const name = get_regex_charset_name(r->extflags, &len);
6796 Copy(name, p, len, char);
6800 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6803 while((ch = *fptr++)) {
6811 Copy(RExC_precomp, p, plen, char);
6812 assert ((RX_WRAPPED(rx) - p) < 16);
6813 r->pre_prefix = p - RX_WRAPPED(rx);
6819 SvCUR_set(rx, p - RX_WRAPPED(rx));
6823 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6825 /* setup various meta data about recursion, this all requires
6826 * RExC_npar to be correctly set, and a bit later on we clear it */
6827 if (RExC_seen & REG_RECURSE_SEEN) {
6828 Newxz(RExC_open_parens, RExC_npar,regnode *);
6829 SAVEFREEPV(RExC_open_parens);
6830 Newxz(RExC_close_parens,RExC_npar,regnode *);
6831 SAVEFREEPV(RExC_close_parens);
6833 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6834 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6835 * So its 1 if there are no parens. */
6836 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6837 ((RExC_npar & 0x07) != 0);
6838 Newx(RExC_study_chunk_recursed,
6839 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6840 SAVEFREEPV(RExC_study_chunk_recursed);
6843 /* Useful during FAIL. */
6844 #ifdef RE_TRACK_PATTERN_OFFSETS
6845 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6846 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6847 "%s %"UVuf" bytes for offset annotations.\n",
6848 ri->u.offsets ? "Got" : "Couldn't get",
6849 (UV)((2*RExC_size+1) * sizeof(U32))));
6851 SetProgLen(ri,RExC_size);
6856 /* Second pass: emit code. */
6857 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6858 RExC_pm_flags = pm_flags;
6860 RExC_end = exp + plen;
6863 RExC_emit_start = ri->program;
6864 RExC_emit = ri->program;
6865 RExC_emit_bound = ri->program + RExC_size + 1;
6866 pRExC_state->code_index = 0;
6868 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6869 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6871 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6873 /* XXXX To minimize changes to RE engine we always allocate
6874 3-units-long substrs field. */
6875 Newx(r->substrs, 1, struct reg_substr_data);
6876 if (RExC_recurse_count) {
6877 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6878 SAVEFREEPV(RExC_recurse);
6882 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6884 RExC_study_chunk_recursed_count= 0;
6886 Zero(r->substrs, 1, struct reg_substr_data);
6887 if (RExC_study_chunk_recursed) {
6888 Zero(RExC_study_chunk_recursed,
6889 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6893 #ifdef TRIE_STUDY_OPT
6895 StructCopy(&zero_scan_data, &data, scan_data_t);
6896 copyRExC_state = RExC_state;
6899 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6901 RExC_state = copyRExC_state;
6902 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6903 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6905 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6906 StructCopy(&zero_scan_data, &data, scan_data_t);
6909 StructCopy(&zero_scan_data, &data, scan_data_t);
6912 /* Dig out information for optimizations. */
6913 r->extflags = RExC_flags; /* was pm_op */
6914 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6917 SvUTF8_on(rx); /* Unicode in it? */
6918 ri->regstclass = NULL;
6919 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
6920 r->intflags |= PREGf_NAUGHTY;
6921 scan = ri->program + 1; /* First BRANCH. */
6923 /* testing for BRANCH here tells us whether there is "must appear"
6924 data in the pattern. If there is then we can use it for optimisations */
6925 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6928 STRLEN longest_float_length, longest_fixed_length;
6929 regnode_ssc ch_class; /* pointed to by data */
6931 SSize_t last_close = 0; /* pointed to by data */
6932 regnode *first= scan;
6933 regnode *first_next= regnext(first);
6935 * Skip introductions and multiplicators >= 1
6936 * so that we can extract the 'meat' of the pattern that must
6937 * match in the large if() sequence following.
6938 * NOTE that EXACT is NOT covered here, as it is normally
6939 * picked up by the optimiser separately.
6941 * This is unfortunate as the optimiser isnt handling lookahead
6942 * properly currently.
6945 while ((OP(first) == OPEN && (sawopen = 1)) ||
6946 /* An OR of *one* alternative - should not happen now. */
6947 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6948 /* for now we can't handle lookbehind IFMATCH*/
6949 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6950 (OP(first) == PLUS) ||
6951 (OP(first) == MINMOD) ||
6952 /* An {n,m} with n>0 */
6953 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6954 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6957 * the only op that could be a regnode is PLUS, all the rest
6958 * will be regnode_1 or regnode_2.
6960 * (yves doesn't think this is true)
6962 if (OP(first) == PLUS)
6965 if (OP(first) == MINMOD)
6967 first += regarglen[OP(first)];
6969 first = NEXTOPER(first);
6970 first_next= regnext(first);
6973 /* Starting-point info. */
6975 DEBUG_PEEP("first:",first,0);
6976 /* Ignore EXACT as we deal with it later. */
6977 if (PL_regkind[OP(first)] == EXACT) {
6978 if (OP(first) == EXACT || OP(first) == EXACTL)
6979 NOOP; /* Empty, get anchored substr later. */
6981 ri->regstclass = first;
6984 else if (PL_regkind[OP(first)] == TRIE &&
6985 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6987 /* this can happen only on restudy */
6988 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6991 else if (REGNODE_SIMPLE(OP(first)))
6992 ri->regstclass = first;
6993 else if (PL_regkind[OP(first)] == BOUND ||
6994 PL_regkind[OP(first)] == NBOUND)
6995 ri->regstclass = first;
6996 else if (PL_regkind[OP(first)] == BOL) {
6997 r->intflags |= (OP(first) == MBOL
7000 first = NEXTOPER(first);
7003 else if (OP(first) == GPOS) {
7004 r->intflags |= PREGf_ANCH_GPOS;
7005 first = NEXTOPER(first);
7008 else if ((!sawopen || !RExC_sawback) &&
7010 (OP(first) == STAR &&
7011 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7012 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7014 /* turn .* into ^.* with an implied $*=1 */
7016 (OP(NEXTOPER(first)) == REG_ANY)
7019 r->intflags |= (type | PREGf_IMPLICIT);
7020 first = NEXTOPER(first);
7023 if (sawplus && !sawminmod && !sawlookahead
7024 && (!sawopen || !RExC_sawback)
7025 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7026 /* x+ must match at the 1st pos of run of x's */
7027 r->intflags |= PREGf_SKIP;
7029 /* Scan is after the zeroth branch, first is atomic matcher. */
7030 #ifdef TRIE_STUDY_OPT
7033 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7034 (IV)(first - scan + 1))
7038 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7039 (IV)(first - scan + 1))
7045 * If there's something expensive in the r.e., find the
7046 * longest literal string that must appear and make it the
7047 * regmust. Resolve ties in favor of later strings, since
7048 * the regstart check works with the beginning of the r.e.
7049 * and avoiding duplication strengthens checking. Not a
7050 * strong reason, but sufficient in the absence of others.
7051 * [Now we resolve ties in favor of the earlier string if
7052 * it happens that c_offset_min has been invalidated, since the
7053 * earlier string may buy us something the later one won't.]
7056 data.longest_fixed = newSVpvs("");
7057 data.longest_float = newSVpvs("");
7058 data.last_found = newSVpvs("");
7059 data.longest = &(data.longest_fixed);
7060 ENTER_with_name("study_chunk");
7061 SAVEFREESV(data.longest_fixed);
7062 SAVEFREESV(data.longest_float);
7063 SAVEFREESV(data.last_found);
7065 if (!ri->regstclass) {
7066 ssc_init(pRExC_state, &ch_class);
7067 data.start_class = &ch_class;
7068 stclass_flag = SCF_DO_STCLASS_AND;
7069 } else /* XXXX Check for BOUND? */
7071 data.last_closep = &last_close;
7074 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7075 scan + RExC_size, /* Up to end */
7077 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7078 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7082 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7085 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7086 && data.last_start_min == 0 && data.last_end > 0
7087 && !RExC_seen_zerolen
7088 && !(RExC_seen & REG_VERBARG_SEEN)
7089 && !(RExC_seen & REG_GPOS_SEEN)
7091 r->extflags |= RXf_CHECK_ALL;
7093 scan_commit(pRExC_state, &data,&minlen,0);
7095 longest_float_length = CHR_SVLEN(data.longest_float);
7097 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7098 && data.offset_fixed == data.offset_float_min
7099 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7100 && S_setup_longest (aTHX_ pRExC_state,
7104 &(r->float_end_shift),
7105 data.lookbehind_float,
7106 data.offset_float_min,
7108 longest_float_length,
7109 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7110 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7112 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7113 r->float_max_offset = data.offset_float_max;
7114 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7115 r->float_max_offset -= data.lookbehind_float;
7116 SvREFCNT_inc_simple_void_NN(data.longest_float);
7119 r->float_substr = r->float_utf8 = NULL;
7120 longest_float_length = 0;
7123 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7125 if (S_setup_longest (aTHX_ pRExC_state,
7127 &(r->anchored_utf8),
7128 &(r->anchored_substr),
7129 &(r->anchored_end_shift),
7130 data.lookbehind_fixed,
7133 longest_fixed_length,
7134 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7135 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7137 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7138 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7141 r->anchored_substr = r->anchored_utf8 = NULL;
7142 longest_fixed_length = 0;
7144 LEAVE_with_name("study_chunk");
7147 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7148 ri->regstclass = NULL;
7150 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7152 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7153 && is_ssc_worth_it(pRExC_state, data.start_class))
7155 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7157 ssc_finalize(pRExC_state, data.start_class);
7159 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7160 StructCopy(data.start_class,
7161 (regnode_ssc*)RExC_rxi->data->data[n],
7163 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7164 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7165 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7166 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7167 PerlIO_printf(Perl_debug_log,
7168 "synthetic stclass \"%s\".\n",
7169 SvPVX_const(sv));});
7170 data.start_class = NULL;
7173 /* A temporary algorithm prefers floated substr to fixed one to dig
7175 if (longest_fixed_length > longest_float_length) {
7176 r->substrs->check_ix = 0;
7177 r->check_end_shift = r->anchored_end_shift;
7178 r->check_substr = r->anchored_substr;
7179 r->check_utf8 = r->anchored_utf8;
7180 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7181 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7182 r->intflags |= PREGf_NOSCAN;
7185 r->substrs->check_ix = 1;
7186 r->check_end_shift = r->float_end_shift;
7187 r->check_substr = r->float_substr;
7188 r->check_utf8 = r->float_utf8;
7189 r->check_offset_min = r->float_min_offset;
7190 r->check_offset_max = r->float_max_offset;
7192 if ((r->check_substr || r->check_utf8) ) {
7193 r->extflags |= RXf_USE_INTUIT;
7194 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7195 r->extflags |= RXf_INTUIT_TAIL;
7197 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7199 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7200 if ( (STRLEN)minlen < longest_float_length )
7201 minlen= longest_float_length;
7202 if ( (STRLEN)minlen < longest_fixed_length )
7203 minlen= longest_fixed_length;
7207 /* Several toplevels. Best we can is to set minlen. */
7209 regnode_ssc ch_class;
7210 SSize_t last_close = 0;
7212 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7214 scan = ri->program + 1;
7215 ssc_init(pRExC_state, &ch_class);
7216 data.start_class = &ch_class;
7217 data.last_closep = &last_close;
7220 minlen = study_chunk(pRExC_state,
7221 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7222 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7223 ? SCF_TRIE_DOING_RESTUDY
7227 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7229 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7230 = r->float_substr = r->float_utf8 = NULL;
7232 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7233 && is_ssc_worth_it(pRExC_state, data.start_class))
7235 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7237 ssc_finalize(pRExC_state, data.start_class);
7239 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7240 StructCopy(data.start_class,
7241 (regnode_ssc*)RExC_rxi->data->data[n],
7243 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7244 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7245 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7246 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7247 PerlIO_printf(Perl_debug_log,
7248 "synthetic stclass \"%s\".\n",
7249 SvPVX_const(sv));});
7250 data.start_class = NULL;
7254 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7255 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7256 r->maxlen = REG_INFTY;
7259 r->maxlen = RExC_maxlen;
7262 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7263 the "real" pattern. */
7265 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7266 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7268 r->minlenret = minlen;
7269 if (r->minlen < minlen)
7272 if (RExC_seen & REG_GPOS_SEEN)
7273 r->intflags |= PREGf_GPOS_SEEN;
7274 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7275 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7277 if (pRExC_state->num_code_blocks)
7278 r->extflags |= RXf_EVAL_SEEN;
7279 if (RExC_seen & REG_CANY_SEEN)
7280 r->intflags |= PREGf_CANY_SEEN;
7281 if (RExC_seen & REG_VERBARG_SEEN)
7283 r->intflags |= PREGf_VERBARG_SEEN;
7284 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7286 if (RExC_seen & REG_CUTGROUP_SEEN)
7287 r->intflags |= PREGf_CUTGROUP_SEEN;
7288 if (pm_flags & PMf_USE_RE_EVAL)
7289 r->intflags |= PREGf_USE_RE_EVAL;
7290 if (RExC_paren_names)
7291 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7293 RXp_PAREN_NAMES(r) = NULL;
7295 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7296 * so it can be used in pp.c */
7297 if (r->intflags & PREGf_ANCH)
7298 r->extflags |= RXf_IS_ANCHORED;
7302 /* this is used to identify "special" patterns that might result
7303 * in Perl NOT calling the regex engine and instead doing the match "itself",
7304 * particularly special cases in split//. By having the regex compiler
7305 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7306 * we avoid weird issues with equivalent patterns resulting in different behavior,
7307 * AND we allow non Perl engines to get the same optimizations by the setting the
7308 * flags appropriately - Yves */
7309 regnode *first = ri->program + 1;
7311 regnode *next = NEXTOPER(first);
7314 if (PL_regkind[fop] == NOTHING && nop == END)
7315 r->extflags |= RXf_NULL;
7316 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7317 /* when fop is SBOL first->flags will be true only when it was
7318 * produced by parsing /\A/, and not when parsing /^/. This is
7319 * very important for the split code as there we want to
7320 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7321 * See rt #122761 for more details. -- Yves */
7322 r->extflags |= RXf_START_ONLY;
7323 else if (fop == PLUS
7324 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7325 && OP(regnext(first)) == END)
7326 r->extflags |= RXf_WHITE;
7327 else if ( r->extflags & RXf_SPLIT
7328 && (fop == EXACT || fop == EXACTL)
7329 && STR_LEN(first) == 1
7330 && *(STRING(first)) == ' '
7331 && OP(regnext(first)) == END )
7332 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7336 if (RExC_contains_locale) {
7337 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7341 if (RExC_paren_names) {
7342 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7343 ri->data->data[ri->name_list_idx]
7344 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7347 ri->name_list_idx = 0;
7349 if (RExC_recurse_count) {
7350 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7351 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7352 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7355 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7356 /* assume we don't need to swap parens around before we match */
7358 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7359 (unsigned long)RExC_study_chunk_recursed_count);
7363 PerlIO_printf(Perl_debug_log,"Final program:\n");
7366 #ifdef RE_TRACK_PATTERN_OFFSETS
7367 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7368 const STRLEN len = ri->u.offsets[0];
7370 GET_RE_DEBUG_FLAGS_DECL;
7371 PerlIO_printf(Perl_debug_log,
7372 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7373 for (i = 1; i <= len; i++) {
7374 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7375 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7376 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7378 PerlIO_printf(Perl_debug_log, "\n");
7383 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7384 * by setting the regexp SV to readonly-only instead. If the
7385 * pattern's been recompiled, the USEDness should remain. */
7386 if (old_re && SvREADONLY(old_re))
7394 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7397 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7399 PERL_UNUSED_ARG(value);
7401 if (flags & RXapif_FETCH) {
7402 return reg_named_buff_fetch(rx, key, flags);
7403 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7404 Perl_croak_no_modify();
7406 } else if (flags & RXapif_EXISTS) {
7407 return reg_named_buff_exists(rx, key, flags)
7410 } else if (flags & RXapif_REGNAMES) {
7411 return reg_named_buff_all(rx, flags);
7412 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7413 return reg_named_buff_scalar(rx, flags);
7415 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7421 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7424 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7425 PERL_UNUSED_ARG(lastkey);
7427 if (flags & RXapif_FIRSTKEY)
7428 return reg_named_buff_firstkey(rx, flags);
7429 else if (flags & RXapif_NEXTKEY)
7430 return reg_named_buff_nextkey(rx, flags);
7432 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7439 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7442 AV *retarray = NULL;
7444 struct regexp *const rx = ReANY(r);
7446 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7448 if (flags & RXapif_ALL)
7451 if (rx && RXp_PAREN_NAMES(rx)) {
7452 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7455 SV* sv_dat=HeVAL(he_str);
7456 I32 *nums=(I32*)SvPVX(sv_dat);
7457 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7458 if ((I32)(rx->nparens) >= nums[i]
7459 && rx->offs[nums[i]].start != -1
7460 && rx->offs[nums[i]].end != -1)
7463 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7468 ret = newSVsv(&PL_sv_undef);
7471 av_push(retarray, ret);
7474 return newRV_noinc(MUTABLE_SV(retarray));
7481 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7484 struct regexp *const rx = ReANY(r);
7486 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7488 if (rx && RXp_PAREN_NAMES(rx)) {
7489 if (flags & RXapif_ALL) {
7490 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7492 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7494 SvREFCNT_dec_NN(sv);
7506 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7508 struct regexp *const rx = ReANY(r);
7510 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7512 if ( rx && RXp_PAREN_NAMES(rx) ) {
7513 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7515 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7522 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7524 struct regexp *const rx = ReANY(r);
7525 GET_RE_DEBUG_FLAGS_DECL;
7527 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7529 if (rx && RXp_PAREN_NAMES(rx)) {
7530 HV *hv = RXp_PAREN_NAMES(rx);
7532 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7535 SV* sv_dat = HeVAL(temphe);
7536 I32 *nums = (I32*)SvPVX(sv_dat);
7537 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7538 if ((I32)(rx->lastparen) >= nums[i] &&
7539 rx->offs[nums[i]].start != -1 &&
7540 rx->offs[nums[i]].end != -1)
7546 if (parno || flags & RXapif_ALL) {
7547 return newSVhek(HeKEY_hek(temphe));
7555 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7560 struct regexp *const rx = ReANY(r);
7562 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7564 if (rx && RXp_PAREN_NAMES(rx)) {
7565 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7566 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7567 } else if (flags & RXapif_ONE) {
7568 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7569 av = MUTABLE_AV(SvRV(ret));
7570 length = av_tindex(av);
7571 SvREFCNT_dec_NN(ret);
7572 return newSViv(length + 1);
7574 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7579 return &PL_sv_undef;
7583 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7585 struct regexp *const rx = ReANY(r);
7588 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7590 if (rx && RXp_PAREN_NAMES(rx)) {
7591 HV *hv= RXp_PAREN_NAMES(rx);
7593 (void)hv_iterinit(hv);
7594 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7597 SV* sv_dat = HeVAL(temphe);
7598 I32 *nums = (I32*)SvPVX(sv_dat);
7599 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7600 if ((I32)(rx->lastparen) >= nums[i] &&
7601 rx->offs[nums[i]].start != -1 &&
7602 rx->offs[nums[i]].end != -1)
7608 if (parno || flags & RXapif_ALL) {
7609 av_push(av, newSVhek(HeKEY_hek(temphe)));
7614 return newRV_noinc(MUTABLE_SV(av));
7618 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7621 struct regexp *const rx = ReANY(r);
7627 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7629 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7630 || n == RX_BUFF_IDX_CARET_FULLMATCH
7631 || n == RX_BUFF_IDX_CARET_POSTMATCH
7634 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7636 /* on something like
7639 * the KEEPCOPY is set on the PMOP rather than the regex */
7640 if (PL_curpm && r == PM_GETRE(PL_curpm))
7641 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7650 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7651 /* no need to distinguish between them any more */
7652 n = RX_BUFF_IDX_FULLMATCH;
7654 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7655 && rx->offs[0].start != -1)
7657 /* $`, ${^PREMATCH} */
7658 i = rx->offs[0].start;
7662 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7663 && rx->offs[0].end != -1)
7665 /* $', ${^POSTMATCH} */
7666 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7667 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7670 if ( 0 <= n && n <= (I32)rx->nparens &&
7671 (s1 = rx->offs[n].start) != -1 &&
7672 (t1 = rx->offs[n].end) != -1)
7674 /* $&, ${^MATCH}, $1 ... */
7676 s = rx->subbeg + s1 - rx->suboffset;
7681 assert(s >= rx->subbeg);
7682 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7684 #ifdef NO_TAINT_SUPPORT
7685 sv_setpvn(sv, s, i);
7687 const int oldtainted = TAINT_get;
7689 sv_setpvn(sv, s, i);
7690 TAINT_set(oldtainted);
7692 if ( (rx->intflags & PREGf_CANY_SEEN)
7693 ? (RXp_MATCH_UTF8(rx)
7694 && (!i || is_utf8_string((U8*)s, i)))
7695 : (RXp_MATCH_UTF8(rx)) )
7702 if (RXp_MATCH_TAINTED(rx)) {
7703 if (SvTYPE(sv) >= SVt_PVMG) {
7704 MAGIC* const mg = SvMAGIC(sv);
7707 SvMAGIC_set(sv, mg->mg_moremagic);
7709 if ((mgt = SvMAGIC(sv))) {
7710 mg->mg_moremagic = mgt;
7711 SvMAGIC_set(sv, mg);
7722 sv_setsv(sv,&PL_sv_undef);
7728 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7729 SV const * const value)
7731 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7733 PERL_UNUSED_ARG(rx);
7734 PERL_UNUSED_ARG(paren);
7735 PERL_UNUSED_ARG(value);
7738 Perl_croak_no_modify();
7742 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7745 struct regexp *const rx = ReANY(r);
7749 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7751 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7752 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7753 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7756 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7758 /* on something like
7761 * the KEEPCOPY is set on the PMOP rather than the regex */
7762 if (PL_curpm && r == PM_GETRE(PL_curpm))
7763 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7769 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7771 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7772 case RX_BUFF_IDX_PREMATCH: /* $` */
7773 if (rx->offs[0].start != -1) {
7774 i = rx->offs[0].start;
7783 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7784 case RX_BUFF_IDX_POSTMATCH: /* $' */
7785 if (rx->offs[0].end != -1) {
7786 i = rx->sublen - rx->offs[0].end;
7788 s1 = rx->offs[0].end;
7795 default: /* $& / ${^MATCH}, $1, $2, ... */
7796 if (paren <= (I32)rx->nparens &&
7797 (s1 = rx->offs[paren].start) != -1 &&
7798 (t1 = rx->offs[paren].end) != -1)
7804 if (ckWARN(WARN_UNINITIALIZED))
7805 report_uninit((const SV *)sv);
7810 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7811 const char * const s = rx->subbeg - rx->suboffset + s1;
7816 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7823 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7825 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7826 PERL_UNUSED_ARG(rx);
7830 return newSVpvs("Regexp");
7833 /* Scans the name of a named buffer from the pattern.
7834 * If flags is REG_RSN_RETURN_NULL returns null.
7835 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7836 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7837 * to the parsed name as looked up in the RExC_paren_names hash.
7838 * If there is an error throws a vFAIL().. type exception.
7841 #define REG_RSN_RETURN_NULL 0
7842 #define REG_RSN_RETURN_NAME 1
7843 #define REG_RSN_RETURN_DATA 2
7846 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7848 char *name_start = RExC_parse;
7850 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7852 assert (RExC_parse <= RExC_end);
7853 if (RExC_parse == RExC_end) NOOP;
7854 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7855 /* skip IDFIRST by using do...while */
7858 RExC_parse += UTF8SKIP(RExC_parse);
7859 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7863 } while (isWORDCHAR(*RExC_parse));
7865 RExC_parse++; /* so the <- from the vFAIL is after the offending
7867 vFAIL("Group name must start with a non-digit word character");
7871 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7872 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7873 if ( flags == REG_RSN_RETURN_NAME)
7875 else if (flags==REG_RSN_RETURN_DATA) {
7878 if ( ! sv_name ) /* should not happen*/
7879 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7880 if (RExC_paren_names)
7881 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7883 sv_dat = HeVAL(he_str);
7885 vFAIL("Reference to nonexistent named group");
7889 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7890 (unsigned long) flags);
7892 NOT_REACHED; /* NOT REACHED */
7897 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7899 if (RExC_lastparse!=RExC_parse) { \
7900 PerlIO_printf(Perl_debug_log, "%s", \
7901 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
7902 RExC_end - RExC_parse, 16, \
7904 PERL_PV_ESCAPE_UNI_DETECT | \
7905 PERL_PV_PRETTY_ELLIPSES | \
7906 PERL_PV_PRETTY_LTGT | \
7907 PERL_PV_ESCAPE_RE | \
7908 PERL_PV_PRETTY_EXACTSIZE \
7912 PerlIO_printf(Perl_debug_log,"%16s",""); \
7915 num = RExC_size + 1; \
7917 num=REG_NODE_NUM(RExC_emit); \
7918 if (RExC_lastnum!=num) \
7919 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7921 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7922 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7923 (int)((depth*2)), "", \
7927 RExC_lastparse=RExC_parse; \
7932 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7933 DEBUG_PARSE_MSG((funcname)); \
7934 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7936 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7937 DEBUG_PARSE_MSG((funcname)); \
7938 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7941 /* This section of code defines the inversion list object and its methods. The
7942 * interfaces are highly subject to change, so as much as possible is static to
7943 * this file. An inversion list is here implemented as a malloc'd C UV array
7944 * as an SVt_INVLIST scalar.
7946 * An inversion list for Unicode is an array of code points, sorted by ordinal
7947 * number. The zeroth element is the first code point in the list. The 1th
7948 * element is the first element beyond that not in the list. In other words,
7949 * the first range is
7950 * invlist[0]..(invlist[1]-1)
7951 * The other ranges follow. Thus every element whose index is divisible by two
7952 * marks the beginning of a range that is in the list, and every element not
7953 * divisible by two marks the beginning of a range not in the list. A single
7954 * element inversion list that contains the single code point N generally
7955 * consists of two elements
7958 * (The exception is when N is the highest representable value on the
7959 * machine, in which case the list containing just it would be a single
7960 * element, itself. By extension, if the last range in the list extends to
7961 * infinity, then the first element of that range will be in the inversion list
7962 * at a position that is divisible by two, and is the final element in the
7964 * Taking the complement (inverting) an inversion list is quite simple, if the
7965 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7966 * This implementation reserves an element at the beginning of each inversion
7967 * list to always contain 0; there is an additional flag in the header which
7968 * indicates if the list begins at the 0, or is offset to begin at the next
7971 * More about inversion lists can be found in "Unicode Demystified"
7972 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7973 * More will be coming when functionality is added later.
7975 * The inversion list data structure is currently implemented as an SV pointing
7976 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7977 * array of UV whose memory management is automatically handled by the existing
7978 * facilities for SV's.
7980 * Some of the methods should always be private to the implementation, and some
7981 * should eventually be made public */
7983 /* The header definitions are in F<inline_invlist.c> */
7985 PERL_STATIC_INLINE UV*
7986 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7988 /* Returns a pointer to the first element in the inversion list's array.
7989 * This is called upon initialization of an inversion list. Where the
7990 * array begins depends on whether the list has the code point U+0000 in it
7991 * or not. The other parameter tells it whether the code that follows this
7992 * call is about to put a 0 in the inversion list or not. The first
7993 * element is either the element reserved for 0, if TRUE, or the element
7994 * after it, if FALSE */
7996 bool* offset = get_invlist_offset_addr(invlist);
7997 UV* zero_addr = (UV *) SvPVX(invlist);
7999 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8002 assert(! _invlist_len(invlist));
8006 /* 1^1 = 0; 1^0 = 1 */
8007 *offset = 1 ^ will_have_0;
8008 return zero_addr + *offset;
8011 PERL_STATIC_INLINE UV*
8012 S_invlist_array(SV* const invlist)
8014 /* Returns the pointer to the inversion list's array. Every time the
8015 * length changes, this needs to be called in case malloc or realloc moved
8018 PERL_ARGS_ASSERT_INVLIST_ARRAY;
8020 /* Must not be empty. If these fail, you probably didn't check for <len>
8021 * being non-zero before trying to get the array */
8022 assert(_invlist_len(invlist));
8024 /* The very first element always contains zero, The array begins either
8025 * there, or if the inversion list is offset, at the element after it.
8026 * The offset header field determines which; it contains 0 or 1 to indicate
8027 * how much additionally to add */
8028 assert(0 == *(SvPVX(invlist)));
8029 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8032 PERL_STATIC_INLINE void
8033 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8035 /* Sets the current number of elements stored in the inversion list.
8036 * Updates SvCUR correspondingly */
8037 PERL_UNUSED_CONTEXT;
8038 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8040 assert(SvTYPE(invlist) == SVt_INVLIST);
8045 : TO_INTERNAL_SIZE(len + offset));
8046 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8049 #ifndef PERL_IN_XSUB_RE
8051 PERL_STATIC_INLINE IV*
8052 S_get_invlist_previous_index_addr(SV* invlist)
8054 /* Return the address of the IV that is reserved to hold the cached index
8056 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8058 assert(SvTYPE(invlist) == SVt_INVLIST);
8060 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8063 PERL_STATIC_INLINE IV
8064 S_invlist_previous_index(SV* const invlist)
8066 /* Returns cached index of previous search */
8068 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8070 return *get_invlist_previous_index_addr(invlist);
8073 PERL_STATIC_INLINE void
8074 S_invlist_set_previous_index(SV* const invlist, const IV index)
8076 /* Caches <index> for later retrieval */
8078 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8080 assert(index == 0 || index < (int) _invlist_len(invlist));
8082 *get_invlist_previous_index_addr(invlist) = index;
8085 PERL_STATIC_INLINE void
8086 S_invlist_trim(SV* const invlist)
8088 PERL_ARGS_ASSERT_INVLIST_TRIM;
8090 assert(SvTYPE(invlist) == SVt_INVLIST);
8092 /* Change the length of the inversion list to how many entries it currently
8094 SvPV_shrink_to_cur((SV *) invlist);
8097 PERL_STATIC_INLINE bool
8098 S_invlist_is_iterating(SV* const invlist)
8100 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8102 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8105 #endif /* ifndef PERL_IN_XSUB_RE */
8107 PERL_STATIC_INLINE UV
8108 S_invlist_max(SV* const invlist)
8110 /* Returns the maximum number of elements storable in the inversion list's
8111 * array, without having to realloc() */
8113 PERL_ARGS_ASSERT_INVLIST_MAX;
8115 assert(SvTYPE(invlist) == SVt_INVLIST);
8117 /* Assumes worst case, in which the 0 element is not counted in the
8118 * inversion list, so subtracts 1 for that */
8119 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8120 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8121 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8124 #ifndef PERL_IN_XSUB_RE
8126 Perl__new_invlist(pTHX_ IV initial_size)
8129 /* Return a pointer to a newly constructed inversion list, with enough
8130 * space to store 'initial_size' elements. If that number is negative, a
8131 * system default is used instead */
8135 if (initial_size < 0) {
8139 /* Allocate the initial space */
8140 new_list = newSV_type(SVt_INVLIST);
8142 /* First 1 is in case the zero element isn't in the list; second 1 is for
8144 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8145 invlist_set_len(new_list, 0, 0);
8147 /* Force iterinit() to be used to get iteration to work */
8148 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8150 *get_invlist_previous_index_addr(new_list) = 0;
8156 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8158 /* Return a pointer to a newly constructed inversion list, initialized to
8159 * point to <list>, which has to be in the exact correct inversion list
8160 * form, including internal fields. Thus this is a dangerous routine that
8161 * should not be used in the wrong hands. The passed in 'list' contains
8162 * several header fields at the beginning that are not part of the
8163 * inversion list body proper */
8165 const STRLEN length = (STRLEN) list[0];
8166 const UV version_id = list[1];
8167 const bool offset = cBOOL(list[2]);
8168 #define HEADER_LENGTH 3
8169 /* If any of the above changes in any way, you must change HEADER_LENGTH
8170 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8171 * perl -E 'say int(rand 2**31-1)'
8173 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8174 data structure type, so that one being
8175 passed in can be validated to be an
8176 inversion list of the correct vintage.
8179 SV* invlist = newSV_type(SVt_INVLIST);
8181 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8183 if (version_id != INVLIST_VERSION_ID) {
8184 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8187 /* The generated array passed in includes header elements that aren't part
8188 * of the list proper, so start it just after them */
8189 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8191 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8192 shouldn't touch it */
8194 *(get_invlist_offset_addr(invlist)) = offset;
8196 /* The 'length' passed to us is the physical number of elements in the
8197 * inversion list. But if there is an offset the logical number is one
8199 invlist_set_len(invlist, length - offset, offset);
8201 invlist_set_previous_index(invlist, 0);
8203 /* Initialize the iteration pointer. */
8204 invlist_iterfinish(invlist);
8206 SvREADONLY_on(invlist);
8210 #endif /* ifndef PERL_IN_XSUB_RE */
8213 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8215 /* Grow the maximum size of an inversion list */
8217 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8219 assert(SvTYPE(invlist) == SVt_INVLIST);
8221 /* Add one to account for the zero element at the beginning which may not
8222 * be counted by the calling parameters */
8223 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8227 S__append_range_to_invlist(pTHX_ SV* const invlist,
8228 const UV start, const UV end)
8230 /* Subject to change or removal. Append the range from 'start' to 'end' at
8231 * the end of the inversion list. The range must be above any existing
8235 UV max = invlist_max(invlist);
8236 UV len = _invlist_len(invlist);
8239 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8241 if (len == 0) { /* Empty lists must be initialized */
8242 offset = start != 0;
8243 array = _invlist_array_init(invlist, ! offset);
8246 /* Here, the existing list is non-empty. The current max entry in the
8247 * list is generally the first value not in the set, except when the
8248 * set extends to the end of permissible values, in which case it is
8249 * the first entry in that final set, and so this call is an attempt to
8250 * append out-of-order */
8252 UV final_element = len - 1;
8253 array = invlist_array(invlist);
8254 if (array[final_element] > start
8255 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8257 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8258 array[final_element], start,
8259 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8262 /* Here, it is a legal append. If the new range begins with the first
8263 * value not in the set, it is extending the set, so the new first
8264 * value not in the set is one greater than the newly extended range.
8266 offset = *get_invlist_offset_addr(invlist);
8267 if (array[final_element] == start) {
8268 if (end != UV_MAX) {
8269 array[final_element] = end + 1;
8272 /* But if the end is the maximum representable on the machine,
8273 * just let the range that this would extend to have no end */
8274 invlist_set_len(invlist, len - 1, offset);
8280 /* Here the new range doesn't extend any existing set. Add it */
8282 len += 2; /* Includes an element each for the start and end of range */
8284 /* If wll overflow the existing space, extend, which may cause the array to
8287 invlist_extend(invlist, len);
8289 /* Have to set len here to avoid assert failure in invlist_array() */
8290 invlist_set_len(invlist, len, offset);
8292 array = invlist_array(invlist);
8295 invlist_set_len(invlist, len, offset);
8298 /* The next item on the list starts the range, the one after that is
8299 * one past the new range. */
8300 array[len - 2] = start;
8301 if (end != UV_MAX) {
8302 array[len - 1] = end + 1;
8305 /* But if the end is the maximum representable on the machine, just let
8306 * the range have no end */
8307 invlist_set_len(invlist, len - 1, offset);
8311 #ifndef PERL_IN_XSUB_RE
8314 Perl__invlist_search(SV* const invlist, const UV cp)
8316 /* Searches the inversion list for the entry that contains the input code
8317 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8318 * return value is the index into the list's array of the range that
8323 IV high = _invlist_len(invlist);
8324 const IV highest_element = high - 1;
8327 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8329 /* If list is empty, return failure. */
8334 /* (We can't get the array unless we know the list is non-empty) */
8335 array = invlist_array(invlist);
8337 mid = invlist_previous_index(invlist);
8338 assert(mid >=0 && mid <= highest_element);
8340 /* <mid> contains the cache of the result of the previous call to this
8341 * function (0 the first time). See if this call is for the same result,
8342 * or if it is for mid-1. This is under the theory that calls to this
8343 * function will often be for related code points that are near each other.
8344 * And benchmarks show that caching gives better results. We also test
8345 * here if the code point is within the bounds of the list. These tests
8346 * replace others that would have had to be made anyway to make sure that
8347 * the array bounds were not exceeded, and these give us extra information
8348 * at the same time */
8349 if (cp >= array[mid]) {
8350 if (cp >= array[highest_element]) {
8351 return highest_element;
8354 /* Here, array[mid] <= cp < array[highest_element]. This means that
8355 * the final element is not the answer, so can exclude it; it also
8356 * means that <mid> is not the final element, so can refer to 'mid + 1'
8358 if (cp < array[mid + 1]) {
8364 else { /* cp < aray[mid] */
8365 if (cp < array[0]) { /* Fail if outside the array */
8369 if (cp >= array[mid - 1]) {
8374 /* Binary search. What we are looking for is <i> such that
8375 * array[i] <= cp < array[i+1]
8376 * The loop below converges on the i+1. Note that there may not be an
8377 * (i+1)th element in the array, and things work nonetheless */
8378 while (low < high) {
8379 mid = (low + high) / 2;
8380 assert(mid <= highest_element);
8381 if (array[mid] <= cp) { /* cp >= array[mid] */
8384 /* We could do this extra test to exit the loop early.
8385 if (cp < array[low]) {
8390 else { /* cp < array[mid] */
8397 invlist_set_previous_index(invlist, high);
8402 Perl__invlist_populate_swatch(SV* const invlist,
8403 const UV start, const UV end, U8* swatch)
8405 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8406 * but is used when the swash has an inversion list. This makes this much
8407 * faster, as it uses a binary search instead of a linear one. This is
8408 * intimately tied to that function, and perhaps should be in utf8.c,
8409 * except it is intimately tied to inversion lists as well. It assumes
8410 * that <swatch> is all 0's on input */
8413 const IV len = _invlist_len(invlist);
8417 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8419 if (len == 0) { /* Empty inversion list */
8423 array = invlist_array(invlist);
8425 /* Find which element it is */
8426 i = _invlist_search(invlist, start);
8428 /* We populate from <start> to <end> */
8429 while (current < end) {
8432 /* The inversion list gives the results for every possible code point
8433 * after the first one in the list. Only those ranges whose index is
8434 * even are ones that the inversion list matches. For the odd ones,
8435 * and if the initial code point is not in the list, we have to skip
8436 * forward to the next element */
8437 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8439 if (i >= len) { /* Finished if beyond the end of the array */
8443 if (current >= end) { /* Finished if beyond the end of what we
8445 if (LIKELY(end < UV_MAX)) {
8449 /* We get here when the upper bound is the maximum
8450 * representable on the machine, and we are looking for just
8451 * that code point. Have to special case it */
8453 goto join_end_of_list;
8456 assert(current >= start);
8458 /* The current range ends one below the next one, except don't go past
8461 upper = (i < len && array[i] < end) ? array[i] : end;
8463 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8464 * for each code point in it */
8465 for (; current < upper; current++) {
8466 const STRLEN offset = (STRLEN)(current - start);
8467 swatch[offset >> 3] |= 1 << (offset & 7);
8472 /* Quit if at the end of the list */
8475 /* But first, have to deal with the highest possible code point on
8476 * the platform. The previous code assumes that <end> is one
8477 * beyond where we want to populate, but that is impossible at the
8478 * platform's infinity, so have to handle it specially */
8479 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8481 const STRLEN offset = (STRLEN)(end - start);
8482 swatch[offset >> 3] |= 1 << (offset & 7);
8487 /* Advance to the next range, which will be for code points not in the
8496 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8497 const bool complement_b, SV** output)
8499 /* Take the union of two inversion lists and point <output> to it. *output
8500 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8501 * the reference count to that list will be decremented if not already a
8502 * temporary (mortal); otherwise *output will be made correspondingly
8503 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8504 * second list is returned. If <complement_b> is TRUE, the union is taken
8505 * of the complement (inversion) of <b> instead of b itself.
8507 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8508 * Richard Gillam, published by Addison-Wesley, and explained at some
8509 * length there. The preface says to incorporate its examples into your
8510 * code at your own risk.
8512 * The algorithm is like a merge sort.
8514 * XXX A potential performance improvement is to keep track as we go along
8515 * if only one of the inputs contributes to the result, meaning the other
8516 * is a subset of that one. In that case, we can skip the final copy and
8517 * return the larger of the input lists, but then outside code might need
8518 * to keep track of whether to free the input list or not */
8520 const UV* array_a; /* a's array */
8522 UV len_a; /* length of a's array */
8525 SV* u; /* the resulting union */
8529 UV i_a = 0; /* current index into a's array */
8533 /* running count, as explained in the algorithm source book; items are
8534 * stopped accumulating and are output when the count changes to/from 0.
8535 * The count is incremented when we start a range that's in the set, and
8536 * decremented when we start a range that's not in the set. So its range
8537 * is 0 to 2. Only when the count is zero is something not in the set.
8541 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8544 /* If either one is empty, the union is the other one */
8545 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8546 bool make_temp = FALSE; /* Should we mortalize the result? */
8550 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8556 *output = invlist_clone(b);
8558 _invlist_invert(*output);
8560 } /* else *output already = b; */
8563 sv_2mortal(*output);
8567 else if ((len_b = _invlist_len(b)) == 0) {
8568 bool make_temp = FALSE;
8570 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8575 /* The complement of an empty list is a list that has everything in it,
8576 * so the union with <a> includes everything too */
8579 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8583 *output = _new_invlist(1);
8584 _append_range_to_invlist(*output, 0, UV_MAX);
8586 else if (*output != a) {
8587 *output = invlist_clone(a);
8589 /* else *output already = a; */
8592 sv_2mortal(*output);
8597 /* Here both lists exist and are non-empty */
8598 array_a = invlist_array(a);
8599 array_b = invlist_array(b);
8601 /* If are to take the union of 'a' with the complement of b, set it
8602 * up so are looking at b's complement. */
8605 /* To complement, we invert: if the first element is 0, remove it. To
8606 * do this, we just pretend the array starts one later */
8607 if (array_b[0] == 0) {
8613 /* But if the first element is not zero, we pretend the list starts
8614 * at the 0 that is always stored immediately before the array. */
8620 /* Size the union for the worst case: that the sets are completely
8622 u = _new_invlist(len_a + len_b);
8624 /* Will contain U+0000 if either component does */
8625 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8626 || (len_b > 0 && array_b[0] == 0));
8628 /* Go through each list item by item, stopping when exhausted one of
8630 while (i_a < len_a && i_b < len_b) {
8631 UV cp; /* The element to potentially add to the union's array */
8632 bool cp_in_set; /* is it in the the input list's set or not */
8634 /* We need to take one or the other of the two inputs for the union.
8635 * Since we are merging two sorted lists, we take the smaller of the
8636 * next items. In case of a tie, we take the one that is in its set
8637 * first. If we took one not in the set first, it would decrement the
8638 * count, possibly to 0 which would cause it to be output as ending the
8639 * range, and the next time through we would take the same number, and
8640 * output it again as beginning the next range. By doing it the
8641 * opposite way, there is no possibility that the count will be
8642 * momentarily decremented to 0, and thus the two adjoining ranges will
8643 * be seamlessly merged. (In a tie and both are in the set or both not
8644 * in the set, it doesn't matter which we take first.) */
8645 if (array_a[i_a] < array_b[i_b]
8646 || (array_a[i_a] == array_b[i_b]
8647 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8649 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8653 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8654 cp = array_b[i_b++];
8657 /* Here, have chosen which of the two inputs to look at. Only output
8658 * if the running count changes to/from 0, which marks the
8659 * beginning/end of a range in that's in the set */
8662 array_u[i_u++] = cp;
8669 array_u[i_u++] = cp;
8674 /* Here, we are finished going through at least one of the lists, which
8675 * means there is something remaining in at most one. We check if the list
8676 * that hasn't been exhausted is positioned such that we are in the middle
8677 * of a range in its set or not. (i_a and i_b point to the element beyond
8678 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8679 * is potentially more to output.
8680 * There are four cases:
8681 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8682 * in the union is entirely from the non-exhausted set.
8683 * 2) Both were in their sets, count is 2. Nothing further should
8684 * be output, as everything that remains will be in the exhausted
8685 * list's set, hence in the union; decrementing to 1 but not 0 insures
8687 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8688 * Nothing further should be output because the union includes
8689 * everything from the exhausted set. Not decrementing ensures that.
8690 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8691 * decrementing to 0 insures that we look at the remainder of the
8692 * non-exhausted set */
8693 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8694 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8699 /* The final length is what we've output so far, plus what else is about to
8700 * be output. (If 'count' is non-zero, then the input list we exhausted
8701 * has everything remaining up to the machine's limit in its set, and hence
8702 * in the union, so there will be no further output. */
8705 /* At most one of the subexpressions will be non-zero */
8706 len_u += (len_a - i_a) + (len_b - i_b);
8709 /* Set result to final length, which can change the pointer to array_u, so
8711 if (len_u != _invlist_len(u)) {
8712 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8714 array_u = invlist_array(u);
8717 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8718 * the other) ended with everything above it not in its set. That means
8719 * that the remaining part of the union is precisely the same as the
8720 * non-exhausted list, so can just copy it unchanged. (If both list were
8721 * exhausted at the same time, then the operations below will be both 0.)
8724 IV copy_count; /* At most one will have a non-zero copy count */
8725 if ((copy_count = len_a - i_a) > 0) {
8726 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8728 else if ((copy_count = len_b - i_b) > 0) {
8729 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8733 /* We may be removing a reference to one of the inputs. If so, the output
8734 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8735 * count decremented) */
8736 if (a == *output || b == *output) {
8737 assert(! invlist_is_iterating(*output));
8738 if ((SvTEMP(*output))) {
8742 SvREFCNT_dec_NN(*output);
8752 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8753 const bool complement_b, SV** i)
8755 /* Take the intersection of two inversion lists and point <i> to it. *i
8756 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8757 * the reference count to that list will be decremented if not already a
8758 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8759 * The first list, <a>, may be NULL, in which case an empty list is
8760 * returned. If <complement_b> is TRUE, the result will be the
8761 * intersection of <a> and the complement (or inversion) of <b> instead of
8764 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8765 * Richard Gillam, published by Addison-Wesley, and explained at some
8766 * length there. The preface says to incorporate its examples into your
8767 * code at your own risk. In fact, it had bugs
8769 * The algorithm is like a merge sort, and is essentially the same as the
8773 const UV* array_a; /* a's array */
8775 UV len_a; /* length of a's array */
8778 SV* r; /* the resulting intersection */
8782 UV i_a = 0; /* current index into a's array */
8786 /* running count, as explained in the algorithm source book; items are
8787 * stopped accumulating and are output when the count changes to/from 2.
8788 * The count is incremented when we start a range that's in the set, and
8789 * decremented when we start a range that's not in the set. So its range
8790 * is 0 to 2. Only when the count is 2 is something in the intersection.
8794 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8797 /* Special case if either one is empty */
8798 len_a = (a == NULL) ? 0 : _invlist_len(a);
8799 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8800 bool make_temp = FALSE;
8802 if (len_a != 0 && complement_b) {
8804 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8805 * be empty. Here, also we are using 'b's complement, which hence
8806 * must be every possible code point. Thus the intersection is
8810 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8815 *i = invlist_clone(a);
8817 /* else *i is already 'a' */
8825 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8826 * intersection must be empty */
8828 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8833 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8837 *i = _new_invlist(0);
8845 /* Here both lists exist and are non-empty */
8846 array_a = invlist_array(a);
8847 array_b = invlist_array(b);
8849 /* If are to take the intersection of 'a' with the complement of b, set it
8850 * up so are looking at b's complement. */
8853 /* To complement, we invert: if the first element is 0, remove it. To
8854 * do this, we just pretend the array starts one later */
8855 if (array_b[0] == 0) {
8861 /* But if the first element is not zero, we pretend the list starts
8862 * at the 0 that is always stored immediately before the array. */
8868 /* Size the intersection for the worst case: that the intersection ends up
8869 * fragmenting everything to be completely disjoint */
8870 r= _new_invlist(len_a + len_b);
8872 /* Will contain U+0000 iff both components do */
8873 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8874 && len_b > 0 && array_b[0] == 0);
8876 /* Go through each list item by item, stopping when exhausted one of
8878 while (i_a < len_a && i_b < len_b) {
8879 UV cp; /* The element to potentially add to the intersection's
8881 bool cp_in_set; /* Is it in the input list's set or not */
8883 /* We need to take one or the other of the two inputs for the
8884 * intersection. Since we are merging two sorted lists, we take the
8885 * smaller of the next items. In case of a tie, we take the one that
8886 * is not in its set first (a difference from the union algorithm). If
8887 * we took one in the set first, it would increment the count, possibly
8888 * to 2 which would cause it to be output as starting a range in the
8889 * intersection, and the next time through we would take that same
8890 * number, and output it again as ending the set. By doing it the
8891 * opposite of this, there is no possibility that the count will be
8892 * momentarily incremented to 2. (In a tie and both are in the set or
8893 * both not in the set, it doesn't matter which we take first.) */
8894 if (array_a[i_a] < array_b[i_b]
8895 || (array_a[i_a] == array_b[i_b]
8896 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8898 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8902 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8906 /* Here, have chosen which of the two inputs to look at. Only output
8907 * if the running count changes to/from 2, which marks the
8908 * beginning/end of a range that's in the intersection */
8912 array_r[i_r++] = cp;
8917 array_r[i_r++] = cp;
8923 /* Here, we are finished going through at least one of the lists, which
8924 * means there is something remaining in at most one. We check if the list
8925 * that has been exhausted is positioned such that we are in the middle
8926 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8927 * the ones we care about.) There are four cases:
8928 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8929 * nothing left in the intersection.
8930 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8931 * above 2. What should be output is exactly that which is in the
8932 * non-exhausted set, as everything it has is also in the intersection
8933 * set, and everything it doesn't have can't be in the intersection
8934 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8935 * gets incremented to 2. Like the previous case, the intersection is
8936 * everything that remains in the non-exhausted set.
8937 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8938 * remains 1. And the intersection has nothing more. */
8939 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8940 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8945 /* The final length is what we've output so far plus what else is in the
8946 * intersection. At most one of the subexpressions below will be non-zero
8950 len_r += (len_a - i_a) + (len_b - i_b);
8953 /* Set result to final length, which can change the pointer to array_r, so
8955 if (len_r != _invlist_len(r)) {
8956 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8958 array_r = invlist_array(r);
8961 /* Finish outputting any remaining */
8962 if (count >= 2) { /* At most one will have a non-zero copy count */
8964 if ((copy_count = len_a - i_a) > 0) {
8965 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8967 else if ((copy_count = len_b - i_b) > 0) {
8968 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8972 /* We may be removing a reference to one of the inputs. If so, the output
8973 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8974 * count decremented) */
8975 if (a == *i || b == *i) {
8976 assert(! invlist_is_iterating(*i));
8981 SvREFCNT_dec_NN(*i);
8991 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8993 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8994 * set. A pointer to the inversion list is returned. This may actually be
8995 * a new list, in which case the passed in one has been destroyed. The
8996 * passed-in inversion list can be NULL, in which case a new one is created
8997 * with just the one range in it */
9002 if (invlist == NULL) {
9003 invlist = _new_invlist(2);
9007 len = _invlist_len(invlist);
9010 /* If comes after the final entry actually in the list, can just append it
9013 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9014 && start >= invlist_array(invlist)[len - 1]))
9016 _append_range_to_invlist(invlist, start, end);
9020 /* Here, can't just append things, create and return a new inversion list
9021 * which is the union of this range and the existing inversion list */
9022 range_invlist = _new_invlist(2);
9023 _append_range_to_invlist(range_invlist, start, end);
9025 _invlist_union(invlist, range_invlist, &invlist);
9027 /* The temporary can be freed */
9028 SvREFCNT_dec_NN(range_invlist);
9034 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9035 UV** other_elements_ptr)
9037 /* Create and return an inversion list whose contents are to be populated
9038 * by the caller. The caller gives the number of elements (in 'size') and
9039 * the very first element ('element0'). This function will set
9040 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9043 * Obviously there is some trust involved that the caller will properly
9044 * fill in the other elements of the array.
9046 * (The first element needs to be passed in, as the underlying code does
9047 * things differently depending on whether it is zero or non-zero) */
9049 SV* invlist = _new_invlist(size);
9052 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9054 _append_range_to_invlist(invlist, element0, element0);
9055 offset = *get_invlist_offset_addr(invlist);
9057 invlist_set_len(invlist, size, offset);
9058 *other_elements_ptr = invlist_array(invlist) + 1;
9064 PERL_STATIC_INLINE SV*
9065 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9066 return _add_range_to_invlist(invlist, cp, cp);
9069 #ifndef PERL_IN_XSUB_RE
9071 Perl__invlist_invert(pTHX_ SV* const invlist)
9073 /* Complement the input inversion list. This adds a 0 if the list didn't
9074 * have a zero; removes it otherwise. As described above, the data
9075 * structure is set up so that this is very efficient */
9077 PERL_ARGS_ASSERT__INVLIST_INVERT;
9079 assert(! invlist_is_iterating(invlist));
9081 /* The inverse of matching nothing is matching everything */
9082 if (_invlist_len(invlist) == 0) {
9083 _append_range_to_invlist(invlist, 0, UV_MAX);
9087 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9092 PERL_STATIC_INLINE SV*
9093 S_invlist_clone(pTHX_ SV* const invlist)
9096 /* Return a new inversion list that is a copy of the input one, which is
9097 * unchanged. The new list will not be mortal even if the old one was. */
9099 /* Need to allocate extra space to accommodate Perl's addition of a
9100 * trailing NUL to SvPV's, since it thinks they are always strings */
9101 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9102 STRLEN physical_length = SvCUR(invlist);
9103 bool offset = *(get_invlist_offset_addr(invlist));
9105 PERL_ARGS_ASSERT_INVLIST_CLONE;
9107 *(get_invlist_offset_addr(new_invlist)) = offset;
9108 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9109 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9114 PERL_STATIC_INLINE STRLEN*
9115 S_get_invlist_iter_addr(SV* invlist)
9117 /* Return the address of the UV that contains the current iteration
9120 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9122 assert(SvTYPE(invlist) == SVt_INVLIST);
9124 return &(((XINVLIST*) SvANY(invlist))->iterator);
9127 PERL_STATIC_INLINE void
9128 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9130 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9132 *get_invlist_iter_addr(invlist) = 0;
9135 PERL_STATIC_INLINE void
9136 S_invlist_iterfinish(SV* invlist)
9138 /* Terminate iterator for invlist. This is to catch development errors.
9139 * Any iteration that is interrupted before completed should call this
9140 * function. Functions that add code points anywhere else but to the end
9141 * of an inversion list assert that they are not in the middle of an
9142 * iteration. If they were, the addition would make the iteration
9143 * problematical: if the iteration hadn't reached the place where things
9144 * were being added, it would be ok */
9146 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9148 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9152 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9154 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9155 * This call sets in <*start> and <*end>, the next range in <invlist>.
9156 * Returns <TRUE> if successful and the next call will return the next
9157 * range; <FALSE> if was already at the end of the list. If the latter,
9158 * <*start> and <*end> are unchanged, and the next call to this function
9159 * will start over at the beginning of the list */
9161 STRLEN* pos = get_invlist_iter_addr(invlist);
9162 UV len = _invlist_len(invlist);
9165 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9168 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9172 array = invlist_array(invlist);
9174 *start = array[(*pos)++];
9180 *end = array[(*pos)++] - 1;
9186 PERL_STATIC_INLINE UV
9187 S_invlist_highest(SV* const invlist)
9189 /* Returns the highest code point that matches an inversion list. This API
9190 * has an ambiguity, as it returns 0 under either the highest is actually
9191 * 0, or if the list is empty. If this distinction matters to you, check
9192 * for emptiness before calling this function */
9194 UV len = _invlist_len(invlist);
9197 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9203 array = invlist_array(invlist);
9205 /* The last element in the array in the inversion list always starts a
9206 * range that goes to infinity. That range may be for code points that are
9207 * matched in the inversion list, or it may be for ones that aren't
9208 * matched. In the latter case, the highest code point in the set is one
9209 * less than the beginning of this range; otherwise it is the final element
9210 * of this range: infinity */
9211 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9213 : array[len - 1] - 1;
9216 #ifndef PERL_IN_XSUB_RE
9218 Perl__invlist_contents(pTHX_ SV* const invlist)
9220 /* Get the contents of an inversion list into a string SV so that they can
9221 * be printed out. It uses the format traditionally done for debug tracing
9225 SV* output = newSVpvs("\n");
9227 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9229 assert(! invlist_is_iterating(invlist));
9231 invlist_iterinit(invlist);
9232 while (invlist_iternext(invlist, &start, &end)) {
9233 if (end == UV_MAX) {
9234 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9236 else if (end != start) {
9237 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9241 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9249 #ifndef PERL_IN_XSUB_RE
9251 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9252 const char * const indent, SV* const invlist)
9254 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9255 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9256 * the string 'indent'. The output looks like this:
9257 [0] 0x000A .. 0x000D
9259 [4] 0x2028 .. 0x2029
9260 [6] 0x3104 .. INFINITY
9261 * This means that the first range of code points matched by the list are
9262 * 0xA through 0xD; the second range contains only the single code point
9263 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9264 * are used to define each range (except if the final range extends to
9265 * infinity, only a single element is needed). The array index of the
9266 * first element for the corresponding range is given in brackets. */
9271 PERL_ARGS_ASSERT__INVLIST_DUMP;
9273 if (invlist_is_iterating(invlist)) {
9274 Perl_dump_indent(aTHX_ level, file,
9275 "%sCan't dump inversion list because is in middle of iterating\n",
9280 invlist_iterinit(invlist);
9281 while (invlist_iternext(invlist, &start, &end)) {
9282 if (end == UV_MAX) {
9283 Perl_dump_indent(aTHX_ level, file,
9284 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9285 indent, (UV)count, start);
9287 else if (end != start) {
9288 Perl_dump_indent(aTHX_ level, file,
9289 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9290 indent, (UV)count, start, end);
9293 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9294 indent, (UV)count, start);
9301 Perl__load_PL_utf8_foldclosures (pTHX)
9303 assert(! PL_utf8_foldclosures);
9305 /* If the folds haven't been read in, call a fold function
9307 if (! PL_utf8_tofold) {
9308 U8 dummy[UTF8_MAXBYTES_CASE+1];
9310 /* This string is just a short named one above \xff */
9311 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9312 assert(PL_utf8_tofold); /* Verify that worked */
9314 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9318 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9320 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9322 /* Return a boolean as to if the two passed in inversion lists are
9323 * identical. The final argument, if TRUE, says to take the complement of
9324 * the second inversion list before doing the comparison */
9326 const UV* array_a = invlist_array(a);
9327 const UV* array_b = invlist_array(b);
9328 UV len_a = _invlist_len(a);
9329 UV len_b = _invlist_len(b);
9331 UV i = 0; /* current index into the arrays */
9332 bool retval = TRUE; /* Assume are identical until proven otherwise */
9334 PERL_ARGS_ASSERT__INVLISTEQ;
9336 /* If are to compare 'a' with the complement of b, set it
9337 * up so are looking at b's complement. */
9340 /* The complement of nothing is everything, so <a> would have to have
9341 * just one element, starting at zero (ending at infinity) */
9343 return (len_a == 1 && array_a[0] == 0);
9345 else if (array_b[0] == 0) {
9347 /* Otherwise, to complement, we invert. Here, the first element is
9348 * 0, just remove it. To do this, we just pretend the array starts
9356 /* But if the first element is not zero, we pretend the list starts
9357 * at the 0 that is always stored immediately before the array. */
9363 /* Make sure that the lengths are the same, as well as the final element
9364 * before looping through the remainder. (Thus we test the length, final,
9365 * and first elements right off the bat) */
9366 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9369 else for (i = 0; i < len_a - 1; i++) {
9370 if (array_a[i] != array_b[i]) {
9381 * As best we can, determine the characters that can match the start of
9382 * the given EXACTF-ish node.
9384 * Returns the invlist as a new SV*; it is the caller's responsibility to
9385 * call SvREFCNT_dec() when done with it.
9388 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9390 const U8 * s = (U8*)STRING(node);
9391 SSize_t bytelen = STR_LEN(node);
9393 /* Start out big enough for 2 separate code points */
9394 SV* invlist = _new_invlist(4);
9396 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9401 /* We punt and assume can match anything if the node begins
9402 * with a multi-character fold. Things are complicated. For
9403 * example, /ffi/i could match any of:
9404 * "\N{LATIN SMALL LIGATURE FFI}"
9405 * "\N{LATIN SMALL LIGATURE FF}I"
9406 * "F\N{LATIN SMALL LIGATURE FI}"
9407 * plus several other things; and making sure we have all the
9408 * possibilities is hard. */
9409 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9410 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9413 /* Any Latin1 range character can potentially match any
9414 * other depending on the locale */
9415 if (OP(node) == EXACTFL) {
9416 _invlist_union(invlist, PL_Latin1, &invlist);
9419 /* But otherwise, it matches at least itself. We can
9420 * quickly tell if it has a distinct fold, and if so,
9421 * it matches that as well */
9422 invlist = add_cp_to_invlist(invlist, uc);
9423 if (IS_IN_SOME_FOLD_L1(uc))
9424 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9427 /* Some characters match above-Latin1 ones under /i. This
9428 * is true of EXACTFL ones when the locale is UTF-8 */
9429 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9430 && (! isASCII(uc) || (OP(node) != EXACTFA
9431 && OP(node) != EXACTFA_NO_TRIE)))
9433 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9437 else { /* Pattern is UTF-8 */
9438 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9439 STRLEN foldlen = UTF8SKIP(s);
9440 const U8* e = s + bytelen;
9443 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9445 /* The only code points that aren't folded in a UTF EXACTFish
9446 * node are are the problematic ones in EXACTFL nodes */
9447 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9448 /* We need to check for the possibility that this EXACTFL
9449 * node begins with a multi-char fold. Therefore we fold
9450 * the first few characters of it so that we can make that
9455 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9457 *(d++) = (U8) toFOLD(*s);
9462 to_utf8_fold(s, d, &len);
9468 /* And set up so the code below that looks in this folded
9469 * buffer instead of the node's string */
9471 foldlen = UTF8SKIP(folded);
9475 /* When we reach here 's' points to the fold of the first
9476 * character(s) of the node; and 'e' points to far enough along
9477 * the folded string to be just past any possible multi-char
9478 * fold. 'foldlen' is the length in bytes of the first
9481 * Unlike the non-UTF-8 case, the macro for determining if a
9482 * string is a multi-char fold requires all the characters to
9483 * already be folded. This is because of all the complications
9484 * if not. Note that they are folded anyway, except in EXACTFL
9485 * nodes. Like the non-UTF case above, we punt if the node
9486 * begins with a multi-char fold */
9488 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9489 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9491 else { /* Single char fold */
9493 /* It matches all the things that fold to it, which are
9494 * found in PL_utf8_foldclosures (including itself) */
9495 invlist = add_cp_to_invlist(invlist, uc);
9496 if (! PL_utf8_foldclosures)
9497 _load_PL_utf8_foldclosures();
9498 if ((listp = hv_fetch(PL_utf8_foldclosures,
9499 (char *) s, foldlen, FALSE)))
9501 AV* list = (AV*) *listp;
9503 for (k = 0; k <= av_tindex(list); k++) {
9504 SV** c_p = av_fetch(list, k, FALSE);
9510 /* /aa doesn't allow folds between ASCII and non- */
9511 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9512 && isASCII(c) != isASCII(uc))
9517 invlist = add_cp_to_invlist(invlist, c);
9526 #undef HEADER_LENGTH
9527 #undef TO_INTERNAL_SIZE
9528 #undef FROM_INTERNAL_SIZE
9529 #undef INVLIST_VERSION_ID
9531 /* End of inversion list object */
9534 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9536 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9537 * constructs, and updates RExC_flags with them. On input, RExC_parse
9538 * should point to the first flag; it is updated on output to point to the
9539 * final ')' or ':'. There needs to be at least one flag, or this will
9542 /* for (?g), (?gc), and (?o) warnings; warning
9543 about (?c) will warn about (?g) -- japhy */
9545 #define WASTED_O 0x01
9546 #define WASTED_G 0x02
9547 #define WASTED_C 0x04
9548 #define WASTED_GC (WASTED_G|WASTED_C)
9549 I32 wastedflags = 0x00;
9550 U32 posflags = 0, negflags = 0;
9551 U32 *flagsp = &posflags;
9552 char has_charset_modifier = '\0';
9554 bool has_use_defaults = FALSE;
9555 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9556 int x_mod_count = 0;
9558 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9560 /* '^' as an initial flag sets certain defaults */
9561 if (UCHARAT(RExC_parse) == '^') {
9563 has_use_defaults = TRUE;
9564 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9565 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9566 ? REGEX_UNICODE_CHARSET
9567 : REGEX_DEPENDS_CHARSET);
9570 cs = get_regex_charset(RExC_flags);
9571 if (cs == REGEX_DEPENDS_CHARSET
9572 && (RExC_utf8 || RExC_uni_semantics))
9574 cs = REGEX_UNICODE_CHARSET;
9577 while (*RExC_parse) {
9578 /* && strchr("iogcmsx", *RExC_parse) */
9579 /* (?g), (?gc) and (?o) are useless here
9580 and must be globally applied -- japhy */
9581 switch (*RExC_parse) {
9583 /* Code for the imsxn flags */
9584 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9586 case LOCALE_PAT_MOD:
9587 if (has_charset_modifier) {
9588 goto excess_modifier;
9590 else if (flagsp == &negflags) {
9593 cs = REGEX_LOCALE_CHARSET;
9594 has_charset_modifier = LOCALE_PAT_MOD;
9596 case UNICODE_PAT_MOD:
9597 if (has_charset_modifier) {
9598 goto excess_modifier;
9600 else if (flagsp == &negflags) {
9603 cs = REGEX_UNICODE_CHARSET;
9604 has_charset_modifier = UNICODE_PAT_MOD;
9606 case ASCII_RESTRICT_PAT_MOD:
9607 if (flagsp == &negflags) {
9610 if (has_charset_modifier) {
9611 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9612 goto excess_modifier;
9614 /* Doubled modifier implies more restricted */
9615 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9618 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9620 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9622 case DEPENDS_PAT_MOD:
9623 if (has_use_defaults) {
9624 goto fail_modifiers;
9626 else if (flagsp == &negflags) {
9629 else if (has_charset_modifier) {
9630 goto excess_modifier;
9633 /* The dual charset means unicode semantics if the
9634 * pattern (or target, not known until runtime) are
9635 * utf8, or something in the pattern indicates unicode
9637 cs = (RExC_utf8 || RExC_uni_semantics)
9638 ? REGEX_UNICODE_CHARSET
9639 : REGEX_DEPENDS_CHARSET;
9640 has_charset_modifier = DEPENDS_PAT_MOD;
9644 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9645 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9647 else if (has_charset_modifier == *(RExC_parse - 1)) {
9648 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9652 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9654 NOT_REACHED; /*NOTREACHED*/
9657 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9659 NOT_REACHED; /*NOTREACHED*/
9660 case ONCE_PAT_MOD: /* 'o' */
9661 case GLOBAL_PAT_MOD: /* 'g' */
9662 if (PASS2 && ckWARN(WARN_REGEXP)) {
9663 const I32 wflagbit = *RExC_parse == 'o'
9666 if (! (wastedflags & wflagbit) ) {
9667 wastedflags |= wflagbit;
9668 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9671 "Useless (%s%c) - %suse /%c modifier",
9672 flagsp == &negflags ? "?-" : "?",
9674 flagsp == &negflags ? "don't " : "",
9681 case CONTINUE_PAT_MOD: /* 'c' */
9682 if (PASS2 && ckWARN(WARN_REGEXP)) {
9683 if (! (wastedflags & WASTED_C) ) {
9684 wastedflags |= WASTED_GC;
9685 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9688 "Useless (%sc) - %suse /gc modifier",
9689 flagsp == &negflags ? "?-" : "?",
9690 flagsp == &negflags ? "don't " : ""
9695 case KEEPCOPY_PAT_MOD: /* 'p' */
9696 if (flagsp == &negflags) {
9698 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9700 *flagsp |= RXf_PMf_KEEPCOPY;
9704 /* A flag is a default iff it is following a minus, so
9705 * if there is a minus, it means will be trying to
9706 * re-specify a default which is an error */
9707 if (has_use_defaults || flagsp == &negflags) {
9708 goto fail_modifiers;
9711 wastedflags = 0; /* reset so (?g-c) warns twice */
9715 RExC_flags |= posflags;
9716 RExC_flags &= ~negflags;
9717 set_regex_charset(&RExC_flags, cs);
9718 if (RExC_flags & RXf_PMf_FOLD) {
9719 RExC_contains_i = 1;
9722 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9728 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9729 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9730 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9731 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9732 NOT_REACHED; /*NOTREACHED*/
9739 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9744 - reg - regular expression, i.e. main body or parenthesized thing
9746 * Caller must absorb opening parenthesis.
9748 * Combining parenthesis handling with the base level of regular expression
9749 * is a trifle forced, but the need to tie the tails of the branches to what
9750 * follows makes it hard to avoid.
9752 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9754 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9756 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9759 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9760 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9761 needs to be restarted.
9762 Otherwise would only return NULL if regbranch() returns NULL, which
9765 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9766 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9767 * 2 is like 1, but indicates that nextchar() has been called to advance
9768 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9769 * this flag alerts us to the need to check for that */
9771 regnode *ret; /* Will be the head of the group. */
9774 regnode *ender = NULL;
9777 U32 oregflags = RExC_flags;
9778 bool have_branch = 0;
9780 I32 freeze_paren = 0;
9781 I32 after_freeze = 0;
9782 I32 num; /* numeric backreferences */
9784 char * parse_start = RExC_parse; /* MJD */
9785 char * const oregcomp_parse = RExC_parse;
9787 GET_RE_DEBUG_FLAGS_DECL;
9789 PERL_ARGS_ASSERT_REG;
9790 DEBUG_PARSE("reg ");
9792 *flagp = 0; /* Tentatively. */
9795 /* Make an OPEN node, if parenthesized. */
9798 /* Under /x, space and comments can be gobbled up between the '(' and
9799 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9800 * intervening space, as the sequence is a token, and a token should be
9802 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9804 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9805 char *start_verb = RExC_parse;
9806 STRLEN verb_len = 0;
9807 char *start_arg = NULL;
9808 unsigned char op = 0;
9810 int internal_argval = 0; /* internal_argval is only useful if
9813 if (has_intervening_patws) {
9815 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9817 while ( *RExC_parse && *RExC_parse != ')' ) {
9818 if ( *RExC_parse == ':' ) {
9819 start_arg = RExC_parse + 1;
9825 verb_len = RExC_parse - start_verb;
9828 while ( *RExC_parse && *RExC_parse != ')' )
9830 if ( *RExC_parse != ')' )
9831 vFAIL("Unterminated verb pattern argument");
9832 if ( RExC_parse == start_arg )
9835 if ( *RExC_parse != ')' )
9836 vFAIL("Unterminated verb pattern");
9839 switch ( *start_verb ) {
9840 case 'A': /* (*ACCEPT) */
9841 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9843 internal_argval = RExC_nestroot;
9846 case 'C': /* (*COMMIT) */
9847 if ( memEQs(start_verb,verb_len,"COMMIT") )
9850 case 'F': /* (*FAIL) */
9851 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9856 case ':': /* (*:NAME) */
9857 case 'M': /* (*MARK:NAME) */
9858 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9863 case 'P': /* (*PRUNE) */
9864 if ( memEQs(start_verb,verb_len,"PRUNE") )
9867 case 'S': /* (*SKIP) */
9868 if ( memEQs(start_verb,verb_len,"SKIP") )
9871 case 'T': /* (*THEN) */
9872 /* [19:06] <TimToady> :: is then */
9873 if ( memEQs(start_verb,verb_len,"THEN") ) {
9875 RExC_seen |= REG_CUTGROUP_SEEN;
9880 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9882 "Unknown verb pattern '%"UTF8f"'",
9883 UTF8fARG(UTF, verb_len, start_verb));
9886 if ( start_arg && internal_argval ) {
9887 vFAIL3("Verb pattern '%.*s' may not have an argument",
9888 verb_len, start_verb);
9889 } else if ( argok < 0 && !start_arg ) {
9890 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9891 verb_len, start_verb);
9893 ret = reganode(pRExC_state, op, internal_argval);
9894 if ( ! internal_argval && ! SIZE_ONLY ) {
9896 SV *sv = newSVpvn( start_arg,
9897 RExC_parse - start_arg);
9898 ARG(ret) = add_data( pRExC_state,
9900 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9907 if (!internal_argval)
9908 RExC_seen |= REG_VERBARG_SEEN;
9909 } else if ( start_arg ) {
9910 vFAIL3("Verb pattern '%.*s' may not have an argument",
9911 verb_len, start_verb);
9913 ret = reg_node(pRExC_state, op);
9915 nextchar(pRExC_state);
9918 else if (*RExC_parse == '?') { /* (?...) */
9919 bool is_logical = 0;
9920 const char * const seqstart = RExC_parse;
9921 const char * endptr;
9922 if (has_intervening_patws) {
9924 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9928 paren = *RExC_parse++;
9929 ret = NULL; /* For look-ahead/behind. */
9932 case 'P': /* (?P...) variants for those used to PCRE/Python */
9933 paren = *RExC_parse++;
9934 if ( paren == '<') /* (?P<...>) named capture */
9936 else if (paren == '>') { /* (?P>name) named recursion */
9937 goto named_recursion;
9939 else if (paren == '=') { /* (?P=...) named backref */
9940 /* this pretty much dupes the code for \k<NAME> in
9941 * regatom(), if you change this make sure you change that
9943 char* name_start = RExC_parse;
9945 SV *sv_dat = reg_scan_name(pRExC_state,
9946 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9947 if (RExC_parse == name_start || *RExC_parse != ')')
9948 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9949 vFAIL2("Sequence %.3s... not terminated",parse_start);
9952 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9953 RExC_rxi->data->data[num]=(void*)sv_dat;
9954 SvREFCNT_inc_simple_void(sv_dat);
9957 ret = reganode(pRExC_state,
9960 : (ASCII_FOLD_RESTRICTED)
9962 : (AT_LEAST_UNI_SEMANTICS)
9970 Set_Node_Offset(ret, parse_start+1);
9971 Set_Node_Cur_Length(ret, parse_start);
9973 nextchar(pRExC_state);
9977 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9978 vFAIL3("Sequence (%.*s...) not recognized",
9979 RExC_parse-seqstart, seqstart);
9980 NOT_REACHED; /*NOTREACHED*/
9981 case '<': /* (?<...) */
9982 if (*RExC_parse == '!')
9984 else if (*RExC_parse != '=')
9990 case '\'': /* (?'...') */
9991 name_start= RExC_parse;
9992 svname = reg_scan_name(pRExC_state,
9993 SIZE_ONLY /* reverse test from the others */
9994 ? REG_RSN_RETURN_NAME
9995 : REG_RSN_RETURN_NULL);
9996 if (RExC_parse == name_start || *RExC_parse != paren)
9997 vFAIL2("Sequence (?%c... not terminated",
9998 paren=='>' ? '<' : paren);
10002 if (!svname) /* shouldn't happen */
10004 "panic: reg_scan_name returned NULL");
10005 if (!RExC_paren_names) {
10006 RExC_paren_names= newHV();
10007 sv_2mortal(MUTABLE_SV(RExC_paren_names));
10009 RExC_paren_name_list= newAV();
10010 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10013 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10015 sv_dat = HeVAL(he_str);
10017 /* croak baby croak */
10019 "panic: paren_name hash element allocation failed");
10020 } else if ( SvPOK(sv_dat) ) {
10021 /* (?|...) can mean we have dupes so scan to check
10022 its already been stored. Maybe a flag indicating
10023 we are inside such a construct would be useful,
10024 but the arrays are likely to be quite small, so
10025 for now we punt -- dmq */
10026 IV count = SvIV(sv_dat);
10027 I32 *pv = (I32*)SvPVX(sv_dat);
10029 for ( i = 0 ; i < count ; i++ ) {
10030 if ( pv[i] == RExC_npar ) {
10036 pv = (I32*)SvGROW(sv_dat,
10037 SvCUR(sv_dat) + sizeof(I32)+1);
10038 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10039 pv[count] = RExC_npar;
10040 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10043 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10044 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10047 SvIV_set(sv_dat, 1);
10050 /* Yes this does cause a memory leak in debugging Perls
10052 if (!av_store(RExC_paren_name_list,
10053 RExC_npar, SvREFCNT_inc(svname)))
10054 SvREFCNT_dec_NN(svname);
10057 /*sv_dump(sv_dat);*/
10059 nextchar(pRExC_state);
10061 goto capturing_parens;
10063 RExC_seen |= REG_LOOKBEHIND_SEEN;
10064 RExC_in_lookbehind++;
10067 case '=': /* (?=...) */
10068 RExC_seen_zerolen++;
10070 case '!': /* (?!...) */
10071 RExC_seen_zerolen++;
10072 /* check if we're really just a "FAIL" assertion */
10074 nextchar(pRExC_state);
10075 if (*RExC_parse == ')') {
10076 ret=reg_node(pRExC_state, OPFAIL);
10077 nextchar(pRExC_state);
10081 case '|': /* (?|...) */
10082 /* branch reset, behave like a (?:...) except that
10083 buffers in alternations share the same numbers */
10085 after_freeze = freeze_paren = RExC_npar;
10087 case ':': /* (?:...) */
10088 case '>': /* (?>...) */
10090 case '$': /* (?$...) */
10091 case '@': /* (?@...) */
10092 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10094 case '0' : /* (?0) */
10095 case 'R' : /* (?R) */
10096 if (*RExC_parse != ')')
10097 FAIL("Sequence (?R) not terminated");
10098 ret = reg_node(pRExC_state, GOSTART);
10099 RExC_seen |= REG_GOSTART_SEEN;
10100 *flagp |= POSTPONED;
10101 nextchar(pRExC_state);
10104 /* named and numeric backreferences */
10105 case '&': /* (?&NAME) */
10106 parse_start = RExC_parse - 1;
10109 SV *sv_dat = reg_scan_name(pRExC_state,
10110 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10111 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10113 if (RExC_parse == RExC_end || *RExC_parse != ')')
10114 vFAIL("Sequence (?&... not terminated");
10115 goto gen_recurse_regop;
10118 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10120 vFAIL("Illegal pattern");
10122 goto parse_recursion;
10124 case '-': /* (?-1) */
10125 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10126 RExC_parse--; /* rewind to let it be handled later */
10130 case '1': case '2': case '3': case '4': /* (?1) */
10131 case '5': case '6': case '7': case '8': case '9':
10135 bool is_neg = FALSE;
10136 parse_start = RExC_parse - 1; /* MJD */
10137 if (*RExC_parse == '-') {
10141 num = grok_atou(RExC_parse, &endptr);
10143 RExC_parse = (char*)endptr;
10145 /* Some limit for num? */
10149 if (*RExC_parse!=')')
10150 vFAIL("Expecting close bracket");
10153 if ( paren == '-' ) {
10155 Diagram of capture buffer numbering.
10156 Top line is the normal capture buffer numbers
10157 Bottom line is the negative indexing as from
10161 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10165 num = RExC_npar + num;
10168 vFAIL("Reference to nonexistent group");
10170 } else if ( paren == '+' ) {
10171 num = RExC_npar + num - 1;
10174 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10176 if (num > (I32)RExC_rx->nparens) {
10178 vFAIL("Reference to nonexistent group");
10180 RExC_recurse_count++;
10181 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10182 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10183 22, "| |", (int)(depth * 2 + 1), "",
10184 (UV)ARG(ret), (IV)ARG2L(ret)));
10186 RExC_seen |= REG_RECURSE_SEEN;
10187 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10188 Set_Node_Offset(ret, parse_start); /* MJD */
10190 *flagp |= POSTPONED;
10191 nextchar(pRExC_state);
10196 case '?': /* (??...) */
10198 if (*RExC_parse != '{') {
10200 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10202 "Sequence (%"UTF8f"...) not recognized",
10203 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10204 NOT_REACHED; /*NOTREACHED*/
10206 *flagp |= POSTPONED;
10207 paren = *RExC_parse++;
10209 case '{': /* (?{...}) */
10212 struct reg_code_block *cb;
10214 RExC_seen_zerolen++;
10216 if ( !pRExC_state->num_code_blocks
10217 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10218 || pRExC_state->code_blocks[pRExC_state->code_index].start
10219 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10222 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10223 FAIL("panic: Sequence (?{...}): no code block found\n");
10224 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10226 /* this is a pre-compiled code block (?{...}) */
10227 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10228 RExC_parse = RExC_start + cb->end;
10231 if (cb->src_regex) {
10232 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10233 RExC_rxi->data->data[n] =
10234 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10235 RExC_rxi->data->data[n+1] = (void*)o;
10238 n = add_data(pRExC_state,
10239 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10240 RExC_rxi->data->data[n] = (void*)o;
10243 pRExC_state->code_index++;
10244 nextchar(pRExC_state);
10248 ret = reg_node(pRExC_state, LOGICAL);
10250 eval = reg2Lanode(pRExC_state, EVAL,
10253 /* for later propagation into (??{})
10255 RExC_flags & RXf_PMf_COMPILETIME
10260 REGTAIL(pRExC_state, ret, eval);
10261 /* deal with the length of this later - MJD */
10264 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10265 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10266 Set_Node_Offset(ret, parse_start);
10269 case '(': /* (?(?{...})...) and (?(?=...)...) */
10272 const int DEFINE_len = sizeof("DEFINE") - 1;
10273 if (RExC_parse[0] == '?') { /* (?(?...)) */
10274 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10275 || RExC_parse[1] == '<'
10276 || RExC_parse[1] == '{') { /* Lookahead or eval. */
10280 ret = reg_node(pRExC_state, LOGICAL);
10284 tail = reg(pRExC_state, 1, &flag, depth+1);
10285 if (flag & RESTART_UTF8) {
10286 *flagp = RESTART_UTF8;
10289 REGTAIL(pRExC_state, ret, tail);
10292 /* Fall through to ‘Unknown switch condition’ at the
10293 end of the if/else chain. */
10295 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10296 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10298 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10299 char *name_start= RExC_parse++;
10301 SV *sv_dat=reg_scan_name(pRExC_state,
10302 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10303 if (RExC_parse == name_start || *RExC_parse != ch)
10304 vFAIL2("Sequence (?(%c... not terminated",
10305 (ch == '>' ? '<' : ch));
10308 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10309 RExC_rxi->data->data[num]=(void*)sv_dat;
10310 SvREFCNT_inc_simple_void(sv_dat);
10312 ret = reganode(pRExC_state,NGROUPP,num);
10313 goto insert_if_check_paren;
10315 else if (strnEQ(RExC_parse, "DEFINE",
10316 MIN(DEFINE_len, RExC_end - RExC_parse)))
10318 ret = reganode(pRExC_state,DEFINEP,0);
10319 RExC_parse += DEFINE_len;
10321 goto insert_if_check_paren;
10323 else if (RExC_parse[0] == 'R') {
10326 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10327 parno = grok_atou(RExC_parse, &endptr);
10329 RExC_parse = (char*)endptr;
10330 } else if (RExC_parse[0] == '&') {
10333 sv_dat = reg_scan_name(pRExC_state,
10335 ? REG_RSN_RETURN_NULL
10336 : REG_RSN_RETURN_DATA);
10337 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10339 ret = reganode(pRExC_state,INSUBP,parno);
10340 goto insert_if_check_paren;
10342 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10346 parno = grok_atou(RExC_parse, &endptr);
10348 RExC_parse = (char*)endptr;
10349 ret = reganode(pRExC_state, GROUPP, parno);
10351 insert_if_check_paren:
10352 if (*(tmp = nextchar(pRExC_state)) != ')') {
10353 /* nextchar also skips comments, so undo its work
10354 * and skip over the the next character.
10357 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10358 vFAIL("Switch condition not recognized");
10361 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10362 br = regbranch(pRExC_state, &flags, 1,depth+1);
10364 if (flags & RESTART_UTF8) {
10365 *flagp = RESTART_UTF8;
10368 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10371 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10373 c = *nextchar(pRExC_state);
10374 if (flags&HASWIDTH)
10375 *flagp |= HASWIDTH;
10378 vFAIL("(?(DEFINE)....) does not allow branches");
10380 /* Fake one for optimizer. */
10381 lastbr = reganode(pRExC_state, IFTHEN, 0);
10383 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10384 if (flags & RESTART_UTF8) {
10385 *flagp = RESTART_UTF8;
10388 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10391 REGTAIL(pRExC_state, ret, lastbr);
10392 if (flags&HASWIDTH)
10393 *flagp |= HASWIDTH;
10394 c = *nextchar(pRExC_state);
10399 if (RExC_parse>RExC_end)
10400 vFAIL("Switch (?(condition)... not terminated");
10402 vFAIL("Switch (?(condition)... contains too many branches");
10404 ender = reg_node(pRExC_state, TAIL);
10405 REGTAIL(pRExC_state, br, ender);
10407 REGTAIL(pRExC_state, lastbr, ender);
10408 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10411 REGTAIL(pRExC_state, ret, ender);
10412 RExC_size++; /* XXX WHY do we need this?!!
10413 For large programs it seems to be required
10414 but I can't figure out why. -- dmq*/
10417 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10418 vFAIL("Unknown switch condition (?(...))");
10420 case '[': /* (?[ ... ]) */
10421 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10424 RExC_parse--; /* for vFAIL to print correctly */
10425 vFAIL("Sequence (? incomplete");
10427 default: /* e.g., (?i) */
10430 parse_lparen_question_flags(pRExC_state);
10431 if (UCHARAT(RExC_parse) != ':') {
10432 nextchar(pRExC_state);
10437 nextchar(pRExC_state);
10442 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
10447 ret = reganode(pRExC_state, OPEN, parno);
10449 if (!RExC_nestroot)
10450 RExC_nestroot = parno;
10451 if (RExC_seen & REG_RECURSE_SEEN
10452 && !RExC_open_parens[parno-1])
10454 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10455 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10456 22, "| |", (int)(depth * 2 + 1), "",
10457 (IV)parno, REG_NODE_NUM(ret)));
10458 RExC_open_parens[parno-1]= ret;
10461 Set_Node_Length(ret, 1); /* MJD */
10462 Set_Node_Offset(ret, RExC_parse); /* MJD */
10472 /* Pick up the branches, linking them together. */
10473 parse_start = RExC_parse; /* MJD */
10474 br = regbranch(pRExC_state, &flags, 1,depth+1);
10476 /* branch_len = (paren != 0); */
10479 if (flags & RESTART_UTF8) {
10480 *flagp = RESTART_UTF8;
10483 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10485 if (*RExC_parse == '|') {
10486 if (!SIZE_ONLY && RExC_extralen) {
10487 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10490 reginsert(pRExC_state, BRANCH, br, depth+1);
10491 Set_Node_Length(br, paren != 0);
10492 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10496 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10498 else if (paren == ':') {
10499 *flagp |= flags&SIMPLE;
10501 if (is_open) { /* Starts with OPEN. */
10502 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10504 else if (paren != '?') /* Not Conditional */
10506 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10508 while (*RExC_parse == '|') {
10509 if (!SIZE_ONLY && RExC_extralen) {
10510 ender = reganode(pRExC_state, LONGJMP,0);
10512 /* Append to the previous. */
10513 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10516 RExC_extralen += 2; /* Account for LONGJMP. */
10517 nextchar(pRExC_state);
10518 if (freeze_paren) {
10519 if (RExC_npar > after_freeze)
10520 after_freeze = RExC_npar;
10521 RExC_npar = freeze_paren;
10523 br = regbranch(pRExC_state, &flags, 0, depth+1);
10526 if (flags & RESTART_UTF8) {
10527 *flagp = RESTART_UTF8;
10530 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10532 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10534 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10537 if (have_branch || paren != ':') {
10538 /* Make a closing node, and hook it on the end. */
10541 ender = reg_node(pRExC_state, TAIL);
10544 ender = reganode(pRExC_state, CLOSE, parno);
10545 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10546 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10547 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10548 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10549 RExC_close_parens[parno-1]= ender;
10550 if (RExC_nestroot == parno)
10553 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10554 Set_Node_Length(ender,1); /* MJD */
10560 *flagp &= ~HASWIDTH;
10563 ender = reg_node(pRExC_state, SUCCEED);
10566 ender = reg_node(pRExC_state, END);
10568 assert(!RExC_opend); /* there can only be one! */
10569 RExC_opend = ender;
10573 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10574 DEBUG_PARSE_MSG("lsbr");
10575 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10576 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10577 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10578 SvPV_nolen_const(RExC_mysv1),
10579 (IV)REG_NODE_NUM(lastbr),
10580 SvPV_nolen_const(RExC_mysv2),
10581 (IV)REG_NODE_NUM(ender),
10582 (IV)(ender - lastbr)
10585 REGTAIL(pRExC_state, lastbr, ender);
10587 if (have_branch && !SIZE_ONLY) {
10588 char is_nothing= 1;
10590 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10592 /* Hook the tails of the branches to the closing node. */
10593 for (br = ret; br; br = regnext(br)) {
10594 const U8 op = PL_regkind[OP(br)];
10595 if (op == BRANCH) {
10596 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10597 if ( OP(NEXTOPER(br)) != NOTHING
10598 || regnext(NEXTOPER(br)) != ender)
10601 else if (op == BRANCHJ) {
10602 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10603 /* for now we always disable this optimisation * /
10604 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10605 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10611 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10612 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10613 DEBUG_PARSE_MSG("NADA");
10614 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10615 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10616 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10617 SvPV_nolen_const(RExC_mysv1),
10618 (IV)REG_NODE_NUM(ret),
10619 SvPV_nolen_const(RExC_mysv2),
10620 (IV)REG_NODE_NUM(ender),
10625 if (OP(ender) == TAIL) {
10630 for ( opt= br + 1; opt < ender ; opt++ )
10631 OP(opt)= OPTIMIZED;
10632 NEXT_OFF(br)= ender - br;
10640 static const char parens[] = "=!<,>";
10642 if (paren && (p = strchr(parens, paren))) {
10643 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10644 int flag = (p - parens) > 1;
10647 node = SUSPEND, flag = 0;
10648 reginsert(pRExC_state, node,ret, depth+1);
10649 Set_Node_Cur_Length(ret, parse_start);
10650 Set_Node_Offset(ret, parse_start + 1);
10652 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10656 /* Check for proper termination. */
10658 /* restore original flags, but keep (?p) */
10659 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10660 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10661 RExC_parse = oregcomp_parse;
10662 vFAIL("Unmatched (");
10665 else if (!paren && RExC_parse < RExC_end) {
10666 if (*RExC_parse == ')') {
10668 vFAIL("Unmatched )");
10671 FAIL("Junk on end of regexp"); /* "Can't happen". */
10672 NOT_REACHED; /* NOTREACHED */
10675 if (RExC_in_lookbehind) {
10676 RExC_in_lookbehind--;
10678 if (after_freeze > RExC_npar)
10679 RExC_npar = after_freeze;
10684 - regbranch - one alternative of an | operator
10686 * Implements the concatenation operator.
10688 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10692 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10695 regnode *chain = NULL;
10697 I32 flags = 0, c = 0;
10698 GET_RE_DEBUG_FLAGS_DECL;
10700 PERL_ARGS_ASSERT_REGBRANCH;
10702 DEBUG_PARSE("brnc");
10707 if (!SIZE_ONLY && RExC_extralen)
10708 ret = reganode(pRExC_state, BRANCHJ,0);
10710 ret = reg_node(pRExC_state, BRANCH);
10711 Set_Node_Length(ret, 1);
10715 if (!first && SIZE_ONLY)
10716 RExC_extralen += 1; /* BRANCHJ */
10718 *flagp = WORST; /* Tentatively. */
10721 nextchar(pRExC_state);
10722 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10723 flags &= ~TRYAGAIN;
10724 latest = regpiece(pRExC_state, &flags,depth+1);
10725 if (latest == NULL) {
10726 if (flags & TRYAGAIN)
10728 if (flags & RESTART_UTF8) {
10729 *flagp = RESTART_UTF8;
10732 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10734 else if (ret == NULL)
10736 *flagp |= flags&(HASWIDTH|POSTPONED);
10737 if (chain == NULL) /* First piece. */
10738 *flagp |= flags&SPSTART;
10740 /* FIXME adding one for every branch after the first is probably
10741 * excessive now we have TRIE support. (hv) */
10743 REGTAIL(pRExC_state, chain, latest);
10748 if (chain == NULL) { /* Loop ran zero times. */
10749 chain = reg_node(pRExC_state, NOTHING);
10754 *flagp |= flags&SIMPLE;
10761 - regpiece - something followed by possible [*+?]
10763 * Note that the branching code sequences used for ? and the general cases
10764 * of * and + are somewhat optimized: they use the same NOTHING node as
10765 * both the endmarker for their branch list and the body of the last branch.
10766 * It might seem that this node could be dispensed with entirely, but the
10767 * endmarker role is not redundant.
10769 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10771 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10775 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10781 const char * const origparse = RExC_parse;
10783 I32 max = REG_INFTY;
10784 #ifdef RE_TRACK_PATTERN_OFFSETS
10787 const char *maxpos = NULL;
10789 /* Save the original in case we change the emitted regop to a FAIL. */
10790 regnode * const orig_emit = RExC_emit;
10792 GET_RE_DEBUG_FLAGS_DECL;
10794 PERL_ARGS_ASSERT_REGPIECE;
10796 DEBUG_PARSE("piec");
10798 ret = regatom(pRExC_state, &flags,depth+1);
10800 if (flags & (TRYAGAIN|RESTART_UTF8))
10801 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10803 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10809 if (op == '{' && regcurly(RExC_parse)) {
10811 #ifdef RE_TRACK_PATTERN_OFFSETS
10812 parse_start = RExC_parse; /* MJD */
10814 next = RExC_parse + 1;
10815 while (isDIGIT(*next) || *next == ',') {
10816 if (*next == ',') {
10824 if (*next == '}') { /* got one */
10825 const char* endptr;
10829 min = grok_atou(RExC_parse, &endptr);
10830 if (*maxpos == ',')
10833 maxpos = RExC_parse;
10834 max = grok_atou(maxpos, &endptr);
10835 if (!max && *maxpos != '0')
10836 max = REG_INFTY; /* meaning "infinity" */
10837 else if (max >= REG_INFTY)
10838 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10840 nextchar(pRExC_state);
10841 if (max < min) { /* If can't match, warn and optimize to fail
10845 /* We can't back off the size because we have to reserve
10846 * enough space for all the things we are about to throw
10847 * away, but we can shrink it by the ammount we are about
10848 * to re-use here */
10849 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10852 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10853 RExC_emit = orig_emit;
10855 ret = reg_node(pRExC_state, OPFAIL);
10858 else if (min == max
10859 && RExC_parse < RExC_end
10860 && (*RExC_parse == '?' || *RExC_parse == '+'))
10863 ckWARN2reg(RExC_parse + 1,
10864 "Useless use of greediness modifier '%c'",
10867 /* Absorb the modifier, so later code doesn't see nor use
10869 nextchar(pRExC_state);
10873 if ((flags&SIMPLE)) {
10874 MARK_NAUGHTY_EXP(2, 2);
10875 reginsert(pRExC_state, CURLY, ret, depth+1);
10876 Set_Node_Offset(ret, parse_start+1); /* MJD */
10877 Set_Node_Cur_Length(ret, parse_start);
10880 regnode * const w = reg_node(pRExC_state, WHILEM);
10883 REGTAIL(pRExC_state, ret, w);
10884 if (!SIZE_ONLY && RExC_extralen) {
10885 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10886 reginsert(pRExC_state, NOTHING,ret, depth+1);
10887 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10889 reginsert(pRExC_state, CURLYX,ret, depth+1);
10891 Set_Node_Offset(ret, parse_start+1);
10892 Set_Node_Length(ret,
10893 op == '{' ? (RExC_parse - parse_start) : 1);
10895 if (!SIZE_ONLY && RExC_extralen)
10896 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10897 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10899 RExC_whilem_seen++, RExC_extralen += 3;
10900 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
10907 *flagp |= HASWIDTH;
10909 ARG1_SET(ret, (U16)min);
10910 ARG2_SET(ret, (U16)max);
10912 if (max == REG_INFTY)
10913 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10919 if (!ISMULT1(op)) {
10924 #if 0 /* Now runtime fix should be reliable. */
10926 /* if this is reinstated, don't forget to put this back into perldiag:
10928 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10930 (F) The part of the regexp subject to either the * or + quantifier
10931 could match an empty string. The {#} shows in the regular
10932 expression about where the problem was discovered.
10936 if (!(flags&HASWIDTH) && op != '?')
10937 vFAIL("Regexp *+ operand could be empty");
10940 #ifdef RE_TRACK_PATTERN_OFFSETS
10941 parse_start = RExC_parse;
10943 nextchar(pRExC_state);
10945 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10947 if (op == '*' && (flags&SIMPLE)) {
10948 reginsert(pRExC_state, STAR, ret, depth+1);
10951 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10953 else if (op == '*') {
10957 else if (op == '+' && (flags&SIMPLE)) {
10958 reginsert(pRExC_state, PLUS, ret, depth+1);
10961 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10963 else if (op == '+') {
10967 else if (op == '?') {
10972 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10973 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10974 ckWARN2reg(RExC_parse,
10975 "%"UTF8f" matches null string many times",
10976 UTF8fARG(UTF, (RExC_parse >= origparse
10977 ? RExC_parse - origparse
10980 (void)ReREFCNT_inc(RExC_rx_sv);
10983 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10984 nextchar(pRExC_state);
10985 reginsert(pRExC_state, MINMOD, ret, depth+1);
10986 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10989 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10991 nextchar(pRExC_state);
10992 ender = reg_node(pRExC_state, SUCCEED);
10993 REGTAIL(pRExC_state, ret, ender);
10994 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10996 ender = reg_node(pRExC_state, TAIL);
10997 REGTAIL(pRExC_state, ret, ender);
11000 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11002 vFAIL("Nested quantifiers");
11009 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
11010 UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
11014 /* This is expected to be called by a parser routine that has recognized '\N'
11015 and needs to handle the rest. RExC_parse is expected to point at the first
11016 char following the N at the time of the call. On successful return,
11017 RExC_parse has been updated to point to just after the sequence identified
11018 by this routine, <*flagp> has been updated, and the non-NULL input pointers
11019 have been set appropriately.
11021 The typical case for this is \N{some character name}. This is usually
11022 called while parsing the input, filling in or ready to fill in an EXACTish
11023 node, and the code point for the character should be returned, so that it
11024 can be added to the node, and parsing continued with the next input
11025 character. But it may be that instead of a single character the \N{}
11026 expands to more than one, a named sequence. In this case any following
11027 quantifier applies to the whole sequence, and it is easier, given the code
11028 structure that calls this, to handle it from a different area of the code.
11029 For this reason, the input parameters can be set so that it returns valid
11030 only on one or the other of these cases.
11032 Another possibility is for the input to be an empty \N{}, which for
11033 backwards compatibility we accept, but generate a NOTHING node which should
11034 later get optimized out. This is handled from the area of code which can
11035 handle a named sequence, so if called with the parameters for the other, it
11038 Still another possibility is for the \N to mean [^\n], and not a single
11039 character or explicit sequence at all. This is determined by context.
11040 Again, this is handled from the area of code which can handle a named
11041 sequence, so if called with the parameters for the other, it also fails.
11043 And the final possibility is for the \N to be called from within a bracketed
11044 character class. In this case the [^\n] meaning makes no sense, and so is
11045 an error. Other anomalous situations are left to the calling code to handle.
11047 For non-single-quoted regexes, the tokenizer has attempted to decide which
11048 of the above applies, and in the case of a named sequence, has converted it
11049 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11050 where c1... are the characters in the sequence. For single-quoted regexes,
11051 the tokenizer passes the \N sequence through unchanged; this code will not
11052 attempt to determine this nor expand those, instead raising a syntax error.
11053 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11054 or there is no '}', it signals that this \N occurrence means to match a
11055 non-newline. (This mostly was done because of [perl #56444].)
11057 The API is somewhat convoluted due to historical and the above reasons.
11059 The function raises an error (via vFAIL), and doesn't return for various
11060 syntax errors. For other failures, it returns (STRLEN) -1. For successes,
11061 it returns a count of how many characters were accounted for by it. (This
11062 can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11063 points in the sequence. It sets <node_p>, <valuep>, and/or
11064 <substitute_parse> on success.
11066 If <valuep> is non-null, it means the caller can accept an input sequence
11067 consisting of just a single code point; <*valuep> is set to the value of the
11068 only or first code point in the input.
11070 If <substitute_parse> is non-null, it means the caller can accept an input
11071 sequence consisting of one or more code points; <*substitute_parse> is a
11072 newly created mortal SV* in this case, containing \x{} escapes representing
11075 Both <valuep> and <substitute_parse> can be non-NULL.
11077 If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
11078 that the caller can accept any legal sequence other than a single code
11079 point. To wit, <*node_p> is set as follows:
11080 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11081 2) \N{}: points to a new NOTHING node; return is 0
11082 3) otherwise: points to a new EXACT node containing the resolved
11083 string; return is the number of code points in the
11084 string. This will never be 1.
11085 Note that failure is returned for single code point sequences if <valuep> is
11086 null and <node_p> is not.
11089 char * endbrace; /* '}' following the name */
11091 char *endchar; /* Points to '.' or '}' ending cur char in the input
11093 bool has_multiple_chars; /* true if the input stream contains a sequence of
11094 more than one character */
11095 bool in_char_class = substitute_parse != NULL;
11096 STRLEN count = 0; /* Number of characters in this sequence */
11098 GET_RE_DEBUG_FLAGS_DECL;
11100 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11102 GET_RE_DEBUG_FLAGS;
11104 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
11105 assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11107 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11108 * modifier. The other meaning does not, so use a temporary until we find
11109 * out which we are being called with */
11110 p = (RExC_flags & RXf_PMf_EXTENDED)
11111 ? regpatws(pRExC_state, RExC_parse,
11112 TRUE) /* means recognize comments */
11115 /* Disambiguate between \N meaning a named character versus \N meaning
11116 * [^\n]. The former is assumed when it can't be the latter. */
11117 if (*p != '{' || regcurly(p)) {
11120 /* no bare \N allowed in a charclass */
11121 if (in_char_class) {
11122 vFAIL("\\N in a character class must be a named character: \\N{...}");
11124 return (STRLEN) -1;
11126 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
11128 nextchar(pRExC_state);
11129 *node_p = reg_node(pRExC_state, REG_ANY);
11130 *flagp |= HASWIDTH|SIMPLE;
11132 Set_Node_Length(*node_p, 1); /* MJD */
11136 /* Here, we have decided it should be a named character or sequence */
11138 /* The test above made sure that the next real character is a '{', but
11139 * under the /x modifier, it could be separated by space (or a comment and
11140 * \n) and this is not allowed (for consistency with \x{...} and the
11141 * tokenizer handling of \N{NAME}). */
11142 if (*RExC_parse != '{') {
11143 vFAIL("Missing braces on \\N{}");
11146 RExC_parse++; /* Skip past the '{' */
11148 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11149 || ! (endbrace == RExC_parse /* nothing between the {} */
11150 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11151 && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11154 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11155 vFAIL("\\N{NAME} must be resolved by the lexer");
11158 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11160 if (endbrace == RExC_parse) { /* empty: \N{} */
11162 *node_p = reg_node(pRExC_state,NOTHING);
11164 else if (! in_char_class) {
11165 return (STRLEN) -1;
11167 nextchar(pRExC_state);
11171 RExC_parse += 2; /* Skip past the 'U+' */
11173 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11175 /* Code points are separated by dots. If none, there is only one code
11176 * point, and is terminated by the brace */
11177 has_multiple_chars = (endchar < endbrace);
11179 /* We get the first code point if we want it, and either there is only one,
11180 * or we can accept both cases of one and there is more than one */
11181 if (valuep && (substitute_parse || ! has_multiple_chars)) {
11182 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11183 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11184 | PERL_SCAN_DISALLOW_PREFIX
11186 /* No errors in the first pass (See [perl
11187 * #122671].) We let the code below find the
11188 * errors when there are multiple chars. */
11189 | ((SIZE_ONLY || has_multiple_chars)
11190 ? PERL_SCAN_SILENT_ILLDIGIT
11193 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11195 /* The tokenizer should have guaranteed validity, but it's possible to
11196 * bypass it by using single quoting, so check. Don't do the check
11197 * here when there are multiple chars; we do it below anyway. */
11198 if (! has_multiple_chars) {
11199 if (length_of_hex == 0
11200 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11202 RExC_parse += length_of_hex; /* Includes all the valid */
11203 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11204 ? UTF8SKIP(RExC_parse)
11206 /* Guard against malformed utf8 */
11207 if (RExC_parse >= endchar) {
11208 RExC_parse = endchar;
11210 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11213 RExC_parse = endbrace + 1;
11218 /* Here, we should have already handled the case where a single character
11219 * is expected and found. So it is a failure if we aren't expecting
11220 * multiple chars and got them; or didn't get them but wanted them. We
11221 * fail without advancing the parse, so that the caller can try again with
11222 * different acceptance criteria */
11223 if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11225 return (STRLEN) -1;
11229 /* What is done here is to convert this to a sub-pattern of the form
11230 * \x{char1}\x{char2}...
11231 * and then either return it in <*substitute_parse> if non-null; or
11232 * call reg recursively to parse it (enclosing in "(?: ... )" ). That
11233 * way, it retains its atomicness, while not having to worry about
11234 * special handling that some code points may have. toke.c has
11235 * converted the original Unicode values to native, so that we can just
11236 * pass on the hex values unchanged. We do have to set a flag to keep
11237 * recoding from happening in the recursion */
11241 char *orig_end = RExC_end;
11244 if (substitute_parse) {
11245 *substitute_parse = newSVpvs("");
11248 substitute_parse = &dummy;
11249 *substitute_parse = newSVpvs("?:");
11251 *substitute_parse = sv_2mortal(*substitute_parse);
11253 while (RExC_parse < endbrace) {
11255 /* Convert to notation the rest of the code understands */
11256 sv_catpv(*substitute_parse, "\\x{");
11257 sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11258 sv_catpv(*substitute_parse, "}");
11260 /* Point to the beginning of the next character in the sequence. */
11261 RExC_parse = endchar + 1;
11262 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11266 if (! in_char_class) {
11267 sv_catpv(*substitute_parse, ")");
11270 RExC_parse = SvPV(*substitute_parse, len);
11272 /* Don't allow empty number */
11273 if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11274 RExC_parse = endbrace;
11275 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11277 RExC_end = RExC_parse + len;
11279 /* The values are Unicode, and therefore not subject to recoding */
11280 RExC_override_recoding = 1;
11283 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11284 if (flags & RESTART_UTF8) {
11285 *flagp = RESTART_UTF8;
11286 return (STRLEN) -1;
11288 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11291 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11294 RExC_parse = endbrace;
11295 RExC_end = orig_end;
11296 RExC_override_recoding = 0;
11298 nextchar(pRExC_state);
11308 * It returns the code point in utf8 for the value in *encp.
11309 * value: a code value in the source encoding
11310 * encp: a pointer to an Encode object
11312 * If the result from Encode is not a single character,
11313 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11316 S_reg_recode(pTHX_ const char value, SV **encp)
11319 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11320 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11321 const STRLEN newlen = SvCUR(sv);
11322 UV uv = UNICODE_REPLACEMENT;
11324 PERL_ARGS_ASSERT_REG_RECODE;
11328 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11331 if (!newlen || numlen != newlen) {
11332 uv = UNICODE_REPLACEMENT;
11338 PERL_STATIC_INLINE U8
11339 S_compute_EXACTish(RExC_state_t *pRExC_state)
11343 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11351 op = get_regex_charset(RExC_flags);
11352 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11353 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11354 been, so there is no hole */
11357 return op + EXACTF;
11360 PERL_STATIC_INLINE void
11361 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11362 regnode *node, I32* flagp, STRLEN len, UV code_point,
11365 /* This knows the details about sizing an EXACTish node, setting flags for
11366 * it (by setting <*flagp>, and potentially populating it with a single
11369 * If <len> (the length in bytes) is non-zero, this function assumes that
11370 * the node has already been populated, and just does the sizing. In this
11371 * case <code_point> should be the final code point that has already been
11372 * placed into the node. This value will be ignored except that under some
11373 * circumstances <*flagp> is set based on it.
11375 * If <len> is zero, the function assumes that the node is to contain only
11376 * the single character given by <code_point> and calculates what <len>
11377 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11378 * additionally will populate the node's STRING with <code_point> or its
11381 * In both cases <*flagp> is appropriately set
11383 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11384 * 255, must be folded (the former only when the rules indicate it can
11387 * When it does the populating, it looks at the flag 'downgradable'. If
11388 * true with a node that folds, it checks if the single code point
11389 * participates in a fold, and if not downgrades the node to an EXACT.
11390 * This helps the optimizer */
11392 bool len_passed_in = cBOOL(len != 0);
11393 U8 character[UTF8_MAXBYTES_CASE+1];
11395 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11397 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11398 * sizing difference, and is extra work that is thrown away */
11399 if (downgradable && ! PASS2) {
11400 downgradable = FALSE;
11403 if (! len_passed_in) {
11405 if (UVCHR_IS_INVARIANT(code_point)) {
11406 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11407 *character = (U8) code_point;
11409 else { /* Here is /i and not /l. (toFOLD() is defined on just
11410 ASCII, which isn't the same thing as INVARIANT on
11411 EBCDIC, but it works there, as the extra invariants
11412 fold to themselves) */
11413 *character = toFOLD((U8) code_point);
11415 /* We can downgrade to an EXACT node if this character
11416 * isn't a folding one. Note that this assumes that
11417 * nothing above Latin1 folds to some other invariant than
11418 * one of these alphabetics; otherwise we would also have
11420 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11421 * || ASCII_FOLD_RESTRICTED))
11423 if (downgradable && PL_fold[code_point] == code_point) {
11429 else if (FOLD && (! LOC
11430 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11431 { /* Folding, and ok to do so now */
11432 UV folded = _to_uni_fold_flags(
11436 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11437 ? FOLD_FLAGS_NOMIX_ASCII
11440 && folded == code_point /* This quickly rules out many
11441 cases, avoiding the
11442 _invlist_contains_cp() overhead
11444 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11451 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11453 /* Not folding this cp, and can output it directly */
11454 *character = UTF8_TWO_BYTE_HI(code_point);
11455 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11459 uvchr_to_utf8( character, code_point);
11460 len = UTF8SKIP(character);
11462 } /* Else pattern isn't UTF8. */
11464 *character = (U8) code_point;
11466 } /* Else is folded non-UTF8 */
11467 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11469 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11470 * comments at join_exact()); */
11471 *character = (U8) code_point;
11474 /* Can turn into an EXACT node if we know the fold at compile time,
11475 * and it folds to itself and doesn't particpate in other folds */
11478 && PL_fold_latin1[code_point] == code_point
11479 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11480 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11484 } /* else is Sharp s. May need to fold it */
11485 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11487 *(character + 1) = 's';
11491 *character = LATIN_SMALL_LETTER_SHARP_S;
11497 RExC_size += STR_SZ(len);
11500 RExC_emit += STR_SZ(len);
11501 STR_LEN(node) = len;
11502 if (! len_passed_in) {
11503 Copy((char *) character, STRING(node), len, char);
11507 *flagp |= HASWIDTH;
11509 /* A single character node is SIMPLE, except for the special-cased SHARP S
11511 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11512 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11513 || ! FOLD || ! DEPENDS_SEMANTICS))
11518 /* The OP may not be well defined in PASS1 */
11519 if (PASS2 && OP(node) == EXACTFL) {
11520 RExC_contains_locale = 1;
11525 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11526 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11529 S_backref_value(char *p)
11531 const char* endptr;
11532 UV val = grok_atou(p, &endptr);
11533 if (endptr == p || endptr == NULL || val > I32_MAX)
11540 - regatom - the lowest level
11542 Try to identify anything special at the start of the pattern. If there
11543 is, then handle it as required. This may involve generating a single regop,
11544 such as for an assertion; or it may involve recursing, such as to
11545 handle a () structure.
11547 If the string doesn't start with something special then we gobble up
11548 as much literal text as we can.
11550 Once we have been able to handle whatever type of thing started the
11551 sequence, we return.
11553 Note: we have to be careful with escapes, as they can be both literal
11554 and special, and in the case of \10 and friends, context determines which.
11556 A summary of the code structure is:
11558 switch (first_byte) {
11559 cases for each special:
11560 handle this special;
11563 switch (2nd byte) {
11564 cases for each unambiguous special:
11565 handle this special;
11567 cases for each ambigous special/literal:
11569 if (special) handle here
11571 default: // unambiguously literal:
11574 default: // is a literal char
11577 create EXACTish node for literal;
11578 while (more input and node isn't full) {
11579 switch (input_byte) {
11580 cases for each special;
11581 make sure parse pointer is set so that the next call to
11582 regatom will see this special first
11583 goto loopdone; // EXACTish node terminated by prev. char
11585 append char to EXACTISH node;
11587 get next input byte;
11591 return the generated node;
11593 Specifically there are two separate switches for handling
11594 escape sequences, with the one for handling literal escapes requiring
11595 a dummy entry for all of the special escapes that are actually handled
11598 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11600 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11602 Otherwise does not return NULL.
11606 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11608 regnode *ret = NULL;
11610 char *parse_start = RExC_parse;
11615 GET_RE_DEBUG_FLAGS_DECL;
11617 *flagp = WORST; /* Tentatively. */
11619 DEBUG_PARSE("atom");
11621 PERL_ARGS_ASSERT_REGATOM;
11624 switch ((U8)*RExC_parse) {
11626 RExC_seen_zerolen++;
11627 nextchar(pRExC_state);
11628 if (RExC_flags & RXf_PMf_MULTILINE)
11629 ret = reg_node(pRExC_state, MBOL);
11631 ret = reg_node(pRExC_state, SBOL);
11632 Set_Node_Length(ret, 1); /* MJD */
11635 nextchar(pRExC_state);
11637 RExC_seen_zerolen++;
11638 if (RExC_flags & RXf_PMf_MULTILINE)
11639 ret = reg_node(pRExC_state, MEOL);
11641 ret = reg_node(pRExC_state, SEOL);
11642 Set_Node_Length(ret, 1); /* MJD */
11645 nextchar(pRExC_state);
11646 if (RExC_flags & RXf_PMf_SINGLELINE)
11647 ret = reg_node(pRExC_state, SANY);
11649 ret = reg_node(pRExC_state, REG_ANY);
11650 *flagp |= HASWIDTH|SIMPLE;
11652 Set_Node_Length(ret, 1); /* MJD */
11656 char * const oregcomp_parse = ++RExC_parse;
11657 ret = regclass(pRExC_state, flagp,depth+1,
11658 FALSE, /* means parse the whole char class */
11659 TRUE, /* allow multi-char folds */
11660 FALSE, /* don't silence non-portable warnings. */
11662 if (*RExC_parse != ']') {
11663 RExC_parse = oregcomp_parse;
11664 vFAIL("Unmatched [");
11667 if (*flagp & RESTART_UTF8)
11669 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11672 nextchar(pRExC_state);
11673 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11677 nextchar(pRExC_state);
11678 ret = reg(pRExC_state, 2, &flags,depth+1);
11680 if (flags & TRYAGAIN) {
11681 if (RExC_parse == RExC_end) {
11682 /* Make parent create an empty node if needed. */
11683 *flagp |= TRYAGAIN;
11688 if (flags & RESTART_UTF8) {
11689 *flagp = RESTART_UTF8;
11692 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11695 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11699 if (flags & TRYAGAIN) {
11700 *flagp |= TRYAGAIN;
11703 vFAIL("Internal urp");
11704 /* Supposed to be caught earlier. */
11710 vFAIL("Quantifier follows nothing");
11715 This switch handles escape sequences that resolve to some kind
11716 of special regop and not to literal text. Escape sequnces that
11717 resolve to literal text are handled below in the switch marked
11720 Every entry in this switch *must* have a corresponding entry
11721 in the literal escape switch. However, the opposite is not
11722 required, as the default for this switch is to jump to the
11723 literal text handling code.
11725 switch ((U8)*++RExC_parse) {
11726 /* Special Escapes */
11728 RExC_seen_zerolen++;
11729 ret = reg_node(pRExC_state, SBOL);
11730 /* SBOL is shared with /^/ so we set the flags so we can tell
11731 * /\A/ from /^/ in split. We check ret because first pass we
11732 * have no regop struct to set the flags on. */
11736 goto finish_meta_pat;
11738 ret = reg_node(pRExC_state, GPOS);
11739 RExC_seen |= REG_GPOS_SEEN;
11741 goto finish_meta_pat;
11743 RExC_seen_zerolen++;
11744 ret = reg_node(pRExC_state, KEEPS);
11746 /* XXX:dmq : disabling in-place substitution seems to
11747 * be necessary here to avoid cases of memory corruption, as
11748 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11750 RExC_seen |= REG_LOOKBEHIND_SEEN;
11751 goto finish_meta_pat;
11753 ret = reg_node(pRExC_state, SEOL);
11755 RExC_seen_zerolen++; /* Do not optimize RE away */
11756 goto finish_meta_pat;
11758 ret = reg_node(pRExC_state, EOS);
11760 RExC_seen_zerolen++; /* Do not optimize RE away */
11761 goto finish_meta_pat;
11763 ret = reg_node(pRExC_state, CANY);
11764 RExC_seen |= REG_CANY_SEEN;
11765 *flagp |= HASWIDTH|SIMPLE;
11767 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11769 goto finish_meta_pat;
11771 ret = reg_node(pRExC_state, CLUMP);
11772 *flagp |= HASWIDTH;
11773 goto finish_meta_pat;
11779 arg = ANYOF_WORDCHAR;
11783 RExC_seen_zerolen++;
11784 RExC_seen |= REG_LOOKBEHIND_SEEN;
11785 op = BOUND + get_regex_charset(RExC_flags);
11786 if (op > BOUNDA) { /* /aa is same as /a */
11789 else if (op == BOUNDL) {
11790 RExC_contains_locale = 1;
11792 ret = reg_node(pRExC_state, op);
11793 FLAGS(ret) = get_regex_charset(RExC_flags);
11795 if ((U8) *(RExC_parse + 1) == '{') {
11796 /* diag_listed_as: Use "%s" instead of "%s" */
11797 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11799 goto finish_meta_pat;
11801 RExC_seen_zerolen++;
11802 RExC_seen |= REG_LOOKBEHIND_SEEN;
11803 op = NBOUND + get_regex_charset(RExC_flags);
11804 if (op > NBOUNDA) { /* /aa is same as /a */
11807 else if (op == NBOUNDL) {
11808 RExC_contains_locale = 1;
11810 ret = reg_node(pRExC_state, op);
11811 FLAGS(ret) = get_regex_charset(RExC_flags);
11813 if ((U8) *(RExC_parse + 1) == '{') {
11814 /* diag_listed_as: Use "%s" instead of "%s" */
11815 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11817 goto finish_meta_pat;
11827 ret = reg_node(pRExC_state, LNBREAK);
11828 *flagp |= HASWIDTH|SIMPLE;
11829 goto finish_meta_pat;
11837 goto join_posix_op_known;
11843 arg = ANYOF_VERTWS;
11845 goto join_posix_op_known;
11855 op = POSIXD + get_regex_charset(RExC_flags);
11856 if (op > POSIXA) { /* /aa is same as /a */
11859 else if (op == POSIXL) {
11860 RExC_contains_locale = 1;
11863 join_posix_op_known:
11866 op += NPOSIXD - POSIXD;
11869 ret = reg_node(pRExC_state, op);
11871 FLAGS(ret) = namedclass_to_classnum(arg);
11874 *flagp |= HASWIDTH|SIMPLE;
11878 nextchar(pRExC_state);
11879 Set_Node_Length(ret, 2); /* MJD */
11885 char* parse_start = RExC_parse - 2;
11890 ret = regclass(pRExC_state, flagp,depth+1,
11891 TRUE, /* means just parse this element */
11892 FALSE, /* don't allow multi-char folds */
11893 FALSE, /* don't silence non-portable warnings.
11894 It would be a bug if these returned
11897 /* regclass() can only return RESTART_UTF8 if multi-char folds
11900 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11905 Set_Node_Offset(ret, parse_start + 2);
11906 Set_Node_Cur_Length(ret, parse_start);
11907 nextchar(pRExC_state);
11911 /* Handle \N and \N{NAME} with multiple code points here and not
11912 * below because it can be multicharacter. join_exact() will join
11913 * them up later on. Also this makes sure that things like
11914 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11915 * The options to the grok function call causes it to fail if the
11916 * sequence is just a single code point. We then go treat it as
11917 * just another character in the current EXACT node, and hence it
11918 * gets uniform treatment with all the other characters. The
11919 * special treatment for quantifiers is not needed for such single
11920 * character sequences */
11922 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11925 if (*flagp & RESTART_UTF8)
11931 case 'k': /* Handle \k<NAME> and \k'NAME' */
11934 char ch= RExC_parse[1];
11935 if (ch != '<' && ch != '\'' && ch != '{') {
11937 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11938 vFAIL2("Sequence %.2s... not terminated",parse_start);
11940 /* this pretty much dupes the code for (?P=...) in reg(), if
11941 you change this make sure you change that */
11942 char* name_start = (RExC_parse += 2);
11944 SV *sv_dat = reg_scan_name(pRExC_state,
11945 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11946 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11947 if (RExC_parse == name_start || *RExC_parse != ch)
11948 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11949 vFAIL2("Sequence %.3s... not terminated",parse_start);
11952 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11953 RExC_rxi->data->data[num]=(void*)sv_dat;
11954 SvREFCNT_inc_simple_void(sv_dat);
11958 ret = reganode(pRExC_state,
11961 : (ASCII_FOLD_RESTRICTED)
11963 : (AT_LEAST_UNI_SEMANTICS)
11969 *flagp |= HASWIDTH;
11971 /* override incorrect value set in reganode MJD */
11972 Set_Node_Offset(ret, parse_start+1);
11973 Set_Node_Cur_Length(ret, parse_start);
11974 nextchar(pRExC_state);
11980 case '1': case '2': case '3': case '4':
11981 case '5': case '6': case '7': case '8': case '9':
11986 if (*RExC_parse == 'g') {
11990 if (*RExC_parse == '{') {
11994 if (*RExC_parse == '-') {
11998 if (hasbrace && !isDIGIT(*RExC_parse)) {
11999 if (isrel) RExC_parse--;
12001 goto parse_named_seq;
12004 num = S_backref_value(RExC_parse);
12006 vFAIL("Reference to invalid group 0");
12007 else if (num == I32_MAX) {
12008 if (isDIGIT(*RExC_parse))
12009 vFAIL("Reference to nonexistent group");
12011 vFAIL("Unterminated \\g... pattern");
12015 num = RExC_npar - num;
12017 vFAIL("Reference to nonexistent or unclosed group");
12021 num = S_backref_value(RExC_parse);
12022 /* bare \NNN might be backref or octal - if it is larger than or equal
12023 * RExC_npar then it is assumed to be and octal escape.
12024 * Note RExC_npar is +1 from the actual number of parens*/
12025 if (num == I32_MAX || (num > 9 && num >= RExC_npar
12026 && *RExC_parse != '8' && *RExC_parse != '9'))
12028 /* Probably a character specified in octal, e.g. \35 */
12033 /* at this point RExC_parse definitely points to a backref
12036 #ifdef RE_TRACK_PATTERN_OFFSETS
12037 char * const parse_start = RExC_parse - 1; /* MJD */
12039 while (isDIGIT(*RExC_parse))
12042 if (*RExC_parse != '}')
12043 vFAIL("Unterminated \\g{...} pattern");
12047 if (num > (I32)RExC_rx->nparens)
12048 vFAIL("Reference to nonexistent group");
12051 ret = reganode(pRExC_state,
12054 : (ASCII_FOLD_RESTRICTED)
12056 : (AT_LEAST_UNI_SEMANTICS)
12062 *flagp |= HASWIDTH;
12064 /* override incorrect value set in reganode MJD */
12065 Set_Node_Offset(ret, parse_start+1);
12066 Set_Node_Cur_Length(ret, parse_start);
12068 nextchar(pRExC_state);
12073 if (RExC_parse >= RExC_end)
12074 FAIL("Trailing \\");
12077 /* Do not generate "unrecognized" warnings here, we fall
12078 back into the quick-grab loop below */
12085 if (RExC_flags & RXf_PMf_EXTENDED) {
12086 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12087 if (RExC_parse < RExC_end)
12094 parse_start = RExC_parse - 1;
12103 #define MAX_NODE_STRING_SIZE 127
12104 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12106 U8 upper_parse = MAX_NODE_STRING_SIZE;
12107 U8 node_type = compute_EXACTish(pRExC_state);
12108 bool next_is_quantifier;
12109 char * oldp = NULL;
12111 /* We can convert EXACTF nodes to EXACTFU if they contain only
12112 * characters that match identically regardless of the target
12113 * string's UTF8ness. The reason to do this is that EXACTF is not
12114 * trie-able, EXACTFU is.
12116 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12117 * contain only above-Latin1 characters (hence must be in UTF8),
12118 * which don't participate in folds with Latin1-range characters,
12119 * as the latter's folds aren't known until runtime. (We don't
12120 * need to figure this out until pass 2) */
12121 bool maybe_exactfu = PASS2
12122 && (node_type == EXACTF || node_type == EXACTFL);
12124 /* If a folding node contains only code points that don't
12125 * participate in folds, it can be changed into an EXACT node,
12126 * which allows the optimizer more things to look for */
12129 ret = reg_node(pRExC_state, node_type);
12131 /* In pass1, folded, we use a temporary buffer instead of the
12132 * actual node, as the node doesn't exist yet */
12133 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12139 /* We do the EXACTFish to EXACT node only if folding. (And we
12140 * don't need to figure this out until pass 2) */
12141 maybe_exact = FOLD && PASS2;
12143 /* XXX The node can hold up to 255 bytes, yet this only goes to
12144 * 127. I (khw) do not know why. Keeping it somewhat less than
12145 * 255 allows us to not have to worry about overflow due to
12146 * converting to utf8 and fold expansion, but that value is
12147 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12148 * split up by this limit into a single one using the real max of
12149 * 255. Even at 127, this breaks under rare circumstances. If
12150 * folding, we do not want to split a node at a character that is a
12151 * non-final in a multi-char fold, as an input string could just
12152 * happen to want to match across the node boundary. The join
12153 * would solve that problem if the join actually happens. But a
12154 * series of more than two nodes in a row each of 127 would cause
12155 * the first join to succeed to get to 254, but then there wouldn't
12156 * be room for the next one, which could at be one of those split
12157 * multi-char folds. I don't know of any fool-proof solution. One
12158 * could back off to end with only a code point that isn't such a
12159 * non-final, but it is possible for there not to be any in the
12161 for (p = RExC_parse - 1;
12162 len < upper_parse && p < RExC_end;
12167 if (RExC_flags & RXf_PMf_EXTENDED)
12168 p = regpatws(pRExC_state, p,
12169 TRUE); /* means recognize comments */
12180 /* Literal Escapes Switch
12182 This switch is meant to handle escape sequences that
12183 resolve to a literal character.
12185 Every escape sequence that represents something
12186 else, like an assertion or a char class, is handled
12187 in the switch marked 'Special Escapes' above in this
12188 routine, but also has an entry here as anything that
12189 isn't explicitly mentioned here will be treated as
12190 an unescaped equivalent literal.
12193 switch ((U8)*++p) {
12194 /* These are all the special escapes. */
12195 case 'A': /* Start assertion */
12196 case 'b': case 'B': /* Word-boundary assertion*/
12197 case 'C': /* Single char !DANGEROUS! */
12198 case 'd': case 'D': /* digit class */
12199 case 'g': case 'G': /* generic-backref, pos assertion */
12200 case 'h': case 'H': /* HORIZWS */
12201 case 'k': case 'K': /* named backref, keep marker */
12202 case 'p': case 'P': /* Unicode property */
12203 case 'R': /* LNBREAK */
12204 case 's': case 'S': /* space class */
12205 case 'v': case 'V': /* VERTWS */
12206 case 'w': case 'W': /* word class */
12207 case 'X': /* eXtended Unicode "combining
12208 character sequence" */
12209 case 'z': case 'Z': /* End of line/string assertion */
12213 /* Anything after here is an escape that resolves to a
12214 literal. (Except digits, which may or may not)
12220 case 'N': /* Handle a single-code point named character. */
12221 /* The options cause it to fail if a multiple code
12222 * point sequence. Handle those in the switch() above
12224 RExC_parse = p + 1;
12225 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12231 if (*flagp & RESTART_UTF8)
12232 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12233 RExC_parse = p = oldp;
12237 if (ender > 0xff) {
12254 ender = ESC_NATIVE;
12264 const char* error_msg;
12266 bool valid = grok_bslash_o(&p,
12269 PASS2, /* out warnings */
12270 FALSE, /* not strict */
12271 TRUE, /* Output warnings
12276 RExC_parse = p; /* going to die anyway; point
12277 to exact spot of failure */
12281 if (IN_ENCODING && ender < 0x100) {
12282 goto recode_encoding;
12284 if (ender > 0xff) {
12291 UV result = UV_MAX; /* initialize to erroneous
12293 const char* error_msg;
12295 bool valid = grok_bslash_x(&p,
12298 PASS2, /* out warnings */
12299 FALSE, /* not strict */
12300 TRUE, /* Output warnings
12305 RExC_parse = p; /* going to die anyway; point
12306 to exact spot of failure */
12311 if (IN_ENCODING && ender < 0x100) {
12312 goto recode_encoding;
12314 if (ender > 0xff) {
12321 ender = grok_bslash_c(*p++, PASS2);
12323 case '8': case '9': /* must be a backreference */
12326 case '1': case '2': case '3':case '4':
12327 case '5': case '6': case '7':
12328 /* When we parse backslash escapes there is ambiguity
12329 * between backreferences and octal escapes. Any escape
12330 * from \1 - \9 is a backreference, any multi-digit
12331 * escape which does not start with 0 and which when
12332 * evaluated as decimal could refer to an already
12333 * parsed capture buffer is a backslash. Anything else
12336 * Note this implies that \118 could be interpreted as
12337 * 118 OR as "\11" . "8" depending on whether there
12338 * were 118 capture buffers defined already in the
12341 /* NOTE, RExC_npar is 1 more than the actual number of
12342 * parens we have seen so far, hence the < RExC_npar below. */
12344 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12345 { /* Not to be treated as an octal constant, go
12353 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12355 ender = grok_oct(p, &numlen, &flags, NULL);
12356 if (ender > 0xff) {
12360 if (PASS2 /* like \08, \178 */
12363 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12365 reg_warn_non_literal_string(
12367 form_short_octal_warning(p, numlen));
12370 if (IN_ENCODING && ender < 0x100)
12371 goto recode_encoding;
12374 if (! RExC_override_recoding) {
12375 SV* enc = _get_encoding();
12376 ender = reg_recode((const char)(U8)ender, &enc);
12378 ckWARNreg(p, "Invalid escape in the specified encoding");
12384 FAIL("Trailing \\");
12387 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12388 /* Include any { following the alpha to emphasize
12389 * that it could be part of an escape at some point
12391 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12392 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12394 goto normal_default;
12395 } /* End of switch on '\' */
12398 /* Currently we don't warn when the lbrace is at the start
12399 * of a construct. This catches it in the middle of a
12400 * literal string, or when its the first thing after
12401 * something like "\b" */
12403 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12405 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12408 default: /* A literal character */
12410 if (UTF8_IS_START(*p) && UTF) {
12412 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12413 &numlen, UTF8_ALLOW_DEFAULT);
12419 } /* End of switch on the literal */
12421 /* Here, have looked at the literal character and <ender>
12422 * contains its ordinal, <p> points to the character after it
12425 if ( RExC_flags & RXf_PMf_EXTENDED)
12426 p = regpatws(pRExC_state, p,
12427 TRUE); /* means recognize comments */
12429 /* If the next thing is a quantifier, it applies to this
12430 * character only, which means that this character has to be in
12431 * its own node and can't just be appended to the string in an
12432 * existing node, so if there are already other characters in
12433 * the node, close the node with just them, and set up to do
12434 * this character again next time through, when it will be the
12435 * only thing in its new node */
12436 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12442 if (! FOLD /* The simple case, just append the literal */
12443 || (LOC /* Also don't fold for tricky chars under /l */
12444 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12447 const STRLEN unilen = reguni(pRExC_state, ender, s);
12453 /* The loop increments <len> each time, as all but this
12454 * path (and one other) through it add a single byte to
12455 * the EXACTish node. But this one has changed len to
12456 * be the correct final value, so subtract one to
12457 * cancel out the increment that follows */
12461 REGC((char)ender, s++);
12464 /* Can get here if folding only if is one of the /l
12465 * characters whose fold depends on the locale. The
12466 * occurrence of any of these indicate that we can't
12467 * simplify things */
12469 maybe_exact = FALSE;
12470 maybe_exactfu = FALSE;
12475 /* See comments for join_exact() as to why we fold this
12476 * non-UTF at compile time */
12477 || (node_type == EXACTFU
12478 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12480 /* Here, are folding and are not UTF-8 encoded; therefore
12481 * the character must be in the range 0-255, and is not /l
12482 * (Not /l because we already handled these under /l in
12483 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12484 if (IS_IN_SOME_FOLD_L1(ender)) {
12485 maybe_exact = FALSE;
12487 /* See if the character's fold differs between /d and
12488 * /u. This includes the multi-char fold SHARP S to
12491 && (PL_fold[ender] != PL_fold_latin1[ender]
12492 || ender == LATIN_SMALL_LETTER_SHARP_S
12494 && isALPHA_FOLD_EQ(ender, 's')
12495 && isALPHA_FOLD_EQ(*(s-1), 's'))))
12497 maybe_exactfu = FALSE;
12501 /* Even when folding, we store just the input character, as
12502 * we have an array that finds its fold quickly */
12503 *(s++) = (char) ender;
12505 else { /* FOLD and UTF */
12506 /* Unlike the non-fold case, we do actually have to
12507 * calculate the results here in pass 1. This is for two
12508 * reasons, the folded length may be longer than the
12509 * unfolded, and we have to calculate how many EXACTish
12510 * nodes it will take; and we may run out of room in a node
12511 * in the middle of a potential multi-char fold, and have
12512 * to back off accordingly. (Hence we can't use REGC for
12513 * the simple case just below.) */
12516 if (isASCII_uni(ender)) {
12517 folded = toFOLD(ender);
12518 *(s)++ = (U8) folded;
12523 folded = _to_uni_fold_flags(
12527 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12528 ? FOLD_FLAGS_NOMIX_ASCII
12532 /* The loop increments <len> each time, as all but this
12533 * path (and one other) through it add a single byte to
12534 * the EXACTish node. But this one has changed len to
12535 * be the correct final value, so subtract one to
12536 * cancel out the increment that follows */
12537 len += foldlen - 1;
12539 /* If this node only contains non-folding code points so
12540 * far, see if this new one is also non-folding */
12542 if (folded != ender) {
12543 maybe_exact = FALSE;
12546 /* Here the fold is the original; we have to check
12547 * further to see if anything folds to it */
12548 if (_invlist_contains_cp(PL_utf8_foldable,
12551 maybe_exact = FALSE;
12558 if (next_is_quantifier) {
12560 /* Here, the next input is a quantifier, and to get here,
12561 * the current character is the only one in the node.
12562 * Also, here <len> doesn't include the final byte for this
12568 } /* End of loop through literal characters */
12570 /* Here we have either exhausted the input or ran out of room in
12571 * the node. (If we encountered a character that can't be in the
12572 * node, transfer is made directly to <loopdone>, and so we
12573 * wouldn't have fallen off the end of the loop.) In the latter
12574 * case, we artificially have to split the node into two, because
12575 * we just don't have enough space to hold everything. This
12576 * creates a problem if the final character participates in a
12577 * multi-character fold in the non-final position, as a match that
12578 * should have occurred won't, due to the way nodes are matched,
12579 * and our artificial boundary. So back off until we find a non-
12580 * problematic character -- one that isn't at the beginning or
12581 * middle of such a fold. (Either it doesn't participate in any
12582 * folds, or appears only in the final position of all the folds it
12583 * does participate in.) A better solution with far fewer false
12584 * positives, and that would fill the nodes more completely, would
12585 * be to actually have available all the multi-character folds to
12586 * test against, and to back-off only far enough to be sure that
12587 * this node isn't ending with a partial one. <upper_parse> is set
12588 * further below (if we need to reparse the node) to include just
12589 * up through that final non-problematic character that this code
12590 * identifies, so when it is set to less than the full node, we can
12591 * skip the rest of this */
12592 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12594 const STRLEN full_len = len;
12596 assert(len >= MAX_NODE_STRING_SIZE);
12598 /* Here, <s> points to the final byte of the final character.
12599 * Look backwards through the string until find a non-
12600 * problematic character */
12604 /* This has no multi-char folds to non-UTF characters */
12605 if (ASCII_FOLD_RESTRICTED) {
12609 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12613 if (! PL_NonL1NonFinalFold) {
12614 PL_NonL1NonFinalFold = _new_invlist_C_array(
12615 NonL1_Perl_Non_Final_Folds_invlist);
12618 /* Point to the first byte of the final character */
12619 s = (char *) utf8_hop((U8 *) s, -1);
12621 while (s >= s0) { /* Search backwards until find
12622 non-problematic char */
12623 if (UTF8_IS_INVARIANT(*s)) {
12625 /* There are no ascii characters that participate
12626 * in multi-char folds under /aa. In EBCDIC, the
12627 * non-ascii invariants are all control characters,
12628 * so don't ever participate in any folds. */
12629 if (ASCII_FOLD_RESTRICTED
12630 || ! IS_NON_FINAL_FOLD(*s))
12635 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12636 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12642 else if (! _invlist_contains_cp(
12643 PL_NonL1NonFinalFold,
12644 valid_utf8_to_uvchr((U8 *) s, NULL)))
12649 /* Here, the current character is problematic in that
12650 * it does occur in the non-final position of some
12651 * fold, so try the character before it, but have to
12652 * special case the very first byte in the string, so
12653 * we don't read outside the string */
12654 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12655 } /* End of loop backwards through the string */
12657 /* If there were only problematic characters in the string,
12658 * <s> will point to before s0, in which case the length
12659 * should be 0, otherwise include the length of the
12660 * non-problematic character just found */
12661 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12664 /* Here, have found the final character, if any, that is
12665 * non-problematic as far as ending the node without splitting
12666 * it across a potential multi-char fold. <len> contains the
12667 * number of bytes in the node up-to and including that
12668 * character, or is 0 if there is no such character, meaning
12669 * the whole node contains only problematic characters. In
12670 * this case, give up and just take the node as-is. We can't
12675 /* If the node ends in an 's' we make sure it stays EXACTF,
12676 * as if it turns into an EXACTFU, it could later get
12677 * joined with another 's' that would then wrongly match
12679 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12681 maybe_exactfu = FALSE;
12685 /* Here, the node does contain some characters that aren't
12686 * problematic. If one such is the final character in the
12687 * node, we are done */
12688 if (len == full_len) {
12691 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12693 /* If the final character is problematic, but the
12694 * penultimate is not, back-off that last character to
12695 * later start a new node with it */
12700 /* Here, the final non-problematic character is earlier
12701 * in the input than the penultimate character. What we do
12702 * is reparse from the beginning, going up only as far as
12703 * this final ok one, thus guaranteeing that the node ends
12704 * in an acceptable character. The reason we reparse is
12705 * that we know how far in the character is, but we don't
12706 * know how to correlate its position with the input parse.
12707 * An alternate implementation would be to build that
12708 * correlation as we go along during the original parse,
12709 * but that would entail extra work for every node, whereas
12710 * this code gets executed only when the string is too
12711 * large for the node, and the final two characters are
12712 * problematic, an infrequent occurrence. Yet another
12713 * possible strategy would be to save the tail of the
12714 * string, and the next time regatom is called, initialize
12715 * with that. The problem with this is that unless you
12716 * back off one more character, you won't be guaranteed
12717 * regatom will get called again, unless regbranch,
12718 * regpiece ... are also changed. If you do back off that
12719 * extra character, so that there is input guaranteed to
12720 * force calling regatom, you can't handle the case where
12721 * just the first character in the node is acceptable. I
12722 * (khw) decided to try this method which doesn't have that
12723 * pitfall; if performance issues are found, we can do a
12724 * combination of the current approach plus that one */
12730 } /* End of verifying node ends with an appropriate char */
12732 loopdone: /* Jumped to when encounters something that shouldn't be in
12735 /* I (khw) don't know if you can get here with zero length, but the
12736 * old code handled this situation by creating a zero-length EXACT
12737 * node. Might as well be NOTHING instead */
12743 /* If 'maybe_exact' is still set here, means there are no
12744 * code points in the node that participate in folds;
12745 * similarly for 'maybe_exactfu' and code points that match
12746 * differently depending on UTF8ness of the target string
12747 * (for /u), or depending on locale for /l */
12753 else if (maybe_exactfu) {
12759 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12760 FALSE /* Don't look to see if could
12761 be turned into an EXACT
12762 node, as we have already
12767 RExC_parse = p - 1;
12768 Set_Node_Cur_Length(ret, parse_start);
12769 nextchar(pRExC_state);
12771 /* len is STRLEN which is unsigned, need to copy to signed */
12774 vFAIL("Internal disaster");
12777 } /* End of label 'defchar:' */
12779 } /* End of giant switch on input character */
12785 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12787 /* Returns the next non-pattern-white space, non-comment character (the
12788 * latter only if 'recognize_comment is true) in the string p, which is
12789 * ended by RExC_end. See also reg_skipcomment */
12790 const char *e = RExC_end;
12792 PERL_ARGS_ASSERT_REGPATWS;
12796 if ((len = is_PATWS_safe(p, e, UTF))) {
12799 else if (recognize_comment && *p == '#') {
12800 p = reg_skipcomment(pRExC_state, p);
12809 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12811 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12812 * sets up the bitmap and any flags, removing those code points from the
12813 * inversion list, setting it to NULL should it become completely empty */
12815 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12816 assert(PL_regkind[OP(node)] == ANYOF);
12818 ANYOF_BITMAP_ZERO(node);
12819 if (*invlist_ptr) {
12821 /* This gets set if we actually need to modify things */
12822 bool change_invlist = FALSE;
12826 /* Start looking through *invlist_ptr */
12827 invlist_iterinit(*invlist_ptr);
12828 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12832 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12833 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12835 else if (end >= NUM_ANYOF_CODE_POINTS) {
12836 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12839 /* Quit if are above what we should change */
12840 if (start >= NUM_ANYOF_CODE_POINTS) {
12844 change_invlist = TRUE;
12846 /* Set all the bits in the range, up to the max that we are doing */
12847 high = (end < NUM_ANYOF_CODE_POINTS - 1)
12849 : NUM_ANYOF_CODE_POINTS - 1;
12850 for (i = start; i <= (int) high; i++) {
12851 if (! ANYOF_BITMAP_TEST(node, i)) {
12852 ANYOF_BITMAP_SET(node, i);
12856 invlist_iterfinish(*invlist_ptr);
12858 /* Done with loop; remove any code points that are in the bitmap from
12859 * *invlist_ptr; similarly for code points above the bitmap if we have
12860 * a flag to match all of them anyways */
12861 if (change_invlist) {
12862 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12864 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12865 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12868 /* If have completely emptied it, remove it completely */
12869 if (_invlist_len(*invlist_ptr) == 0) {
12870 SvREFCNT_dec_NN(*invlist_ptr);
12871 *invlist_ptr = NULL;
12876 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12877 Character classes ([:foo:]) can also be negated ([:^foo:]).
12878 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12879 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12880 but trigger failures because they are currently unimplemented. */
12882 #define POSIXCC_DONE(c) ((c) == ':')
12883 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12884 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12886 PERL_STATIC_INLINE I32
12887 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12889 I32 namedclass = OOB_NAMEDCLASS;
12891 PERL_ARGS_ASSERT_REGPPOSIXCC;
12893 if (value == '[' && RExC_parse + 1 < RExC_end &&
12894 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12895 POSIXCC(UCHARAT(RExC_parse)))
12897 const char c = UCHARAT(RExC_parse);
12898 char* const s = RExC_parse++;
12900 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12902 if (RExC_parse == RExC_end) {
12905 /* Try to give a better location for the error (than the end of
12906 * the string) by looking for the matching ']' */
12908 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12911 vFAIL2("Unmatched '%c' in POSIX class", c);
12913 /* Grandfather lone [:, [=, [. */
12917 const char* const t = RExC_parse++; /* skip over the c */
12920 if (UCHARAT(RExC_parse) == ']') {
12921 const char *posixcc = s + 1;
12922 RExC_parse++; /* skip over the ending ] */
12925 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12926 const I32 skip = t - posixcc;
12928 /* Initially switch on the length of the name. */
12931 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12932 this is the Perl \w
12934 namedclass = ANYOF_WORDCHAR;
12937 /* Names all of length 5. */
12938 /* alnum alpha ascii blank cntrl digit graph lower
12939 print punct space upper */
12940 /* Offset 4 gives the best switch position. */
12941 switch (posixcc[4]) {
12943 if (memEQ(posixcc, "alph", 4)) /* alpha */
12944 namedclass = ANYOF_ALPHA;
12947 if (memEQ(posixcc, "spac", 4)) /* space */
12948 namedclass = ANYOF_PSXSPC;
12951 if (memEQ(posixcc, "grap", 4)) /* graph */
12952 namedclass = ANYOF_GRAPH;
12955 if (memEQ(posixcc, "asci", 4)) /* ascii */
12956 namedclass = ANYOF_ASCII;
12959 if (memEQ(posixcc, "blan", 4)) /* blank */
12960 namedclass = ANYOF_BLANK;
12963 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12964 namedclass = ANYOF_CNTRL;
12967 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12968 namedclass = ANYOF_ALPHANUMERIC;
12971 if (memEQ(posixcc, "lowe", 4)) /* lower */
12972 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12973 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12974 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12977 if (memEQ(posixcc, "digi", 4)) /* digit */
12978 namedclass = ANYOF_DIGIT;
12979 else if (memEQ(posixcc, "prin", 4)) /* print */
12980 namedclass = ANYOF_PRINT;
12981 else if (memEQ(posixcc, "punc", 4)) /* punct */
12982 namedclass = ANYOF_PUNCT;
12987 if (memEQ(posixcc, "xdigit", 6))
12988 namedclass = ANYOF_XDIGIT;
12992 if (namedclass == OOB_NAMEDCLASS)
12994 "POSIX class [:%"UTF8f":] unknown",
12995 UTF8fARG(UTF, t - s - 1, s + 1));
12997 /* The #defines are structured so each complement is +1 to
12998 * the normal one */
13002 assert (posixcc[skip] == ':');
13003 assert (posixcc[skip+1] == ']');
13004 } else if (!SIZE_ONLY) {
13005 /* [[=foo=]] and [[.foo.]] are still future. */
13007 /* adjust RExC_parse so the warning shows after
13008 the class closes */
13009 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13011 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13014 /* Maternal grandfather:
13015 * "[:" ending in ":" but not in ":]" */
13017 vFAIL("Unmatched '[' in POSIX class");
13020 /* Grandfather lone [:, [=, [. */
13030 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13032 /* This applies some heuristics at the current parse position (which should
13033 * be at a '[') to see if what follows might be intended to be a [:posix:]
13034 * class. It returns true if it really is a posix class, of course, but it
13035 * also can return true if it thinks that what was intended was a posix
13036 * class that didn't quite make it.
13038 * It will return true for
13040 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13041 * ')' indicating the end of the (?[
13042 * [:any garbage including %^&$ punctuation:]
13044 * This is designed to be called only from S_handle_regex_sets; it could be
13045 * easily adapted to be called from the spot at the beginning of regclass()
13046 * that checks to see in a normal bracketed class if the surrounding []
13047 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13048 * change long-standing behavior, so I (khw) didn't do that */
13049 char* p = RExC_parse + 1;
13050 char first_char = *p;
13052 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13054 assert(*(p - 1) == '[');
13056 if (! POSIXCC(first_char)) {
13061 while (p < RExC_end && isWORDCHAR(*p)) p++;
13063 if (p >= RExC_end) {
13067 if (p - RExC_parse > 2 /* Got at least 1 word character */
13068 && (*p == first_char
13069 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13074 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13077 && p - RExC_parse > 2 /* [:] evaluates to colon;
13078 [::] is a bad posix class. */
13079 && first_char == *(p - 1));
13083 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13084 I32 *flagp, U32 depth,
13085 char * const oregcomp_parse)
13087 /* Handle the (?[...]) construct to do set operations */
13090 UV start, end; /* End points of code point ranges */
13092 char *save_end, *save_parse;
13097 const bool save_fold = FOLD;
13099 GET_RE_DEBUG_FLAGS_DECL;
13101 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13104 vFAIL("(?[...]) not valid in locale");
13106 RExC_uni_semantics = 1;
13108 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13109 * (such as EXACT). Thus we can skip most everything if just sizing. We
13110 * call regclass to handle '[]' so as to not have to reinvent its parsing
13111 * rules here (throwing away the size it computes each time). And, we exit
13112 * upon an unescaped ']' that isn't one ending a regclass. To do both
13113 * these things, we need to realize that something preceded by a backslash
13114 * is escaped, so we have to keep track of backslashes */
13116 Perl_ck_warner_d(aTHX_
13117 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13118 "The regex_sets feature is experimental" REPORT_LOCATION,
13119 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13121 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13122 RExC_precomp + (RExC_parse - RExC_precomp)));
13125 UV depth = 0; /* how many nested (?[...]) constructs */
13127 while (RExC_parse < RExC_end) {
13128 SV* current = NULL;
13129 RExC_parse = regpatws(pRExC_state, RExC_parse,
13130 TRUE); /* means recognize comments */
13131 switch (*RExC_parse) {
13133 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13138 /* Skip the next byte (which could cause us to end up in
13139 * the middle of a UTF-8 character, but since none of those
13140 * are confusable with anything we currently handle in this
13141 * switch (invariants all), it's safe. We'll just hit the
13142 * default: case next time and keep on incrementing until
13143 * we find one of the invariants we do handle. */
13148 /* If this looks like it is a [:posix:] class, leave the
13149 * parse pointer at the '[' to fool regclass() into
13150 * thinking it is part of a '[[:posix:]]'. That function
13151 * will use strict checking to force a syntax error if it
13152 * doesn't work out to a legitimate class */
13153 bool is_posix_class
13154 = could_it_be_a_POSIX_class(pRExC_state);
13155 if (! is_posix_class) {
13159 /* regclass() can only return RESTART_UTF8 if multi-char
13160 folds are allowed. */
13161 if (!regclass(pRExC_state, flagp,depth+1,
13162 is_posix_class, /* parse the whole char
13163 class only if not a
13165 FALSE, /* don't allow multi-char folds */
13166 TRUE, /* silence non-portable warnings. */
13168 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13171 /* function call leaves parse pointing to the ']', except
13172 * if we faked it */
13173 if (is_posix_class) {
13177 SvREFCNT_dec(current); /* In case it returned something */
13182 if (depth--) break;
13184 if (RExC_parse < RExC_end
13185 && *RExC_parse == ')')
13187 node = reganode(pRExC_state, ANYOF, 0);
13188 RExC_size += ANYOF_SKIP;
13189 nextchar(pRExC_state);
13190 Set_Node_Length(node,
13191 RExC_parse - oregcomp_parse + 1); /* MJD */
13200 FAIL("Syntax error in (?[...])");
13203 /* Pass 2 only after this. Everything in this construct is a
13204 * metacharacter. Operands begin with either a '\' (for an escape
13205 * sequence), or a '[' for a bracketed character class. Any other
13206 * character should be an operator, or parenthesis for grouping. Both
13207 * types of operands are handled by calling regclass() to parse them. It
13208 * is called with a parameter to indicate to return the computed inversion
13209 * list. The parsing here is implemented via a stack. Each entry on the
13210 * stack is a single character representing one of the operators, or the
13211 * '('; or else a pointer to an operand inversion list. */
13213 #define IS_OPERAND(a) (! SvIOK(a))
13215 /* The stack starts empty. It is a syntax error if the first thing parsed
13216 * is a binary operator; everything else is pushed on the stack. When an
13217 * operand is parsed, the top of the stack is examined. If it is a binary
13218 * operator, the item before it should be an operand, and both are replaced
13219 * by the result of doing that operation on the new operand and the one on
13220 * the stack. Thus a sequence of binary operands is reduced to a single
13221 * one before the next one is parsed.
13223 * A unary operator may immediately follow a binary in the input, for
13226 * When an operand is parsed and the top of the stack is a unary operator,
13227 * the operation is performed, and then the stack is rechecked to see if
13228 * this new operand is part of a binary operation; if so, it is handled as
13231 * A '(' is simply pushed on the stack; it is valid only if the stack is
13232 * empty, or the top element of the stack is an operator or another '('
13233 * (for which the parenthesized expression will become an operand). By the
13234 * time the corresponding ')' is parsed everything in between should have
13235 * been parsed and evaluated to a single operand (or else is a syntax
13236 * error), and is handled as a regular operand */
13238 sv_2mortal((SV *)(stack = newAV()));
13240 while (RExC_parse < RExC_end) {
13241 I32 top_index = av_tindex(stack);
13243 SV* current = NULL;
13245 /* Skip white space */
13246 RExC_parse = regpatws(pRExC_state, RExC_parse,
13247 TRUE /* means recognize comments */ );
13248 if (RExC_parse >= RExC_end) {
13249 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13251 if ((curchar = UCHARAT(RExC_parse)) == ']') {
13258 if (av_tindex(stack) >= 0 /* This makes sure that we can
13259 safely subtract 1 from
13260 RExC_parse in the next clause.
13261 If we have something on the
13262 stack, we have parsed something
13264 && UCHARAT(RExC_parse - 1) == '('
13265 && RExC_parse < RExC_end)
13267 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13268 * This happens when we have some thing like
13270 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13272 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13274 * Here we would be handling the interpolated
13275 * '$thai_or_lao'. We handle this by a recursive call to
13276 * ourselves which returns the inversion list the
13277 * interpolated expression evaluates to. We use the flags
13278 * from the interpolated pattern. */
13279 U32 save_flags = RExC_flags;
13280 const char * const save_parse = ++RExC_parse;
13282 parse_lparen_question_flags(pRExC_state);
13284 if (RExC_parse == save_parse /* Makes sure there was at
13285 least one flag (or this
13286 embedding wasn't compiled)
13288 || RExC_parse >= RExC_end - 4
13289 || UCHARAT(RExC_parse) != ':'
13290 || UCHARAT(++RExC_parse) != '('
13291 || UCHARAT(++RExC_parse) != '?'
13292 || UCHARAT(++RExC_parse) != '[')
13295 /* In combination with the above, this moves the
13296 * pointer to the point just after the first erroneous
13297 * character (or if there are no flags, to where they
13298 * should have been) */
13299 if (RExC_parse >= RExC_end - 4) {
13300 RExC_parse = RExC_end;
13302 else if (RExC_parse != save_parse) {
13303 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13305 vFAIL("Expecting '(?flags:(?[...'");
13308 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13309 depth+1, oregcomp_parse);
13311 /* Here, 'current' contains the embedded expression's
13312 * inversion list, and RExC_parse points to the trailing
13313 * ']'; the next character should be the ')' which will be
13314 * paired with the '(' that has been put on the stack, so
13315 * the whole embedded expression reduces to '(operand)' */
13318 RExC_flags = save_flags;
13319 goto handle_operand;
13324 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13325 vFAIL("Unexpected character");
13328 /* regclass() can only return RESTART_UTF8 if multi-char
13329 folds are allowed. */
13330 if (!regclass(pRExC_state, flagp,depth+1,
13331 TRUE, /* means parse just the next thing */
13332 FALSE, /* don't allow multi-char folds */
13333 FALSE, /* don't silence non-portable warnings. */
13335 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13337 /* regclass() will return with parsing just the \ sequence,
13338 * leaving the parse pointer at the next thing to parse */
13340 goto handle_operand;
13342 case '[': /* Is a bracketed character class */
13344 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13346 if (! is_posix_class) {
13350 /* regclass() can only return RESTART_UTF8 if multi-char
13351 folds are allowed. */
13352 if(!regclass(pRExC_state, flagp,depth+1,
13353 is_posix_class, /* parse the whole char class
13354 only if not a posix class */
13355 FALSE, /* don't allow multi-char folds */
13356 FALSE, /* don't silence non-portable warnings. */
13358 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13360 /* function call leaves parse pointing to the ']', except if we
13362 if (is_posix_class) {
13366 goto handle_operand;
13375 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13376 || ! IS_OPERAND(*top_ptr))
13379 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13381 av_push(stack, newSVuv(curchar));
13385 av_push(stack, newSVuv(curchar));
13389 if (top_index >= 0) {
13390 top_ptr = av_fetch(stack, top_index, FALSE);
13392 if (IS_OPERAND(*top_ptr)) {
13394 vFAIL("Unexpected '(' with no preceding operator");
13397 av_push(stack, newSVuv(curchar));
13404 || ! (current = av_pop(stack))
13405 || ! IS_OPERAND(current)
13406 || ! (lparen = av_pop(stack))
13407 || IS_OPERAND(lparen)
13408 || SvUV(lparen) != '(')
13410 SvREFCNT_dec(current);
13412 vFAIL("Unexpected ')'");
13415 SvREFCNT_dec_NN(lparen);
13422 /* Here, we have an operand to process, in 'current' */
13424 if (top_index < 0) { /* Just push if stack is empty */
13425 av_push(stack, current);
13428 SV* top = av_pop(stack);
13430 char current_operator;
13432 if (IS_OPERAND(top)) {
13433 SvREFCNT_dec_NN(top);
13434 SvREFCNT_dec_NN(current);
13435 vFAIL("Operand with no preceding operator");
13437 current_operator = (char) SvUV(top);
13438 switch (current_operator) {
13439 case '(': /* Push the '(' back on followed by the new
13441 av_push(stack, top);
13442 av_push(stack, current);
13443 SvREFCNT_inc(top); /* Counters the '_dec' done
13444 just after the 'break', so
13445 it doesn't get wrongly freed
13450 _invlist_invert(current);
13452 /* Unlike binary operators, the top of the stack,
13453 * now that this unary one has been popped off, may
13454 * legally be an operator, and we now have operand
13457 SvREFCNT_dec_NN(top);
13458 goto handle_operand;
13461 prev = av_pop(stack);
13462 _invlist_intersection(prev,
13465 av_push(stack, current);
13470 prev = av_pop(stack);
13471 _invlist_union(prev, current, ¤t);
13472 av_push(stack, current);
13476 prev = av_pop(stack);;
13477 _invlist_subtract(prev, current, ¤t);
13478 av_push(stack, current);
13481 case '^': /* The union minus the intersection */
13487 prev = av_pop(stack);
13488 _invlist_union(prev, current, &u);
13489 _invlist_intersection(prev, current, &i);
13490 /* _invlist_subtract will overwrite current
13491 without freeing what it already contains */
13493 _invlist_subtract(u, i, ¤t);
13494 av_push(stack, current);
13495 SvREFCNT_dec_NN(i);
13496 SvREFCNT_dec_NN(u);
13497 SvREFCNT_dec_NN(element);
13502 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13504 SvREFCNT_dec_NN(top);
13505 SvREFCNT_dec(prev);
13509 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13512 if (av_tindex(stack) < 0 /* Was empty */
13513 || ((final = av_pop(stack)) == NULL)
13514 || ! IS_OPERAND(final)
13515 || av_tindex(stack) >= 0) /* More left on stack */
13517 vFAIL("Incomplete expression within '(?[ ])'");
13520 /* Here, 'final' is the resultant inversion list from evaluating the
13521 * expression. Return it if so requested */
13522 if (return_invlist) {
13523 *return_invlist = final;
13527 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13528 * expecting a string of ranges and individual code points */
13529 invlist_iterinit(final);
13530 result_string = newSVpvs("");
13531 while (invlist_iternext(final, &start, &end)) {
13532 if (start == end) {
13533 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13536 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13541 save_parse = RExC_parse;
13542 RExC_parse = SvPV(result_string, len);
13543 save_end = RExC_end;
13544 RExC_end = RExC_parse + len;
13546 /* We turn off folding around the call, as the class we have constructed
13547 * already has all folding taken into consideration, and we don't want
13548 * regclass() to add to that */
13549 RExC_flags &= ~RXf_PMf_FOLD;
13550 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13552 node = regclass(pRExC_state, flagp,depth+1,
13553 FALSE, /* means parse the whole char class */
13554 FALSE, /* don't allow multi-char folds */
13555 TRUE, /* silence non-portable warnings. The above may very
13556 well have generated non-portable code points, but
13557 they're valid on this machine */
13560 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13563 RExC_flags |= RXf_PMf_FOLD;
13565 RExC_parse = save_parse + 1;
13566 RExC_end = save_end;
13567 SvREFCNT_dec_NN(final);
13568 SvREFCNT_dec_NN(result_string);
13570 nextchar(pRExC_state);
13571 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13577 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13579 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13580 * innocent-looking character class, like /[ks]/i won't have to go out to
13581 * disk to find the possible matches.
13583 * This should be called only for a Latin1-range code points, cp, which is
13584 * known to be involved in a simple fold with other code points above
13585 * Latin1. It would give false results if /aa has been specified.
13586 * Multi-char folds are outside the scope of this, and must be handled
13589 * XXX It would be better to generate these via regen, in case a new
13590 * version of the Unicode standard adds new mappings, though that is not
13591 * really likely, and may be caught by the default: case of the switch
13594 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13596 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13602 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13606 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13609 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13610 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13612 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13613 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13614 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13616 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13617 *invlist = add_cp_to_invlist(*invlist,
13618 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13620 case LATIN_SMALL_LETTER_SHARP_S:
13621 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13624 /* Use deprecated warning to increase the chances of this being
13627 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13634 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13636 /* This adds the string scalar <multi_string> to the array
13637 * <multi_char_matches>. <multi_string> is known to have exactly
13638 * <cp_count> code points in it. This is used when constructing a
13639 * bracketed character class and we find something that needs to match more
13640 * than a single character.
13642 * <multi_char_matches> is actually an array of arrays. Each top-level
13643 * element is an array that contains all the strings known so far that are
13644 * the same length. And that length (in number of code points) is the same
13645 * as the index of the top-level array. Hence, the [2] element is an
13646 * array, each element thereof is a string containing TWO code points;
13647 * while element [3] is for strings of THREE characters, and so on. Since
13648 * this is for multi-char strings there can never be a [0] nor [1] element.
13650 * When we rewrite the character class below, we will do so such that the
13651 * longest strings are written first, so that it prefers the longest
13652 * matching strings first. This is done even if it turns out that any
13653 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
13654 * Christiansen has agreed that this is ok. This makes the test for the
13655 * ligature 'ffi' come before the test for 'ff', for example */
13658 AV** this_array_ptr;
13660 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13662 if (! multi_char_matches) {
13663 multi_char_matches = newAV();
13666 if (av_exists(multi_char_matches, cp_count)) {
13667 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13668 this_array = *this_array_ptr;
13671 this_array = newAV();
13672 av_store(multi_char_matches, cp_count,
13675 av_push(this_array, multi_string);
13677 return multi_char_matches;
13680 /* The names of properties whose definitions are not known at compile time are
13681 * stored in this SV, after a constant heading. So if the length has been
13682 * changed since initialization, then there is a run-time definition. */
13683 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13684 (SvCUR(listsv) != initial_listsv_len)
13687 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13688 const bool stop_at_1, /* Just parse the next thing, don't
13689 look for a full character class */
13690 bool allow_multi_folds,
13691 const bool silence_non_portable, /* Don't output warnings
13694 SV** ret_invlist) /* Return an inversion list, not a node */
13696 /* parse a bracketed class specification. Most of these will produce an
13697 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13698 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13699 * under /i with multi-character folds: it will be rewritten following the
13700 * paradigm of this example, where the <multi-fold>s are characters which
13701 * fold to multiple character sequences:
13702 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13703 * gets effectively rewritten as:
13704 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13705 * reg() gets called (recursively) on the rewritten version, and this
13706 * function will return what it constructs. (Actually the <multi-fold>s
13707 * aren't physically removed from the [abcdefghi], it's just that they are
13708 * ignored in the recursion by means of a flag:
13709 * <RExC_in_multi_char_class>.)
13711 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13712 * characters, with the corresponding bit set if that character is in the
13713 * list. For characters above this, a range list or swash is used. There
13714 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13715 * determinable at compile time
13717 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13718 * to be restarted. This can only happen if ret_invlist is non-NULL.
13721 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13723 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13726 IV namedclass = OOB_NAMEDCLASS;
13727 char *rangebegin = NULL;
13728 bool need_class = 0;
13730 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13731 than just initialized. */
13732 SV* properties = NULL; /* Code points that match \p{} \P{} */
13733 SV* posixes = NULL; /* Code points that match classes like [:word:],
13734 extended beyond the Latin1 range. These have to
13735 be kept separate from other code points for much
13736 of this function because their handling is
13737 different under /i, and for most classes under
13739 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13740 separate for a while from the non-complemented
13741 versions because of complications with /d
13743 UV element_count = 0; /* Number of distinct elements in the class.
13744 Optimizations may be possible if this is tiny */
13745 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13746 character; used under /i */
13748 char * stop_ptr = RExC_end; /* where to stop parsing */
13749 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13751 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13753 /* Unicode properties are stored in a swash; this holds the current one
13754 * being parsed. If this swash is the only above-latin1 component of the
13755 * character class, an optimization is to pass it directly on to the
13756 * execution engine. Otherwise, it is set to NULL to indicate that there
13757 * are other things in the class that have to be dealt with at execution
13759 SV* swash = NULL; /* Code points that match \p{} \P{} */
13761 /* Set if a component of this character class is user-defined; just passed
13762 * on to the engine */
13763 bool has_user_defined_property = FALSE;
13765 /* inversion list of code points this node matches only when the target
13766 * string is in UTF-8. (Because is under /d) */
13767 SV* depends_list = NULL;
13769 /* Inversion list of code points this node matches regardless of things
13770 * like locale, folding, utf8ness of the target string */
13771 SV* cp_list = NULL;
13773 /* Like cp_list, but code points on this list need to be checked for things
13774 * that fold to/from them under /i */
13775 SV* cp_foldable_list = NULL;
13777 /* Like cp_list, but code points on this list are valid only when the
13778 * runtime locale is UTF-8 */
13779 SV* only_utf8_locale_list = NULL;
13782 /* In a range, counts how many 0-2 of the ends of it came from literals,
13783 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13784 UV literal_endpoint = 0;
13786 /* Is the range unicode? which means on a platform that isn't 1-1 native
13787 * to Unicode (i.e. non-ASCII), each code point in it should be considered
13788 * to be a Unicode value. */
13789 bool unicode_range = FALSE;
13791 bool invert = FALSE; /* Is this class to be complemented */
13793 bool warn_super = ALWAYS_WARN_SUPER;
13795 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13796 case we need to change the emitted regop to an EXACT. */
13797 const char * orig_parse = RExC_parse;
13798 const SSize_t orig_size = RExC_size;
13799 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13800 GET_RE_DEBUG_FLAGS_DECL;
13802 PERL_ARGS_ASSERT_REGCLASS;
13804 PERL_UNUSED_ARG(depth);
13807 DEBUG_PARSE("clas");
13809 /* Assume we are going to generate an ANYOF node. */
13810 ret = reganode(pRExC_state,
13817 RExC_size += ANYOF_SKIP;
13818 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13821 ANYOF_FLAGS(ret) = 0;
13823 RExC_emit += ANYOF_SKIP;
13824 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13825 initial_listsv_len = SvCUR(listsv);
13826 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13830 RExC_parse = regpatws(pRExC_state, RExC_parse,
13831 FALSE /* means don't recognize comments */ );
13834 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13837 allow_multi_folds = FALSE;
13840 RExC_parse = regpatws(pRExC_state, RExC_parse,
13841 FALSE /* means don't recognize comments */ );
13845 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13846 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13847 const char *s = RExC_parse;
13848 const char c = *s++;
13850 while (isWORDCHAR(*s))
13852 if (*s && c == *s && s[1] == ']') {
13853 SAVEFREESV(RExC_rx_sv);
13855 "POSIX syntax [%c %c] belongs inside character classes",
13857 (void)ReREFCNT_inc(RExC_rx_sv);
13861 /* If the caller wants us to just parse a single element, accomplish this
13862 * by faking the loop ending condition */
13863 if (stop_at_1 && RExC_end > RExC_parse) {
13864 stop_ptr = RExC_parse + 1;
13867 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13868 if (UCHARAT(RExC_parse) == ']')
13869 goto charclassloop;
13872 if (RExC_parse >= stop_ptr) {
13877 RExC_parse = regpatws(pRExC_state, RExC_parse,
13878 FALSE /* means don't recognize comments */ );
13881 if (UCHARAT(RExC_parse) == ']') {
13887 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13888 save_value = value;
13889 save_prevvalue = prevvalue;
13892 rangebegin = RExC_parse;
13895 literal_endpoint = 0;
13899 value = utf8n_to_uvchr((U8*)RExC_parse,
13900 RExC_end - RExC_parse,
13901 &numlen, UTF8_ALLOW_DEFAULT);
13902 RExC_parse += numlen;
13905 value = UCHARAT(RExC_parse++);
13908 && RExC_parse < RExC_end
13909 && POSIXCC(UCHARAT(RExC_parse)))
13911 namedclass = regpposixcc(pRExC_state, value, strict);
13913 else if (value != '\\') {
13915 literal_endpoint++;
13919 /* Is a backslash; get the code point of the char after it */
13920 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13921 value = utf8n_to_uvchr((U8*)RExC_parse,
13922 RExC_end - RExC_parse,
13923 &numlen, UTF8_ALLOW_DEFAULT);
13924 RExC_parse += numlen;
13927 value = UCHARAT(RExC_parse++);
13929 /* Some compilers cannot handle switching on 64-bit integer
13930 * values, therefore value cannot be an UV. Yes, this will
13931 * be a problem later if we want switch on Unicode.
13932 * A similar issue a little bit later when switching on
13933 * namedclass. --jhi */
13935 /* If the \ is escaping white space when white space is being
13936 * skipped, it means that that white space is wanted literally, and
13937 * is already in 'value'. Otherwise, need to translate the escape
13938 * into what it signifies. */
13939 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13941 case 'w': namedclass = ANYOF_WORDCHAR; break;
13942 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13943 case 's': namedclass = ANYOF_SPACE; break;
13944 case 'S': namedclass = ANYOF_NSPACE; break;
13945 case 'd': namedclass = ANYOF_DIGIT; break;
13946 case 'D': namedclass = ANYOF_NDIGIT; break;
13947 case 'v': namedclass = ANYOF_VERTWS; break;
13948 case 'V': namedclass = ANYOF_NVERTWS; break;
13949 case 'h': namedclass = ANYOF_HORIZWS; break;
13950 case 'H': namedclass = ANYOF_NHORIZWS; break;
13951 case 'N': /* Handle \N{NAME} in class */
13954 STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13955 flagp, depth, &as_text);
13956 if (*flagp & RESTART_UTF8)
13957 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13958 if (cp_count != 1) { /* The typical case drops through */
13959 assert(cp_count != (STRLEN) -1);
13960 if (cp_count == 0) {
13962 RExC_parse++; /* Position after the "}" */
13963 vFAIL("Zero length \\N{}");
13966 ckWARNreg(RExC_parse,
13967 "Ignoring zero length \\N{} in character class");
13970 else { /* cp_count > 1 */
13971 if (! RExC_in_multi_char_class) {
13972 if (invert || range || *RExC_parse == '-') {
13975 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13978 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13983 = add_multi_match(multi_char_matches,
13987 break; /* <value> contains the first code
13988 point. Drop out of the switch to
13991 } /* End of cp_count != 1 */
13993 /* This element should not be processed further in this
13996 value = save_value;
13997 prevvalue = save_prevvalue;
13998 continue; /* Back to top of loop to get next char */
14000 /* Here, is a single code point, and <value> contains it */
14002 /* We consider named characters to be literal characters,
14003 * and they are Unicode */
14004 literal_endpoint++;
14005 unicode_range = TRUE;
14014 /* We will handle any undefined properties ourselves */
14015 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14016 /* And we actually would prefer to get
14017 * the straight inversion list of the
14018 * swash, since we will be accessing it
14019 * anyway, to save a little time */
14020 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14022 if (RExC_parse >= RExC_end)
14023 vFAIL2("Empty \\%c{}", (U8)value);
14024 if (*RExC_parse == '{') {
14025 const U8 c = (U8)value;
14026 e = strchr(RExC_parse++, '}');
14028 vFAIL2("Missing right brace on \\%c{}", c);
14029 while (isSPACE(*RExC_parse))
14031 if (e == RExC_parse)
14032 vFAIL2("Empty \\%c{}", c);
14033 n = e - RExC_parse;
14034 while (isSPACE(*(RExC_parse + n - 1)))
14045 if (UCHARAT(RExC_parse) == '^') {
14048 /* toggle. (The rhs xor gets the single bit that
14049 * differs between P and p; the other xor inverts just
14051 value ^= 'P' ^ 'p';
14053 while (isSPACE(*RExC_parse)) {
14058 /* Try to get the definition of the property into
14059 * <invlist>. If /i is in effect, the effective property
14060 * will have its name be <__NAME_i>. The design is
14061 * discussed in commit
14062 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14063 name = savepv(Perl_form(aTHX_
14065 (FOLD) ? "__" : "",
14071 /* Look up the property name, and get its swash and
14072 * inversion list, if the property is found */
14074 SvREFCNT_dec_NN(swash);
14076 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14079 NULL, /* No inversion list */
14082 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14083 HV* curpkg = (IN_PERL_COMPILETIME)
14085 : CopSTASH(PL_curcop);
14087 SvREFCNT_dec_NN(swash);
14091 /* Here didn't find it. It could be a user-defined
14092 * property that will be available at run-time. If we
14093 * accept only compile-time properties, is an error;
14094 * otherwise add it to the list for run-time look up */
14096 RExC_parse = e + 1;
14098 "Property '%"UTF8f"' is unknown",
14099 UTF8fARG(UTF, n, name));
14102 /* If the property name doesn't already have a package
14103 * name, add the current one to it so that it can be
14104 * referred to outside it. [perl #121777] */
14105 if (curpkg && ! instr(name, "::")) {
14106 char* pkgname = HvNAME(curpkg);
14107 if (strNE(pkgname, "main")) {
14108 char* full_name = Perl_form(aTHX_
14112 n = strlen(full_name);
14114 name = savepvn(full_name, n);
14117 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14118 (value == 'p' ? '+' : '!'),
14119 UTF8fARG(UTF, n, name));
14120 has_user_defined_property = TRUE;
14122 /* We don't know yet, so have to assume that the
14123 * property could match something in the Latin1 range,
14124 * hence something that isn't utf8. Note that this
14125 * would cause things in <depends_list> to match
14126 * inappropriately, except that any \p{}, including
14127 * this one forces Unicode semantics, which means there
14128 * is no <depends_list> */
14130 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14134 /* Here, did get the swash and its inversion list. If
14135 * the swash is from a user-defined property, then this
14136 * whole character class should be regarded as such */
14137 if (swash_init_flags
14138 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14140 has_user_defined_property = TRUE;
14143 /* We warn on matching an above-Unicode code point
14144 * if the match would return true, except don't
14145 * warn for \p{All}, which has exactly one element
14147 (_invlist_contains_cp(invlist, 0x110000)
14148 && (! (_invlist_len(invlist) == 1
14149 && *invlist_array(invlist) == 0)))
14155 /* Invert if asking for the complement */
14156 if (value == 'P') {
14157 _invlist_union_complement_2nd(properties,
14161 /* The swash can't be used as-is, because we've
14162 * inverted things; delay removing it to here after
14163 * have copied its invlist above */
14164 SvREFCNT_dec_NN(swash);
14168 _invlist_union(properties, invlist, &properties);
14173 RExC_parse = e + 1;
14174 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14177 /* \p means they want Unicode semantics */
14178 RExC_uni_semantics = 1;
14181 case 'n': value = '\n'; break;
14182 case 'r': value = '\r'; break;
14183 case 't': value = '\t'; break;
14184 case 'f': value = '\f'; break;
14185 case 'b': value = '\b'; break;
14186 case 'e': value = ESC_NATIVE; break;
14187 case 'a': value = '\a'; break;
14189 RExC_parse--; /* function expects to be pointed at the 'o' */
14191 const char* error_msg;
14192 bool valid = grok_bslash_o(&RExC_parse,
14195 PASS2, /* warnings only in
14198 silence_non_portable,
14204 if (IN_ENCODING && value < 0x100) {
14205 goto recode_encoding;
14209 RExC_parse--; /* function expects to be pointed at the 'x' */
14211 const char* error_msg;
14212 bool valid = grok_bslash_x(&RExC_parse,
14215 PASS2, /* Output warnings */
14217 silence_non_portable,
14223 if (IN_ENCODING && value < 0x100)
14224 goto recode_encoding;
14227 value = grok_bslash_c(*RExC_parse++, PASS2);
14229 case '0': case '1': case '2': case '3': case '4':
14230 case '5': case '6': case '7':
14232 /* Take 1-3 octal digits */
14233 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14234 numlen = (strict) ? 4 : 3;
14235 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14236 RExC_parse += numlen;
14239 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14240 vFAIL("Need exactly 3 octal digits");
14242 else if (! SIZE_ONLY /* like \08, \178 */
14244 && RExC_parse < RExC_end
14245 && isDIGIT(*RExC_parse)
14246 && ckWARN(WARN_REGEXP))
14248 SAVEFREESV(RExC_rx_sv);
14249 reg_warn_non_literal_string(
14251 form_short_octal_warning(RExC_parse, numlen));
14252 (void)ReREFCNT_inc(RExC_rx_sv);
14255 if (IN_ENCODING && value < 0x100)
14256 goto recode_encoding;
14260 if (! RExC_override_recoding) {
14261 SV* enc = _get_encoding();
14262 value = reg_recode((const char)(U8)value, &enc);
14265 vFAIL("Invalid escape in the specified encoding");
14268 ckWARNreg(RExC_parse,
14269 "Invalid escape in the specified encoding");
14275 /* Allow \_ to not give an error */
14276 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14278 vFAIL2("Unrecognized escape \\%c in character class",
14282 SAVEFREESV(RExC_rx_sv);
14283 ckWARN2reg(RExC_parse,
14284 "Unrecognized escape \\%c in character class passed through",
14286 (void)ReREFCNT_inc(RExC_rx_sv);
14290 } /* End of switch on char following backslash */
14291 } /* end of handling backslash escape sequences */
14293 /* Here, we have the current token in 'value' */
14295 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14298 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14299 * literal, as is the character that began the false range, i.e.
14300 * the 'a' in the examples */
14303 const int w = (RExC_parse >= rangebegin)
14304 ? RExC_parse - rangebegin
14308 "False [] range \"%"UTF8f"\"",
14309 UTF8fARG(UTF, w, rangebegin));
14312 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14313 ckWARN2reg(RExC_parse,
14314 "False [] range \"%"UTF8f"\"",
14315 UTF8fARG(UTF, w, rangebegin));
14316 (void)ReREFCNT_inc(RExC_rx_sv);
14317 cp_list = add_cp_to_invlist(cp_list, '-');
14318 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14323 range = 0; /* this was not a true range */
14324 element_count += 2; /* So counts for three values */
14327 classnum = namedclass_to_classnum(namedclass);
14329 if (LOC && namedclass < ANYOF_POSIXL_MAX
14330 #ifndef HAS_ISASCII
14331 && classnum != _CC_ASCII
14334 /* What the Posix classes (like \w, [:space:]) match in locale
14335 * isn't knowable under locale until actual match time. Room
14336 * must be reserved (one time per outer bracketed class) to
14337 * store such classes. The space will contain a bit for each
14338 * named class that is to be matched against. This isn't
14339 * needed for \p{} and pseudo-classes, as they are not affected
14340 * by locale, and hence are dealt with separately */
14341 if (! need_class) {
14344 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14347 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14349 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14350 ANYOF_POSIXL_ZERO(ret);
14353 /* Coverity thinks it is possible for this to be negative; both
14354 * jhi and khw think it's not, but be safer */
14355 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14356 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14358 /* See if it already matches the complement of this POSIX
14360 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14361 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14365 posixl_matches_all = TRUE;
14366 break; /* No need to continue. Since it matches both
14367 e.g., \w and \W, it matches everything, and the
14368 bracketed class can be optimized into qr/./s */
14371 /* Add this class to those that should be checked at runtime */
14372 ANYOF_POSIXL_SET(ret, namedclass);
14374 /* The above-Latin1 characters are not subject to locale rules.
14375 * Just add them, in the second pass, to the
14376 * unconditionally-matched list */
14378 SV* scratch_list = NULL;
14380 /* Get the list of the above-Latin1 code points this
14382 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14383 PL_XPosix_ptrs[classnum],
14385 /* Odd numbers are complements, like
14386 * NDIGIT, NASCII, ... */
14387 namedclass % 2 != 0,
14389 /* Checking if 'cp_list' is NULL first saves an extra
14390 * clone. Its reference count will be decremented at the
14391 * next union, etc, or if this is the only instance, at the
14392 * end of the routine */
14394 cp_list = scratch_list;
14397 _invlist_union(cp_list, scratch_list, &cp_list);
14398 SvREFCNT_dec_NN(scratch_list);
14400 continue; /* Go get next character */
14403 else if (! SIZE_ONLY) {
14405 /* Here, not in pass1 (in that pass we skip calculating the
14406 * contents of this class), and is /l, or is a POSIX class for
14407 * which /l doesn't matter (or is a Unicode property, which is
14408 * skipped here). */
14409 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14410 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14412 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14413 * nor /l make a difference in what these match,
14414 * therefore we just add what they match to cp_list. */
14415 if (classnum != _CC_VERTSPACE) {
14416 assert( namedclass == ANYOF_HORIZWS
14417 || namedclass == ANYOF_NHORIZWS);
14419 /* It turns out that \h is just a synonym for
14421 classnum = _CC_BLANK;
14424 _invlist_union_maybe_complement_2nd(
14426 PL_XPosix_ptrs[classnum],
14427 namedclass % 2 != 0, /* Complement if odd
14428 (NHORIZWS, NVERTWS)
14433 else { /* Garden variety class. If is NASCII, NDIGIT, ...
14434 complement and use nposixes */
14435 SV** posixes_ptr = namedclass % 2 == 0
14438 SV** source_ptr = &PL_XPosix_ptrs[classnum];
14439 _invlist_union_maybe_complement_2nd(
14442 namedclass % 2 != 0,
14446 } /* end of namedclass \blah */
14449 RExC_parse = regpatws(pRExC_state, RExC_parse,
14450 FALSE /* means don't recognize comments */ );
14453 /* If 'range' is set, 'value' is the ending of a range--check its
14454 * validity. (If value isn't a single code point in the case of a
14455 * range, we should have figured that out above in the code that
14456 * catches false ranges). Later, we will handle each individual code
14457 * point in the range. If 'range' isn't set, this could be the
14458 * beginning of a range, so check for that by looking ahead to see if
14459 * the next real character to be processed is the range indicator--the
14464 /* For unicode ranges, we have to test that the Unicode as opposed
14465 * to the native values are not decreasing. (Above 255, and there
14466 * is no difference between native and Unicode) */
14467 if (unicode_range && prevvalue < 255 && value < 255) {
14468 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14469 goto backwards_range;
14474 if (prevvalue > value) /* b-a */ {
14479 w = RExC_parse - rangebegin;
14481 "Invalid [] range \"%"UTF8f"\"",
14482 UTF8fARG(UTF, w, rangebegin));
14483 NOT_REACHED; /* NOT REACHED */
14487 prevvalue = value; /* save the beginning of the potential range */
14488 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14489 && *RExC_parse == '-')
14491 char* next_char_ptr = RExC_parse + 1;
14492 if (skip_white) { /* Get the next real char after the '-' */
14493 next_char_ptr = regpatws(pRExC_state,
14495 FALSE); /* means don't recognize
14499 /* If the '-' is at the end of the class (just before the ']',
14500 * it is a literal minus; otherwise it is a range */
14501 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14502 RExC_parse = next_char_ptr;
14504 /* a bad range like \w-, [:word:]- ? */
14505 if (namedclass > OOB_NAMEDCLASS) {
14506 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14507 const int w = RExC_parse >= rangebegin
14508 ? RExC_parse - rangebegin
14511 vFAIL4("False [] range \"%*.*s\"",
14516 "False [] range \"%*.*s\"",
14521 cp_list = add_cp_to_invlist(cp_list, '-');
14525 range = 1; /* yeah, it's a range! */
14526 continue; /* but do it the next time */
14531 if (namedclass > OOB_NAMEDCLASS) {
14535 /* Here, we have a single value this time through the loop, and
14536 * <prevvalue> is the beginning of the range, if any; or <value> if
14539 /* non-Latin1 code point implies unicode semantics. Must be set in
14540 * pass1 so is there for the whole of pass 2 */
14542 RExC_uni_semantics = 1;
14545 /* Ready to process either the single value, or the completed range.
14546 * For single-valued non-inverted ranges, we consider the possibility
14547 * of multi-char folds. (We made a conscious decision to not do this
14548 * for the other cases because it can often lead to non-intuitive
14549 * results. For example, you have the peculiar case that:
14550 * "s s" =~ /^[^\xDF]+$/i => Y
14551 * "ss" =~ /^[^\xDF]+$/i => N
14553 * See [perl #89750] */
14554 if (FOLD && allow_multi_folds && value == prevvalue) {
14555 if (value == LATIN_SMALL_LETTER_SHARP_S
14556 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14559 /* Here <value> is indeed a multi-char fold. Get what it is */
14561 U8 foldbuf[UTF8_MAXBYTES_CASE];
14564 UV folded = _to_uni_fold_flags(
14568 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14569 ? FOLD_FLAGS_NOMIX_ASCII
14573 /* Here, <folded> should be the first character of the
14574 * multi-char fold of <value>, with <foldbuf> containing the
14575 * whole thing. But, if this fold is not allowed (because of
14576 * the flags), <fold> will be the same as <value>, and should
14577 * be processed like any other character, so skip the special
14579 if (folded != value) {
14581 /* Skip if we are recursed, currently parsing the class
14582 * again. Otherwise add this character to the list of
14583 * multi-char folds. */
14584 if (! RExC_in_multi_char_class) {
14585 STRLEN cp_count = utf8_length(foldbuf,
14586 foldbuf + foldlen);
14587 SV* multi_fold = sv_2mortal(newSVpvs(""));
14589 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14592 = add_multi_match(multi_char_matches,
14598 /* This element should not be processed further in this
14601 value = save_value;
14602 prevvalue = save_prevvalue;
14608 /* Deal with this element of the class */
14611 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14614 /* On non-ASCII platforms, for ranges that span all of 0..255, and
14615 * ones that don't require special handling, we can just add the
14616 * range like we do for ASCII platforms */
14617 if ((UNLIKELY(prevvalue == 0) && value >= 255)
14618 || ! (prevvalue < 256
14620 || (literal_endpoint == 2
14621 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14622 || (isUPPER_A(prevvalue)
14623 && isUPPER_A(value)))))))
14625 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14629 /* Here, requires special handling. This can be because it is
14630 * a range whose code points are considered to be Unicode, and
14631 * so must be individually translated into native, or because
14632 * its a subrange of 'A-Z' or 'a-z' which each aren't
14633 * contiguous in EBCDIC, but we have defined them to include
14634 * only the "expected" upper or lower case ASCII alphabetics.
14635 * Subranges above 255 are the same in native and Unicode, so
14636 * can be added as a range */
14637 U8 start = NATIVE_TO_LATIN1(prevvalue);
14639 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
14640 for (j = start; j <= end; j++) {
14641 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
14644 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14651 range = 0; /* this range (if it was one) is done now */
14652 } /* End of loop through all the text within the brackets */
14654 /* If anything in the class expands to more than one character, we have to
14655 * deal with them by building up a substitute parse string, and recursively
14656 * calling reg() on it, instead of proceeding */
14657 if (multi_char_matches) {
14658 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14661 char *save_end = RExC_end;
14662 char *save_parse = RExC_parse;
14663 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14668 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14669 because too confusing */
14671 sv_catpv(substitute_parse, "(?:");
14675 /* Look at the longest folds first */
14676 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14678 if (av_exists(multi_char_matches, cp_count)) {
14679 AV** this_array_ptr;
14682 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14684 while ((this_sequence = av_pop(*this_array_ptr)) !=
14687 if (! first_time) {
14688 sv_catpv(substitute_parse, "|");
14690 first_time = FALSE;
14692 sv_catpv(substitute_parse, SvPVX(this_sequence));
14697 /* If the character class contains anything else besides these
14698 * multi-character folds, have to include it in recursive parsing */
14699 if (element_count) {
14700 sv_catpv(substitute_parse, "|[");
14701 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14702 sv_catpv(substitute_parse, "]");
14705 sv_catpv(substitute_parse, ")");
14708 /* This is a way to get the parse to skip forward a whole named
14709 * sequence instead of matching the 2nd character when it fails the
14711 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14715 RExC_parse = SvPV(substitute_parse, len);
14716 RExC_end = RExC_parse + len;
14717 RExC_in_multi_char_class = 1;
14718 RExC_override_recoding = 1;
14719 RExC_emit = (regnode *)orig_emit;
14721 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14723 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14725 RExC_parse = save_parse;
14726 RExC_end = save_end;
14727 RExC_in_multi_char_class = 0;
14728 RExC_override_recoding = 0;
14729 SvREFCNT_dec_NN(multi_char_matches);
14733 /* Here, we've gone through the entire class and dealt with multi-char
14734 * folds. We are now in a position that we can do some checks to see if we
14735 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14736 * Currently we only do two checks:
14737 * 1) is in the unlikely event that the user has specified both, eg. \w and
14738 * \W under /l, then the class matches everything. (This optimization
14739 * is done only to make the optimizer code run later work.)
14740 * 2) if the character class contains only a single element (including a
14741 * single range), we see if there is an equivalent node for it.
14742 * Other checks are possible */
14743 if (! ret_invlist /* Can't optimize if returning the constructed
14745 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14750 if (UNLIKELY(posixl_matches_all)) {
14753 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14754 \w or [:digit:] or \p{foo}
14757 /* All named classes are mapped into POSIXish nodes, with its FLAG
14758 * argument giving which class it is */
14759 switch ((I32)namedclass) {
14760 case ANYOF_UNIPROP:
14763 /* These don't depend on the charset modifiers. They always
14764 * match under /u rules */
14765 case ANYOF_NHORIZWS:
14766 case ANYOF_HORIZWS:
14767 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14770 case ANYOF_NVERTWS:
14775 /* The actual POSIXish node for all the rest depends on the
14776 * charset modifier. The ones in the first set depend only on
14777 * ASCII or, if available on this platform, locale */
14781 op = (LOC) ? POSIXL : POSIXA;
14792 /* under /a could be alpha */
14794 if (ASCII_RESTRICTED) {
14795 namedclass = ANYOF_ALPHA + (namedclass % 2);
14803 /* The rest have more possibilities depending on the charset.
14804 * We take advantage of the enum ordering of the charset
14805 * modifiers to get the exact node type, */
14807 op = POSIXD + get_regex_charset(RExC_flags);
14808 if (op > POSIXA) { /* /aa is same as /a */
14813 /* The odd numbered ones are the complements of the
14814 * next-lower even number one */
14815 if (namedclass % 2 == 1) {
14819 arg = namedclass_to_classnum(namedclass);
14823 else if (value == prevvalue) {
14825 /* Here, the class consists of just a single code point */
14828 if (! LOC && value == '\n') {
14829 op = REG_ANY; /* Optimize [^\n] */
14830 *flagp |= HASWIDTH|SIMPLE;
14834 else if (value < 256 || UTF) {
14836 /* Optimize a single value into an EXACTish node, but not if it
14837 * would require converting the pattern to UTF-8. */
14838 op = compute_EXACTish(pRExC_state);
14840 } /* Otherwise is a range */
14841 else if (! LOC) { /* locale could vary these */
14842 if (prevvalue == '0') {
14843 if (value == '9') {
14848 else if (prevvalue == 'A') {
14851 && literal_endpoint == 2
14854 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14858 else if (prevvalue == 'a') {
14861 && literal_endpoint == 2
14864 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14870 /* Here, we have changed <op> away from its initial value iff we found
14871 * an optimization */
14874 /* Throw away this ANYOF regnode, and emit the calculated one,
14875 * which should correspond to the beginning, not current, state of
14877 const char * cur_parse = RExC_parse;
14878 RExC_parse = (char *)orig_parse;
14882 /* To get locale nodes to not use the full ANYOF size would
14883 * require moving the code above that writes the portions
14884 * of it that aren't in other nodes to after this point.
14885 * e.g. ANYOF_POSIXL_SET */
14886 RExC_size = orig_size;
14890 RExC_emit = (regnode *)orig_emit;
14891 if (PL_regkind[op] == POSIXD) {
14892 if (op == POSIXL) {
14893 RExC_contains_locale = 1;
14896 op += NPOSIXD - POSIXD;
14901 ret = reg_node(pRExC_state, op);
14903 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14907 *flagp |= HASWIDTH|SIMPLE;
14909 else if (PL_regkind[op] == EXACT) {
14910 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14911 TRUE /* downgradable to EXACT */
14915 RExC_parse = (char *) cur_parse;
14917 SvREFCNT_dec(posixes);
14918 SvREFCNT_dec(nposixes);
14919 SvREFCNT_dec(cp_list);
14920 SvREFCNT_dec(cp_foldable_list);
14927 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14929 /* If folding, we calculate all characters that could fold to or from the
14930 * ones already on the list */
14931 if (cp_foldable_list) {
14933 UV start, end; /* End points of code point ranges */
14935 SV* fold_intersection = NULL;
14938 /* Our calculated list will be for Unicode rules. For locale
14939 * matching, we have to keep a separate list that is consulted at
14940 * runtime only when the locale indicates Unicode rules. For
14941 * non-locale, we just use to the general list */
14943 use_list = &only_utf8_locale_list;
14946 use_list = &cp_list;
14949 /* Only the characters in this class that participate in folds need
14950 * be checked. Get the intersection of this class and all the
14951 * possible characters that are foldable. This can quickly narrow
14952 * down a large class */
14953 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14954 &fold_intersection);
14956 /* The folds for all the Latin1 characters are hard-coded into this
14957 * program, but we have to go out to disk to get the others. */
14958 if (invlist_highest(cp_foldable_list) >= 256) {
14960 /* This is a hash that for a particular fold gives all
14961 * characters that are involved in it */
14962 if (! PL_utf8_foldclosures) {
14963 _load_PL_utf8_foldclosures();
14967 /* Now look at the foldable characters in this class individually */
14968 invlist_iterinit(fold_intersection);
14969 while (invlist_iternext(fold_intersection, &start, &end)) {
14972 /* Look at every character in the range */
14973 for (j = start; j <= end; j++) {
14974 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14980 if (IS_IN_SOME_FOLD_L1(j)) {
14982 /* ASCII is always matched; non-ASCII is matched
14983 * only under Unicode rules (which could happen
14984 * under /l if the locale is a UTF-8 one */
14985 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14986 *use_list = add_cp_to_invlist(*use_list,
14987 PL_fold_latin1[j]);
14991 add_cp_to_invlist(depends_list,
14992 PL_fold_latin1[j]);
14996 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14997 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14999 add_above_Latin1_folds(pRExC_state,
15006 /* Here is an above Latin1 character. We don't have the
15007 * rules hard-coded for it. First, get its fold. This is
15008 * the simple fold, as the multi-character folds have been
15009 * handled earlier and separated out */
15010 _to_uni_fold_flags(j, foldbuf, &foldlen,
15011 (ASCII_FOLD_RESTRICTED)
15012 ? FOLD_FLAGS_NOMIX_ASCII
15015 /* Single character fold of above Latin1. Add everything in
15016 * its fold closure to the list that this node should match.
15017 * The fold closures data structure is a hash with the keys
15018 * being the UTF-8 of every character that is folded to, like
15019 * 'k', and the values each an array of all code points that
15020 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
15021 * Multi-character folds are not included */
15022 if ((listp = hv_fetch(PL_utf8_foldclosures,
15023 (char *) foldbuf, foldlen, FALSE)))
15025 AV* list = (AV*) *listp;
15027 for (k = 0; k <= av_tindex(list); k++) {
15028 SV** c_p = av_fetch(list, k, FALSE);
15034 /* /aa doesn't allow folds between ASCII and non- */
15035 if ((ASCII_FOLD_RESTRICTED
15036 && (isASCII(c) != isASCII(j))))
15041 /* Folds under /l which cross the 255/256 boundary
15042 * are added to a separate list. (These are valid
15043 * only when the locale is UTF-8.) */
15044 if (c < 256 && LOC) {
15045 *use_list = add_cp_to_invlist(*use_list, c);
15049 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15051 cp_list = add_cp_to_invlist(cp_list, c);
15054 /* Similarly folds involving non-ascii Latin1
15055 * characters under /d are added to their list */
15056 depends_list = add_cp_to_invlist(depends_list,
15063 SvREFCNT_dec_NN(fold_intersection);
15066 /* Now that we have finished adding all the folds, there is no reason
15067 * to keep the foldable list separate */
15068 _invlist_union(cp_list, cp_foldable_list, &cp_list);
15069 SvREFCNT_dec_NN(cp_foldable_list);
15072 /* And combine the result (if any) with any inversion list from posix
15073 * classes. The lists are kept separate up to now because we don't want to
15074 * fold the classes (folding of those is automatically handled by the swash
15075 * fetching code) */
15076 if (posixes || nposixes) {
15077 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15078 /* Under /a and /aa, nothing above ASCII matches these */
15079 _invlist_intersection(posixes,
15080 PL_XPosix_ptrs[_CC_ASCII],
15084 if (DEPENDS_SEMANTICS) {
15085 /* Under /d, everything in the upper half of the Latin1 range
15086 * matches these complements */
15087 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15089 else if (AT_LEAST_ASCII_RESTRICTED) {
15090 /* Under /a and /aa, everything above ASCII matches these
15092 _invlist_union_complement_2nd(nposixes,
15093 PL_XPosix_ptrs[_CC_ASCII],
15097 _invlist_union(posixes, nposixes, &posixes);
15098 SvREFCNT_dec_NN(nposixes);
15101 posixes = nposixes;
15104 if (! DEPENDS_SEMANTICS) {
15106 _invlist_union(cp_list, posixes, &cp_list);
15107 SvREFCNT_dec_NN(posixes);
15114 /* Under /d, we put into a separate list the Latin1 things that
15115 * match only when the target string is utf8 */
15116 SV* nonascii_but_latin1_properties = NULL;
15117 _invlist_intersection(posixes, PL_UpperLatin1,
15118 &nonascii_but_latin1_properties);
15119 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15122 _invlist_union(cp_list, posixes, &cp_list);
15123 SvREFCNT_dec_NN(posixes);
15129 if (depends_list) {
15130 _invlist_union(depends_list, nonascii_but_latin1_properties,
15132 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15135 depends_list = nonascii_but_latin1_properties;
15140 /* And combine the result (if any) with any inversion list from properties.
15141 * The lists are kept separate up to now so that we can distinguish the two
15142 * in regards to matching above-Unicode. A run-time warning is generated
15143 * if a Unicode property is matched against a non-Unicode code point. But,
15144 * we allow user-defined properties to match anything, without any warning,
15145 * and we also suppress the warning if there is a portion of the character
15146 * class that isn't a Unicode property, and which matches above Unicode, \W
15147 * or [\x{110000}] for example.
15148 * (Note that in this case, unlike the Posix one above, there is no
15149 * <depends_list>, because having a Unicode property forces Unicode
15154 /* If it matters to the final outcome, see if a non-property
15155 * component of the class matches above Unicode. If so, the
15156 * warning gets suppressed. This is true even if just a single
15157 * such code point is specified, as though not strictly correct if
15158 * another such code point is matched against, the fact that they
15159 * are using above-Unicode code points indicates they should know
15160 * the issues involved */
15162 warn_super = ! (invert
15163 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15166 _invlist_union(properties, cp_list, &cp_list);
15167 SvREFCNT_dec_NN(properties);
15170 cp_list = properties;
15174 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15178 /* Here, we have calculated what code points should be in the character
15181 * Now we can see about various optimizations. Fold calculation (which we
15182 * did above) needs to take place before inversion. Otherwise /[^k]/i
15183 * would invert to include K, which under /i would match k, which it
15184 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15185 * folded until runtime */
15187 /* If we didn't do folding, it's because some information isn't available
15188 * until runtime; set the run-time fold flag for these. (We don't have to
15189 * worry about properties folding, as that is taken care of by the swash
15190 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15191 * locales, or the class matches at least one 0-255 range code point */
15193 if (only_utf8_locale_list) {
15194 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15196 else if (cp_list) { /* Look to see if there a 0-255 code point is in
15199 invlist_iterinit(cp_list);
15200 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15201 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15203 invlist_iterfinish(cp_list);
15207 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15208 * at compile time. Besides not inverting folded locale now, we can't
15209 * invert if there are things such as \w, which aren't known until runtime
15213 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15215 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15217 _invlist_invert(cp_list);
15219 /* Any swash can't be used as-is, because we've inverted things */
15221 SvREFCNT_dec_NN(swash);
15225 /* Clear the invert flag since have just done it here */
15230 *ret_invlist = cp_list;
15231 SvREFCNT_dec(swash);
15233 /* Discard the generated node */
15235 RExC_size = orig_size;
15238 RExC_emit = orig_emit;
15243 /* Some character classes are equivalent to other nodes. Such nodes take
15244 * up less room and generally fewer operations to execute than ANYOF nodes.
15245 * Above, we checked for and optimized into some such equivalents for
15246 * certain common classes that are easy to test. Getting to this point in
15247 * the code means that the class didn't get optimized there. Since this
15248 * code is only executed in Pass 2, it is too late to save space--it has
15249 * been allocated in Pass 1, and currently isn't given back. But turning
15250 * things into an EXACTish node can allow the optimizer to join it to any
15251 * adjacent such nodes. And if the class is equivalent to things like /./,
15252 * expensive run-time swashes can be avoided. Now that we have more
15253 * complete information, we can find things necessarily missed by the
15254 * earlier code. I (khw) am not sure how much to look for here. It would
15255 * be easy, but perhaps too slow, to check any candidates against all the
15256 * node types they could possibly match using _invlistEQ(). */
15261 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15262 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15264 /* We don't optimize if we are supposed to make sure all non-Unicode
15265 * code points raise a warning, as only ANYOF nodes have this check.
15267 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15270 U8 op = END; /* The optimzation node-type */
15271 const char * cur_parse= RExC_parse;
15273 invlist_iterinit(cp_list);
15274 if (! invlist_iternext(cp_list, &start, &end)) {
15276 /* Here, the list is empty. This happens, for example, when a
15277 * Unicode property is the only thing in the character class, and
15278 * it doesn't match anything. (perluniprops.pod notes such
15281 *flagp |= HASWIDTH|SIMPLE;
15283 else if (start == end) { /* The range is a single code point */
15284 if (! invlist_iternext(cp_list, &start, &end)
15286 /* Don't do this optimization if it would require changing
15287 * the pattern to UTF-8 */
15288 && (start < 256 || UTF))
15290 /* Here, the list contains a single code point. Can optimize
15291 * into an EXACTish node */
15302 /* A locale node under folding with one code point can be
15303 * an EXACTFL, as its fold won't be calculated until
15309 /* Here, we are generally folding, but there is only one
15310 * code point to match. If we have to, we use an EXACT
15311 * node, but it would be better for joining with adjacent
15312 * nodes in the optimization pass if we used the same
15313 * EXACTFish node that any such are likely to be. We can
15314 * do this iff the code point doesn't participate in any
15315 * folds. For example, an EXACTF of a colon is the same as
15316 * an EXACT one, since nothing folds to or from a colon. */
15318 if (IS_IN_SOME_FOLD_L1(value)) {
15323 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15328 /* If we haven't found the node type, above, it means we
15329 * can use the prevailing one */
15331 op = compute_EXACTish(pRExC_state);
15336 else if (start == 0) {
15337 if (end == UV_MAX) {
15339 *flagp |= HASWIDTH|SIMPLE;
15342 else if (end == '\n' - 1
15343 && invlist_iternext(cp_list, &start, &end)
15344 && start == '\n' + 1 && end == UV_MAX)
15347 *flagp |= HASWIDTH|SIMPLE;
15351 invlist_iterfinish(cp_list);
15354 RExC_parse = (char *)orig_parse;
15355 RExC_emit = (regnode *)orig_emit;
15357 ret = reg_node(pRExC_state, op);
15359 RExC_parse = (char *)cur_parse;
15361 if (PL_regkind[op] == EXACT) {
15362 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15363 TRUE /* downgradable to EXACT */
15367 SvREFCNT_dec_NN(cp_list);
15372 /* Here, <cp_list> contains all the code points we can determine at
15373 * compile time that match under all conditions. Go through it, and
15374 * for things that belong in the bitmap, put them there, and delete from
15375 * <cp_list>. While we are at it, see if everything above 255 is in the
15376 * list, and if so, set a flag to speed up execution */
15378 populate_ANYOF_from_invlist(ret, &cp_list);
15381 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15384 /* Here, the bitmap has been populated with all the Latin1 code points that
15385 * always match. Can now add to the overall list those that match only
15386 * when the target string is UTF-8 (<depends_list>). */
15387 if (depends_list) {
15389 _invlist_union(cp_list, depends_list, &cp_list);
15390 SvREFCNT_dec_NN(depends_list);
15393 cp_list = depends_list;
15395 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15398 /* If there is a swash and more than one element, we can't use the swash in
15399 * the optimization below. */
15400 if (swash && element_count > 1) {
15401 SvREFCNT_dec_NN(swash);
15405 /* Note that the optimization of using 'swash' if it is the only thing in
15406 * the class doesn't have us change swash at all, so it can include things
15407 * that are also in the bitmap; otherwise we have purposely deleted that
15408 * duplicate information */
15409 set_ANYOF_arg(pRExC_state, ret, cp_list,
15410 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15412 only_utf8_locale_list,
15413 swash, has_user_defined_property);
15415 *flagp |= HASWIDTH|SIMPLE;
15417 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15418 RExC_contains_locale = 1;
15424 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15427 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15428 regnode* const node,
15430 SV* const runtime_defns,
15431 SV* const only_utf8_locale_list,
15433 const bool has_user_defined_property)
15435 /* Sets the arg field of an ANYOF-type node 'node', using information about
15436 * the node passed-in. If there is nothing outside the node's bitmap, the
15437 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
15438 * the count returned by add_data(), having allocated and stored an array,
15439 * av, that that count references, as follows:
15440 * av[0] stores the character class description in its textual form.
15441 * This is used later (regexec.c:Perl_regclass_swash()) to
15442 * initialize the appropriate swash, and is also useful for dumping
15443 * the regnode. This is set to &PL_sv_undef if the textual
15444 * description is not needed at run-time (as happens if the other
15445 * elements completely define the class)
15446 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15447 * computed from av[0]. But if no further computation need be done,
15448 * the swash is stored here now (and av[0] is &PL_sv_undef).
15449 * av[2] stores the inversion list of code points that match only if the
15450 * current locale is UTF-8
15451 * av[3] stores the cp_list inversion list for use in addition or instead
15452 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15453 * (Otherwise everything needed is already in av[0] and av[1])
15454 * av[4] is set if any component of the class is from a user-defined
15455 * property; used only if av[3] exists */
15459 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15461 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15462 assert(! (ANYOF_FLAGS(node)
15463 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15464 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15465 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15468 AV * const av = newAV();
15471 assert(ANYOF_FLAGS(node)
15472 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15473 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15475 av_store(av, 0, (runtime_defns)
15476 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15479 av_store(av, 1, swash);
15480 SvREFCNT_dec_NN(cp_list);
15483 av_store(av, 1, &PL_sv_undef);
15485 av_store(av, 3, cp_list);
15486 av_store(av, 4, newSVuv(has_user_defined_property));
15490 if (only_utf8_locale_list) {
15491 av_store(av, 2, only_utf8_locale_list);
15494 av_store(av, 2, &PL_sv_undef);
15497 rv = newRV_noinc(MUTABLE_SV(av));
15498 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15499 RExC_rxi->data->data[n] = (void*)rv;
15504 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15506 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15507 const regnode* node,
15510 SV** only_utf8_locale_ptr,
15514 /* For internal core use only.
15515 * Returns the swash for the input 'node' in the regex 'prog'.
15516 * If <doinit> is 'true', will attempt to create the swash if not already
15518 * If <listsvp> is non-null, will return the printable contents of the
15519 * swash. This can be used to get debugging information even before the
15520 * swash exists, by calling this function with 'doinit' set to false, in
15521 * which case the components that will be used to eventually create the
15522 * swash are returned (in a printable form).
15523 * If <exclude_list> is not NULL, it is an inversion list of things to
15524 * exclude from what's returned in <listsvp>.
15525 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
15526 * that, in spite of this function's name, the swash it returns may include
15527 * the bitmap data as well */
15530 SV *si = NULL; /* Input swash initialization string */
15531 SV* invlist = NULL;
15533 RXi_GET_DECL(prog,progi);
15534 const struct reg_data * const data = prog ? progi->data : NULL;
15536 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15538 assert(ANYOF_FLAGS(node)
15539 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15540 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15542 if (data && data->count) {
15543 const U32 n = ARG(node);
15545 if (data->what[n] == 's') {
15546 SV * const rv = MUTABLE_SV(data->data[n]);
15547 AV * const av = MUTABLE_AV(SvRV(rv));
15548 SV **const ary = AvARRAY(av);
15549 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15551 si = *ary; /* ary[0] = the string to initialize the swash with */
15553 /* Elements 3 and 4 are either both present or both absent. [3] is
15554 * any inversion list generated at compile time; [4] indicates if
15555 * that inversion list has any user-defined properties in it. */
15556 if (av_tindex(av) >= 2) {
15557 if (only_utf8_locale_ptr
15559 && ary[2] != &PL_sv_undef)
15561 *only_utf8_locale_ptr = ary[2];
15564 assert(only_utf8_locale_ptr);
15565 *only_utf8_locale_ptr = NULL;
15568 if (av_tindex(av) >= 3) {
15570 if (SvUV(ary[4])) {
15571 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15579 /* Element [1] is reserved for the set-up swash. If already there,
15580 * return it; if not, create it and store it there */
15581 if (ary[1] && SvROK(ary[1])) {
15584 else if (doinit && ((si && si != &PL_sv_undef)
15585 || (invlist && invlist != &PL_sv_undef))) {
15587 sw = _core_swash_init("utf8", /* the utf8 package */
15591 0, /* not from tr/// */
15593 &swash_init_flags);
15594 (void)av_store(av, 1, sw);
15599 /* If requested, return a printable version of what this swash matches */
15601 SV* matches_string = newSVpvs("");
15603 /* The swash should be used, if possible, to get the data, as it
15604 * contains the resolved data. But this function can be called at
15605 * compile-time, before everything gets resolved, in which case we
15606 * return the currently best available information, which is the string
15607 * that will eventually be used to do that resolving, 'si' */
15608 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15609 && (si && si != &PL_sv_undef))
15611 sv_catsv(matches_string, si);
15614 /* Add the inversion list to whatever we have. This may have come from
15615 * the swash, or from an input parameter */
15617 if (exclude_list) {
15618 SV* clone = invlist_clone(invlist);
15619 _invlist_subtract(clone, exclude_list, &clone);
15620 sv_catsv(matches_string, _invlist_contents(clone));
15621 SvREFCNT_dec_NN(clone);
15624 sv_catsv(matches_string, _invlist_contents(invlist));
15627 *listsvp = matches_string;
15632 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15634 /* reg_skipcomment()
15636 Absorbs an /x style # comment from the input stream,
15637 returning a pointer to the first character beyond the comment, or if the
15638 comment terminates the pattern without anything following it, this returns
15639 one past the final character of the pattern (in other words, RExC_end) and
15640 sets the REG_RUN_ON_COMMENT_SEEN flag.
15642 Note it's the callers responsibility to ensure that we are
15643 actually in /x mode
15647 PERL_STATIC_INLINE char*
15648 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15650 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15654 while (p < RExC_end) {
15655 if (*(++p) == '\n') {
15660 /* we ran off the end of the pattern without ending the comment, so we have
15661 * to add an \n when wrapping */
15662 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15668 Advances the parse position, and optionally absorbs
15669 "whitespace" from the inputstream.
15671 Without /x "whitespace" means (?#...) style comments only,
15672 with /x this means (?#...) and # comments and whitespace proper.
15674 Returns the RExC_parse point from BEFORE the scan occurs.
15676 This is the /x friendly way of saying RExC_parse++.
15680 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15682 char* const retval = RExC_parse++;
15684 PERL_ARGS_ASSERT_NEXTCHAR;
15687 if (RExC_end - RExC_parse >= 3
15688 && *RExC_parse == '('
15689 && RExC_parse[1] == '?'
15690 && RExC_parse[2] == '#')
15692 while (*RExC_parse != ')') {
15693 if (RExC_parse == RExC_end)
15694 FAIL("Sequence (?#... not terminated");
15700 if (RExC_flags & RXf_PMf_EXTENDED) {
15701 char * p = regpatws(pRExC_state, RExC_parse,
15702 TRUE); /* means recognize comments */
15703 if (p != RExC_parse) {
15713 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15715 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15716 * space. In pass1, it aligns and increments RExC_size; in pass2,
15719 regnode * const ret = RExC_emit;
15720 GET_RE_DEBUG_FLAGS_DECL;
15722 PERL_ARGS_ASSERT_REGNODE_GUTS;
15724 assert(extra_size >= regarglen[op]);
15727 SIZE_ALIGN(RExC_size);
15728 RExC_size += 1 + extra_size;
15731 if (RExC_emit >= RExC_emit_bound)
15732 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15733 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15735 NODE_ALIGN_FILL(ret);
15736 #ifndef RE_TRACK_PATTERN_OFFSETS
15737 PERL_UNUSED_ARG(name);
15739 if (RExC_offsets) { /* MJD */
15741 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15744 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15745 ? "Overwriting end of array!\n" : "OK",
15746 (UV)(RExC_emit - RExC_emit_start),
15747 (UV)(RExC_parse - RExC_start),
15748 (UV)RExC_offsets[0]));
15749 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15756 - reg_node - emit a node
15758 STATIC regnode * /* Location. */
15759 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15761 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15763 PERL_ARGS_ASSERT_REG_NODE;
15765 assert(regarglen[op] == 0);
15768 regnode *ptr = ret;
15769 FILL_ADVANCE_NODE(ptr, op);
15776 - reganode - emit a node with an argument
15778 STATIC regnode * /* Location. */
15779 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15781 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15783 PERL_ARGS_ASSERT_REGANODE;
15785 assert(regarglen[op] == 1);
15788 regnode *ptr = ret;
15789 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15796 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15798 /* emit a node with U32 and I32 arguments */
15800 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15802 PERL_ARGS_ASSERT_REG2LANODE;
15804 assert(regarglen[op] == 2);
15807 regnode *ptr = ret;
15808 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15815 - reguni - emit (if appropriate) a Unicode character
15817 PERL_STATIC_INLINE STRLEN
15818 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15820 PERL_ARGS_ASSERT_REGUNI;
15822 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15826 - reginsert - insert an operator in front of already-emitted operand
15828 * Means relocating the operand.
15831 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15836 const int offset = regarglen[(U8)op];
15837 const int size = NODE_STEP_REGNODE + offset;
15838 GET_RE_DEBUG_FLAGS_DECL;
15840 PERL_ARGS_ASSERT_REGINSERT;
15841 PERL_UNUSED_CONTEXT;
15842 PERL_UNUSED_ARG(depth);
15843 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15844 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15853 if (RExC_open_parens) {
15855 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15856 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15857 if ( RExC_open_parens[paren] >= opnd ) {
15858 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15859 RExC_open_parens[paren] += size;
15861 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15863 if ( RExC_close_parens[paren] >= opnd ) {
15864 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15865 RExC_close_parens[paren] += size;
15867 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15872 while (src > opnd) {
15873 StructCopy(--src, --dst, regnode);
15874 #ifdef RE_TRACK_PATTERN_OFFSETS
15875 if (RExC_offsets) { /* MJD 20010112 */
15877 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15881 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15882 ? "Overwriting end of array!\n" : "OK",
15883 (UV)(src - RExC_emit_start),
15884 (UV)(dst - RExC_emit_start),
15885 (UV)RExC_offsets[0]));
15886 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15887 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15893 place = opnd; /* Op node, where operand used to be. */
15894 #ifdef RE_TRACK_PATTERN_OFFSETS
15895 if (RExC_offsets) { /* MJD */
15897 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15901 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15902 ? "Overwriting end of array!\n" : "OK",
15903 (UV)(place - RExC_emit_start),
15904 (UV)(RExC_parse - RExC_start),
15905 (UV)RExC_offsets[0]));
15906 Set_Node_Offset(place, RExC_parse);
15907 Set_Node_Length(place, 1);
15910 src = NEXTOPER(place);
15911 FILL_ADVANCE_NODE(place, op);
15912 Zero(src, offset, regnode);
15916 - regtail - set the next-pointer at the end of a node chain of p to val.
15917 - SEE ALSO: regtail_study
15919 /* TODO: All three parms should be const */
15921 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15922 const regnode *val,U32 depth)
15925 GET_RE_DEBUG_FLAGS_DECL;
15927 PERL_ARGS_ASSERT_REGTAIL;
15929 PERL_UNUSED_ARG(depth);
15935 /* Find last node. */
15938 regnode * const temp = regnext(scan);
15940 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15941 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15942 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15943 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15944 (temp == NULL ? "->" : ""),
15945 (temp == NULL ? PL_reg_name[OP(val)] : "")
15953 if (reg_off_by_arg[OP(scan)]) {
15954 ARG_SET(scan, val - scan);
15957 NEXT_OFF(scan) = val - scan;
15963 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15964 - Look for optimizable sequences at the same time.
15965 - currently only looks for EXACT chains.
15967 This is experimental code. The idea is to use this routine to perform
15968 in place optimizations on branches and groups as they are constructed,
15969 with the long term intention of removing optimization from study_chunk so
15970 that it is purely analytical.
15972 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15973 to control which is which.
15976 /* TODO: All four parms should be const */
15979 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15980 const regnode *val,U32 depth)
15984 #ifdef EXPERIMENTAL_INPLACESCAN
15987 GET_RE_DEBUG_FLAGS_DECL;
15989 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15995 /* Find last node. */
15999 regnode * const temp = regnext(scan);
16000 #ifdef EXPERIMENTAL_INPLACESCAN
16001 if (PL_regkind[OP(scan)] == EXACT) {
16002 bool unfolded_multi_char; /* Unexamined in this routine */
16003 if (join_exact(pRExC_state, scan, &min,
16004 &unfolded_multi_char, 1, val, depth+1))
16009 switch (OP(scan)) {
16013 case EXACTFA_NO_TRIE:
16019 if( exact == PSEUDO )
16021 else if ( exact != OP(scan) )
16030 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16031 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16032 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16033 SvPV_nolen_const(RExC_mysv),
16034 REG_NODE_NUM(scan),
16035 PL_reg_name[exact]);
16042 DEBUG_PARSE_MSG("");
16043 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16044 PerlIO_printf(Perl_debug_log,
16045 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16046 SvPV_nolen_const(RExC_mysv),
16047 (IV)REG_NODE_NUM(val),
16051 if (reg_off_by_arg[OP(scan)]) {
16052 ARG_SET(scan, val - scan);
16055 NEXT_OFF(scan) = val - scan;
16063 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16068 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16073 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16075 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16076 if (flags & (1<<bit)) {
16077 if (!set++ && lead)
16078 PerlIO_printf(Perl_debug_log, "%s",lead);
16079 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16084 PerlIO_printf(Perl_debug_log, "\n");
16086 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16091 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16097 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16099 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16100 if (flags & (1<<bit)) {
16101 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16104 if (!set++ && lead)
16105 PerlIO_printf(Perl_debug_log, "%s",lead);
16106 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16109 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16110 if (!set++ && lead) {
16111 PerlIO_printf(Perl_debug_log, "%s",lead);
16114 case REGEX_UNICODE_CHARSET:
16115 PerlIO_printf(Perl_debug_log, "UNICODE");
16117 case REGEX_LOCALE_CHARSET:
16118 PerlIO_printf(Perl_debug_log, "LOCALE");
16120 case REGEX_ASCII_RESTRICTED_CHARSET:
16121 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16123 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16124 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16127 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16133 PerlIO_printf(Perl_debug_log, "\n");
16135 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16141 Perl_regdump(pTHX_ const regexp *r)
16144 SV * const sv = sv_newmortal();
16145 SV *dsv= sv_newmortal();
16146 RXi_GET_DECL(r,ri);
16147 GET_RE_DEBUG_FLAGS_DECL;
16149 PERL_ARGS_ASSERT_REGDUMP;
16151 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16153 /* Header fields of interest. */
16154 if (r->anchored_substr) {
16155 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16156 RE_SV_DUMPLEN(r->anchored_substr), 30);
16157 PerlIO_printf(Perl_debug_log,
16158 "anchored %s%s at %"IVdf" ",
16159 s, RE_SV_TAIL(r->anchored_substr),
16160 (IV)r->anchored_offset);
16161 } else if (r->anchored_utf8) {
16162 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16163 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16164 PerlIO_printf(Perl_debug_log,
16165 "anchored utf8 %s%s at %"IVdf" ",
16166 s, RE_SV_TAIL(r->anchored_utf8),
16167 (IV)r->anchored_offset);
16169 if (r->float_substr) {
16170 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16171 RE_SV_DUMPLEN(r->float_substr), 30);
16172 PerlIO_printf(Perl_debug_log,
16173 "floating %s%s at %"IVdf"..%"UVuf" ",
16174 s, RE_SV_TAIL(r->float_substr),
16175 (IV)r->float_min_offset, (UV)r->float_max_offset);
16176 } else if (r->float_utf8) {
16177 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16178 RE_SV_DUMPLEN(r->float_utf8), 30);
16179 PerlIO_printf(Perl_debug_log,
16180 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16181 s, RE_SV_TAIL(r->float_utf8),
16182 (IV)r->float_min_offset, (UV)r->float_max_offset);
16184 if (r->check_substr || r->check_utf8)
16185 PerlIO_printf(Perl_debug_log,
16187 (r->check_substr == r->float_substr
16188 && r->check_utf8 == r->float_utf8
16189 ? "(checking floating" : "(checking anchored"));
16190 if (r->intflags & PREGf_NOSCAN)
16191 PerlIO_printf(Perl_debug_log, " noscan");
16192 if (r->extflags & RXf_CHECK_ALL)
16193 PerlIO_printf(Perl_debug_log, " isall");
16194 if (r->check_substr || r->check_utf8)
16195 PerlIO_printf(Perl_debug_log, ") ");
16197 if (ri->regstclass) {
16198 regprop(r, sv, ri->regstclass, NULL, NULL);
16199 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16201 if (r->intflags & PREGf_ANCH) {
16202 PerlIO_printf(Perl_debug_log, "anchored");
16203 if (r->intflags & PREGf_ANCH_MBOL)
16204 PerlIO_printf(Perl_debug_log, "(MBOL)");
16205 if (r->intflags & PREGf_ANCH_SBOL)
16206 PerlIO_printf(Perl_debug_log, "(SBOL)");
16207 if (r->intflags & PREGf_ANCH_GPOS)
16208 PerlIO_printf(Perl_debug_log, "(GPOS)");
16209 PerlIO_putc(Perl_debug_log, ' ');
16211 if (r->intflags & PREGf_GPOS_SEEN)
16212 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16213 if (r->intflags & PREGf_SKIP)
16214 PerlIO_printf(Perl_debug_log, "plus ");
16215 if (r->intflags & PREGf_IMPLICIT)
16216 PerlIO_printf(Perl_debug_log, "implicit ");
16217 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16218 if (r->extflags & RXf_EVAL_SEEN)
16219 PerlIO_printf(Perl_debug_log, "with eval ");
16220 PerlIO_printf(Perl_debug_log, "\n");
16222 regdump_extflags("r->extflags: ",r->extflags);
16223 regdump_intflags("r->intflags: ",r->intflags);
16226 PERL_ARGS_ASSERT_REGDUMP;
16227 PERL_UNUSED_CONTEXT;
16228 PERL_UNUSED_ARG(r);
16229 #endif /* DEBUGGING */
16233 - regprop - printable representation of opcode, with run time support
16237 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16242 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16243 static const char * const anyofs[] = {
16244 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16245 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
16246 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
16247 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
16248 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
16249 || _CC_VERTSPACE != 16
16250 #error Need to adjust order of anyofs[]
16287 RXi_GET_DECL(prog,progi);
16288 GET_RE_DEBUG_FLAGS_DECL;
16290 PERL_ARGS_ASSERT_REGPROP;
16292 sv_setpvn(sv, "", 0);
16294 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
16295 /* It would be nice to FAIL() here, but this may be called from
16296 regexec.c, and it would be hard to supply pRExC_state. */
16297 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16298 (int)OP(o), (int)REGNODE_MAX);
16299 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16301 k = PL_regkind[OP(o)];
16304 sv_catpvs(sv, " ");
16305 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16306 * is a crude hack but it may be the best for now since
16307 * we have no flag "this EXACTish node was UTF-8"
16309 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16310 PERL_PV_ESCAPE_UNI_DETECT |
16311 PERL_PV_ESCAPE_NONASCII |
16312 PERL_PV_PRETTY_ELLIPSES |
16313 PERL_PV_PRETTY_LTGT |
16314 PERL_PV_PRETTY_NOCLEAR
16316 } else if (k == TRIE) {
16317 /* print the details of the trie in dumpuntil instead, as
16318 * progi->data isn't available here */
16319 const char op = OP(o);
16320 const U32 n = ARG(o);
16321 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16322 (reg_ac_data *)progi->data->data[n] :
16324 const reg_trie_data * const trie
16325 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16327 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16328 DEBUG_TRIE_COMPILE_r(
16329 Perl_sv_catpvf(aTHX_ sv,
16330 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16331 (UV)trie->startstate,
16332 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16333 (UV)trie->wordcount,
16336 (UV)TRIE_CHARCOUNT(trie),
16337 (UV)trie->uniquecharcount
16340 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16341 sv_catpvs(sv, "[");
16342 (void) put_charclass_bitmap_innards(sv,
16343 (IS_ANYOF_TRIE(op))
16345 : TRIE_BITMAP(trie),
16347 sv_catpvs(sv, "]");
16350 } else if (k == CURLY) {
16351 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16352 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16353 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16355 else if (k == WHILEM && o->flags) /* Ordinal/of */
16356 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16357 else if (k == REF || k == OPEN || k == CLOSE
16358 || k == GROUPP || OP(o)==ACCEPT)
16360 AV *name_list= NULL;
16361 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16362 if ( RXp_PAREN_NAMES(prog) ) {
16363 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16364 } else if ( pRExC_state ) {
16365 name_list= RExC_paren_name_list;
16368 if ( k != REF || (OP(o) < NREF)) {
16369 SV **name= av_fetch(name_list, ARG(o), 0 );
16371 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16374 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16375 I32 *nums=(I32*)SvPVX(sv_dat);
16376 SV **name= av_fetch(name_list, nums[0], 0 );
16379 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16380 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16381 (n ? "," : ""), (IV)nums[n]);
16383 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16387 if ( k == REF && reginfo) {
16388 U32 n = ARG(o); /* which paren pair */
16389 I32 ln = prog->offs[n].start;
16390 if (prog->lastparen < n || ln == -1)
16391 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16392 else if (ln == prog->offs[n].end)
16393 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16395 const char *s = reginfo->strbeg + ln;
16396 Perl_sv_catpvf(aTHX_ sv, ": ");
16397 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16398 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16401 } else if (k == GOSUB) {
16402 AV *name_list= NULL;
16403 if ( RXp_PAREN_NAMES(prog) ) {
16404 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16405 } else if ( pRExC_state ) {
16406 name_list= RExC_paren_name_list;
16409 /* Paren and offset */
16410 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16412 SV **name= av_fetch(name_list, ARG(o), 0 );
16414 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16417 else if (k == VERB) {
16419 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16420 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16421 } else if (k == LOGICAL)
16422 /* 2: embedded, otherwise 1 */
16423 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16424 else if (k == ANYOF) {
16425 const U8 flags = ANYOF_FLAGS(o);
16427 SV* bitmap_invlist; /* Will hold what the bit map contains */
16430 if (OP(o) == ANYOFL)
16431 sv_catpvs(sv, "{loc}");
16432 if (flags & ANYOF_LOC_FOLD)
16433 sv_catpvs(sv, "{i}");
16434 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16435 if (flags & ANYOF_INVERT)
16436 sv_catpvs(sv, "^");
16438 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16440 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16443 /* output any special charclass tests (used entirely under use
16445 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16447 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16448 if (ANYOF_POSIXL_TEST(o,i)) {
16449 sv_catpv(sv, anyofs[i]);
16455 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16456 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16457 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16461 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16462 if (flags & ANYOF_INVERT)
16463 /*make sure the invert info is in each */
16464 sv_catpvs(sv, "^");
16467 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16468 sv_catpvs(sv, "{non-utf8-latin1-all}");
16471 /* output information about the unicode matching */
16472 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16473 sv_catpvs(sv, "{above_bitmap_all}");
16474 else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16475 SV *lv; /* Set if there is something outside the bit map. */
16476 bool byte_output = FALSE; /* If something in the bitmap has
16478 SV *only_utf8_locale;
16480 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
16481 * is used to guarantee that nothing in the bitmap gets
16483 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16484 &lv, &only_utf8_locale,
16486 if (lv && lv != &PL_sv_undef) {
16487 char *s = savesvpv(lv);
16488 char * const origs = s;
16490 while (*s && *s != '\n')
16494 const char * const t = ++s;
16496 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16497 sv_catpvs(sv, "{outside bitmap}");
16500 sv_catpvs(sv, "{utf8}");
16504 sv_catpvs(sv, " ");
16510 /* Truncate very long output */
16511 if (s - origs > 256) {
16512 Perl_sv_catpvf(aTHX_ sv,
16514 (int) (s - origs - 1),
16520 else if (*s == '\t') {
16534 SvREFCNT_dec_NN(lv);
16537 if ((flags & ANYOF_LOC_FOLD)
16538 && only_utf8_locale
16539 && only_utf8_locale != &PL_sv_undef)
16542 int max_entries = 256;
16544 sv_catpvs(sv, "{utf8 locale}");
16545 invlist_iterinit(only_utf8_locale);
16546 while (invlist_iternext(only_utf8_locale,
16548 put_range(sv, start, end, FALSE);
16550 if (max_entries < 0) {
16551 sv_catpvs(sv, "...");
16555 invlist_iterfinish(only_utf8_locale);
16559 SvREFCNT_dec(bitmap_invlist);
16562 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16564 else if (k == POSIXD || k == NPOSIXD) {
16565 U8 index = FLAGS(o) * 2;
16566 if (index < C_ARRAY_LENGTH(anyofs)) {
16567 if (*anyofs[index] != '[') {
16570 sv_catpv(sv, anyofs[index]);
16571 if (*anyofs[index] != '[') {
16576 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16579 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16580 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16581 else if (OP(o) == SBOL)
16582 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16584 PERL_UNUSED_CONTEXT;
16585 PERL_UNUSED_ARG(sv);
16586 PERL_UNUSED_ARG(o);
16587 PERL_UNUSED_ARG(prog);
16588 PERL_UNUSED_ARG(reginfo);
16589 PERL_UNUSED_ARG(pRExC_state);
16590 #endif /* DEBUGGING */
16596 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16597 { /* Assume that RE_INTUIT is set */
16598 struct regexp *const prog = ReANY(r);
16599 GET_RE_DEBUG_FLAGS_DECL;
16601 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16602 PERL_UNUSED_CONTEXT;
16606 const char * const s = SvPV_nolen_const(prog->check_substr
16607 ? prog->check_substr : prog->check_utf8);
16609 if (!PL_colorset) reginitcolors();
16610 PerlIO_printf(Perl_debug_log,
16611 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16613 prog->check_substr ? "" : "utf8 ",
16614 PL_colors[5],PL_colors[0],
16617 (strlen(s) > 60 ? "..." : ""));
16620 return prog->check_substr ? prog->check_substr : prog->check_utf8;
16626 handles refcounting and freeing the perl core regexp structure. When
16627 it is necessary to actually free the structure the first thing it
16628 does is call the 'free' method of the regexp_engine associated to
16629 the regexp, allowing the handling of the void *pprivate; member
16630 first. (This routine is not overridable by extensions, which is why
16631 the extensions free is called first.)
16633 See regdupe and regdupe_internal if you change anything here.
16635 #ifndef PERL_IN_XSUB_RE
16637 Perl_pregfree(pTHX_ REGEXP *r)
16643 Perl_pregfree2(pTHX_ REGEXP *rx)
16645 struct regexp *const r = ReANY(rx);
16646 GET_RE_DEBUG_FLAGS_DECL;
16648 PERL_ARGS_ASSERT_PREGFREE2;
16650 if (r->mother_re) {
16651 ReREFCNT_dec(r->mother_re);
16653 CALLREGFREE_PVT(rx); /* free the private data */
16654 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16655 Safefree(r->xpv_len_u.xpvlenu_pv);
16658 SvREFCNT_dec(r->anchored_substr);
16659 SvREFCNT_dec(r->anchored_utf8);
16660 SvREFCNT_dec(r->float_substr);
16661 SvREFCNT_dec(r->float_utf8);
16662 Safefree(r->substrs);
16664 RX_MATCH_COPY_FREE(rx);
16665 #ifdef PERL_ANY_COW
16666 SvREFCNT_dec(r->saved_copy);
16669 SvREFCNT_dec(r->qr_anoncv);
16670 rx->sv_u.svu_rx = 0;
16675 This is a hacky workaround to the structural issue of match results
16676 being stored in the regexp structure which is in turn stored in
16677 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16678 could be PL_curpm in multiple contexts, and could require multiple
16679 result sets being associated with the pattern simultaneously, such
16680 as when doing a recursive match with (??{$qr})
16682 The solution is to make a lightweight copy of the regexp structure
16683 when a qr// is returned from the code executed by (??{$qr}) this
16684 lightweight copy doesn't actually own any of its data except for
16685 the starp/end and the actual regexp structure itself.
16691 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16693 struct regexp *ret;
16694 struct regexp *const r = ReANY(rx);
16695 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16697 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16700 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16702 SvOK_off((SV *)ret_x);
16704 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16705 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16706 made both spots point to the same regexp body.) */
16707 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16708 assert(!SvPVX(ret_x));
16709 ret_x->sv_u.svu_rx = temp->sv_any;
16710 temp->sv_any = NULL;
16711 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16712 SvREFCNT_dec_NN(temp);
16713 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16714 ing below will not set it. */
16715 SvCUR_set(ret_x, SvCUR(rx));
16718 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16719 sv_force_normal(sv) is called. */
16721 ret = ReANY(ret_x);
16723 SvFLAGS(ret_x) |= SvUTF8(rx);
16724 /* We share the same string buffer as the original regexp, on which we
16725 hold a reference count, incremented when mother_re is set below.
16726 The string pointer is copied here, being part of the regexp struct.
16728 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16729 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16731 const I32 npar = r->nparens+1;
16732 Newx(ret->offs, npar, regexp_paren_pair);
16733 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16736 Newx(ret->substrs, 1, struct reg_substr_data);
16737 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16739 SvREFCNT_inc_void(ret->anchored_substr);
16740 SvREFCNT_inc_void(ret->anchored_utf8);
16741 SvREFCNT_inc_void(ret->float_substr);
16742 SvREFCNT_inc_void(ret->float_utf8);
16744 /* check_substr and check_utf8, if non-NULL, point to either their
16745 anchored or float namesakes, and don't hold a second reference. */
16747 RX_MATCH_COPIED_off(ret_x);
16748 #ifdef PERL_ANY_COW
16749 ret->saved_copy = NULL;
16751 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16752 SvREFCNT_inc_void(ret->qr_anoncv);
16758 /* regfree_internal()
16760 Free the private data in a regexp. This is overloadable by
16761 extensions. Perl takes care of the regexp structure in pregfree(),
16762 this covers the *pprivate pointer which technically perl doesn't
16763 know about, however of course we have to handle the
16764 regexp_internal structure when no extension is in use.
16766 Note this is called before freeing anything in the regexp
16771 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16773 struct regexp *const r = ReANY(rx);
16774 RXi_GET_DECL(r,ri);
16775 GET_RE_DEBUG_FLAGS_DECL;
16777 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16783 SV *dsv= sv_newmortal();
16784 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16785 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16786 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16787 PL_colors[4],PL_colors[5],s);
16790 #ifdef RE_TRACK_PATTERN_OFFSETS
16792 Safefree(ri->u.offsets); /* 20010421 MJD */
16794 if (ri->code_blocks) {
16796 for (n = 0; n < ri->num_code_blocks; n++)
16797 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16798 Safefree(ri->code_blocks);
16802 int n = ri->data->count;
16805 /* If you add a ->what type here, update the comment in regcomp.h */
16806 switch (ri->data->what[n]) {
16812 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16815 Safefree(ri->data->data[n]);
16821 { /* Aho Corasick add-on structure for a trie node.
16822 Used in stclass optimization only */
16824 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16825 #ifdef USE_ITHREADS
16829 refcount = --aho->refcount;
16832 PerlMemShared_free(aho->states);
16833 PerlMemShared_free(aho->fail);
16834 /* do this last!!!! */
16835 PerlMemShared_free(ri->data->data[n]);
16836 /* we should only ever get called once, so
16837 * assert as much, and also guard the free
16838 * which /might/ happen twice. At the least
16839 * it will make code anlyzers happy and it
16840 * doesn't cost much. - Yves */
16841 assert(ri->regstclass);
16842 if (ri->regstclass) {
16843 PerlMemShared_free(ri->regstclass);
16844 ri->regstclass = 0;
16851 /* trie structure. */
16853 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16854 #ifdef USE_ITHREADS
16858 refcount = --trie->refcount;
16861 PerlMemShared_free(trie->charmap);
16862 PerlMemShared_free(trie->states);
16863 PerlMemShared_free(trie->trans);
16865 PerlMemShared_free(trie->bitmap);
16867 PerlMemShared_free(trie->jump);
16868 PerlMemShared_free(trie->wordinfo);
16869 /* do this last!!!! */
16870 PerlMemShared_free(ri->data->data[n]);
16875 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16876 ri->data->what[n]);
16879 Safefree(ri->data->what);
16880 Safefree(ri->data);
16886 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16887 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16888 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16891 re_dup - duplicate a regexp.
16893 This routine is expected to clone a given regexp structure. It is only
16894 compiled under USE_ITHREADS.
16896 After all of the core data stored in struct regexp is duplicated
16897 the regexp_engine.dupe method is used to copy any private data
16898 stored in the *pprivate pointer. This allows extensions to handle
16899 any duplication it needs to do.
16901 See pregfree() and regfree_internal() if you change anything here.
16903 #if defined(USE_ITHREADS)
16904 #ifndef PERL_IN_XSUB_RE
16906 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16910 const struct regexp *r = ReANY(sstr);
16911 struct regexp *ret = ReANY(dstr);
16913 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16915 npar = r->nparens+1;
16916 Newx(ret->offs, npar, regexp_paren_pair);
16917 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16919 if (ret->substrs) {
16920 /* Do it this way to avoid reading from *r after the StructCopy().
16921 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16922 cache, it doesn't matter. */
16923 const bool anchored = r->check_substr
16924 ? r->check_substr == r->anchored_substr
16925 : r->check_utf8 == r->anchored_utf8;
16926 Newx(ret->substrs, 1, struct reg_substr_data);
16927 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16929 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16930 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16931 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16932 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16934 /* check_substr and check_utf8, if non-NULL, point to either their
16935 anchored or float namesakes, and don't hold a second reference. */
16937 if (ret->check_substr) {
16939 assert(r->check_utf8 == r->anchored_utf8);
16940 ret->check_substr = ret->anchored_substr;
16941 ret->check_utf8 = ret->anchored_utf8;
16943 assert(r->check_substr == r->float_substr);
16944 assert(r->check_utf8 == r->float_utf8);
16945 ret->check_substr = ret->float_substr;
16946 ret->check_utf8 = ret->float_utf8;
16948 } else if (ret->check_utf8) {
16950 ret->check_utf8 = ret->anchored_utf8;
16952 ret->check_utf8 = ret->float_utf8;
16957 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16958 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16961 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16963 if (RX_MATCH_COPIED(dstr))
16964 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16966 ret->subbeg = NULL;
16967 #ifdef PERL_ANY_COW
16968 ret->saved_copy = NULL;
16971 /* Whether mother_re be set or no, we need to copy the string. We
16972 cannot refrain from copying it when the storage points directly to
16973 our mother regexp, because that's
16974 1: a buffer in a different thread
16975 2: something we no longer hold a reference on
16976 so we need to copy it locally. */
16977 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16978 ret->mother_re = NULL;
16980 #endif /* PERL_IN_XSUB_RE */
16985 This is the internal complement to regdupe() which is used to copy
16986 the structure pointed to by the *pprivate pointer in the regexp.
16987 This is the core version of the extension overridable cloning hook.
16988 The regexp structure being duplicated will be copied by perl prior
16989 to this and will be provided as the regexp *r argument, however
16990 with the /old/ structures pprivate pointer value. Thus this routine
16991 may override any copying normally done by perl.
16993 It returns a pointer to the new regexp_internal structure.
16997 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
17000 struct regexp *const r = ReANY(rx);
17001 regexp_internal *reti;
17003 RXi_GET_DECL(r,ri);
17005 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17009 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17010 char, regexp_internal);
17011 Copy(ri->program, reti->program, len+1, regnode);
17013 reti->num_code_blocks = ri->num_code_blocks;
17014 if (ri->code_blocks) {
17016 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17017 struct reg_code_block);
17018 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17019 struct reg_code_block);
17020 for (n = 0; n < ri->num_code_blocks; n++)
17021 reti->code_blocks[n].src_regex = (REGEXP*)
17022 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17025 reti->code_blocks = NULL;
17027 reti->regstclass = NULL;
17030 struct reg_data *d;
17031 const int count = ri->data->count;
17034 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17035 char, struct reg_data);
17036 Newx(d->what, count, U8);
17039 for (i = 0; i < count; i++) {
17040 d->what[i] = ri->data->what[i];
17041 switch (d->what[i]) {
17042 /* see also regcomp.h and regfree_internal() */
17043 case 'a': /* actually an AV, but the dup function is identical. */
17047 case 'u': /* actually an HV, but the dup function is identical. */
17048 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17051 /* This is cheating. */
17052 Newx(d->data[i], 1, regnode_ssc);
17053 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17054 reti->regstclass = (regnode*)d->data[i];
17057 /* Trie stclasses are readonly and can thus be shared
17058 * without duplication. We free the stclass in pregfree
17059 * when the corresponding reg_ac_data struct is freed.
17061 reti->regstclass= ri->regstclass;
17065 ((reg_trie_data*)ri->data->data[i])->refcount++;
17070 d->data[i] = ri->data->data[i];
17073 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17074 ri->data->what[i]);
17083 reti->name_list_idx = ri->name_list_idx;
17085 #ifdef RE_TRACK_PATTERN_OFFSETS
17086 if (ri->u.offsets) {
17087 Newx(reti->u.offsets, 2*len+1, U32);
17088 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17091 SetProgLen(reti,len);
17094 return (void*)reti;
17097 #endif /* USE_ITHREADS */
17099 #ifndef PERL_IN_XSUB_RE
17102 - regnext - dig the "next" pointer out of a node
17105 Perl_regnext(pTHX_ regnode *p)
17112 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17113 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17114 (int)OP(p), (int)REGNODE_MAX);
17117 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17126 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17129 STRLEN l1 = strlen(pat1);
17130 STRLEN l2 = strlen(pat2);
17133 const char *message;
17135 PERL_ARGS_ASSERT_RE_CROAK2;
17141 Copy(pat1, buf, l1 , char);
17142 Copy(pat2, buf + l1, l2 , char);
17143 buf[l1 + l2] = '\n';
17144 buf[l1 + l2 + 1] = '\0';
17145 va_start(args, pat2);
17146 msv = vmess(buf, &args);
17148 message = SvPV_const(msv,l1);
17151 Copy(message, buf, l1 , char);
17152 /* l1-1 to avoid \n */
17153 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17157 /* Certain characters are output as a sequence with the first being a
17159 #define isBACKSLASHED_PUNCT(c) \
17160 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17163 S_put_code_point(pTHX_ SV *sv, UV c)
17165 PERL_ARGS_ASSERT_PUT_CODE_POINT;
17168 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17170 else if (isPRINT(c)) {
17171 const char string = (char) c;
17172 if (isBACKSLASHED_PUNCT(c))
17173 sv_catpvs(sv, "\\");
17174 sv_catpvn(sv, &string, 1);
17177 const char * const mnemonic = cntrl_to_mnemonic((char) c);
17179 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17182 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17187 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17190 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17192 /* Appends to 'sv' a displayable version of the range of code points from
17193 * 'start' to 'end'. It assumes that only ASCII printables are displayable
17194 * as-is (though some of these will be escaped by put_code_point()). */
17196 const unsigned int min_range_count = 3;
17198 assert(start <= end);
17200 PERL_ARGS_ASSERT_PUT_RANGE;
17202 while (start <= end) {
17204 const char * format;
17206 if (end - start < min_range_count) {
17208 /* Individual chars in short ranges */
17209 for (; start <= end; start++) {
17210 put_code_point(sv, start);
17215 /* If permitted by the input options, and there is a possibility that
17216 * this range contains a printable literal, look to see if there is
17218 if (allow_literals && start <= MAX_PRINT_A) {
17220 /* If the range begin isn't an ASCII printable, effectively split
17221 * the range into two parts:
17222 * 1) the portion before the first such printable,
17224 * and output them separately. */
17225 if (! isPRINT_A(start)) {
17226 UV temp_end = start + 1;
17228 /* There is no point looking beyond the final possible
17229 * printable, in MAX_PRINT_A */
17230 UV max = MIN(end, MAX_PRINT_A);
17232 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17236 /* Here, temp_end points to one beyond the first printable if
17237 * found, or to one beyond 'max' if not. If none found, make
17238 * sure that we use the entire range */
17239 if (temp_end > MAX_PRINT_A) {
17240 temp_end = end + 1;
17243 /* Output the first part of the split range, the part that
17244 * doesn't have printables, with no looking for literals
17245 * (otherwise we would infinitely recurse) */
17246 put_range(sv, start, temp_end - 1, FALSE);
17248 /* The 2nd part of the range (if any) starts here. */
17251 /* We continue instead of dropping down because even if the 2nd
17252 * part is non-empty, it could be so short that we want to
17253 * output it specially, as tested for at the top of this loop.
17258 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
17259 * output a sub-range of just the digits or letters, then process
17260 * the remaining portion as usual. */
17261 if (isALPHANUMERIC_A(start)) {
17262 UV mask = (isDIGIT_A(start))
17267 UV temp_end = start + 1;
17269 /* Find the end of the sub-range that includes just the
17270 * characters in the same class as the first character in it */
17271 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17276 /* For short ranges, don't duplicate the code above to output
17277 * them; just call recursively */
17278 if (temp_end - start < min_range_count) {
17279 put_range(sv, start, temp_end, FALSE);
17281 else { /* Output as a range */
17282 put_code_point(sv, start);
17283 sv_catpvs(sv, "-");
17284 put_code_point(sv, temp_end);
17286 start = temp_end + 1;
17290 /* We output any other printables as individual characters */
17291 if (isPUNCT_A(start) || isSPACE_A(start)) {
17292 while (start <= end && (isPUNCT_A(start)
17293 || isSPACE_A(start)))
17295 put_code_point(sv, start);
17300 } /* End of looking for literals */
17302 /* Here is not to output as a literal. Some control characters have
17303 * mnemonic names. Split off any of those at the beginning and end of
17304 * the range to print mnemonically. It isn't possible for many of
17305 * these to be in a row, so this won't overwhelm with output */
17306 while (isMNEMONIC_CNTRL(start) && start <= end) {
17307 put_code_point(sv, start);
17310 if (start < end && isMNEMONIC_CNTRL(end)) {
17312 /* Here, the final character in the range has a mnemonic name.
17313 * Work backwards from the end to find the final non-mnemonic */
17314 UV temp_end = end - 1;
17315 while (isMNEMONIC_CNTRL(temp_end)) {
17319 /* And separately output the range that doesn't have mnemonics */
17320 put_range(sv, start, temp_end, FALSE);
17322 /* Then output the mnemonic trailing controls */
17323 start = temp_end + 1;
17324 while (start <= end) {
17325 put_code_point(sv, start);
17331 /* As a final resort, output the range or subrange as hex. */
17333 this_end = (end < NUM_ANYOF_CODE_POINTS)
17335 : NUM_ANYOF_CODE_POINTS - 1;
17336 format = (this_end < 256)
17337 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17338 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17339 GCC_DIAG_IGNORE(-Wformat-nonliteral);
17340 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17347 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17349 /* Appends to 'sv' a displayable version of the innards of the bracketed
17350 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
17351 * output anything, and bitmap_invlist, if not NULL, will point to an
17352 * inversion list of what is in the bit map */
17356 unsigned int punct_count = 0;
17357 SV* invlist = NULL;
17358 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17359 bool allow_literals = TRUE;
17361 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17363 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17365 /* Worst case is exactly every-other code point is in the list */
17366 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17368 /* Convert the bit map to an inversion list, keeping track of how many
17369 * ASCII puncts are set, including an extra amount for the backslashed
17371 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17372 if (BITMAP_TEST(bitmap, i)) {
17373 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17374 if (isPUNCT_A(i)) {
17376 if isBACKSLASHED_PUNCT(i) {
17383 /* Nothing to output */
17384 if (_invlist_len(*invlist_ptr) == 0) {
17385 SvREFCNT_dec(invlist);
17389 /* Generally, it is more readable if printable characters are output as
17390 * literals, but if a range (nearly) spans all of them, it's best to output
17391 * it as a single range. This code will use a single range if all but 2
17392 * printables are in it */
17393 invlist_iterinit(*invlist_ptr);
17394 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17396 /* If range starts beyond final printable, it doesn't have any in it */
17397 if (start > MAX_PRINT_A) {
17401 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
17402 * all but two, the range must start and end no later than 2 from
17404 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17405 if (end > MAX_PRINT_A) {
17411 if (end - start >= MAX_PRINT_A - ' ' - 2) {
17412 allow_literals = FALSE;
17417 invlist_iterfinish(*invlist_ptr);
17419 /* The legibility of the output depends mostly on how many punctuation
17420 * characters are output. There are 32 possible ASCII ones, and some have
17421 * an additional backslash, bringing it to currently 36, so if any more
17422 * than 18 are to be output, we can instead output it as its complement,
17423 * yielding fewer puncts, and making it more legible. But give some weight
17424 * to the fact that outputting it as a complement is less legible than a
17425 * straight output, so don't complement unless we are somewhat over the 18
17427 if (allow_literals && punct_count > 22) {
17428 sv_catpvs(sv, "^");
17430 /* Add everything remaining to the list, so when we invert it just
17431 * below, it will be excluded */
17432 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17433 _invlist_invert(*invlist_ptr);
17436 /* Here we have figured things out. Output each range */
17437 invlist_iterinit(*invlist_ptr);
17438 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17439 if (start >= NUM_ANYOF_CODE_POINTS) {
17442 put_range(sv, start, end, allow_literals);
17444 invlist_iterfinish(*invlist_ptr);
17449 #define CLEAR_OPTSTART \
17450 if (optstart) STMT_START { \
17451 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
17452 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17456 #define DUMPUNTIL(b,e) \
17458 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17460 STATIC const regnode *
17461 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17462 const regnode *last, const regnode *plast,
17463 SV* sv, I32 indent, U32 depth)
17465 U8 op = PSEUDO; /* Arbitrary non-END op. */
17466 const regnode *next;
17467 const regnode *optstart= NULL;
17469 RXi_GET_DECL(r,ri);
17470 GET_RE_DEBUG_FLAGS_DECL;
17472 PERL_ARGS_ASSERT_DUMPUNTIL;
17474 #ifdef DEBUG_DUMPUNTIL
17475 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17476 last ? last-start : 0,plast ? plast-start : 0);
17479 if (plast && plast < last)
17482 while (PL_regkind[op] != END && (!last || node < last)) {
17484 /* While that wasn't END last time... */
17487 if (op == CLOSE || op == WHILEM)
17489 next = regnext((regnode *)node);
17492 if (OP(node) == OPTIMIZED) {
17493 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17500 regprop(r, sv, node, NULL, NULL);
17501 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17502 (int)(2*indent + 1), "", SvPVX_const(sv));
17504 if (OP(node) != OPTIMIZED) {
17505 if (next == NULL) /* Next ptr. */
17506 PerlIO_printf(Perl_debug_log, " (0)");
17507 else if (PL_regkind[(U8)op] == BRANCH
17508 && PL_regkind[OP(next)] != BRANCH )
17509 PerlIO_printf(Perl_debug_log, " (FAIL)");
17511 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17512 (void)PerlIO_putc(Perl_debug_log, '\n');
17516 if (PL_regkind[(U8)op] == BRANCHJ) {
17519 const regnode *nnode = (OP(next) == LONGJMP
17520 ? regnext((regnode *)next)
17522 if (last && nnode > last)
17524 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17527 else if (PL_regkind[(U8)op] == BRANCH) {
17529 DUMPUNTIL(NEXTOPER(node), next);
17531 else if ( PL_regkind[(U8)op] == TRIE ) {
17532 const regnode *this_trie = node;
17533 const char op = OP(node);
17534 const U32 n = ARG(node);
17535 const reg_ac_data * const ac = op>=AHOCORASICK ?
17536 (reg_ac_data *)ri->data->data[n] :
17538 const reg_trie_data * const trie =
17539 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17541 AV *const trie_words
17542 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17544 const regnode *nextbranch= NULL;
17547 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17548 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17550 PerlIO_printf(Perl_debug_log, "%*s%s ",
17551 (int)(2*(indent+3)), "",
17553 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17554 SvCUR(*elem_ptr), 60,
17555 PL_colors[0], PL_colors[1],
17557 ? PERL_PV_ESCAPE_UNI
17559 | PERL_PV_PRETTY_ELLIPSES
17560 | PERL_PV_PRETTY_LTGT
17565 U16 dist= trie->jump[word_idx+1];
17566 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17567 (UV)((dist ? this_trie + dist : next) - start));
17570 nextbranch= this_trie + trie->jump[0];
17571 DUMPUNTIL(this_trie + dist, nextbranch);
17573 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17574 nextbranch= regnext((regnode *)nextbranch);
17576 PerlIO_printf(Perl_debug_log, "\n");
17579 if (last && next > last)
17584 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
17585 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17586 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17588 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17590 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17592 else if ( op == PLUS || op == STAR) {
17593 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17595 else if (PL_regkind[(U8)op] == ANYOF) {
17596 /* arglen 1 + class block */
17597 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17598 ? ANYOF_POSIXL_SKIP
17600 node = NEXTOPER(node);
17602 else if (PL_regkind[(U8)op] == EXACT) {
17603 /* Literal string, where present. */
17604 node += NODE_SZ_STR(node) - 1;
17605 node = NEXTOPER(node);
17608 node = NEXTOPER(node);
17609 node += regarglen[(U8)op];
17611 if (op == CURLYX || op == OPEN)
17615 #ifdef DEBUG_DUMPUNTIL
17616 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17621 #endif /* DEBUGGING */
17625 * c-indentation-style: bsd
17626 * c-basic-offset: 4
17627 * indent-tabs-mode: nil
17630 * ex: set ts=8 sts=4 sw=4 et: