5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
109 /* this is a chain of data about sub patterns we are processing that
110 need to be handled separately/specially in study_chunk. Its so
111 we can simulate recursion without losing state. */
113 typedef struct scan_frame {
114 regnode *last_regnode; /* last node to process in this frame */
115 regnode *next_regnode; /* next node to process when last is reached */
116 U32 prev_recursed_depth;
117 I32 stopparen; /* what stopparen do we use */
118 U32 is_top_frame; /* what flags do we use? */
120 struct scan_frame *this_prev_frame; /* this previous frame */
121 struct scan_frame *prev_frame; /* previous frame */
122 struct scan_frame *next_frame; /* next frame */
125 struct RExC_state_t {
126 U32 flags; /* RXf_* are we folding, multilining? */
127 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
128 char *precomp; /* uncompiled string. */
129 REGEXP *rx_sv; /* The SV that is the regexp. */
130 regexp *rx; /* perl core regexp structure */
131 regexp_internal *rxi; /* internal data for regexp object
133 char *start; /* Start of input for compile */
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
136 SSize_t whilem_seen; /* number of WHILEM in this expr */
137 regnode *emit_start; /* Start of emitted-code area */
138 regnode *emit_bound; /* First regnode outside of the
140 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
141 implies compiling, so don't emit */
142 regnode_ssc emit_dummy; /* placeholder for emit to point to;
143 large enough for the largest
144 non-EXACTish node, so can use it as
146 I32 naughty; /* How bad is this pattern? */
147 I32 sawback; /* Did we see \1, ...? */
149 SSize_t size; /* Code size. */
150 I32 npar; /* Capture buffer count, (OPEN) plus
151 one. ("par" 0 is the whole
153 I32 nestroot; /* root parens we are in - used by
157 regnode **open_parens; /* pointers to open parens */
158 regnode **close_parens; /* pointers to close parens */
159 regnode *opend; /* END node in program */
160 I32 utf8; /* whether the pattern is utf8 or not */
161 I32 orig_utf8; /* whether the pattern was originally in utf8 */
162 /* XXX use this for future optimisation of case
163 * where pattern must be upgraded to utf8. */
164 I32 uni_semantics; /* If a d charset modifier should use unicode
165 rules, even if the pattern is not in
167 HV *paren_names; /* Paren names */
169 regnode **recurse; /* Recurse regops */
170 I32 recurse_count; /* Number of recurse regops */
171 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
173 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
177 I32 override_recoding;
178 I32 in_multi_char_class;
179 struct reg_code_block *code_blocks; /* positions of literal (?{})
181 int num_code_blocks; /* size of code_blocks[] */
182 int code_index; /* next code_blocks[] slot */
183 SSize_t maxlen; /* mininum possible number of chars in string to match */
184 scan_frame *frame_head;
185 scan_frame *frame_last;
187 #ifdef ADD_TO_REGEXEC
188 char *starttry; /* -Dr: where regtry was called. */
189 #define RExC_starttry (pRExC_state->starttry)
191 SV *runtime_code_qr; /* qr with the runtime code blocks */
193 const char *lastparse;
195 AV *paren_name_list; /* idx -> name */
196 U32 study_chunk_recursed_count;
199 #define RExC_lastparse (pRExC_state->lastparse)
200 #define RExC_lastnum (pRExC_state->lastnum)
201 #define RExC_paren_name_list (pRExC_state->paren_name_list)
202 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
203 #define RExC_mysv (pRExC_state->mysv1)
204 #define RExC_mysv1 (pRExC_state->mysv1)
205 #define RExC_mysv2 (pRExC_state->mysv2)
210 #define RExC_flags (pRExC_state->flags)
211 #define RExC_pm_flags (pRExC_state->pm_flags)
212 #define RExC_precomp (pRExC_state->precomp)
213 #define RExC_rx_sv (pRExC_state->rx_sv)
214 #define RExC_rx (pRExC_state->rx)
215 #define RExC_rxi (pRExC_state->rxi)
216 #define RExC_start (pRExC_state->start)
217 #define RExC_end (pRExC_state->end)
218 #define RExC_parse (pRExC_state->parse)
219 #define RExC_whilem_seen (pRExC_state->whilem_seen)
220 #ifdef RE_TRACK_PATTERN_OFFSETS
221 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
224 #define RExC_emit (pRExC_state->emit)
225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
226 #define RExC_emit_start (pRExC_state->emit_start)
227 #define RExC_emit_bound (pRExC_state->emit_bound)
228 #define RExC_naughty (pRExC_state->naughty)
229 #define RExC_sawback (pRExC_state->sawback)
230 #define RExC_seen (pRExC_state->seen)
231 #define RExC_size (pRExC_state->size)
232 #define RExC_maxlen (pRExC_state->maxlen)
233 #define RExC_npar (pRExC_state->npar)
234 #define RExC_nestroot (pRExC_state->nestroot)
235 #define RExC_extralen (pRExC_state->extralen)
236 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
237 #define RExC_utf8 (pRExC_state->utf8)
238 #define RExC_uni_semantics (pRExC_state->uni_semantics)
239 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
240 #define RExC_open_parens (pRExC_state->open_parens)
241 #define RExC_close_parens (pRExC_state->close_parens)
242 #define RExC_opend (pRExC_state->opend)
243 #define RExC_paren_names (pRExC_state->paren_names)
244 #define RExC_recurse (pRExC_state->recurse)
245 #define RExC_recurse_count (pRExC_state->recurse_count)
246 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
247 #define RExC_study_chunk_recursed_bytes \
248 (pRExC_state->study_chunk_recursed_bytes)
249 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
250 #define RExC_contains_locale (pRExC_state->contains_locale)
251 #define RExC_contains_i (pRExC_state->contains_i)
252 #define RExC_override_recoding (pRExC_state->override_recoding)
253 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
254 #define RExC_frame_head (pRExC_state->frame_head)
255 #define RExC_frame_last (pRExC_state->frame_last)
256 #define RExC_frame_count (pRExC_state->frame_count)
259 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
260 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
261 ((*s) == '{' && regcurly(s)))
264 * Flags to be passed up and down.
266 #define WORST 0 /* Worst case. */
267 #define HASWIDTH 0x01 /* Known to match non-null strings. */
269 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
270 * character. (There needs to be a case: in the switch statement in regexec.c
271 * for any node marked SIMPLE.) Note that this is not the same thing as
274 #define SPSTART 0x04 /* Starts with * or + */
275 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
276 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
277 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
279 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
281 /* whether trie related optimizations are enabled */
282 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
283 #define TRIE_STUDY_OPT
284 #define FULL_TRIE_STUDY
290 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
291 #define PBITVAL(paren) (1 << ((paren) & 7))
292 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
293 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
294 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
296 #define REQUIRE_UTF8 STMT_START { \
298 *flagp = RESTART_UTF8; \
303 /* This converts the named class defined in regcomp.h to its equivalent class
304 * number defined in handy.h. */
305 #define namedclass_to_classnum(class) ((int) ((class) / 2))
306 #define classnum_to_namedclass(classnum) ((classnum) * 2)
308 #define _invlist_union_complement_2nd(a, b, output) \
309 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
310 #define _invlist_intersection_complement_2nd(a, b, output) \
311 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
313 /* About scan_data_t.
315 During optimisation we recurse through the regexp program performing
316 various inplace (keyhole style) optimisations. In addition study_chunk
317 and scan_commit populate this data structure with information about
318 what strings MUST appear in the pattern. We look for the longest
319 string that must appear at a fixed location, and we look for the
320 longest string that may appear at a floating location. So for instance
325 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
326 strings (because they follow a .* construct). study_chunk will identify
327 both FOO and BAR as being the longest fixed and floating strings respectively.
329 The strings can be composites, for instance
333 will result in a composite fixed substring 'foo'.
335 For each string some basic information is maintained:
337 - offset or min_offset
338 This is the position the string must appear at, or not before.
339 It also implicitly (when combined with minlenp) tells us how many
340 characters must match before the string we are searching for.
341 Likewise when combined with minlenp and the length of the string it
342 tells us how many characters must appear after the string we have
346 Only used for floating strings. This is the rightmost point that
347 the string can appear at. If set to SSize_t_MAX it indicates that the
348 string can occur infinitely far to the right.
351 A pointer to the minimum number of characters of the pattern that the
352 string was found inside. This is important as in the case of positive
353 lookahead or positive lookbehind we can have multiple patterns
358 The minimum length of the pattern overall is 3, the minimum length
359 of the lookahead part is 3, but the minimum length of the part that
360 will actually match is 1. So 'FOO's minimum length is 3, but the
361 minimum length for the F is 1. This is important as the minimum length
362 is used to determine offsets in front of and behind the string being
363 looked for. Since strings can be composites this is the length of the
364 pattern at the time it was committed with a scan_commit. Note that
365 the length is calculated by study_chunk, so that the minimum lengths
366 are not known until the full pattern has been compiled, thus the
367 pointer to the value.
371 In the case of lookbehind the string being searched for can be
372 offset past the start point of the final matching string.
373 If this value was just blithely removed from the min_offset it would
374 invalidate some of the calculations for how many chars must match
375 before or after (as they are derived from min_offset and minlen and
376 the length of the string being searched for).
377 When the final pattern is compiled and the data is moved from the
378 scan_data_t structure into the regexp structure the information
379 about lookbehind is factored in, with the information that would
380 have been lost precalculated in the end_shift field for the
383 The fields pos_min and pos_delta are used to store the minimum offset
384 and the delta to the maximum offset at the current point in the pattern.
388 typedef struct scan_data_t {
389 /*I32 len_min; unused */
390 /*I32 len_delta; unused */
394 SSize_t last_end; /* min value, <0 unless valid. */
395 SSize_t last_start_min;
396 SSize_t last_start_max;
397 SV **longest; /* Either &l_fixed, or &l_float. */
398 SV *longest_fixed; /* longest fixed string found in pattern */
399 SSize_t offset_fixed; /* offset where it starts */
400 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
401 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
402 SV *longest_float; /* longest floating string found in pattern */
403 SSize_t offset_float_min; /* earliest point in string it can appear */
404 SSize_t offset_float_max; /* latest point in string it can appear */
405 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
406 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
409 SSize_t *last_closep;
410 regnode_ssc *start_class;
414 * Forward declarations for pregcomp()'s friends.
417 static const scan_data_t zero_scan_data =
418 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
420 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
421 #define SF_BEFORE_SEOL 0x0001
422 #define SF_BEFORE_MEOL 0x0002
423 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
424 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
426 #define SF_FIX_SHIFT_EOL (+2)
427 #define SF_FL_SHIFT_EOL (+4)
429 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
430 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
432 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
433 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
434 #define SF_IS_INF 0x0040
435 #define SF_HAS_PAR 0x0080
436 #define SF_IN_PAR 0x0100
437 #define SF_HAS_EVAL 0x0200
438 #define SCF_DO_SUBSTR 0x0400
439 #define SCF_DO_STCLASS_AND 0x0800
440 #define SCF_DO_STCLASS_OR 0x1000
441 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
442 #define SCF_WHILEM_VISITED_POS 0x2000
444 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
445 #define SCF_SEEN_ACCEPT 0x8000
446 #define SCF_TRIE_DOING_RESTUDY 0x10000
447 #define SCF_IN_DEFINE 0x20000
452 #define UTF cBOOL(RExC_utf8)
454 /* The enums for all these are ordered so things work out correctly */
455 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
456 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
457 == REGEX_DEPENDS_CHARSET)
458 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
459 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
460 >= REGEX_UNICODE_CHARSET)
461 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
462 == REGEX_ASCII_RESTRICTED_CHARSET)
463 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
464 >= REGEX_ASCII_RESTRICTED_CHARSET)
465 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
466 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
468 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
470 /* For programs that want to be strictly Unicode compatible by dying if any
471 * attempt is made to match a non-Unicode code point against a Unicode
473 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
475 #define OOB_NAMEDCLASS -1
477 /* There is no code point that is out-of-bounds, so this is problematic. But
478 * its only current use is to initialize a variable that is always set before
480 #define OOB_UNICODE 0xDEADBEEF
482 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
483 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
486 /* length of regex to show in messages that don't mark a position within */
487 #define RegexLengthToShowInErrorMessages 127
490 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
491 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
492 * op/pragma/warn/regcomp.
494 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
495 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
497 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
498 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
500 #define REPORT_LOCATION_ARGS(offset) \
501 UTF8fARG(UTF, offset, RExC_precomp), \
502 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
505 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
506 * arg. Show regex, up to a maximum length. If it's too long, chop and add
509 #define _FAIL(code) STMT_START { \
510 const char *ellipses = ""; \
511 IV len = RExC_end - RExC_precomp; \
514 SAVEFREESV(RExC_rx_sv); \
515 if (len > RegexLengthToShowInErrorMessages) { \
516 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
517 len = RegexLengthToShowInErrorMessages - 10; \
523 #define FAIL(msg) _FAIL( \
524 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
525 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
527 #define FAIL2(msg,arg) _FAIL( \
528 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
529 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
532 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
534 #define Simple_vFAIL(m) STMT_START { \
536 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
537 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
538 m, REPORT_LOCATION_ARGS(offset)); \
542 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
544 #define vFAIL(m) STMT_START { \
546 SAVEFREESV(RExC_rx_sv); \
551 * Like Simple_vFAIL(), but accepts two arguments.
553 #define Simple_vFAIL2(m,a1) STMT_START { \
554 const IV offset = RExC_parse - RExC_precomp; \
555 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
556 REPORT_LOCATION_ARGS(offset)); \
560 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
562 #define vFAIL2(m,a1) STMT_START { \
564 SAVEFREESV(RExC_rx_sv); \
565 Simple_vFAIL2(m, a1); \
570 * Like Simple_vFAIL(), but accepts three arguments.
572 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
573 const IV offset = RExC_parse - RExC_precomp; \
574 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
575 REPORT_LOCATION_ARGS(offset)); \
579 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
581 #define vFAIL3(m,a1,a2) STMT_START { \
583 SAVEFREESV(RExC_rx_sv); \
584 Simple_vFAIL3(m, a1, a2); \
588 * Like Simple_vFAIL(), but accepts four arguments.
590 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
591 const IV offset = RExC_parse - RExC_precomp; \
592 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
593 REPORT_LOCATION_ARGS(offset)); \
596 #define vFAIL4(m,a1,a2,a3) STMT_START { \
598 SAVEFREESV(RExC_rx_sv); \
599 Simple_vFAIL4(m, a1, a2, a3); \
602 /* A specialized version of vFAIL2 that works with UTF8f */
603 #define vFAIL2utf8f(m, a1) STMT_START { \
604 const IV offset = RExC_parse - RExC_precomp; \
606 SAVEFREESV(RExC_rx_sv); \
607 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
608 REPORT_LOCATION_ARGS(offset)); \
611 /* These have asserts in them because of [perl #122671] Many warnings in
612 * regcomp.c can occur twice. If they get output in pass1 and later in that
613 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
614 * would get output again. So they should be output in pass2, and these
615 * asserts make sure new warnings follow that paradigm. */
617 /* m is not necessarily a "literal string", in this macro */
618 #define reg_warn_non_literal_string(loc, m) STMT_START { \
619 const IV offset = loc - RExC_precomp; \
620 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
621 m, REPORT_LOCATION_ARGS(offset)); \
624 #define ckWARNreg(loc,m) STMT_START { \
625 const IV offset = loc - RExC_precomp; \
626 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
627 REPORT_LOCATION_ARGS(offset)); \
630 #define vWARN_dep(loc, m) STMT_START { \
631 const IV offset = loc - RExC_precomp; \
632 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
633 REPORT_LOCATION_ARGS(offset)); \
636 #define ckWARNdep(loc,m) STMT_START { \
637 const IV offset = loc - RExC_precomp; \
638 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
640 REPORT_LOCATION_ARGS(offset)); \
643 #define ckWARNregdep(loc,m) STMT_START { \
644 const IV offset = loc - RExC_precomp; \
645 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
647 REPORT_LOCATION_ARGS(offset)); \
650 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
651 const IV offset = loc - RExC_precomp; \
652 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
654 a1, REPORT_LOCATION_ARGS(offset)); \
657 #define ckWARN2reg(loc, m, a1) STMT_START { \
658 const IV offset = loc - RExC_precomp; \
659 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
660 a1, REPORT_LOCATION_ARGS(offset)); \
663 #define vWARN3(loc, m, a1, a2) STMT_START { \
664 const IV offset = loc - RExC_precomp; \
665 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
666 a1, a2, REPORT_LOCATION_ARGS(offset)); \
669 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
670 const IV offset = loc - RExC_precomp; \
671 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
672 a1, a2, REPORT_LOCATION_ARGS(offset)); \
675 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
676 const IV offset = loc - RExC_precomp; \
677 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
678 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
681 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
682 const IV offset = loc - RExC_precomp; \
683 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
684 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
687 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
688 const IV offset = loc - RExC_precomp; \
689 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
690 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
694 /* Allow for side effects in s */
695 #define REGC(c,s) STMT_START { \
696 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
699 /* Macros for recording node offsets. 20001227 mjd@plover.com
700 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
701 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
702 * Element 0 holds the number n.
703 * Position is 1 indexed.
705 #ifndef RE_TRACK_PATTERN_OFFSETS
706 #define Set_Node_Offset_To_R(node,byte)
707 #define Set_Node_Offset(node,byte)
708 #define Set_Cur_Node_Offset
709 #define Set_Node_Length_To_R(node,len)
710 #define Set_Node_Length(node,len)
711 #define Set_Node_Cur_Length(node,start)
712 #define Node_Offset(n)
713 #define Node_Length(n)
714 #define Set_Node_Offset_Length(node,offset,len)
715 #define ProgLen(ri) ri->u.proglen
716 #define SetProgLen(ri,x) ri->u.proglen = x
718 #define ProgLen(ri) ri->u.offsets[0]
719 #define SetProgLen(ri,x) ri->u.offsets[0] = x
720 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
722 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
723 __LINE__, (int)(node), (int)(byte))); \
725 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
728 RExC_offsets[2*(node)-1] = (byte); \
733 #define Set_Node_Offset(node,byte) \
734 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
735 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
737 #define Set_Node_Length_To_R(node,len) STMT_START { \
739 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
740 __LINE__, (int)(node), (int)(len))); \
742 Perl_croak(aTHX_ "value of node is %d in Length macro", \
745 RExC_offsets[2*(node)] = (len); \
750 #define Set_Node_Length(node,len) \
751 Set_Node_Length_To_R((node)-RExC_emit_start, len)
752 #define Set_Node_Cur_Length(node, start) \
753 Set_Node_Length(node, RExC_parse - start)
755 /* Get offsets and lengths */
756 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
757 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
759 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
760 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
761 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
765 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
766 #define EXPERIMENTAL_INPLACESCAN
767 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
769 #define DEBUG_RExC_seen() \
770 DEBUG_OPTIMISE_MORE_r({ \
771 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
773 if (RExC_seen & REG_ZERO_LEN_SEEN) \
774 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
776 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
777 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
779 if (RExC_seen & REG_GPOS_SEEN) \
780 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
782 if (RExC_seen & REG_CANY_SEEN) \
783 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
785 if (RExC_seen & REG_RECURSE_SEEN) \
786 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
788 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
789 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
791 if (RExC_seen & REG_VERBARG_SEEN) \
792 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
794 if (RExC_seen & REG_CUTGROUP_SEEN) \
795 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
797 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
798 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
800 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
801 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
803 if (RExC_seen & REG_GOSTART_SEEN) \
804 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
806 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
807 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
809 PerlIO_printf(Perl_debug_log,"\n"); \
812 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
813 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
815 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
817 PerlIO_printf(Perl_debug_log, "%s", open_str); \
818 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
819 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
820 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
821 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
822 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
823 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
824 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
825 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
826 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
827 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
828 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
829 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
830 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
831 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
832 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
833 PerlIO_printf(Perl_debug_log, "%s", close_str); \
837 #define DEBUG_STUDYDATA(str,data,depth) \
838 DEBUG_OPTIMISE_MORE_r(if(data){ \
839 PerlIO_printf(Perl_debug_log, \
840 "%*s" str "Pos:%"IVdf"/%"IVdf \
842 (int)(depth)*2, "", \
843 (IV)((data)->pos_min), \
844 (IV)((data)->pos_delta), \
845 (UV)((data)->flags) \
847 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
848 PerlIO_printf(Perl_debug_log, \
849 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
850 (IV)((data)->whilem_c), \
851 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
852 is_inf ? "INF " : "" \
854 if ((data)->last_found) \
855 PerlIO_printf(Perl_debug_log, \
856 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
857 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
858 SvPVX_const((data)->last_found), \
859 (IV)((data)->last_end), \
860 (IV)((data)->last_start_min), \
861 (IV)((data)->last_start_max), \
862 ((data)->longest && \
863 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
864 SvPVX_const((data)->longest_fixed), \
865 (IV)((data)->offset_fixed), \
866 ((data)->longest && \
867 (data)->longest==&((data)->longest_float)) ? "*" : "", \
868 SvPVX_const((data)->longest_float), \
869 (IV)((data)->offset_float_min), \
870 (IV)((data)->offset_float_max) \
872 PerlIO_printf(Perl_debug_log,"\n"); \
877 /* is c a control character for which we have a mnemonic? */
878 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
881 S_cntrl_to_mnemonic(const U8 c)
883 /* Returns the mnemonic string that represents character 'c', if one
884 * exists; NULL otherwise. The only ones that exist for the purposes of
885 * this routine are a few control characters */
888 case '\a': return "\\a";
889 case '\b': return "\\b";
890 case ESC_NATIVE: return "\\e";
891 case '\f': return "\\f";
892 case '\n': return "\\n";
893 case '\r': return "\\r";
894 case '\t': return "\\t";
902 /* Mark that we cannot extend a found fixed substring at this point.
903 Update the longest found anchored substring and the longest found
904 floating substrings if needed. */
907 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
908 SSize_t *minlenp, int is_inf)
910 const STRLEN l = CHR_SVLEN(data->last_found);
911 const STRLEN old_l = CHR_SVLEN(*data->longest);
912 GET_RE_DEBUG_FLAGS_DECL;
914 PERL_ARGS_ASSERT_SCAN_COMMIT;
916 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
917 SvSetMagicSV(*data->longest, data->last_found);
918 if (*data->longest == data->longest_fixed) {
919 data->offset_fixed = l ? data->last_start_min : data->pos_min;
920 if (data->flags & SF_BEFORE_EOL)
922 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
924 data->flags &= ~SF_FIX_BEFORE_EOL;
925 data->minlen_fixed=minlenp;
926 data->lookbehind_fixed=0;
928 else { /* *data->longest == data->longest_float */
929 data->offset_float_min = l ? data->last_start_min : data->pos_min;
930 data->offset_float_max = (l
931 ? data->last_start_max
932 : (data->pos_delta == SSize_t_MAX
934 : data->pos_min + data->pos_delta));
936 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
937 data->offset_float_max = SSize_t_MAX;
938 if (data->flags & SF_BEFORE_EOL)
940 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
942 data->flags &= ~SF_FL_BEFORE_EOL;
943 data->minlen_float=minlenp;
944 data->lookbehind_float=0;
947 SvCUR_set(data->last_found, 0);
949 SV * const sv = data->last_found;
950 if (SvUTF8(sv) && SvMAGICAL(sv)) {
951 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
957 data->flags &= ~SF_BEFORE_EOL;
958 DEBUG_STUDYDATA("commit: ",data,0);
961 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
962 * list that describes which code points it matches */
965 S_ssc_anything(pTHX_ regnode_ssc *ssc)
967 /* Set the SSC 'ssc' to match an empty string or any code point */
969 PERL_ARGS_ASSERT_SSC_ANYTHING;
971 assert(is_ANYOF_SYNTHETIC(ssc));
973 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
974 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
975 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
979 S_ssc_is_anything(const regnode_ssc *ssc)
981 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
982 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
983 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
984 * in any way, so there's no point in using it */
989 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
991 assert(is_ANYOF_SYNTHETIC(ssc));
993 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
997 /* See if the list consists solely of the range 0 - Infinity */
998 invlist_iterinit(ssc->invlist);
999 ret = invlist_iternext(ssc->invlist, &start, &end)
1003 invlist_iterfinish(ssc->invlist);
1009 /* If e.g., both \w and \W are set, matches everything */
1010 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1012 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1013 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1023 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1025 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1026 * string, any code point, or any posix class under locale */
1028 PERL_ARGS_ASSERT_SSC_INIT;
1030 Zero(ssc, 1, regnode_ssc);
1031 set_ANYOF_SYNTHETIC(ssc);
1032 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1035 /* If any portion of the regex is to operate under locale rules,
1036 * initialization includes it. The reason this isn't done for all regexes
1037 * is that the optimizer was written under the assumption that locale was
1038 * all-or-nothing. Given the complexity and lack of documentation in the
1039 * optimizer, and that there are inadequate test cases for locale, many
1040 * parts of it may not work properly, it is safest to avoid locale unless
1042 if (RExC_contains_locale) {
1043 ANYOF_POSIXL_SETALL(ssc);
1046 ANYOF_POSIXL_ZERO(ssc);
1051 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1052 const regnode_ssc *ssc)
1054 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1055 * to the list of code points matched, and locale posix classes; hence does
1056 * not check its flags) */
1061 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1063 assert(is_ANYOF_SYNTHETIC(ssc));
1065 invlist_iterinit(ssc->invlist);
1066 ret = invlist_iternext(ssc->invlist, &start, &end)
1070 invlist_iterfinish(ssc->invlist);
1076 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1084 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1085 const regnode_charclass* const node)
1087 /* Returns a mortal inversion list defining which code points are matched
1088 * by 'node', which is of type ANYOF. Handles complementing the result if
1089 * appropriate. If some code points aren't knowable at this time, the
1090 * returned list must, and will, contain every code point that is a
1093 SV* invlist = sv_2mortal(_new_invlist(0));
1094 SV* only_utf8_locale_invlist = NULL;
1096 const U32 n = ARG(node);
1097 bool new_node_has_latin1 = FALSE;
1099 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1101 /* Look at the data structure created by S_set_ANYOF_arg() */
1102 if (n != ANYOF_ONLY_HAS_BITMAP) {
1103 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1104 AV * const av = MUTABLE_AV(SvRV(rv));
1105 SV **const ary = AvARRAY(av);
1106 assert(RExC_rxi->data->what[n] == 's');
1108 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1109 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1111 else if (ary[0] && ary[0] != &PL_sv_undef) {
1113 /* Here, no compile-time swash, and there are things that won't be
1114 * known until runtime -- we have to assume it could be anything */
1115 return _add_range_to_invlist(invlist, 0, UV_MAX);
1117 else if (ary[3] && ary[3] != &PL_sv_undef) {
1119 /* Here no compile-time swash, and no run-time only data. Use the
1120 * node's inversion list */
1121 invlist = sv_2mortal(invlist_clone(ary[3]));
1124 /* Get the code points valid only under UTF-8 locales */
1125 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1126 && ary[2] && ary[2] != &PL_sv_undef)
1128 only_utf8_locale_invlist = ary[2];
1132 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1133 * code points, and an inversion list for the others, but if there are code
1134 * points that should match only conditionally on the target string being
1135 * UTF-8, those are placed in the inversion list, and not the bitmap.
1136 * Since there are circumstances under which they could match, they are
1137 * included in the SSC. But if the ANYOF node is to be inverted, we have
1138 * to exclude them here, so that when we invert below, the end result
1139 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1140 * have to do this here before we add the unconditionally matched code
1142 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1143 _invlist_intersection_complement_2nd(invlist,
1148 /* Add in the points from the bit map */
1149 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1150 if (ANYOF_BITMAP_TEST(node, i)) {
1151 invlist = add_cp_to_invlist(invlist, i);
1152 new_node_has_latin1 = TRUE;
1156 /* If this can match all upper Latin1 code points, have to add them
1158 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1159 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1162 /* Similarly for these */
1163 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1164 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1167 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1168 _invlist_invert(invlist);
1170 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1172 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1173 * locale. We can skip this if there are no 0-255 at all. */
1174 _invlist_union(invlist, PL_Latin1, &invlist);
1177 /* Similarly add the UTF-8 locale possible matches. These have to be
1178 * deferred until after the non-UTF-8 locale ones are taken care of just
1179 * above, or it leads to wrong results under ANYOF_INVERT */
1180 if (only_utf8_locale_invlist) {
1181 _invlist_union_maybe_complement_2nd(invlist,
1182 only_utf8_locale_invlist,
1183 ANYOF_FLAGS(node) & ANYOF_INVERT,
1190 /* These two functions currently do the exact same thing */
1191 #define ssc_init_zero ssc_init
1193 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1194 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1196 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1197 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1198 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1201 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1202 const regnode_charclass *and_with)
1204 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1205 * another SSC or a regular ANYOF class. Can create false positives. */
1210 PERL_ARGS_ASSERT_SSC_AND;
1212 assert(is_ANYOF_SYNTHETIC(ssc));
1214 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1215 * the code point inversion list and just the relevant flags */
1216 if (is_ANYOF_SYNTHETIC(and_with)) {
1217 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1218 anded_flags = ANYOF_FLAGS(and_with);
1220 /* XXX This is a kludge around what appears to be deficiencies in the
1221 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1222 * there are paths through the optimizer where it doesn't get weeded
1223 * out when it should. And if we don't make some extra provision for
1224 * it like the code just below, it doesn't get added when it should.
1225 * This solution is to add it only when AND'ing, which is here, and
1226 * only when what is being AND'ed is the pristine, original node
1227 * matching anything. Thus it is like adding it to ssc_anything() but
1228 * only when the result is to be AND'ed. Probably the same solution
1229 * could be adopted for the same problem we have with /l matching,
1230 * which is solved differently in S_ssc_init(), and that would lead to
1231 * fewer false positives than that solution has. But if this solution
1232 * creates bugs, the consequences are only that a warning isn't raised
1233 * that should be; while the consequences for having /l bugs is
1234 * incorrect matches */
1235 if (ssc_is_anything((regnode_ssc *)and_with)) {
1236 anded_flags |= ANYOF_WARN_SUPER;
1240 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1241 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1244 ANYOF_FLAGS(ssc) &= anded_flags;
1246 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1247 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1248 * 'and_with' may be inverted. When not inverted, we have the situation of
1250 * (C1 | P1) & (C2 | P2)
1251 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1252 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1253 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1254 * <= ((C1 & C2) | P1 | P2)
1255 * Alternatively, the last few steps could be:
1256 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1257 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1258 * <= (C1 | C2 | (P1 & P2))
1259 * We favor the second approach if either P1 or P2 is non-empty. This is
1260 * because these components are a barrier to doing optimizations, as what
1261 * they match cannot be known until the moment of matching as they are
1262 * dependent on the current locale, 'AND"ing them likely will reduce or
1264 * But we can do better if we know that C1,P1 are in their initial state (a
1265 * frequent occurrence), each matching everything:
1266 * (<everything>) & (C2 | P2) = C2 | P2
1267 * Similarly, if C2,P2 are in their initial state (again a frequent
1268 * occurrence), the result is a no-op
1269 * (C1 | P1) & (<everything>) = C1 | P1
1272 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1273 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1274 * <= (C1 & ~C2) | (P1 & ~P2)
1277 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1278 && ! is_ANYOF_SYNTHETIC(and_with))
1282 ssc_intersection(ssc,
1284 FALSE /* Has already been inverted */
1287 /* If either P1 or P2 is empty, the intersection will be also; can skip
1289 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1290 ANYOF_POSIXL_ZERO(ssc);
1292 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1294 /* Note that the Posix class component P from 'and_with' actually
1296 * P = Pa | Pb | ... | Pn
1297 * where each component is one posix class, such as in [\w\s].
1299 * ~P = ~(Pa | Pb | ... | Pn)
1300 * = ~Pa & ~Pb & ... & ~Pn
1301 * <= ~Pa | ~Pb | ... | ~Pn
1302 * The last is something we can easily calculate, but unfortunately
1303 * is likely to have many false positives. We could do better
1304 * in some (but certainly not all) instances if two classes in
1305 * P have known relationships. For example
1306 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1308 * :lower: & :print: = :lower:
1309 * And similarly for classes that must be disjoint. For example,
1310 * since \s and \w can have no elements in common based on rules in
1311 * the POSIX standard,
1312 * \w & ^\S = nothing
1313 * Unfortunately, some vendor locales do not meet the Posix
1314 * standard, in particular almost everything by Microsoft.
1315 * The loop below just changes e.g., \w into \W and vice versa */
1317 regnode_charclass_posixl temp;
1318 int add = 1; /* To calculate the index of the complement */
1320 ANYOF_POSIXL_ZERO(&temp);
1321 for (i = 0; i < ANYOF_MAX; i++) {
1323 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1324 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1326 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1327 ANYOF_POSIXL_SET(&temp, i + add);
1329 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1331 ANYOF_POSIXL_AND(&temp, ssc);
1333 } /* else ssc already has no posixes */
1334 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1335 in its initial state */
1336 else if (! is_ANYOF_SYNTHETIC(and_with)
1337 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1339 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1340 * copy it over 'ssc' */
1341 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1342 if (is_ANYOF_SYNTHETIC(and_with)) {
1343 StructCopy(and_with, ssc, regnode_ssc);
1346 ssc->invlist = anded_cp_list;
1347 ANYOF_POSIXL_ZERO(ssc);
1348 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1349 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1353 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1354 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1356 /* One or the other of P1, P2 is non-empty. */
1357 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1358 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1360 ssc_union(ssc, anded_cp_list, FALSE);
1362 else { /* P1 = P2 = empty */
1363 ssc_intersection(ssc, anded_cp_list, FALSE);
1369 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1370 const regnode_charclass *or_with)
1372 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1373 * another SSC or a regular ANYOF class. Can create false positives if
1374 * 'or_with' is to be inverted. */
1379 PERL_ARGS_ASSERT_SSC_OR;
1381 assert(is_ANYOF_SYNTHETIC(ssc));
1383 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1384 * the code point inversion list and just the relevant flags */
1385 if (is_ANYOF_SYNTHETIC(or_with)) {
1386 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1387 ored_flags = ANYOF_FLAGS(or_with);
1390 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1391 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1394 ANYOF_FLAGS(ssc) |= ored_flags;
1396 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1397 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1398 * 'or_with' may be inverted. When not inverted, we have the simple
1399 * situation of computing:
1400 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1401 * If P1|P2 yields a situation with both a class and its complement are
1402 * set, like having both \w and \W, this matches all code points, and we
1403 * can delete these from the P component of the ssc going forward. XXX We
1404 * might be able to delete all the P components, but I (khw) am not certain
1405 * about this, and it is better to be safe.
1408 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1409 * <= (C1 | P1) | ~C2
1410 * <= (C1 | ~C2) | P1
1411 * (which results in actually simpler code than the non-inverted case)
1414 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1415 && ! is_ANYOF_SYNTHETIC(or_with))
1417 /* We ignore P2, leaving P1 going forward */
1418 } /* else Not inverted */
1419 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1420 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1421 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1423 for (i = 0; i < ANYOF_MAX; i += 2) {
1424 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1426 ssc_match_all_cp(ssc);
1427 ANYOF_POSIXL_CLEAR(ssc, i);
1428 ANYOF_POSIXL_CLEAR(ssc, i+1);
1436 FALSE /* Already has been inverted */
1440 PERL_STATIC_INLINE void
1441 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1443 PERL_ARGS_ASSERT_SSC_UNION;
1445 assert(is_ANYOF_SYNTHETIC(ssc));
1447 _invlist_union_maybe_complement_2nd(ssc->invlist,
1453 PERL_STATIC_INLINE void
1454 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1456 const bool invert2nd)
1458 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1460 assert(is_ANYOF_SYNTHETIC(ssc));
1462 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1468 PERL_STATIC_INLINE void
1469 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1471 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1473 assert(is_ANYOF_SYNTHETIC(ssc));
1475 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1478 PERL_STATIC_INLINE void
1479 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1481 /* AND just the single code point 'cp' into the SSC 'ssc' */
1483 SV* cp_list = _new_invlist(2);
1485 PERL_ARGS_ASSERT_SSC_CP_AND;
1487 assert(is_ANYOF_SYNTHETIC(ssc));
1489 cp_list = add_cp_to_invlist(cp_list, cp);
1490 ssc_intersection(ssc, cp_list,
1491 FALSE /* Not inverted */
1493 SvREFCNT_dec_NN(cp_list);
1496 PERL_STATIC_INLINE void
1497 S_ssc_clear_locale(regnode_ssc *ssc)
1499 /* Set the SSC 'ssc' to not match any locale things */
1500 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1502 assert(is_ANYOF_SYNTHETIC(ssc));
1504 ANYOF_POSIXL_ZERO(ssc);
1505 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1508 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1511 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1513 /* The synthetic start class is used to hopefully quickly winnow down
1514 * places where a pattern could start a match in the target string. If it
1515 * doesn't really narrow things down that much, there isn't much point to
1516 * having the overhead of using it. This function uses some very crude
1517 * heuristics to decide if to use the ssc or not.
1519 * It returns TRUE if 'ssc' rules out more than half what it considers to
1520 * be the "likely" possible matches, but of course it doesn't know what the
1521 * actual things being matched are going to be; these are only guesses
1523 * For /l matches, it assumes that the only likely matches are going to be
1524 * in the 0-255 range, uniformly distributed, so half of that is 127
1525 * For /a and /d matches, it assumes that the likely matches will be just
1526 * the ASCII range, so half of that is 63
1527 * For /u and there isn't anything matching above the Latin1 range, it
1528 * assumes that that is the only range likely to be matched, and uses
1529 * half that as the cut-off: 127. If anything matches above Latin1,
1530 * it assumes that all of Unicode could match (uniformly), except for
1531 * non-Unicode code points and things in the General Category "Other"
1532 * (unassigned, private use, surrogates, controls and formats). This
1533 * is a much large number. */
1535 const U32 max_match = (LOC)
1539 : (invlist_highest(ssc->invlist) < 256)
1541 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1542 U32 count = 0; /* Running total of number of code points matched by
1544 UV start, end; /* Start and end points of current range in inversion
1547 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1549 invlist_iterinit(ssc->invlist);
1550 while (invlist_iternext(ssc->invlist, &start, &end)) {
1552 /* /u is the only thing that we expect to match above 255; so if not /u
1553 * and even if there are matches above 255, ignore them. This catches
1554 * things like \d under /d which does match the digits above 255, but
1555 * since the pattern is /d, it is not likely to be expecting them */
1556 if (! UNI_SEMANTICS) {
1560 end = MIN(end, 255);
1562 count += end - start + 1;
1563 if (count > max_match) {
1564 invlist_iterfinish(ssc->invlist);
1574 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1576 /* The inversion list in the SSC is marked mortal; now we need a more
1577 * permanent copy, which is stored the same way that is done in a regular
1578 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1581 SV* invlist = invlist_clone(ssc->invlist);
1583 PERL_ARGS_ASSERT_SSC_FINALIZE;
1585 assert(is_ANYOF_SYNTHETIC(ssc));
1587 /* The code in this file assumes that all but these flags aren't relevant
1588 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1589 * by the time we reach here */
1590 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1592 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1594 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1595 NULL, NULL, NULL, FALSE);
1597 /* Make sure is clone-safe */
1598 ssc->invlist = NULL;
1600 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1601 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1604 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1607 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1608 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1609 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1610 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1611 ? (TRIE_LIST_CUR( idx ) - 1) \
1617 dump_trie(trie,widecharmap,revcharmap)
1618 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1619 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1621 These routines dump out a trie in a somewhat readable format.
1622 The _interim_ variants are used for debugging the interim
1623 tables that are used to generate the final compressed
1624 representation which is what dump_trie expects.
1626 Part of the reason for their existence is to provide a form
1627 of documentation as to how the different representations function.
1632 Dumps the final compressed table form of the trie to Perl_debug_log.
1633 Used for debugging make_trie().
1637 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1638 AV *revcharmap, U32 depth)
1641 SV *sv=sv_newmortal();
1642 int colwidth= widecharmap ? 6 : 4;
1644 GET_RE_DEBUG_FLAGS_DECL;
1646 PERL_ARGS_ASSERT_DUMP_TRIE;
1648 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1649 (int)depth * 2 + 2,"",
1650 "Match","Base","Ofs" );
1652 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1653 SV ** const tmp = av_fetch( revcharmap, state, 0);
1655 PerlIO_printf( Perl_debug_log, "%*s",
1657 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1658 PL_colors[0], PL_colors[1],
1659 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1660 PERL_PV_ESCAPE_FIRSTCHAR
1665 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1666 (int)depth * 2 + 2,"");
1668 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1669 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1670 PerlIO_printf( Perl_debug_log, "\n");
1672 for( state = 1 ; state < trie->statecount ; state++ ) {
1673 const U32 base = trie->states[ state ].trans.base;
1675 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1676 (int)depth * 2 + 2,"", (UV)state);
1678 if ( trie->states[ state ].wordnum ) {
1679 PerlIO_printf( Perl_debug_log, " W%4X",
1680 trie->states[ state ].wordnum );
1682 PerlIO_printf( Perl_debug_log, "%6s", "" );
1685 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1690 while( ( base + ofs < trie->uniquecharcount ) ||
1691 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1692 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1696 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1698 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1699 if ( ( base + ofs >= trie->uniquecharcount )
1700 && ( base + ofs - trie->uniquecharcount
1702 && trie->trans[ base + ofs
1703 - trie->uniquecharcount ].check == state )
1705 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1707 (UV)trie->trans[ base + ofs
1708 - trie->uniquecharcount ].next );
1710 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1714 PerlIO_printf( Perl_debug_log, "]");
1717 PerlIO_printf( Perl_debug_log, "\n" );
1719 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1721 for (word=1; word <= trie->wordcount; word++) {
1722 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1723 (int)word, (int)(trie->wordinfo[word].prev),
1724 (int)(trie->wordinfo[word].len));
1726 PerlIO_printf(Perl_debug_log, "\n" );
1729 Dumps a fully constructed but uncompressed trie in list form.
1730 List tries normally only are used for construction when the number of
1731 possible chars (trie->uniquecharcount) is very high.
1732 Used for debugging make_trie().
1735 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1736 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1740 SV *sv=sv_newmortal();
1741 int colwidth= widecharmap ? 6 : 4;
1742 GET_RE_DEBUG_FLAGS_DECL;
1744 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1746 /* print out the table precompression. */
1747 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1748 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1749 "------:-----+-----------------\n" );
1751 for( state=1 ; state < next_alloc ; state ++ ) {
1754 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1755 (int)depth * 2 + 2,"", (UV)state );
1756 if ( ! trie->states[ state ].wordnum ) {
1757 PerlIO_printf( Perl_debug_log, "%5s| ","");
1759 PerlIO_printf( Perl_debug_log, "W%4x| ",
1760 trie->states[ state ].wordnum
1763 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1764 SV ** const tmp = av_fetch( revcharmap,
1765 TRIE_LIST_ITEM(state,charid).forid, 0);
1767 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1769 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1771 PL_colors[0], PL_colors[1],
1772 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1773 | PERL_PV_ESCAPE_FIRSTCHAR
1775 TRIE_LIST_ITEM(state,charid).forid,
1776 (UV)TRIE_LIST_ITEM(state,charid).newstate
1779 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1780 (int)((depth * 2) + 14), "");
1783 PerlIO_printf( Perl_debug_log, "\n");
1788 Dumps a fully constructed but uncompressed trie in table form.
1789 This is the normal DFA style state transition table, with a few
1790 twists to facilitate compression later.
1791 Used for debugging make_trie().
1794 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1795 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1800 SV *sv=sv_newmortal();
1801 int colwidth= widecharmap ? 6 : 4;
1802 GET_RE_DEBUG_FLAGS_DECL;
1804 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1807 print out the table precompression so that we can do a visual check
1808 that they are identical.
1811 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1813 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1814 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1816 PerlIO_printf( Perl_debug_log, "%*s",
1818 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1819 PL_colors[0], PL_colors[1],
1820 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1821 PERL_PV_ESCAPE_FIRSTCHAR
1827 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1829 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1830 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1833 PerlIO_printf( Perl_debug_log, "\n" );
1835 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1837 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1838 (int)depth * 2 + 2,"",
1839 (UV)TRIE_NODENUM( state ) );
1841 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1842 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1844 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1846 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1848 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1849 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1850 (UV)trie->trans[ state ].check );
1852 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1853 (UV)trie->trans[ state ].check,
1854 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1862 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1863 startbranch: the first branch in the whole branch sequence
1864 first : start branch of sequence of branch-exact nodes.
1865 May be the same as startbranch
1866 last : Thing following the last branch.
1867 May be the same as tail.
1868 tail : item following the branch sequence
1869 count : words in the sequence
1870 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1871 depth : indent depth
1873 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1875 A trie is an N'ary tree where the branches are determined by digital
1876 decomposition of the key. IE, at the root node you look up the 1st character and
1877 follow that branch repeat until you find the end of the branches. Nodes can be
1878 marked as "accepting" meaning they represent a complete word. Eg:
1882 would convert into the following structure. Numbers represent states, letters
1883 following numbers represent valid transitions on the letter from that state, if
1884 the number is in square brackets it represents an accepting state, otherwise it
1885 will be in parenthesis.
1887 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1891 (1) +-i->(6)-+-s->[7]
1893 +-s->(3)-+-h->(4)-+-e->[5]
1895 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1897 This shows that when matching against the string 'hers' we will begin at state 1
1898 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1899 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1900 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1901 single traverse. We store a mapping from accepting to state to which word was
1902 matched, and then when we have multiple possibilities we try to complete the
1903 rest of the regex in the order in which they occured in the alternation.
1905 The only prior NFA like behaviour that would be changed by the TRIE support is
1906 the silent ignoring of duplicate alternations which are of the form:
1908 / (DUPE|DUPE) X? (?{ ... }) Y /x
1910 Thus EVAL blocks following a trie may be called a different number of times with
1911 and without the optimisation. With the optimisations dupes will be silently
1912 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1913 the following demonstrates:
1915 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1917 which prints out 'word' three times, but
1919 'words'=~/(word|word|word)(?{ print $1 })S/
1921 which doesnt print it out at all. This is due to other optimisations kicking in.
1923 Example of what happens on a structural level:
1925 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1927 1: CURLYM[1] {1,32767}(18)
1938 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1939 and should turn into:
1941 1: CURLYM[1] {1,32767}(18)
1943 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1951 Cases where tail != last would be like /(?foo|bar)baz/:
1961 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1962 and would end up looking like:
1965 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1972 d = uvchr_to_utf8_flags(d, uv, 0);
1974 is the recommended Unicode-aware way of saying
1979 #define TRIE_STORE_REVCHAR(val) \
1982 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1983 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1984 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1985 SvCUR_set(zlopp, kapow - flrbbbbb); \
1988 av_push(revcharmap, zlopp); \
1990 char ooooff = (char)val; \
1991 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1995 /* This gets the next character from the input, folding it if not already
1997 #define TRIE_READ_CHAR STMT_START { \
2000 /* if it is UTF then it is either already folded, or does not need \
2002 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2004 else if (folder == PL_fold_latin1) { \
2005 /* This folder implies Unicode rules, which in the range expressible \
2006 * by not UTF is the lower case, with the two exceptions, one of \
2007 * which should have been taken care of before calling this */ \
2008 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2009 uvc = toLOWER_L1(*uc); \
2010 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2013 /* raw data, will be folded later if needed */ \
2021 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2022 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2023 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2024 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2026 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2027 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2028 TRIE_LIST_CUR( state )++; \
2031 #define TRIE_LIST_NEW(state) STMT_START { \
2032 Newxz( trie->states[ state ].trans.list, \
2033 4, reg_trie_trans_le ); \
2034 TRIE_LIST_CUR( state ) = 1; \
2035 TRIE_LIST_LEN( state ) = 4; \
2038 #define TRIE_HANDLE_WORD(state) STMT_START { \
2039 U16 dupe= trie->states[ state ].wordnum; \
2040 regnode * const noper_next = regnext( noper ); \
2043 /* store the word for dumping */ \
2045 if (OP(noper) != NOTHING) \
2046 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2048 tmp = newSVpvn_utf8( "", 0, UTF ); \
2049 av_push( trie_words, tmp ); \
2053 trie->wordinfo[curword].prev = 0; \
2054 trie->wordinfo[curword].len = wordlen; \
2055 trie->wordinfo[curword].accept = state; \
2057 if ( noper_next < tail ) { \
2059 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2061 trie->jump[curword] = (U16)(noper_next - convert); \
2063 jumper = noper_next; \
2065 nextbranch= regnext(cur); \
2069 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2070 /* chain, so that when the bits of chain are later */\
2071 /* linked together, the dups appear in the chain */\
2072 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2073 trie->wordinfo[dupe].prev = curword; \
2075 /* we haven't inserted this word yet. */ \
2076 trie->states[ state ].wordnum = curword; \
2081 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2082 ( ( base + charid >= ucharcount \
2083 && base + charid < ubound \
2084 && state == trie->trans[ base - ucharcount + charid ].check \
2085 && trie->trans[ base - ucharcount + charid ].next ) \
2086 ? trie->trans[ base - ucharcount + charid ].next \
2087 : ( state==1 ? special : 0 ) \
2091 #define MADE_JUMP_TRIE 2
2092 #define MADE_EXACT_TRIE 4
2095 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2096 regnode *first, regnode *last, regnode *tail,
2097 U32 word_count, U32 flags, U32 depth)
2099 /* first pass, loop through and scan words */
2100 reg_trie_data *trie;
2101 HV *widecharmap = NULL;
2102 AV *revcharmap = newAV();
2108 regnode *jumper = NULL;
2109 regnode *nextbranch = NULL;
2110 regnode *convert = NULL;
2111 U32 *prev_states; /* temp array mapping each state to previous one */
2112 /* we just use folder as a flag in utf8 */
2113 const U8 * folder = NULL;
2116 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2117 AV *trie_words = NULL;
2118 /* along with revcharmap, this only used during construction but both are
2119 * useful during debugging so we store them in the struct when debugging.
2122 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2123 STRLEN trie_charcount=0;
2125 SV *re_trie_maxbuff;
2126 GET_RE_DEBUG_FLAGS_DECL;
2128 PERL_ARGS_ASSERT_MAKE_TRIE;
2130 PERL_UNUSED_ARG(depth);
2137 case EXACTFU: folder = PL_fold_latin1; break;
2138 case EXACTF: folder = PL_fold; break;
2139 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2142 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2144 trie->startstate = 1;
2145 trie->wordcount = word_count;
2146 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2147 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2149 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2150 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2151 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2154 trie_words = newAV();
2157 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2158 assert(re_trie_maxbuff);
2159 if (!SvIOK(re_trie_maxbuff)) {
2160 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2162 DEBUG_TRIE_COMPILE_r({
2163 PerlIO_printf( Perl_debug_log,
2164 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2165 (int)depth * 2 + 2, "",
2166 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2167 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2170 /* Find the node we are going to overwrite */
2171 if ( first == startbranch && OP( last ) != BRANCH ) {
2172 /* whole branch chain */
2175 /* branch sub-chain */
2176 convert = NEXTOPER( first );
2179 /* -- First loop and Setup --
2181 We first traverse the branches and scan each word to determine if it
2182 contains widechars, and how many unique chars there are, this is
2183 important as we have to build a table with at least as many columns as we
2186 We use an array of integers to represent the character codes 0..255
2187 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2188 the native representation of the character value as the key and IV's for
2191 *TODO* If we keep track of how many times each character is used we can
2192 remap the columns so that the table compression later on is more
2193 efficient in terms of memory by ensuring the most common value is in the
2194 middle and the least common are on the outside. IMO this would be better
2195 than a most to least common mapping as theres a decent chance the most
2196 common letter will share a node with the least common, meaning the node
2197 will not be compressible. With a middle is most common approach the worst
2198 case is when we have the least common nodes twice.
2202 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2203 regnode *noper = NEXTOPER( cur );
2204 const U8 *uc = (U8*)STRING( noper );
2205 const U8 *e = uc + STR_LEN( noper );
2207 U32 wordlen = 0; /* required init */
2208 STRLEN minchars = 0;
2209 STRLEN maxchars = 0;
2210 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2213 if (OP(noper) == NOTHING) {
2214 regnode *noper_next= regnext(noper);
2215 if (noper_next != tail && OP(noper_next) == flags) {
2217 uc= (U8*)STRING(noper);
2218 e= uc + STR_LEN(noper);
2219 trie->minlen= STR_LEN(noper);
2226 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2227 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2228 regardless of encoding */
2229 if (OP( noper ) == EXACTFU_SS) {
2230 /* false positives are ok, so just set this */
2231 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2234 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2236 TRIE_CHARCOUNT(trie)++;
2239 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2240 * is in effect. Under /i, this character can match itself, or
2241 * anything that folds to it. If not under /i, it can match just
2242 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2243 * all fold to k, and all are single characters. But some folds
2244 * expand to more than one character, so for example LATIN SMALL
2245 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2246 * the string beginning at 'uc' is 'ffi', it could be matched by
2247 * three characters, or just by the one ligature character. (It
2248 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2249 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2250 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2251 * match.) The trie needs to know the minimum and maximum number
2252 * of characters that could match so that it can use size alone to
2253 * quickly reject many match attempts. The max is simple: it is
2254 * the number of folded characters in this branch (since a fold is
2255 * never shorter than what folds to it. */
2259 /* And the min is equal to the max if not under /i (indicated by
2260 * 'folder' being NULL), or there are no multi-character folds. If
2261 * there is a multi-character fold, the min is incremented just
2262 * once, for the character that folds to the sequence. Each
2263 * character in the sequence needs to be added to the list below of
2264 * characters in the trie, but we count only the first towards the
2265 * min number of characters needed. This is done through the
2266 * variable 'foldlen', which is returned by the macros that look
2267 * for these sequences as the number of bytes the sequence
2268 * occupies. Each time through the loop, we decrement 'foldlen' by
2269 * how many bytes the current char occupies. Only when it reaches
2270 * 0 do we increment 'minchars' or look for another multi-character
2272 if (folder == NULL) {
2275 else if (foldlen > 0) {
2276 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2281 /* See if *uc is the beginning of a multi-character fold. If
2282 * so, we decrement the length remaining to look at, to account
2283 * for the current character this iteration. (We can use 'uc'
2284 * instead of the fold returned by TRIE_READ_CHAR because for
2285 * non-UTF, the latin1_safe macro is smart enough to account
2286 * for all the unfolded characters, and because for UTF, the
2287 * string will already have been folded earlier in the
2288 * compilation process */
2290 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2291 foldlen -= UTF8SKIP(uc);
2294 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2299 /* The current character (and any potential folds) should be added
2300 * to the possible matching characters for this position in this
2304 U8 folded= folder[ (U8) uvc ];
2305 if ( !trie->charmap[ folded ] ) {
2306 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2307 TRIE_STORE_REVCHAR( folded );
2310 if ( !trie->charmap[ uvc ] ) {
2311 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2312 TRIE_STORE_REVCHAR( uvc );
2315 /* store the codepoint in the bitmap, and its folded
2317 TRIE_BITMAP_SET(trie, uvc);
2319 /* store the folded codepoint */
2320 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2323 /* store first byte of utf8 representation of
2324 variant codepoints */
2325 if (! UVCHR_IS_INVARIANT(uvc)) {
2326 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2329 set_bit = 0; /* We've done our bit :-) */
2333 /* XXX We could come up with the list of code points that fold
2334 * to this using PL_utf8_foldclosures, except not for
2335 * multi-char folds, as there may be multiple combinations
2336 * there that could work, which needs to wait until runtime to
2337 * resolve (The comment about LIGATURE FFI above is such an
2342 widecharmap = newHV();
2344 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2347 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2349 if ( !SvTRUE( *svpp ) ) {
2350 sv_setiv( *svpp, ++trie->uniquecharcount );
2351 TRIE_STORE_REVCHAR(uvc);
2354 } /* end loop through characters in this branch of the trie */
2356 /* We take the min and max for this branch and combine to find the min
2357 * and max for all branches processed so far */
2358 if( cur == first ) {
2359 trie->minlen = minchars;
2360 trie->maxlen = maxchars;
2361 } else if (minchars < trie->minlen) {
2362 trie->minlen = minchars;
2363 } else if (maxchars > trie->maxlen) {
2364 trie->maxlen = maxchars;
2366 } /* end first pass */
2367 DEBUG_TRIE_COMPILE_r(
2368 PerlIO_printf( Perl_debug_log,
2369 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2370 (int)depth * 2 + 2,"",
2371 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2372 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2373 (int)trie->minlen, (int)trie->maxlen )
2377 We now know what we are dealing with in terms of unique chars and
2378 string sizes so we can calculate how much memory a naive
2379 representation using a flat table will take. If it's over a reasonable
2380 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2381 conservative but potentially much slower representation using an array
2384 At the end we convert both representations into the same compressed
2385 form that will be used in regexec.c for matching with. The latter
2386 is a form that cannot be used to construct with but has memory
2387 properties similar to the list form and access properties similar
2388 to the table form making it both suitable for fast searches and
2389 small enough that its feasable to store for the duration of a program.
2391 See the comment in the code where the compressed table is produced
2392 inplace from the flat tabe representation for an explanation of how
2393 the compression works.
2398 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2401 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2402 > SvIV(re_trie_maxbuff) )
2405 Second Pass -- Array Of Lists Representation
2407 Each state will be represented by a list of charid:state records
2408 (reg_trie_trans_le) the first such element holds the CUR and LEN
2409 points of the allocated array. (See defines above).
2411 We build the initial structure using the lists, and then convert
2412 it into the compressed table form which allows faster lookups
2413 (but cant be modified once converted).
2416 STRLEN transcount = 1;
2418 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2419 "%*sCompiling trie using list compiler\n",
2420 (int)depth * 2 + 2, ""));
2422 trie->states = (reg_trie_state *)
2423 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2424 sizeof(reg_trie_state) );
2428 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2430 regnode *noper = NEXTOPER( cur );
2431 U8 *uc = (U8*)STRING( noper );
2432 const U8 *e = uc + STR_LEN( noper );
2433 U32 state = 1; /* required init */
2434 U16 charid = 0; /* sanity init */
2435 U32 wordlen = 0; /* required init */
2437 if (OP(noper) == NOTHING) {
2438 regnode *noper_next= regnext(noper);
2439 if (noper_next != tail && OP(noper_next) == flags) {
2441 uc= (U8*)STRING(noper);
2442 e= uc + STR_LEN(noper);
2446 if (OP(noper) != NOTHING) {
2447 for ( ; uc < e ; uc += len ) {
2452 charid = trie->charmap[ uvc ];
2454 SV** const svpp = hv_fetch( widecharmap,
2461 charid=(U16)SvIV( *svpp );
2464 /* charid is now 0 if we dont know the char read, or
2465 * nonzero if we do */
2472 if ( !trie->states[ state ].trans.list ) {
2473 TRIE_LIST_NEW( state );
2476 check <= TRIE_LIST_USED( state );
2479 if ( TRIE_LIST_ITEM( state, check ).forid
2482 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2487 newstate = next_alloc++;
2488 prev_states[newstate] = state;
2489 TRIE_LIST_PUSH( state, charid, newstate );
2494 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2498 TRIE_HANDLE_WORD(state);
2500 } /* end second pass */
2502 /* next alloc is the NEXT state to be allocated */
2503 trie->statecount = next_alloc;
2504 trie->states = (reg_trie_state *)
2505 PerlMemShared_realloc( trie->states,
2507 * sizeof(reg_trie_state) );
2509 /* and now dump it out before we compress it */
2510 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2511 revcharmap, next_alloc,
2515 trie->trans = (reg_trie_trans *)
2516 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2523 for( state=1 ; state < next_alloc ; state ++ ) {
2527 DEBUG_TRIE_COMPILE_MORE_r(
2528 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2532 if (trie->states[state].trans.list) {
2533 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2537 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2538 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2539 if ( forid < minid ) {
2541 } else if ( forid > maxid ) {
2545 if ( transcount < tp + maxid - minid + 1) {
2547 trie->trans = (reg_trie_trans *)
2548 PerlMemShared_realloc( trie->trans,
2550 * sizeof(reg_trie_trans) );
2551 Zero( trie->trans + (transcount / 2),
2555 base = trie->uniquecharcount + tp - minid;
2556 if ( maxid == minid ) {
2558 for ( ; zp < tp ; zp++ ) {
2559 if ( ! trie->trans[ zp ].next ) {
2560 base = trie->uniquecharcount + zp - minid;
2561 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2563 trie->trans[ zp ].check = state;
2569 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2571 trie->trans[ tp ].check = state;
2576 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2577 const U32 tid = base
2578 - trie->uniquecharcount
2579 + TRIE_LIST_ITEM( state, idx ).forid;
2580 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2582 trie->trans[ tid ].check = state;
2584 tp += ( maxid - minid + 1 );
2586 Safefree(trie->states[ state ].trans.list);
2589 DEBUG_TRIE_COMPILE_MORE_r(
2590 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2593 trie->states[ state ].trans.base=base;
2595 trie->lasttrans = tp + 1;
2599 Second Pass -- Flat Table Representation.
2601 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2602 each. We know that we will need Charcount+1 trans at most to store
2603 the data (one row per char at worst case) So we preallocate both
2604 structures assuming worst case.
2606 We then construct the trie using only the .next slots of the entry
2609 We use the .check field of the first entry of the node temporarily
2610 to make compression both faster and easier by keeping track of how
2611 many non zero fields are in the node.
2613 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2616 There are two terms at use here: state as a TRIE_NODEIDX() which is
2617 a number representing the first entry of the node, and state as a
2618 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2619 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2620 if there are 2 entrys per node. eg:
2628 The table is internally in the right hand, idx form. However as we
2629 also have to deal with the states array which is indexed by nodenum
2630 we have to use TRIE_NODENUM() to convert.
2633 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2634 "%*sCompiling trie using table compiler\n",
2635 (int)depth * 2 + 2, ""));
2637 trie->trans = (reg_trie_trans *)
2638 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2639 * trie->uniquecharcount + 1,
2640 sizeof(reg_trie_trans) );
2641 trie->states = (reg_trie_state *)
2642 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2643 sizeof(reg_trie_state) );
2644 next_alloc = trie->uniquecharcount + 1;
2647 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2649 regnode *noper = NEXTOPER( cur );
2650 const U8 *uc = (U8*)STRING( noper );
2651 const U8 *e = uc + STR_LEN( noper );
2653 U32 state = 1; /* required init */
2655 U16 charid = 0; /* sanity init */
2656 U32 accept_state = 0; /* sanity init */
2658 U32 wordlen = 0; /* required init */
2660 if (OP(noper) == NOTHING) {
2661 regnode *noper_next= regnext(noper);
2662 if (noper_next != tail && OP(noper_next) == flags) {
2664 uc= (U8*)STRING(noper);
2665 e= uc + STR_LEN(noper);
2669 if ( OP(noper) != NOTHING ) {
2670 for ( ; uc < e ; uc += len ) {
2675 charid = trie->charmap[ uvc ];
2677 SV* const * const svpp = hv_fetch( widecharmap,
2681 charid = svpp ? (U16)SvIV(*svpp) : 0;
2685 if ( !trie->trans[ state + charid ].next ) {
2686 trie->trans[ state + charid ].next = next_alloc;
2687 trie->trans[ state ].check++;
2688 prev_states[TRIE_NODENUM(next_alloc)]
2689 = TRIE_NODENUM(state);
2690 next_alloc += trie->uniquecharcount;
2692 state = trie->trans[ state + charid ].next;
2694 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2696 /* charid is now 0 if we dont know the char read, or
2697 * nonzero if we do */
2700 accept_state = TRIE_NODENUM( state );
2701 TRIE_HANDLE_WORD(accept_state);
2703 } /* end second pass */
2705 /* and now dump it out before we compress it */
2706 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2708 next_alloc, depth+1));
2712 * Inplace compress the table.*
2714 For sparse data sets the table constructed by the trie algorithm will
2715 be mostly 0/FAIL transitions or to put it another way mostly empty.
2716 (Note that leaf nodes will not contain any transitions.)
2718 This algorithm compresses the tables by eliminating most such
2719 transitions, at the cost of a modest bit of extra work during lookup:
2721 - Each states[] entry contains a .base field which indicates the
2722 index in the state[] array wheres its transition data is stored.
2724 - If .base is 0 there are no valid transitions from that node.
2726 - If .base is nonzero then charid is added to it to find an entry in
2729 -If trans[states[state].base+charid].check!=state then the
2730 transition is taken to be a 0/Fail transition. Thus if there are fail
2731 transitions at the front of the node then the .base offset will point
2732 somewhere inside the previous nodes data (or maybe even into a node
2733 even earlier), but the .check field determines if the transition is
2737 The following process inplace converts the table to the compressed
2738 table: We first do not compress the root node 1,and mark all its
2739 .check pointers as 1 and set its .base pointer as 1 as well. This
2740 allows us to do a DFA construction from the compressed table later,
2741 and ensures that any .base pointers we calculate later are greater
2744 - We set 'pos' to indicate the first entry of the second node.
2746 - We then iterate over the columns of the node, finding the first and
2747 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2748 and set the .check pointers accordingly, and advance pos
2749 appropriately and repreat for the next node. Note that when we copy
2750 the next pointers we have to convert them from the original
2751 NODEIDX form to NODENUM form as the former is not valid post
2754 - If a node has no transitions used we mark its base as 0 and do not
2755 advance the pos pointer.
2757 - If a node only has one transition we use a second pointer into the
2758 structure to fill in allocated fail transitions from other states.
2759 This pointer is independent of the main pointer and scans forward
2760 looking for null transitions that are allocated to a state. When it
2761 finds one it writes the single transition into the "hole". If the
2762 pointer doesnt find one the single transition is appended as normal.
2764 - Once compressed we can Renew/realloc the structures to release the
2767 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2768 specifically Fig 3.47 and the associated pseudocode.
2772 const U32 laststate = TRIE_NODENUM( next_alloc );
2775 trie->statecount = laststate;
2777 for ( state = 1 ; state < laststate ; state++ ) {
2779 const U32 stateidx = TRIE_NODEIDX( state );
2780 const U32 o_used = trie->trans[ stateidx ].check;
2781 U32 used = trie->trans[ stateidx ].check;
2782 trie->trans[ stateidx ].check = 0;
2785 used && charid < trie->uniquecharcount;
2788 if ( flag || trie->trans[ stateidx + charid ].next ) {
2789 if ( trie->trans[ stateidx + charid ].next ) {
2791 for ( ; zp < pos ; zp++ ) {
2792 if ( ! trie->trans[ zp ].next ) {
2796 trie->states[ state ].trans.base
2798 + trie->uniquecharcount
2800 trie->trans[ zp ].next
2801 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2803 trie->trans[ zp ].check = state;
2804 if ( ++zp > pos ) pos = zp;
2811 trie->states[ state ].trans.base
2812 = pos + trie->uniquecharcount - charid ;
2814 trie->trans[ pos ].next
2815 = SAFE_TRIE_NODENUM(
2816 trie->trans[ stateidx + charid ].next );
2817 trie->trans[ pos ].check = state;
2822 trie->lasttrans = pos + 1;
2823 trie->states = (reg_trie_state *)
2824 PerlMemShared_realloc( trie->states, laststate
2825 * sizeof(reg_trie_state) );
2826 DEBUG_TRIE_COMPILE_MORE_r(
2827 PerlIO_printf( Perl_debug_log,
2828 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2829 (int)depth * 2 + 2,"",
2830 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2834 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2837 } /* end table compress */
2839 DEBUG_TRIE_COMPILE_MORE_r(
2840 PerlIO_printf(Perl_debug_log,
2841 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2842 (int)depth * 2 + 2, "",
2843 (UV)trie->statecount,
2844 (UV)trie->lasttrans)
2846 /* resize the trans array to remove unused space */
2847 trie->trans = (reg_trie_trans *)
2848 PerlMemShared_realloc( trie->trans, trie->lasttrans
2849 * sizeof(reg_trie_trans) );
2851 { /* Modify the program and insert the new TRIE node */
2852 U8 nodetype =(U8)(flags & 0xFF);
2856 regnode *optimize = NULL;
2857 #ifdef RE_TRACK_PATTERN_OFFSETS
2860 U32 mjd_nodelen = 0;
2861 #endif /* RE_TRACK_PATTERN_OFFSETS */
2862 #endif /* DEBUGGING */
2864 This means we convert either the first branch or the first Exact,
2865 depending on whether the thing following (in 'last') is a branch
2866 or not and whther first is the startbranch (ie is it a sub part of
2867 the alternation or is it the whole thing.)
2868 Assuming its a sub part we convert the EXACT otherwise we convert
2869 the whole branch sequence, including the first.
2871 /* Find the node we are going to overwrite */
2872 if ( first != startbranch || OP( last ) == BRANCH ) {
2873 /* branch sub-chain */
2874 NEXT_OFF( first ) = (U16)(last - first);
2875 #ifdef RE_TRACK_PATTERN_OFFSETS
2877 mjd_offset= Node_Offset((convert));
2878 mjd_nodelen= Node_Length((convert));
2881 /* whole branch chain */
2883 #ifdef RE_TRACK_PATTERN_OFFSETS
2886 const regnode *nop = NEXTOPER( convert );
2887 mjd_offset= Node_Offset((nop));
2888 mjd_nodelen= Node_Length((nop));
2892 PerlIO_printf(Perl_debug_log,
2893 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2894 (int)depth * 2 + 2, "",
2895 (UV)mjd_offset, (UV)mjd_nodelen)
2898 /* But first we check to see if there is a common prefix we can
2899 split out as an EXACT and put in front of the TRIE node. */
2900 trie->startstate= 1;
2901 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2903 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2907 const U32 base = trie->states[ state ].trans.base;
2909 if ( trie->states[state].wordnum )
2912 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2913 if ( ( base + ofs >= trie->uniquecharcount ) &&
2914 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2915 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2917 if ( ++count > 1 ) {
2918 SV **tmp = av_fetch( revcharmap, ofs, 0);
2919 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2920 if ( state == 1 ) break;
2922 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2924 PerlIO_printf(Perl_debug_log,
2925 "%*sNew Start State=%"UVuf" Class: [",
2926 (int)depth * 2 + 2, "",
2929 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2930 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2932 TRIE_BITMAP_SET(trie,*ch);
2934 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2936 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2940 TRIE_BITMAP_SET(trie,*ch);
2942 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2943 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2949 SV **tmp = av_fetch( revcharmap, idx, 0);
2951 char *ch = SvPV( *tmp, len );
2953 SV *sv=sv_newmortal();
2954 PerlIO_printf( Perl_debug_log,
2955 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2956 (int)depth * 2 + 2, "",
2958 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2959 PL_colors[0], PL_colors[1],
2960 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2961 PERL_PV_ESCAPE_FIRSTCHAR
2966 OP( convert ) = nodetype;
2967 str=STRING(convert);
2970 STR_LEN(convert) += len;
2976 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2981 trie->prefixlen = (state-1);
2983 regnode *n = convert+NODE_SZ_STR(convert);
2984 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2985 trie->startstate = state;
2986 trie->minlen -= (state - 1);
2987 trie->maxlen -= (state - 1);
2989 /* At least the UNICOS C compiler choked on this
2990 * being argument to DEBUG_r(), so let's just have
2993 #ifdef PERL_EXT_RE_BUILD
2999 regnode *fix = convert;
3000 U32 word = trie->wordcount;
3002 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3003 while( ++fix < n ) {
3004 Set_Node_Offset_Length(fix, 0, 0);
3007 SV ** const tmp = av_fetch( trie_words, word, 0 );
3009 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3010 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3012 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3020 NEXT_OFF(convert) = (U16)(tail - convert);
3021 DEBUG_r(optimize= n);
3027 if ( trie->maxlen ) {
3028 NEXT_OFF( convert ) = (U16)(tail - convert);
3029 ARG_SET( convert, data_slot );
3030 /* Store the offset to the first unabsorbed branch in
3031 jump[0], which is otherwise unused by the jump logic.
3032 We use this when dumping a trie and during optimisation. */
3034 trie->jump[0] = (U16)(nextbranch - convert);
3036 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3037 * and there is a bitmap
3038 * and the first "jump target" node we found leaves enough room
3039 * then convert the TRIE node into a TRIEC node, with the bitmap
3040 * embedded inline in the opcode - this is hypothetically faster.
3042 if ( !trie->states[trie->startstate].wordnum
3044 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3046 OP( convert ) = TRIEC;
3047 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3048 PerlMemShared_free(trie->bitmap);
3051 OP( convert ) = TRIE;
3053 /* store the type in the flags */
3054 convert->flags = nodetype;
3058 + regarglen[ OP( convert ) ];
3060 /* XXX We really should free up the resource in trie now,
3061 as we won't use them - (which resources?) dmq */
3063 /* needed for dumping*/
3064 DEBUG_r(if (optimize) {
3065 regnode *opt = convert;
3067 while ( ++opt < optimize) {
3068 Set_Node_Offset_Length(opt,0,0);
3071 Try to clean up some of the debris left after the
3074 while( optimize < jumper ) {
3075 mjd_nodelen += Node_Length((optimize));
3076 OP( optimize ) = OPTIMIZED;
3077 Set_Node_Offset_Length(optimize,0,0);
3080 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3082 } /* end node insert */
3084 /* Finish populating the prev field of the wordinfo array. Walk back
3085 * from each accept state until we find another accept state, and if
3086 * so, point the first word's .prev field at the second word. If the
3087 * second already has a .prev field set, stop now. This will be the
3088 * case either if we've already processed that word's accept state,
3089 * or that state had multiple words, and the overspill words were
3090 * already linked up earlier.
3097 for (word=1; word <= trie->wordcount; word++) {
3099 if (trie->wordinfo[word].prev)
3101 state = trie->wordinfo[word].accept;
3103 state = prev_states[state];
3106 prev = trie->states[state].wordnum;
3110 trie->wordinfo[word].prev = prev;
3112 Safefree(prev_states);
3116 /* and now dump out the compressed format */
3117 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3119 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3121 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3122 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3124 SvREFCNT_dec_NN(revcharmap);
3128 : trie->startstate>1
3134 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3136 /* The Trie is constructed and compressed now so we can build a fail array if
3139 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3141 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3145 We find the fail state for each state in the trie, this state is the longest
3146 proper suffix of the current state's 'word' that is also a proper prefix of
3147 another word in our trie. State 1 represents the word '' and is thus the
3148 default fail state. This allows the DFA not to have to restart after its
3149 tried and failed a word at a given point, it simply continues as though it
3150 had been matching the other word in the first place.
3152 'abcdgu'=~/abcdefg|cdgu/
3153 When we get to 'd' we are still matching the first word, we would encounter
3154 'g' which would fail, which would bring us to the state representing 'd' in
3155 the second word where we would try 'g' and succeed, proceeding to match
3158 /* add a fail transition */
3159 const U32 trie_offset = ARG(source);
3160 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3162 const U32 ucharcount = trie->uniquecharcount;
3163 const U32 numstates = trie->statecount;
3164 const U32 ubound = trie->lasttrans + ucharcount;
3168 U32 base = trie->states[ 1 ].trans.base;
3171 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3173 GET_RE_DEBUG_FLAGS_DECL;
3175 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3176 PERL_UNUSED_CONTEXT;
3178 PERL_UNUSED_ARG(depth);
3181 if ( OP(source) == TRIE ) {
3182 struct regnode_1 *op = (struct regnode_1 *)
3183 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3184 StructCopy(source,op,struct regnode_1);
3185 stclass = (regnode *)op;
3187 struct regnode_charclass *op = (struct regnode_charclass *)
3188 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3189 StructCopy(source,op,struct regnode_charclass);
3190 stclass = (regnode *)op;
3192 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3194 ARG_SET( stclass, data_slot );
3195 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3196 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3197 aho->trie=trie_offset;
3198 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3199 Copy( trie->states, aho->states, numstates, reg_trie_state );
3200 Newxz( q, numstates, U32);
3201 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3204 /* initialize fail[0..1] to be 1 so that we always have
3205 a valid final fail state */
3206 fail[ 0 ] = fail[ 1 ] = 1;
3208 for ( charid = 0; charid < ucharcount ; charid++ ) {
3209 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3211 q[ q_write ] = newstate;
3212 /* set to point at the root */
3213 fail[ q[ q_write++ ] ]=1;
3216 while ( q_read < q_write) {
3217 const U32 cur = q[ q_read++ % numstates ];
3218 base = trie->states[ cur ].trans.base;
3220 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3221 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3223 U32 fail_state = cur;
3226 fail_state = fail[ fail_state ];
3227 fail_base = aho->states[ fail_state ].trans.base;
3228 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3230 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3231 fail[ ch_state ] = fail_state;
3232 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3234 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3236 q[ q_write++ % numstates] = ch_state;
3240 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3241 when we fail in state 1, this allows us to use the
3242 charclass scan to find a valid start char. This is based on the principle
3243 that theres a good chance the string being searched contains lots of stuff
3244 that cant be a start char.
3246 fail[ 0 ] = fail[ 1 ] = 0;
3247 DEBUG_TRIE_COMPILE_r({
3248 PerlIO_printf(Perl_debug_log,
3249 "%*sStclass Failtable (%"UVuf" states): 0",
3250 (int)(depth * 2), "", (UV)numstates
3252 for( q_read=1; q_read<numstates; q_read++ ) {
3253 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3255 PerlIO_printf(Perl_debug_log, "\n");
3258 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3263 #define DEBUG_PEEP(str,scan,depth) \
3264 DEBUG_OPTIMISE_r({if (scan){ \
3265 regnode *Next = regnext(scan); \
3266 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3267 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3268 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3269 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3270 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3271 PerlIO_printf(Perl_debug_log, "\n"); \
3274 /* The below joins as many adjacent EXACTish nodes as possible into a single
3275 * one. The regop may be changed if the node(s) contain certain sequences that
3276 * require special handling. The joining is only done if:
3277 * 1) there is room in the current conglomerated node to entirely contain the
3279 * 2) they are the exact same node type
3281 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3282 * these get optimized out
3284 * If a node is to match under /i (folded), the number of characters it matches
3285 * can be different than its character length if it contains a multi-character
3286 * fold. *min_subtract is set to the total delta number of characters of the
3289 * And *unfolded_multi_char is set to indicate whether or not the node contains
3290 * an unfolded multi-char fold. This happens when whether the fold is valid or
3291 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3292 * SMALL LETTER SHARP S, as only if the target string being matched against
3293 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3294 * folding rules depend on the locale in force at runtime. (Multi-char folds
3295 * whose components are all above the Latin1 range are not run-time locale
3296 * dependent, and have already been folded by the time this function is
3299 * This is as good a place as any to discuss the design of handling these
3300 * multi-character fold sequences. It's been wrong in Perl for a very long
3301 * time. There are three code points in Unicode whose multi-character folds
3302 * were long ago discovered to mess things up. The previous designs for
3303 * dealing with these involved assigning a special node for them. This
3304 * approach doesn't always work, as evidenced by this example:
3305 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3306 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3307 * would match just the \xDF, it won't be able to handle the case where a
3308 * successful match would have to cross the node's boundary. The new approach
3309 * that hopefully generally solves the problem generates an EXACTFU_SS node
3310 * that is "sss" in this case.
3312 * It turns out that there are problems with all multi-character folds, and not
3313 * just these three. Now the code is general, for all such cases. The
3314 * approach taken is:
3315 * 1) This routine examines each EXACTFish node that could contain multi-
3316 * character folded sequences. Since a single character can fold into
3317 * such a sequence, the minimum match length for this node is less than
3318 * the number of characters in the node. This routine returns in
3319 * *min_subtract how many characters to subtract from the the actual
3320 * length of the string to get a real minimum match length; it is 0 if
3321 * there are no multi-char foldeds. This delta is used by the caller to
3322 * adjust the min length of the match, and the delta between min and max,
3323 * so that the optimizer doesn't reject these possibilities based on size
3325 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3326 * is used for an EXACTFU node that contains at least one "ss" sequence in
3327 * it. For non-UTF-8 patterns and strings, this is the only case where
3328 * there is a possible fold length change. That means that a regular
3329 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3330 * with length changes, and so can be processed faster. regexec.c takes
3331 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3332 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3333 * known until runtime). This saves effort in regex matching. However,
3334 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3335 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3336 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3337 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3338 * possibilities for the non-UTF8 patterns are quite simple, except for
3339 * the sharp s. All the ones that don't involve a UTF-8 target string are
3340 * members of a fold-pair, and arrays are set up for all of them so that
3341 * the other member of the pair can be found quickly. Code elsewhere in
3342 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3343 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3344 * described in the next item.
3345 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3346 * validity of the fold won't be known until runtime, and so must remain
3347 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3348 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3349 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3350 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3351 * The reason this is a problem is that the optimizer part of regexec.c
3352 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3353 * that a character in the pattern corresponds to at most a single
3354 * character in the target string. (And I do mean character, and not byte
3355 * here, unlike other parts of the documentation that have never been
3356 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3357 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3358 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3359 * nodes, violate the assumption, and they are the only instances where it
3360 * is violated. I'm reluctant to try to change the assumption, as the
3361 * code involved is impenetrable to me (khw), so instead the code here
3362 * punts. This routine examines EXACTFL nodes, and (when the pattern
3363 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3364 * boolean indicating whether or not the node contains such a fold. When
3365 * it is true, the caller sets a flag that later causes the optimizer in
3366 * this file to not set values for the floating and fixed string lengths,
3367 * and thus avoids the optimizer code in regexec.c that makes the invalid
3368 * assumption. Thus, there is no optimization based on string lengths for
3369 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3370 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3371 * assumption is wrong only in these cases is that all other non-UTF-8
3372 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3373 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3374 * EXACTF nodes because we don't know at compile time if it actually
3375 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3376 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3377 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3378 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3379 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3380 * string would require the pattern to be forced into UTF-8, the overhead
3381 * of which we want to avoid. Similarly the unfolded multi-char folds in
3382 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3385 * Similarly, the code that generates tries doesn't currently handle
3386 * not-already-folded multi-char folds, and it looks like a pain to change
3387 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3388 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3389 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3390 * using /iaa matching will be doing so almost entirely with ASCII
3391 * strings, so this should rarely be encountered in practice */
3393 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3394 if (PL_regkind[OP(scan)] == EXACT) \
3395 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3398 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3399 UV *min_subtract, bool *unfolded_multi_char,
3400 U32 flags,regnode *val, U32 depth)
3402 /* Merge several consecutive EXACTish nodes into one. */
3403 regnode *n = regnext(scan);
3405 regnode *next = scan + NODE_SZ_STR(scan);
3409 regnode *stop = scan;
3410 GET_RE_DEBUG_FLAGS_DECL;
3412 PERL_UNUSED_ARG(depth);
3415 PERL_ARGS_ASSERT_JOIN_EXACT;
3416 #ifndef EXPERIMENTAL_INPLACESCAN
3417 PERL_UNUSED_ARG(flags);
3418 PERL_UNUSED_ARG(val);
3420 DEBUG_PEEP("join",scan,depth);
3422 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3423 * EXACT ones that are mergeable to the current one. */
3425 && (PL_regkind[OP(n)] == NOTHING
3426 || (stringok && OP(n) == OP(scan)))
3428 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3431 if (OP(n) == TAIL || n > next)
3433 if (PL_regkind[OP(n)] == NOTHING) {
3434 DEBUG_PEEP("skip:",n,depth);
3435 NEXT_OFF(scan) += NEXT_OFF(n);
3436 next = n + NODE_STEP_REGNODE;
3443 else if (stringok) {
3444 const unsigned int oldl = STR_LEN(scan);
3445 regnode * const nnext = regnext(n);
3447 /* XXX I (khw) kind of doubt that this works on platforms (should
3448 * Perl ever run on one) where U8_MAX is above 255 because of lots
3449 * of other assumptions */
3450 /* Don't join if the sum can't fit into a single node */
3451 if (oldl + STR_LEN(n) > U8_MAX)
3454 DEBUG_PEEP("merg",n,depth);
3457 NEXT_OFF(scan) += NEXT_OFF(n);
3458 STR_LEN(scan) += STR_LEN(n);
3459 next = n + NODE_SZ_STR(n);
3460 /* Now we can overwrite *n : */
3461 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3469 #ifdef EXPERIMENTAL_INPLACESCAN
3470 if (flags && !NEXT_OFF(n)) {
3471 DEBUG_PEEP("atch", val, depth);
3472 if (reg_off_by_arg[OP(n)]) {
3473 ARG_SET(n, val - n);
3476 NEXT_OFF(n) = val - n;
3484 *unfolded_multi_char = FALSE;
3486 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3487 * can now analyze for sequences of problematic code points. (Prior to
3488 * this final joining, sequences could have been split over boundaries, and
3489 * hence missed). The sequences only happen in folding, hence for any
3490 * non-EXACT EXACTish node */
3491 if (OP(scan) != EXACT) {
3492 U8* s0 = (U8*) STRING(scan);
3494 U8* s_end = s0 + STR_LEN(scan);
3496 int total_count_delta = 0; /* Total delta number of characters that
3497 multi-char folds expand to */
3499 /* One pass is made over the node's string looking for all the
3500 * possibilities. To avoid some tests in the loop, there are two main
3501 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3506 if (OP(scan) == EXACTFL) {
3509 /* An EXACTFL node would already have been changed to another
3510 * node type unless there is at least one character in it that
3511 * is problematic; likely a character whose fold definition
3512 * won't be known until runtime, and so has yet to be folded.
3513 * For all but the UTF-8 locale, folds are 1-1 in length, but
3514 * to handle the UTF-8 case, we need to create a temporary
3515 * folded copy using UTF-8 locale rules in order to analyze it.
3516 * This is because our macros that look to see if a sequence is
3517 * a multi-char fold assume everything is folded (otherwise the
3518 * tests in those macros would be too complicated and slow).
3519 * Note that here, the non-problematic folds will have already
3520 * been done, so we can just copy such characters. We actually
3521 * don't completely fold the EXACTFL string. We skip the
3522 * unfolded multi-char folds, as that would just create work
3523 * below to figure out the size they already are */
3525 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3528 STRLEN s_len = UTF8SKIP(s);
3529 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3530 Copy(s, d, s_len, U8);
3533 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3534 *unfolded_multi_char = TRUE;
3535 Copy(s, d, s_len, U8);
3538 else if (isASCII(*s)) {
3539 *(d++) = toFOLD(*s);
3543 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3549 /* Point the remainder of the routine to look at our temporary
3553 } /* End of creating folded copy of EXACTFL string */
3555 /* Examine the string for a multi-character fold sequence. UTF-8
3556 * patterns have all characters pre-folded by the time this code is
3558 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3559 length sequence we are looking for is 2 */
3561 int count = 0; /* How many characters in a multi-char fold */
3562 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3563 if (! len) { /* Not a multi-char fold: get next char */
3568 /* Nodes with 'ss' require special handling, except for
3569 * EXACTFA-ish for which there is no multi-char fold to this */
3570 if (len == 2 && *s == 's' && *(s+1) == 's'
3571 && OP(scan) != EXACTFA
3572 && OP(scan) != EXACTFA_NO_TRIE)
3575 if (OP(scan) != EXACTFL) {
3576 OP(scan) = EXACTFU_SS;
3580 else { /* Here is a generic multi-char fold. */
3581 U8* multi_end = s + len;
3583 /* Count how many characters are in it. In the case of
3584 * /aa, no folds which contain ASCII code points are
3585 * allowed, so check for those, and skip if found. */
3586 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3587 count = utf8_length(s, multi_end);
3591 while (s < multi_end) {
3594 goto next_iteration;
3604 /* The delta is how long the sequence is minus 1 (1 is how long
3605 * the character that folds to the sequence is) */
3606 total_count_delta += count - 1;
3610 /* We created a temporary folded copy of the string in EXACTFL
3611 * nodes. Therefore we need to be sure it doesn't go below zero,
3612 * as the real string could be shorter */
3613 if (OP(scan) == EXACTFL) {
3614 int total_chars = utf8_length((U8*) STRING(scan),
3615 (U8*) STRING(scan) + STR_LEN(scan));
3616 if (total_count_delta > total_chars) {
3617 total_count_delta = total_chars;
3621 *min_subtract += total_count_delta;
3624 else if (OP(scan) == EXACTFA) {
3626 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3627 * fold to the ASCII range (and there are no existing ones in the
3628 * upper latin1 range). But, as outlined in the comments preceding
3629 * this function, we need to flag any occurrences of the sharp s.
3630 * This character forbids trie formation (because of added
3633 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3634 OP(scan) = EXACTFA_NO_TRIE;
3635 *unfolded_multi_char = TRUE;
3644 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3645 * folds that are all Latin1. As explained in the comments
3646 * preceding this function, we look also for the sharp s in EXACTF
3647 * and EXACTFL nodes; it can be in the final position. Otherwise
3648 * we can stop looking 1 byte earlier because have to find at least
3649 * two characters for a multi-fold */
3650 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3655 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3656 if (! len) { /* Not a multi-char fold. */
3657 if (*s == LATIN_SMALL_LETTER_SHARP_S
3658 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3660 *unfolded_multi_char = TRUE;
3667 && isALPHA_FOLD_EQ(*s, 's')
3668 && isALPHA_FOLD_EQ(*(s+1), 's'))
3671 /* EXACTF nodes need to know that the minimum length
3672 * changed so that a sharp s in the string can match this
3673 * ss in the pattern, but they remain EXACTF nodes, as they
3674 * won't match this unless the target string is is UTF-8,
3675 * which we don't know until runtime. EXACTFL nodes can't
3676 * transform into EXACTFU nodes */
3677 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3678 OP(scan) = EXACTFU_SS;
3682 *min_subtract += len - 1;
3689 /* Allow dumping but overwriting the collection of skipped
3690 * ops and/or strings with fake optimized ops */
3691 n = scan + NODE_SZ_STR(scan);
3699 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3703 /* REx optimizer. Converts nodes into quicker variants "in place".
3704 Finds fixed substrings. */
3706 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3707 to the position after last scanned or to NULL. */
3709 #define INIT_AND_WITHP \
3710 assert(!and_withp); \
3711 Newx(and_withp,1, regnode_ssc); \
3712 SAVEFREEPV(and_withp)
3716 S_unwind_scan_frames(pTHX_ const void *p)
3718 scan_frame *f= (scan_frame *)p;
3720 scan_frame *n= f->next_frame;
3728 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3729 SSize_t *minlenp, SSize_t *deltap,
3734 regnode_ssc *and_withp,
3735 U32 flags, U32 depth)
3736 /* scanp: Start here (read-write). */
3737 /* deltap: Write maxlen-minlen here. */
3738 /* last: Stop before this one. */
3739 /* data: string data about the pattern */
3740 /* stopparen: treat close N as END */
3741 /* recursed: which subroutines have we recursed into */
3742 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3744 /* There must be at least this number of characters to match */
3747 regnode *scan = *scanp, *next;
3749 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3750 int is_inf_internal = 0; /* The studied chunk is infinite */
3751 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3752 scan_data_t data_fake;
3753 SV *re_trie_maxbuff = NULL;
3754 regnode *first_non_open = scan;
3755 SSize_t stopmin = SSize_t_MAX;
3756 scan_frame *frame = NULL;
3757 GET_RE_DEBUG_FLAGS_DECL;
3759 PERL_ARGS_ASSERT_STUDY_CHUNK;
3763 while (first_non_open && OP(first_non_open) == OPEN)
3764 first_non_open=regnext(first_non_open);
3770 RExC_study_chunk_recursed_count++;
3772 DEBUG_OPTIMISE_MORE_r(
3774 PerlIO_printf(Perl_debug_log,
3775 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3776 ((int) depth*2), "", (long)stopparen,
3777 (unsigned long)RExC_study_chunk_recursed_count,
3778 (unsigned long)depth, (unsigned long)recursed_depth,
3781 if (recursed_depth) {
3784 for ( j = 0 ; j < recursed_depth ; j++ ) {
3785 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3787 PAREN_TEST(RExC_study_chunk_recursed +
3788 ( j * RExC_study_chunk_recursed_bytes), i )
3791 !PAREN_TEST(RExC_study_chunk_recursed +
3792 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3795 PerlIO_printf(Perl_debug_log," %d",i);
3799 if ( j + 1 < recursed_depth ) {
3800 PerlIO_printf(Perl_debug_log, ",");
3804 PerlIO_printf(Perl_debug_log,"\n");
3807 while ( scan && OP(scan) != END && scan < last ){
3808 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3809 node length to get a real minimum (because
3810 the folded version may be shorter) */
3811 bool unfolded_multi_char = FALSE;
3812 /* Peephole optimizer: */
3813 DEBUG_STUDYDATA("Peep:", data, depth);
3814 DEBUG_PEEP("Peep", scan, depth);
3817 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3818 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3819 * by a different invocation of reg() -- Yves
3821 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3823 /* Follow the next-chain of the current node and optimize
3824 away all the NOTHINGs from it. */
3825 if (OP(scan) != CURLYX) {
3826 const int max = (reg_off_by_arg[OP(scan)]
3828 /* I32 may be smaller than U16 on CRAYs! */
3829 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3830 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3834 /* Skip NOTHING and LONGJMP. */
3835 while ((n = regnext(n))
3836 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3837 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3838 && off + noff < max)
3840 if (reg_off_by_arg[OP(scan)])
3843 NEXT_OFF(scan) = off;
3846 /* The principal pseudo-switch. Cannot be a switch, since we
3847 look into several different things. */
3848 if ( OP(scan) == DEFINEP ) {
3850 SSize_t deltanext = 0;
3851 SSize_t fake_last_close = 0;
3852 I32 f = SCF_IN_DEFINE;
3854 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3855 scan = regnext(scan);
3856 assert( OP(scan) == IFTHEN );
3857 DEBUG_PEEP("expect IFTHEN", scan, depth);
3859 data_fake.last_closep= &fake_last_close;
3861 next = regnext(scan);
3862 scan = NEXTOPER(NEXTOPER(scan));
3863 DEBUG_PEEP("scan", scan, depth);
3864 DEBUG_PEEP("next", next, depth);
3866 /* we suppose the run is continuous, last=next...
3867 * NOTE we dont use the return here! */
3868 (void)study_chunk(pRExC_state, &scan, &minlen,
3869 &deltanext, next, &data_fake, stopparen,
3870 recursed_depth, NULL, f, depth+1);
3875 OP(scan) == BRANCH ||
3876 OP(scan) == BRANCHJ ||
3879 next = regnext(scan);
3882 /* The op(next)==code check below is to see if we
3883 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3884 * IFTHEN is special as it might not appear in pairs.
3885 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3886 * we dont handle it cleanly. */
3887 if (OP(next) == code || code == IFTHEN) {
3888 /* NOTE - There is similar code to this block below for
3889 * handling TRIE nodes on a re-study. If you change stuff here
3890 * check there too. */
3891 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3893 regnode * const startbranch=scan;
3895 if (flags & SCF_DO_SUBSTR) {
3896 /* Cannot merge strings after this. */
3897 scan_commit(pRExC_state, data, minlenp, is_inf);
3900 if (flags & SCF_DO_STCLASS)
3901 ssc_init_zero(pRExC_state, &accum);
3903 while (OP(scan) == code) {
3904 SSize_t deltanext, minnext, fake;
3906 regnode_ssc this_class;
3908 DEBUG_PEEP("Branch", scan, depth);
3911 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3913 data_fake.whilem_c = data->whilem_c;
3914 data_fake.last_closep = data->last_closep;
3917 data_fake.last_closep = &fake;
3919 data_fake.pos_delta = delta;
3920 next = regnext(scan);
3922 scan = NEXTOPER(scan); /* everything */
3923 if (code != BRANCH) /* everything but BRANCH */
3924 scan = NEXTOPER(scan);
3926 if (flags & SCF_DO_STCLASS) {
3927 ssc_init(pRExC_state, &this_class);
3928 data_fake.start_class = &this_class;
3929 f = SCF_DO_STCLASS_AND;
3931 if (flags & SCF_WHILEM_VISITED_POS)
3932 f |= SCF_WHILEM_VISITED_POS;
3934 /* we suppose the run is continuous, last=next...*/
3935 minnext = study_chunk(pRExC_state, &scan, minlenp,
3936 &deltanext, next, &data_fake, stopparen,
3937 recursed_depth, NULL, f,depth+1);
3941 if (deltanext == SSize_t_MAX) {
3942 is_inf = is_inf_internal = 1;
3944 } else if (max1 < minnext + deltanext)
3945 max1 = minnext + deltanext;
3947 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3949 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3950 if ( stopmin > minnext)
3951 stopmin = min + min1;
3952 flags &= ~SCF_DO_SUBSTR;
3954 data->flags |= SCF_SEEN_ACCEPT;
3957 if (data_fake.flags & SF_HAS_EVAL)
3958 data->flags |= SF_HAS_EVAL;
3959 data->whilem_c = data_fake.whilem_c;
3961 if (flags & SCF_DO_STCLASS)
3962 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3964 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3966 if (flags & SCF_DO_SUBSTR) {
3967 data->pos_min += min1;
3968 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3969 data->pos_delta = SSize_t_MAX;
3971 data->pos_delta += max1 - min1;
3972 if (max1 != min1 || is_inf)
3973 data->longest = &(data->longest_float);
3976 if (delta == SSize_t_MAX
3977 || SSize_t_MAX - delta - (max1 - min1) < 0)
3978 delta = SSize_t_MAX;
3980 delta += max1 - min1;
3981 if (flags & SCF_DO_STCLASS_OR) {
3982 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3984 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3985 flags &= ~SCF_DO_STCLASS;
3988 else if (flags & SCF_DO_STCLASS_AND) {
3990 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3991 flags &= ~SCF_DO_STCLASS;
3994 /* Switch to OR mode: cache the old value of
3995 * data->start_class */
3997 StructCopy(data->start_class, and_withp, regnode_ssc);
3998 flags &= ~SCF_DO_STCLASS_AND;
3999 StructCopy(&accum, data->start_class, regnode_ssc);
4000 flags |= SCF_DO_STCLASS_OR;
4004 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4005 OP( startbranch ) == BRANCH )
4009 Assuming this was/is a branch we are dealing with: 'scan'
4010 now points at the item that follows the branch sequence,
4011 whatever it is. We now start at the beginning of the
4012 sequence and look for subsequences of
4018 which would be constructed from a pattern like
4021 If we can find such a subsequence we need to turn the first
4022 element into a trie and then add the subsequent branch exact
4023 strings to the trie.
4027 1. patterns where the whole set of branches can be
4030 2. patterns where only a subset can be converted.
4032 In case 1 we can replace the whole set with a single regop
4033 for the trie. In case 2 we need to keep the start and end
4036 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4037 becomes BRANCH TRIE; BRANCH X;
4039 There is an additional case, that being where there is a
4040 common prefix, which gets split out into an EXACT like node
4041 preceding the TRIE node.
4043 If x(1..n)==tail then we can do a simple trie, if not we make
4044 a "jump" trie, such that when we match the appropriate word
4045 we "jump" to the appropriate tail node. Essentially we turn
4046 a nested if into a case structure of sorts.
4051 if (!re_trie_maxbuff) {
4052 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4053 if (!SvIOK(re_trie_maxbuff))
4054 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4056 if ( SvIV(re_trie_maxbuff)>=0 ) {
4058 regnode *first = (regnode *)NULL;
4059 regnode *last = (regnode *)NULL;
4060 regnode *tail = scan;
4064 /* var tail is used because there may be a TAIL
4065 regop in the way. Ie, the exacts will point to the
4066 thing following the TAIL, but the last branch will
4067 point at the TAIL. So we advance tail. If we
4068 have nested (?:) we may have to move through several
4072 while ( OP( tail ) == TAIL ) {
4073 /* this is the TAIL generated by (?:) */
4074 tail = regnext( tail );
4078 DEBUG_TRIE_COMPILE_r({
4079 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4080 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4081 (int)depth * 2 + 2, "",
4082 "Looking for TRIE'able sequences. Tail node is: ",
4083 SvPV_nolen_const( RExC_mysv )
4089 Step through the branches
4090 cur represents each branch,
4091 noper is the first thing to be matched as part
4093 noper_next is the regnext() of that node.
4095 We normally handle a case like this
4096 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4097 support building with NOJUMPTRIE, which restricts
4098 the trie logic to structures like /FOO|BAR/.
4100 If noper is a trieable nodetype then the branch is
4101 a possible optimization target. If we are building
4102 under NOJUMPTRIE then we require that noper_next is
4103 the same as scan (our current position in the regex
4106 Once we have two or more consecutive such branches
4107 we can create a trie of the EXACT's contents and
4108 stitch it in place into the program.
4110 If the sequence represents all of the branches in
4111 the alternation we replace the entire thing with a
4114 Otherwise when it is a subsequence we need to
4115 stitch it in place and replace only the relevant
4116 branches. This means the first branch has to remain
4117 as it is used by the alternation logic, and its
4118 next pointer, and needs to be repointed at the item
4119 on the branch chain following the last branch we
4120 have optimized away.
4122 This could be either a BRANCH, in which case the
4123 subsequence is internal, or it could be the item
4124 following the branch sequence in which case the
4125 subsequence is at the end (which does not
4126 necessarily mean the first node is the start of the
4129 TRIE_TYPE(X) is a define which maps the optype to a
4133 ----------------+-----------
4137 EXACTFU_SS | EXACTFU
4142 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
4143 ( EXACT == (X) ) ? EXACT : \
4144 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
4145 ( EXACTFA == (X) ) ? EXACTFA : \
4148 /* dont use tail as the end marker for this traverse */
4149 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4150 regnode * const noper = NEXTOPER( cur );
4151 U8 noper_type = OP( noper );
4152 U8 noper_trietype = TRIE_TYPE( noper_type );
4153 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4154 regnode * const noper_next = regnext( noper );
4155 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4156 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4159 DEBUG_TRIE_COMPILE_r({
4160 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4161 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4162 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4164 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4165 PerlIO_printf( Perl_debug_log, " -> %s",
4166 SvPV_nolen_const(RExC_mysv));
4169 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4170 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4171 SvPV_nolen_const(RExC_mysv));
4173 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4174 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4175 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4179 /* Is noper a trieable nodetype that can be merged
4180 * with the current trie (if there is one)? */
4184 ( noper_trietype == NOTHING)
4185 || ( trietype == NOTHING )
4186 || ( trietype == noper_trietype )
4189 && noper_next == tail
4193 /* Handle mergable triable node Either we are
4194 * the first node in a new trieable sequence,
4195 * in which case we do some bookkeeping,
4196 * otherwise we update the end pointer. */
4199 if ( noper_trietype == NOTHING ) {
4200 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4201 regnode * const noper_next = regnext( noper );
4202 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4203 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4206 if ( noper_next_trietype ) {
4207 trietype = noper_next_trietype;
4208 } else if (noper_next_type) {
4209 /* a NOTHING regop is 1 regop wide.
4210 * We need at least two for a trie
4211 * so we can't merge this in */
4215 trietype = noper_trietype;
4218 if ( trietype == NOTHING )
4219 trietype = noper_trietype;
4224 } /* end handle mergable triable node */
4226 /* handle unmergable node -
4227 * noper may either be a triable node which can
4228 * not be tried together with the current trie,
4229 * or a non triable node */
4231 /* If last is set and trietype is not
4232 * NOTHING then we have found at least two
4233 * triable branch sequences in a row of a
4234 * similar trietype so we can turn them
4235 * into a trie. If/when we allow NOTHING to
4236 * start a trie sequence this condition
4237 * will be required, and it isn't expensive
4238 * so we leave it in for now. */
4239 if ( trietype && trietype != NOTHING )
4240 make_trie( pRExC_state,
4241 startbranch, first, cur, tail,
4242 count, trietype, depth+1 );
4243 last = NULL; /* note: we clear/update
4244 first, trietype etc below,
4245 so we dont do it here */
4249 && noper_next == tail
4252 /* noper is triable, so we can start a new
4256 trietype = noper_trietype;
4258 /* if we already saw a first but the
4259 * current node is not triable then we have
4260 * to reset the first information. */
4265 } /* end handle unmergable node */
4266 } /* loop over branches */
4267 DEBUG_TRIE_COMPILE_r({
4268 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4269 PerlIO_printf( Perl_debug_log,
4270 "%*s- %s (%d) <SCAN FINISHED>\n",
4272 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4275 if ( last && trietype ) {
4276 if ( trietype != NOTHING ) {
4277 /* the last branch of the sequence was part of
4278 * a trie, so we have to construct it here
4279 * outside of the loop */
4280 made= make_trie( pRExC_state, startbranch,
4281 first, scan, tail, count,
4282 trietype, depth+1 );
4283 #ifdef TRIE_STUDY_OPT
4284 if ( ((made == MADE_EXACT_TRIE &&
4285 startbranch == first)
4286 || ( first_non_open == first )) &&
4288 flags |= SCF_TRIE_RESTUDY;
4289 if ( startbranch == first
4292 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4297 /* at this point we know whatever we have is a
4298 * NOTHING sequence/branch AND if 'startbranch'
4299 * is 'first' then we can turn the whole thing
4302 if ( startbranch == first ) {
4304 /* the entire thing is a NOTHING sequence,
4305 * something like this: (?:|) So we can
4306 * turn it into a plain NOTHING op. */
4307 DEBUG_TRIE_COMPILE_r({
4308 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4309 PerlIO_printf( Perl_debug_log,
4310 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4311 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4314 OP(startbranch)= NOTHING;
4315 NEXT_OFF(startbranch)= tail - startbranch;
4316 for ( opt= startbranch + 1; opt < tail ; opt++ )
4320 } /* end if ( last) */
4321 } /* TRIE_MAXBUF is non zero */
4326 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4327 scan = NEXTOPER(NEXTOPER(scan));
4328 } else /* single branch is optimized. */
4329 scan = NEXTOPER(scan);
4331 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4333 regnode *start = NULL;
4334 regnode *end = NULL;
4335 U32 my_recursed_depth= recursed_depth;
4338 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4339 /* Do setup, note this code has side effects beyond
4340 * the rest of this block. Specifically setting
4341 * RExC_recurse[] must happen at least once during
4343 if (OP(scan) == GOSUB) {
4345 RExC_recurse[ARG2L(scan)] = scan;
4346 start = RExC_open_parens[paren-1];
4347 end = RExC_close_parens[paren-1];
4349 start = RExC_rxi->program + 1;
4352 /* NOTE we MUST always execute the above code, even
4353 * if we do nothing with a GOSUB/GOSTART */
4355 ( flags & SCF_IN_DEFINE )
4358 (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4360 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4363 /* no need to do anything here if we are in a define. */
4364 /* or we are after some kind of infinite construct
4365 * so we can skip recursing into this item.
4366 * Since it is infinite we will not change the maxlen
4367 * or delta, and if we miss something that might raise
4368 * the minlen it will merely pessimise a little.
4370 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4371 * might result in a minlen of 1 and not of 4,
4372 * but this doesn't make us mismatch, just try a bit
4373 * harder than we should.
4375 scan= regnext(scan);
4382 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4384 /* it is quite possible that there are more efficient ways
4385 * to do this. We maintain a bitmap per level of recursion
4386 * of which patterns we have entered so we can detect if a
4387 * pattern creates a possible infinite loop. When we
4388 * recurse down a level we copy the previous levels bitmap
4389 * down. When we are at recursion level 0 we zero the top
4390 * level bitmap. It would be nice to implement a different
4391 * more efficient way of doing this. In particular the top
4392 * level bitmap may be unnecessary.
4394 if (!recursed_depth) {
4395 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4397 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4398 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4399 RExC_study_chunk_recursed_bytes, U8);
4401 /* we havent recursed into this paren yet, so recurse into it */
4402 DEBUG_STUDYDATA("set:", data,depth);
4403 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4404 my_recursed_depth= recursed_depth + 1;
4406 DEBUG_STUDYDATA("inf:", data,depth);
4407 /* some form of infinite recursion, assume infinite length
4409 if (flags & SCF_DO_SUBSTR) {
4410 scan_commit(pRExC_state, data, minlenp, is_inf);
4411 data->longest = &(data->longest_float);
4413 is_inf = is_inf_internal = 1;
4414 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4415 ssc_anything(data->start_class);
4416 flags &= ~SCF_DO_STCLASS;
4418 start= NULL; /* reset start so we dont recurse later on. */
4423 end = regnext(scan);
4426 scan_frame *newframe;
4428 if (!RExC_frame_last) {
4429 Newxz(newframe, 1, scan_frame);
4430 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4431 RExC_frame_head= newframe;
4433 } else if (!RExC_frame_last->next_frame) {
4434 Newxz(newframe,1,scan_frame);
4435 RExC_frame_last->next_frame= newframe;
4436 newframe->prev_frame= RExC_frame_last;
4439 newframe= RExC_frame_last->next_frame;
4441 RExC_frame_last= newframe;
4443 newframe->next_regnode = regnext(scan);
4444 newframe->last_regnode = last;
4445 newframe->stopparen = stopparen;
4446 newframe->prev_recursed_depth = recursed_depth;
4447 newframe->this_prev_frame= frame;
4449 DEBUG_STUDYDATA("frame-new:",data,depth);
4450 DEBUG_PEEP("fnew", scan, depth);
4457 recursed_depth= my_recursed_depth;
4462 else if (OP(scan) == EXACT) {
4463 SSize_t l = STR_LEN(scan);
4466 const U8 * const s = (U8*)STRING(scan);
4467 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4468 l = utf8_length(s, s + l);
4470 uc = *((U8*)STRING(scan));
4473 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4474 /* The code below prefers earlier match for fixed
4475 offset, later match for variable offset. */
4476 if (data->last_end == -1) { /* Update the start info. */
4477 data->last_start_min = data->pos_min;
4478 data->last_start_max = is_inf
4479 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4481 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4483 SvUTF8_on(data->last_found);
4485 SV * const sv = data->last_found;
4486 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4487 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4488 if (mg && mg->mg_len >= 0)
4489 mg->mg_len += utf8_length((U8*)STRING(scan),
4490 (U8*)STRING(scan)+STR_LEN(scan));
4492 data->last_end = data->pos_min + l;
4493 data->pos_min += l; /* As in the first entry. */
4494 data->flags &= ~SF_BEFORE_EOL;
4497 /* ANDing the code point leaves at most it, and not in locale, and
4498 * can't match null string */
4499 if (flags & SCF_DO_STCLASS_AND) {
4500 ssc_cp_and(data->start_class, uc);
4501 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4502 ssc_clear_locale(data->start_class);
4504 else if (flags & SCF_DO_STCLASS_OR) {
4505 ssc_add_cp(data->start_class, uc);
4506 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4508 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4509 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4511 flags &= ~SCF_DO_STCLASS;
4513 else if (PL_regkind[OP(scan)] == EXACT) {
4514 /* But OP != EXACT!, so is EXACTFish */
4515 SSize_t l = STR_LEN(scan);
4516 UV uc = *((U8*)STRING(scan));
4517 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4518 separate code points */
4519 const U8 * s = (U8*)STRING(scan);
4521 /* Search for fixed substrings supports EXACT only. */
4522 if (flags & SCF_DO_SUBSTR) {
4524 scan_commit(pRExC_state, data, minlenp, is_inf);
4527 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4528 l = utf8_length(s, s + l);
4530 if (unfolded_multi_char) {
4531 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4533 min += l - min_subtract;
4535 delta += min_subtract;
4536 if (flags & SCF_DO_SUBSTR) {
4537 data->pos_min += l - min_subtract;
4538 if (data->pos_min < 0) {
4541 data->pos_delta += min_subtract;
4543 data->longest = &(data->longest_float);
4547 if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4548 ssc_clear_locale(data->start_class);
4553 /* We punt and assume can match anything if the node begins
4554 * with a multi-character fold. Things are complicated. For
4555 * example, /ffi/i could match any of:
4556 * "\N{LATIN SMALL LIGATURE FFI}"
4557 * "\N{LATIN SMALL LIGATURE FF}I"
4558 * "F\N{LATIN SMALL LIGATURE FI}"
4559 * plus several other things; and making sure we have all the
4560 * possibilities is hard. */
4561 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4563 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4567 /* Any Latin1 range character can potentially match any
4568 * other depending on the locale */
4569 if (OP(scan) == EXACTFL) {
4570 _invlist_union(EXACTF_invlist, PL_Latin1,
4574 /* But otherwise, it matches at least itself. We can
4575 * quickly tell if it has a distinct fold, and if so,
4576 * it matches that as well */
4577 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4578 if (IS_IN_SOME_FOLD_L1(uc)) {
4579 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4580 PL_fold_latin1[uc]);
4584 /* Some characters match above-Latin1 ones under /i. This
4585 * is true of EXACTFL ones when the locale is UTF-8 */
4586 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4587 && (! isASCII(uc) || (OP(scan) != EXACTFA
4588 && OP(scan) != EXACTFA_NO_TRIE)))
4590 add_above_Latin1_folds(pRExC_state,
4596 else { /* Pattern is UTF-8 */
4597 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4598 STRLEN foldlen = UTF8SKIP(s);
4599 const U8* e = s + STR_LEN(scan);
4602 /* The only code points that aren't folded in a UTF EXACTFish
4603 * node are are the problematic ones in EXACTFL nodes */
4604 if (OP(scan) == EXACTFL
4605 && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4607 /* We need to check for the possibility that this EXACTFL
4608 * node begins with a multi-char fold. Therefore we fold
4609 * the first few characters of it so that we can make that
4614 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4616 *(d++) = (U8) toFOLD(*s);
4621 to_utf8_fold(s, d, &len);
4627 /* And set up so the code below that looks in this folded
4628 * buffer instead of the node's string */
4630 foldlen = UTF8SKIP(folded);
4634 /* When we reach here 's' points to the fold of the first
4635 * character(s) of the node; and 'e' points to far enough along
4636 * the folded string to be just past any possible multi-char
4637 * fold. 'foldlen' is the length in bytes of the first
4640 * Unlike the non-UTF-8 case, the macro for determining if a
4641 * string is a multi-char fold requires all the characters to
4642 * already be folded. This is because of all the complications
4643 * if not. Note that they are folded anyway, except in EXACTFL
4644 * nodes. Like the non-UTF case above, we punt if the node
4645 * begins with a multi-char fold */
4647 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4649 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4651 else { /* Single char fold */
4653 /* It matches all the things that fold to it, which are
4654 * found in PL_utf8_foldclosures (including itself) */
4655 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4656 if (! PL_utf8_foldclosures) {
4657 _load_PL_utf8_foldclosures();
4659 if ((listp = hv_fetch(PL_utf8_foldclosures,
4660 (char *) s, foldlen, FALSE)))
4662 AV* list = (AV*) *listp;
4664 for (k = 0; k <= av_tindex(list); k++) {
4665 SV** c_p = av_fetch(list, k, FALSE);
4671 /* /aa doesn't allow folds between ASCII and non- */
4672 if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4673 && isASCII(c) != isASCII(uc))
4678 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4683 if (flags & SCF_DO_STCLASS_AND) {
4684 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4685 ANYOF_POSIXL_ZERO(data->start_class);
4686 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4688 else if (flags & SCF_DO_STCLASS_OR) {
4689 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4690 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4692 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4693 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4695 flags &= ~SCF_DO_STCLASS;
4696 SvREFCNT_dec(EXACTF_invlist);
4698 else if (REGNODE_VARIES(OP(scan))) {
4699 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4700 I32 fl = 0, f = flags;
4701 regnode * const oscan = scan;
4702 regnode_ssc this_class;
4703 regnode_ssc *oclass = NULL;
4704 I32 next_is_eval = 0;
4706 switch (PL_regkind[OP(scan)]) {
4707 case WHILEM: /* End of (?:...)* . */
4708 scan = NEXTOPER(scan);
4711 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4712 next = NEXTOPER(scan);
4713 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4715 maxcount = REG_INFTY;
4716 next = regnext(scan);
4717 scan = NEXTOPER(scan);
4721 if (flags & SCF_DO_SUBSTR)
4726 if (flags & SCF_DO_STCLASS) {
4728 maxcount = REG_INFTY;
4729 next = regnext(scan);
4730 scan = NEXTOPER(scan);
4733 if (flags & SCF_DO_SUBSTR) {
4734 scan_commit(pRExC_state, data, minlenp, is_inf);
4735 /* Cannot extend fixed substrings */
4736 data->longest = &(data->longest_float);
4738 is_inf = is_inf_internal = 1;
4739 scan = regnext(scan);
4740 goto optimize_curly_tail;
4742 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4743 && (scan->flags == stopparen))
4748 mincount = ARG1(scan);
4749 maxcount = ARG2(scan);
4751 next = regnext(scan);
4752 if (OP(scan) == CURLYX) {
4753 I32 lp = (data ? *(data->last_closep) : 0);
4754 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4756 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4757 next_is_eval = (OP(scan) == EVAL);
4759 if (flags & SCF_DO_SUBSTR) {
4761 scan_commit(pRExC_state, data, minlenp, is_inf);
4762 /* Cannot extend fixed substrings */
4763 pos_before = data->pos_min;
4767 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4769 data->flags |= SF_IS_INF;
4771 if (flags & SCF_DO_STCLASS) {
4772 ssc_init(pRExC_state, &this_class);
4773 oclass = data->start_class;
4774 data->start_class = &this_class;
4775 f |= SCF_DO_STCLASS_AND;
4776 f &= ~SCF_DO_STCLASS_OR;
4778 /* Exclude from super-linear cache processing any {n,m}
4779 regops for which the combination of input pos and regex
4780 pos is not enough information to determine if a match
4783 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4784 regex pos at the \s*, the prospects for a match depend not
4785 only on the input position but also on how many (bar\s*)
4786 repeats into the {4,8} we are. */
4787 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4788 f &= ~SCF_WHILEM_VISITED_POS;
4790 /* This will finish on WHILEM, setting scan, or on NULL: */
4791 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4792 last, data, stopparen, recursed_depth, NULL,
4794 ? (f & ~SCF_DO_SUBSTR)
4798 if (flags & SCF_DO_STCLASS)
4799 data->start_class = oclass;
4800 if (mincount == 0 || minnext == 0) {
4801 if (flags & SCF_DO_STCLASS_OR) {
4802 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4804 else if (flags & SCF_DO_STCLASS_AND) {
4805 /* Switch to OR mode: cache the old value of
4806 * data->start_class */
4808 StructCopy(data->start_class, and_withp, regnode_ssc);
4809 flags &= ~SCF_DO_STCLASS_AND;
4810 StructCopy(&this_class, data->start_class, regnode_ssc);
4811 flags |= SCF_DO_STCLASS_OR;
4812 ANYOF_FLAGS(data->start_class)
4813 |= SSC_MATCHES_EMPTY_STRING;
4815 } else { /* Non-zero len */
4816 if (flags & SCF_DO_STCLASS_OR) {
4817 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4818 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4820 else if (flags & SCF_DO_STCLASS_AND)
4821 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4822 flags &= ~SCF_DO_STCLASS;
4824 if (!scan) /* It was not CURLYX, but CURLY. */
4826 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4827 /* ? quantifier ok, except for (?{ ... }) */
4828 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4829 && (minnext == 0) && (deltanext == 0)
4830 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4831 && maxcount <= REG_INFTY/3) /* Complement check for big
4834 /* Fatal warnings may leak the regexp without this: */
4835 SAVEFREESV(RExC_rx_sv);
4836 ckWARNreg(RExC_parse,
4837 "Quantifier unexpected on zero-length expression");
4838 (void)ReREFCNT_inc(RExC_rx_sv);
4841 min += minnext * mincount;
4842 is_inf_internal |= deltanext == SSize_t_MAX
4843 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4844 is_inf |= is_inf_internal;
4846 delta = SSize_t_MAX;
4848 delta += (minnext + deltanext) * maxcount
4849 - minnext * mincount;
4851 /* Try powerful optimization CURLYX => CURLYN. */
4852 if ( OP(oscan) == CURLYX && data
4853 && data->flags & SF_IN_PAR
4854 && !(data->flags & SF_HAS_EVAL)
4855 && !deltanext && minnext == 1 ) {
4856 /* Try to optimize to CURLYN. */
4857 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4858 regnode * const nxt1 = nxt;
4865 if (!REGNODE_SIMPLE(OP(nxt))
4866 && !(PL_regkind[OP(nxt)] == EXACT
4867 && STR_LEN(nxt) == 1))
4873 if (OP(nxt) != CLOSE)
4875 if (RExC_open_parens) {
4876 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4877 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4879 /* Now we know that nxt2 is the only contents: */
4880 oscan->flags = (U8)ARG(nxt);
4882 OP(nxt1) = NOTHING; /* was OPEN. */
4885 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4886 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4887 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4888 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4889 OP(nxt + 1) = OPTIMIZED; /* was count. */
4890 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4895 /* Try optimization CURLYX => CURLYM. */
4896 if ( OP(oscan) == CURLYX && data
4897 && !(data->flags & SF_HAS_PAR)
4898 && !(data->flags & SF_HAS_EVAL)
4899 && !deltanext /* atom is fixed width */
4900 && minnext != 0 /* CURLYM can't handle zero width */
4902 /* Nor characters whose fold at run-time may be
4903 * multi-character */
4904 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4906 /* XXXX How to optimize if data == 0? */
4907 /* Optimize to a simpler form. */
4908 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4912 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4913 && (OP(nxt2) != WHILEM))
4915 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4916 /* Need to optimize away parenths. */
4917 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4918 /* Set the parenth number. */
4919 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4921 oscan->flags = (U8)ARG(nxt);
4922 if (RExC_open_parens) {
4923 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4924 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4926 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4927 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4930 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4931 OP(nxt + 1) = OPTIMIZED; /* was count. */
4932 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4933 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4936 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4937 regnode *nnxt = regnext(nxt1);
4939 if (reg_off_by_arg[OP(nxt1)])
4940 ARG_SET(nxt1, nxt2 - nxt1);
4941 else if (nxt2 - nxt1 < U16_MAX)
4942 NEXT_OFF(nxt1) = nxt2 - nxt1;
4944 OP(nxt) = NOTHING; /* Cannot beautify */
4949 /* Optimize again: */
4950 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4951 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4956 else if ((OP(oscan) == CURLYX)
4957 && (flags & SCF_WHILEM_VISITED_POS)
4958 /* See the comment on a similar expression above.
4959 However, this time it's not a subexpression
4960 we care about, but the expression itself. */
4961 && (maxcount == REG_INFTY)
4962 && data && ++data->whilem_c < 16) {
4963 /* This stays as CURLYX, we can put the count/of pair. */
4964 /* Find WHILEM (as in regexec.c) */
4965 regnode *nxt = oscan + NEXT_OFF(oscan);
4967 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4969 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4970 | (RExC_whilem_seen << 4)); /* On WHILEM */
4972 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4974 if (flags & SCF_DO_SUBSTR) {
4975 SV *last_str = NULL;
4976 STRLEN last_chrs = 0;
4977 int counted = mincount != 0;
4979 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4981 SSize_t b = pos_before >= data->last_start_min
4982 ? pos_before : data->last_start_min;
4984 const char * const s = SvPV_const(data->last_found, l);
4985 SSize_t old = b - data->last_start_min;
4988 old = utf8_hop((U8*)s, old) - (U8*)s;
4990 /* Get the added string: */
4991 last_str = newSVpvn_utf8(s + old, l, UTF);
4992 last_chrs = UTF ? utf8_length((U8*)(s + old),
4993 (U8*)(s + old + l)) : l;
4994 if (deltanext == 0 && pos_before == b) {
4995 /* What was added is a constant string */
4998 SvGROW(last_str, (mincount * l) + 1);
4999 repeatcpy(SvPVX(last_str) + l,
5000 SvPVX_const(last_str), l,
5002 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5003 /* Add additional parts. */
5004 SvCUR_set(data->last_found,
5005 SvCUR(data->last_found) - l);
5006 sv_catsv(data->last_found, last_str);
5008 SV * sv = data->last_found;
5010 SvUTF8(sv) && SvMAGICAL(sv) ?
5011 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5012 if (mg && mg->mg_len >= 0)
5013 mg->mg_len += last_chrs * (mincount-1);
5015 last_chrs *= mincount;
5016 data->last_end += l * (mincount - 1);
5019 /* start offset must point into the last copy */
5020 data->last_start_min += minnext * (mincount - 1);
5021 data->last_start_max += is_inf ? SSize_t_MAX
5022 : (maxcount - 1) * (minnext + data->pos_delta);
5025 /* It is counted once already... */
5026 data->pos_min += minnext * (mincount - counted);
5028 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5029 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5030 " maxcount=%"UVuf" mincount=%"UVuf"\n",
5031 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5033 if (deltanext != SSize_t_MAX)
5034 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5035 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5036 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5038 if (deltanext == SSize_t_MAX
5039 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5040 data->pos_delta = SSize_t_MAX;
5042 data->pos_delta += - counted * deltanext +
5043 (minnext + deltanext) * maxcount - minnext * mincount;
5044 if (mincount != maxcount) {
5045 /* Cannot extend fixed substrings found inside
5047 scan_commit(pRExC_state, data, minlenp, is_inf);
5048 if (mincount && last_str) {
5049 SV * const sv = data->last_found;
5050 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5051 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5055 sv_setsv(sv, last_str);
5056 data->last_end = data->pos_min;
5057 data->last_start_min = data->pos_min - last_chrs;
5058 data->last_start_max = is_inf
5060 : data->pos_min + data->pos_delta - last_chrs;
5062 data->longest = &(data->longest_float);
5064 SvREFCNT_dec(last_str);
5066 if (data && (fl & SF_HAS_EVAL))
5067 data->flags |= SF_HAS_EVAL;
5068 optimize_curly_tail:
5069 if (OP(oscan) != CURLYX) {
5070 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5072 NEXT_OFF(oscan) += NEXT_OFF(next);
5078 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5083 if (flags & SCF_DO_SUBSTR) {
5084 /* Cannot expect anything... */
5085 scan_commit(pRExC_state, data, minlenp, is_inf);
5086 data->longest = &(data->longest_float);
5088 is_inf = is_inf_internal = 1;
5089 if (flags & SCF_DO_STCLASS_OR) {
5090 if (OP(scan) == CLUMP) {
5091 /* Actually is any start char, but very few code points
5092 * aren't start characters */
5093 ssc_match_all_cp(data->start_class);
5096 ssc_anything(data->start_class);
5099 flags &= ~SCF_DO_STCLASS;
5103 else if (OP(scan) == LNBREAK) {
5104 if (flags & SCF_DO_STCLASS) {
5105 if (flags & SCF_DO_STCLASS_AND) {
5106 ssc_intersection(data->start_class,
5107 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5108 ssc_clear_locale(data->start_class);
5109 ANYOF_FLAGS(data->start_class)
5110 &= ~SSC_MATCHES_EMPTY_STRING;
5112 else if (flags & SCF_DO_STCLASS_OR) {
5113 ssc_union(data->start_class,
5114 PL_XPosix_ptrs[_CC_VERTSPACE],
5116 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5118 /* See commit msg for
5119 * 749e076fceedeb708a624933726e7989f2302f6a */
5120 ANYOF_FLAGS(data->start_class)
5121 &= ~SSC_MATCHES_EMPTY_STRING;
5123 flags &= ~SCF_DO_STCLASS;
5126 delta++; /* Because of the 2 char string cr-lf */
5127 if (flags & SCF_DO_SUBSTR) {
5128 /* Cannot expect anything... */
5129 scan_commit(pRExC_state, data, minlenp, is_inf);
5131 data->pos_delta += 1;
5132 data->longest = &(data->longest_float);
5135 else if (REGNODE_SIMPLE(OP(scan))) {
5137 if (flags & SCF_DO_SUBSTR) {
5138 scan_commit(pRExC_state, data, minlenp, is_inf);
5142 if (flags & SCF_DO_STCLASS) {
5144 SV* my_invlist = sv_2mortal(_new_invlist(0));
5147 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5148 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5150 /* Some of the logic below assumes that switching
5151 locale on will only add false positives. */
5156 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5161 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5162 ssc_match_all_cp(data->start_class);
5167 SV* REG_ANY_invlist = _new_invlist(2);
5168 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5170 if (flags & SCF_DO_STCLASS_OR) {
5171 ssc_union(data->start_class,
5173 TRUE /* TRUE => invert, hence all but \n
5177 else if (flags & SCF_DO_STCLASS_AND) {
5178 ssc_intersection(data->start_class,
5180 TRUE /* TRUE => invert */
5182 ssc_clear_locale(data->start_class);
5184 SvREFCNT_dec_NN(REG_ANY_invlist);
5189 if (flags & SCF_DO_STCLASS_AND)
5190 ssc_and(pRExC_state, data->start_class,
5191 (regnode_charclass *) scan);
5193 ssc_or(pRExC_state, data->start_class,
5194 (regnode_charclass *) scan);
5202 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5203 if (flags & SCF_DO_STCLASS_AND) {
5204 bool was_there = cBOOL(
5205 ANYOF_POSIXL_TEST(data->start_class,
5207 ANYOF_POSIXL_ZERO(data->start_class);
5208 if (was_there) { /* Do an AND */
5209 ANYOF_POSIXL_SET(data->start_class, namedclass);
5211 /* No individual code points can now match */
5212 data->start_class->invlist
5213 = sv_2mortal(_new_invlist(0));
5216 int complement = namedclass + ((invert) ? -1 : 1);
5218 assert(flags & SCF_DO_STCLASS_OR);
5220 /* If the complement of this class was already there,
5221 * the result is that they match all code points,
5222 * (\d + \D == everything). Remove the classes from
5223 * future consideration. Locale is not relevant in
5225 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5226 ssc_match_all_cp(data->start_class);
5227 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5228 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5230 else { /* The usual case; just add this class to the
5232 ANYOF_POSIXL_SET(data->start_class, namedclass);
5237 case NPOSIXA: /* For these, we always know the exact set of
5242 if (FLAGS(scan) == _CC_ASCII) {
5243 my_invlist = PL_XPosix_ptrs[_CC_ASCII];
5246 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5247 PL_XPosix_ptrs[_CC_ASCII],
5258 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5260 /* NPOSIXD matches all upper Latin1 code points unless the
5261 * target string being matched is UTF-8, which is
5262 * unknowable until match time. Since we are going to
5263 * invert, we want to get rid of all of them so that the
5264 * inversion will match all */
5265 if (OP(scan) == NPOSIXD) {
5266 _invlist_subtract(my_invlist, PL_UpperLatin1,
5272 if (flags & SCF_DO_STCLASS_AND) {
5273 ssc_intersection(data->start_class, my_invlist, invert);
5274 ssc_clear_locale(data->start_class);
5277 assert(flags & SCF_DO_STCLASS_OR);
5278 ssc_union(data->start_class, my_invlist, invert);
5281 if (flags & SCF_DO_STCLASS_OR)
5282 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5283 flags &= ~SCF_DO_STCLASS;
5286 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5287 data->flags |= (OP(scan) == MEOL
5290 scan_commit(pRExC_state, data, minlenp, is_inf);
5293 else if ( PL_regkind[OP(scan)] == BRANCHJ
5294 /* Lookbehind, or need to calculate parens/evals/stclass: */
5295 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5296 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5298 if ( OP(scan) == UNLESSM &&
5300 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5301 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5304 regnode *upto= regnext(scan);
5306 DEBUG_STUDYDATA("OPFAIL",data,depth);
5308 /*DEBUG_PARSE_MSG("opfail");*/
5309 regprop(RExC_rx, RExC_mysv, upto, NULL, pRExC_state);
5310 PerlIO_printf(Perl_debug_log,
5311 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5312 SvPV_nolen_const(RExC_mysv),
5313 (IV)REG_NODE_NUM(upto),
5318 NEXT_OFF(scan) = upto - scan;
5319 for (opt= scan + 1; opt < upto ; opt++)
5320 OP(opt) = OPTIMIZED;
5324 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5325 || OP(scan) == UNLESSM )
5327 /* Negative Lookahead/lookbehind
5328 In this case we can't do fixed string optimisation.
5331 SSize_t deltanext, minnext, fake = 0;
5336 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5338 data_fake.whilem_c = data->whilem_c;
5339 data_fake.last_closep = data->last_closep;
5342 data_fake.last_closep = &fake;
5343 data_fake.pos_delta = delta;
5344 if ( flags & SCF_DO_STCLASS && !scan->flags
5345 && OP(scan) == IFMATCH ) { /* Lookahead */
5346 ssc_init(pRExC_state, &intrnl);
5347 data_fake.start_class = &intrnl;
5348 f |= SCF_DO_STCLASS_AND;
5350 if (flags & SCF_WHILEM_VISITED_POS)
5351 f |= SCF_WHILEM_VISITED_POS;
5352 next = regnext(scan);
5353 nscan = NEXTOPER(NEXTOPER(scan));
5354 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5355 last, &data_fake, stopparen,
5356 recursed_depth, NULL, f, depth+1);
5359 FAIL("Variable length lookbehind not implemented");
5361 else if (minnext > (I32)U8_MAX) {
5362 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5365 scan->flags = (U8)minnext;
5368 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5370 if (data_fake.flags & SF_HAS_EVAL)
5371 data->flags |= SF_HAS_EVAL;
5372 data->whilem_c = data_fake.whilem_c;
5374 if (f & SCF_DO_STCLASS_AND) {
5375 if (flags & SCF_DO_STCLASS_OR) {
5376 /* OR before, AND after: ideally we would recurse with
5377 * data_fake to get the AND applied by study of the
5378 * remainder of the pattern, and then derecurse;
5379 * *** HACK *** for now just treat as "no information".
5380 * See [perl #56690].
5382 ssc_init(pRExC_state, data->start_class);
5384 /* AND before and after: combine and continue. These
5385 * assertions are zero-length, so can match an EMPTY
5387 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5388 ANYOF_FLAGS(data->start_class)
5389 |= SSC_MATCHES_EMPTY_STRING;
5393 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5395 /* Positive Lookahead/lookbehind
5396 In this case we can do fixed string optimisation,
5397 but we must be careful about it. Note in the case of
5398 lookbehind the positions will be offset by the minimum
5399 length of the pattern, something we won't know about
5400 until after the recurse.
5402 SSize_t deltanext, fake = 0;
5406 /* We use SAVEFREEPV so that when the full compile
5407 is finished perl will clean up the allocated
5408 minlens when it's all done. This way we don't
5409 have to worry about freeing them when we know
5410 they wont be used, which would be a pain.
5413 Newx( minnextp, 1, SSize_t );
5414 SAVEFREEPV(minnextp);
5417 StructCopy(data, &data_fake, scan_data_t);
5418 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5421 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5422 data_fake.last_found=newSVsv(data->last_found);
5426 data_fake.last_closep = &fake;
5427 data_fake.flags = 0;
5428 data_fake.pos_delta = delta;
5430 data_fake.flags |= SF_IS_INF;
5431 if ( flags & SCF_DO_STCLASS && !scan->flags
5432 && OP(scan) == IFMATCH ) { /* Lookahead */
5433 ssc_init(pRExC_state, &intrnl);
5434 data_fake.start_class = &intrnl;
5435 f |= SCF_DO_STCLASS_AND;
5437 if (flags & SCF_WHILEM_VISITED_POS)
5438 f |= SCF_WHILEM_VISITED_POS;
5439 next = regnext(scan);
5440 nscan = NEXTOPER(NEXTOPER(scan));
5442 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5443 &deltanext, last, &data_fake,
5444 stopparen, recursed_depth, NULL,
5448 FAIL("Variable length lookbehind not implemented");
5450 else if (*minnextp > (I32)U8_MAX) {
5451 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5454 scan->flags = (U8)*minnextp;
5459 if (f & SCF_DO_STCLASS_AND) {
5460 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5461 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5464 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5466 if (data_fake.flags & SF_HAS_EVAL)
5467 data->flags |= SF_HAS_EVAL;
5468 data->whilem_c = data_fake.whilem_c;
5469 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5470 if (RExC_rx->minlen<*minnextp)
5471 RExC_rx->minlen=*minnextp;
5472 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5473 SvREFCNT_dec_NN(data_fake.last_found);
5475 if ( data_fake.minlen_fixed != minlenp )
5477 data->offset_fixed= data_fake.offset_fixed;
5478 data->minlen_fixed= data_fake.minlen_fixed;
5479 data->lookbehind_fixed+= scan->flags;
5481 if ( data_fake.minlen_float != minlenp )
5483 data->minlen_float= data_fake.minlen_float;
5484 data->offset_float_min=data_fake.offset_float_min;
5485 data->offset_float_max=data_fake.offset_float_max;
5486 data->lookbehind_float+= scan->flags;
5493 else if (OP(scan) == OPEN) {
5494 if (stopparen != (I32)ARG(scan))
5497 else if (OP(scan) == CLOSE) {
5498 if (stopparen == (I32)ARG(scan)) {
5501 if ((I32)ARG(scan) == is_par) {
5502 next = regnext(scan);
5504 if ( next && (OP(next) != WHILEM) && next < last)
5505 is_par = 0; /* Disable optimization */
5508 *(data->last_closep) = ARG(scan);
5510 else if (OP(scan) == EVAL) {
5512 data->flags |= SF_HAS_EVAL;
5514 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5515 if (flags & SCF_DO_SUBSTR) {
5516 scan_commit(pRExC_state, data, minlenp, is_inf);
5517 flags &= ~SCF_DO_SUBSTR;
5519 if (data && OP(scan)==ACCEPT) {
5520 data->flags |= SCF_SEEN_ACCEPT;
5525 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5527 if (flags & SCF_DO_SUBSTR) {
5528 scan_commit(pRExC_state, data, minlenp, is_inf);
5529 data->longest = &(data->longest_float);
5531 is_inf = is_inf_internal = 1;
5532 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5533 ssc_anything(data->start_class);
5534 flags &= ~SCF_DO_STCLASS;
5536 else if (OP(scan) == GPOS) {
5537 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5538 !(delta || is_inf || (data && data->pos_delta)))
5540 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5541 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5542 if (RExC_rx->gofs < (STRLEN)min)
5543 RExC_rx->gofs = min;
5545 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5549 #ifdef TRIE_STUDY_OPT
5550 #ifdef FULL_TRIE_STUDY
5551 else if (PL_regkind[OP(scan)] == TRIE) {
5552 /* NOTE - There is similar code to this block above for handling
5553 BRANCH nodes on the initial study. If you change stuff here
5555 regnode *trie_node= scan;
5556 regnode *tail= regnext(scan);
5557 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5558 SSize_t max1 = 0, min1 = SSize_t_MAX;
5561 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5562 /* Cannot merge strings after this. */
5563 scan_commit(pRExC_state, data, minlenp, is_inf);
5565 if (flags & SCF_DO_STCLASS)
5566 ssc_init_zero(pRExC_state, &accum);
5572 const regnode *nextbranch= NULL;
5575 for ( word=1 ; word <= trie->wordcount ; word++)
5577 SSize_t deltanext=0, minnext=0, f = 0, fake;
5578 regnode_ssc this_class;
5580 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5582 data_fake.whilem_c = data->whilem_c;
5583 data_fake.last_closep = data->last_closep;
5586 data_fake.last_closep = &fake;
5587 data_fake.pos_delta = delta;
5588 if (flags & SCF_DO_STCLASS) {
5589 ssc_init(pRExC_state, &this_class);
5590 data_fake.start_class = &this_class;
5591 f = SCF_DO_STCLASS_AND;
5593 if (flags & SCF_WHILEM_VISITED_POS)
5594 f |= SCF_WHILEM_VISITED_POS;
5596 if (trie->jump[word]) {
5598 nextbranch = trie_node + trie->jump[0];
5599 scan= trie_node + trie->jump[word];
5600 /* We go from the jump point to the branch that follows
5601 it. Note this means we need the vestigal unused
5602 branches even though they arent otherwise used. */
5603 minnext = study_chunk(pRExC_state, &scan, minlenp,
5604 &deltanext, (regnode *)nextbranch, &data_fake,
5605 stopparen, recursed_depth, NULL, f,depth+1);
5607 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5608 nextbranch= regnext((regnode*)nextbranch);
5610 if (min1 > (SSize_t)(minnext + trie->minlen))
5611 min1 = minnext + trie->minlen;
5612 if (deltanext == SSize_t_MAX) {
5613 is_inf = is_inf_internal = 1;
5615 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5616 max1 = minnext + deltanext + trie->maxlen;
5618 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5620 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5621 if ( stopmin > min + min1)
5622 stopmin = min + min1;
5623 flags &= ~SCF_DO_SUBSTR;
5625 data->flags |= SCF_SEEN_ACCEPT;
5628 if (data_fake.flags & SF_HAS_EVAL)
5629 data->flags |= SF_HAS_EVAL;
5630 data->whilem_c = data_fake.whilem_c;
5632 if (flags & SCF_DO_STCLASS)
5633 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5636 if (flags & SCF_DO_SUBSTR) {
5637 data->pos_min += min1;
5638 data->pos_delta += max1 - min1;
5639 if (max1 != min1 || is_inf)
5640 data->longest = &(data->longest_float);
5643 delta += max1 - min1;
5644 if (flags & SCF_DO_STCLASS_OR) {
5645 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5647 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5648 flags &= ~SCF_DO_STCLASS;
5651 else if (flags & SCF_DO_STCLASS_AND) {
5653 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5654 flags &= ~SCF_DO_STCLASS;
5657 /* Switch to OR mode: cache the old value of
5658 * data->start_class */
5660 StructCopy(data->start_class, and_withp, regnode_ssc);
5661 flags &= ~SCF_DO_STCLASS_AND;
5662 StructCopy(&accum, data->start_class, regnode_ssc);
5663 flags |= SCF_DO_STCLASS_OR;
5670 else if (PL_regkind[OP(scan)] == TRIE) {
5671 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5674 min += trie->minlen;
5675 delta += (trie->maxlen - trie->minlen);
5676 flags &= ~SCF_DO_STCLASS; /* xxx */
5677 if (flags & SCF_DO_SUBSTR) {
5678 /* Cannot expect anything... */
5679 scan_commit(pRExC_state, data, minlenp, is_inf);
5680 data->pos_min += trie->minlen;
5681 data->pos_delta += (trie->maxlen - trie->minlen);
5682 if (trie->maxlen != trie->minlen)
5683 data->longest = &(data->longest_float);
5685 if (trie->jump) /* no more substrings -- for now /grr*/
5686 flags &= ~SCF_DO_SUBSTR;
5688 #endif /* old or new */
5689 #endif /* TRIE_STUDY_OPT */
5691 /* Else: zero-length, ignore. */
5692 scan = regnext(scan);
5694 /* If we are exiting a recursion we can unset its recursed bit
5695 * and allow ourselves to enter it again - no danger of an
5696 * infinite loop there.
5697 if (stopparen > -1 && recursed) {
5698 DEBUG_STUDYDATA("unset:", data,depth);
5699 PAREN_UNSET( recursed, stopparen);
5705 DEBUG_STUDYDATA("frame-end:",data,depth);
5706 DEBUG_PEEP("fend", scan, depth);
5708 /* restore previous context */
5709 last = frame->last_regnode;
5710 scan = frame->next_regnode;
5711 stopparen = frame->stopparen;
5712 recursed_depth = frame->prev_recursed_depth;
5714 RExC_frame_last = frame->prev_frame;
5715 frame = frame->this_prev_frame;
5716 goto fake_study_recurse;
5721 DEBUG_STUDYDATA("pre-fin:",data,depth);
5724 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5726 if (flags & SCF_DO_SUBSTR && is_inf)
5727 data->pos_delta = SSize_t_MAX - data->pos_min;
5728 if (is_par > (I32)U8_MAX)
5730 if (is_par && pars==1 && data) {
5731 data->flags |= SF_IN_PAR;
5732 data->flags &= ~SF_HAS_PAR;
5734 else if (pars && data) {
5735 data->flags |= SF_HAS_PAR;
5736 data->flags &= ~SF_IN_PAR;
5738 if (flags & SCF_DO_STCLASS_OR)
5739 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5740 if (flags & SCF_TRIE_RESTUDY)
5741 data->flags |= SCF_TRIE_RESTUDY;
5743 DEBUG_STUDYDATA("post-fin:",data,depth);
5746 SSize_t final_minlen= min < stopmin ? min : stopmin;
5748 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5749 RExC_maxlen = final_minlen + delta;
5751 return final_minlen;
5757 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5759 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5761 PERL_ARGS_ASSERT_ADD_DATA;
5763 Renewc(RExC_rxi->data,
5764 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5765 char, struct reg_data);
5767 Renew(RExC_rxi->data->what, count + n, U8);
5769 Newx(RExC_rxi->data->what, n, U8);
5770 RExC_rxi->data->count = count + n;
5771 Copy(s, RExC_rxi->data->what + count, n, U8);
5775 /*XXX: todo make this not included in a non debugging perl, but appears to be
5776 * used anyway there, in 'use re' */
5777 #ifndef PERL_IN_XSUB_RE
5779 Perl_reginitcolors(pTHX)
5781 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5783 char *t = savepv(s);
5787 t = strchr(t, '\t');
5793 PL_colors[i] = t = (char *)"";
5798 PL_colors[i++] = (char *)"";
5805 #ifdef TRIE_STUDY_OPT
5806 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5809 (data.flags & SCF_TRIE_RESTUDY) \
5817 #define CHECK_RESTUDY_GOTO_butfirst
5821 * pregcomp - compile a regular expression into internal code
5823 * Decides which engine's compiler to call based on the hint currently in
5827 #ifndef PERL_IN_XSUB_RE
5829 /* return the currently in-scope regex engine (or the default if none) */
5831 regexp_engine const *
5832 Perl_current_re_engine(pTHX)
5834 if (IN_PERL_COMPILETIME) {
5835 HV * const table = GvHV(PL_hintgv);
5838 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5839 return &PL_core_reg_engine;
5840 ptr = hv_fetchs(table, "regcomp", FALSE);
5841 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5842 return &PL_core_reg_engine;
5843 return INT2PTR(regexp_engine*,SvIV(*ptr));
5847 if (!PL_curcop->cop_hints_hash)
5848 return &PL_core_reg_engine;
5849 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5850 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5851 return &PL_core_reg_engine;
5852 return INT2PTR(regexp_engine*,SvIV(ptr));
5858 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5860 regexp_engine const *eng = current_re_engine();
5861 GET_RE_DEBUG_FLAGS_DECL;
5863 PERL_ARGS_ASSERT_PREGCOMP;
5865 /* Dispatch a request to compile a regexp to correct regexp engine. */
5867 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5870 return CALLREGCOMP_ENG(eng, pattern, flags);
5874 /* public(ish) entry point for the perl core's own regex compiling code.
5875 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5876 * pattern rather than a list of OPs, and uses the internal engine rather
5877 * than the current one */
5880 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5882 SV *pat = pattern; /* defeat constness! */
5883 PERL_ARGS_ASSERT_RE_COMPILE;
5884 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5885 #ifdef PERL_IN_XSUB_RE
5888 &PL_core_reg_engine,
5890 NULL, NULL, rx_flags, 0);
5894 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5895 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5896 * point to the realloced string and length.
5898 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5902 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5903 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5905 U8 *const src = (U8*)*pat_p;
5910 GET_RE_DEBUG_FLAGS_DECL;
5912 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5913 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5915 Newx(dst, *plen_p * 2 + 1, U8);
5918 while (s < *plen_p) {
5919 append_utf8_from_native_byte(src[s], &d);
5920 if (n < num_code_blocks) {
5921 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5922 pRExC_state->code_blocks[n].start = d - dst - 1;
5923 assert(*(d - 1) == '(');
5926 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5927 pRExC_state->code_blocks[n].end = d - dst - 1;
5928 assert(*(d - 1) == ')');
5937 *pat_p = (char*) dst;
5939 RExC_orig_utf8 = RExC_utf8 = 1;
5944 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5945 * while recording any code block indices, and handling overloading,
5946 * nested qr// objects etc. If pat is null, it will allocate a new
5947 * string, or just return the first arg, if there's only one.
5949 * Returns the malloced/updated pat.
5950 * patternp and pat_count is the array of SVs to be concatted;
5951 * oplist is the optional list of ops that generated the SVs;
5952 * recompile_p is a pointer to a boolean that will be set if
5953 * the regex will need to be recompiled.
5954 * delim, if non-null is an SV that will be inserted between each element
5958 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5959 SV *pat, SV ** const patternp, int pat_count,
5960 OP *oplist, bool *recompile_p, SV *delim)
5964 bool use_delim = FALSE;
5965 bool alloced = FALSE;
5967 /* if we know we have at least two args, create an empty string,
5968 * then concatenate args to that. For no args, return an empty string */
5969 if (!pat && pat_count != 1) {
5975 for (svp = patternp; svp < patternp + pat_count; svp++) {
5978 STRLEN orig_patlen = 0;
5980 SV *msv = use_delim ? delim : *svp;
5981 if (!msv) msv = &PL_sv_undef;
5983 /* if we've got a delimiter, we go round the loop twice for each
5984 * svp slot (except the last), using the delimiter the second
5993 if (SvTYPE(msv) == SVt_PVAV) {
5994 /* we've encountered an interpolated array within
5995 * the pattern, e.g. /...@a..../. Expand the list of elements,
5996 * then recursively append elements.
5997 * The code in this block is based on S_pushav() */
5999 AV *const av = (AV*)msv;
6000 const SSize_t maxarg = AvFILL(av) + 1;
6004 assert(oplist->op_type == OP_PADAV
6005 || oplist->op_type == OP_RV2AV);
6006 oplist = OP_SIBLING(oplist);
6009 if (SvRMAGICAL(av)) {
6012 Newx(array, maxarg, SV*);
6014 for (i=0; i < maxarg; i++) {
6015 SV ** const svp = av_fetch(av, i, FALSE);
6016 array[i] = svp ? *svp : &PL_sv_undef;
6020 array = AvARRAY(av);
6022 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6023 array, maxarg, NULL, recompile_p,
6025 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6031 /* we make the assumption here that each op in the list of
6032 * op_siblings maps to one SV pushed onto the stack,
6033 * except for code blocks, with have both an OP_NULL and
6035 * This allows us to match up the list of SVs against the
6036 * list of OPs to find the next code block.
6038 * Note that PUSHMARK PADSV PADSV ..
6040 * PADRANGE PADSV PADSV ..
6041 * so the alignment still works. */
6044 if (oplist->op_type == OP_NULL
6045 && (oplist->op_flags & OPf_SPECIAL))
6047 assert(n < pRExC_state->num_code_blocks);
6048 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6049 pRExC_state->code_blocks[n].block = oplist;
6050 pRExC_state->code_blocks[n].src_regex = NULL;
6053 oplist = OP_SIBLING(oplist); /* skip CONST */
6056 oplist = OP_SIBLING(oplist);;
6059 /* apply magic and QR overloading to arg */
6062 if (SvROK(msv) && SvAMAGIC(msv)) {
6063 SV *sv = AMG_CALLunary(msv, regexp_amg);
6067 if (SvTYPE(sv) != SVt_REGEXP)
6068 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6073 /* try concatenation overload ... */
6074 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6075 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6078 /* overloading involved: all bets are off over literal
6079 * code. Pretend we haven't seen it */
6080 pRExC_state->num_code_blocks -= n;
6084 /* ... or failing that, try "" overload */
6085 while (SvAMAGIC(msv)
6086 && (sv = AMG_CALLunary(msv, string_amg))
6090 && SvRV(msv) == SvRV(sv))
6095 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6099 /* this is a partially unrolled
6100 * sv_catsv_nomg(pat, msv);
6101 * that allows us to adjust code block indices if
6104 char *dst = SvPV_force_nomg(pat, dlen);
6106 if (SvUTF8(msv) && !SvUTF8(pat)) {
6107 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6108 sv_setpvn(pat, dst, dlen);
6111 sv_catsv_nomg(pat, msv);
6118 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6121 /* extract any code blocks within any embedded qr//'s */
6122 if (rx && SvTYPE(rx) == SVt_REGEXP
6123 && RX_ENGINE((REGEXP*)rx)->op_comp)
6126 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6127 if (ri->num_code_blocks) {
6129 /* the presence of an embedded qr// with code means
6130 * we should always recompile: the text of the
6131 * qr// may not have changed, but it may be a
6132 * different closure than last time */
6134 Renew(pRExC_state->code_blocks,
6135 pRExC_state->num_code_blocks + ri->num_code_blocks,
6136 struct reg_code_block);
6137 pRExC_state->num_code_blocks += ri->num_code_blocks;
6139 for (i=0; i < ri->num_code_blocks; i++) {
6140 struct reg_code_block *src, *dst;
6141 STRLEN offset = orig_patlen
6142 + ReANY((REGEXP *)rx)->pre_prefix;
6143 assert(n < pRExC_state->num_code_blocks);
6144 src = &ri->code_blocks[i];
6145 dst = &pRExC_state->code_blocks[n];
6146 dst->start = src->start + offset;
6147 dst->end = src->end + offset;
6148 dst->block = src->block;
6149 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6158 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6167 /* see if there are any run-time code blocks in the pattern.
6168 * False positives are allowed */
6171 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6172 char *pat, STRLEN plen)
6177 PERL_UNUSED_CONTEXT;
6179 for (s = 0; s < plen; s++) {
6180 if (n < pRExC_state->num_code_blocks
6181 && s == pRExC_state->code_blocks[n].start)
6183 s = pRExC_state->code_blocks[n].end;
6187 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6189 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6191 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6198 /* Handle run-time code blocks. We will already have compiled any direct
6199 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6200 * copy of it, but with any literal code blocks blanked out and
6201 * appropriate chars escaped; then feed it into
6203 * eval "qr'modified_pattern'"
6207 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6211 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6213 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6214 * and merge them with any code blocks of the original regexp.
6216 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6217 * instead, just save the qr and return FALSE; this tells our caller that
6218 * the original pattern needs upgrading to utf8.
6222 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6223 char *pat, STRLEN plen)
6227 GET_RE_DEBUG_FLAGS_DECL;
6229 if (pRExC_state->runtime_code_qr) {
6230 /* this is the second time we've been called; this should
6231 * only happen if the main pattern got upgraded to utf8
6232 * during compilation; re-use the qr we compiled first time
6233 * round (which should be utf8 too)
6235 qr = pRExC_state->runtime_code_qr;
6236 pRExC_state->runtime_code_qr = NULL;
6237 assert(RExC_utf8 && SvUTF8(qr));
6243 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6247 /* determine how many extra chars we need for ' and \ escaping */
6248 for (s = 0; s < plen; s++) {
6249 if (pat[s] == '\'' || pat[s] == '\\')
6253 Newx(newpat, newlen, char);
6255 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6257 for (s = 0; s < plen; s++) {
6258 if (n < pRExC_state->num_code_blocks
6259 && s == pRExC_state->code_blocks[n].start)
6261 /* blank out literal code block */
6262 assert(pat[s] == '(');
6263 while (s <= pRExC_state->code_blocks[n].end) {
6271 if (pat[s] == '\'' || pat[s] == '\\')
6276 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6280 PerlIO_printf(Perl_debug_log,
6281 "%sre-parsing pattern for runtime code:%s %s\n",
6282 PL_colors[4],PL_colors[5],newpat);
6285 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6290 PUSHSTACKi(PERLSI_REQUIRE);
6291 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6292 * parsing qr''; normally only q'' does this. It also alters
6294 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6295 SvREFCNT_dec_NN(sv);
6300 SV * const errsv = ERRSV;
6301 if (SvTRUE_NN(errsv))
6303 Safefree(pRExC_state->code_blocks);
6304 /* use croak_sv ? */
6305 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6308 assert(SvROK(qr_ref));
6310 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6311 /* the leaving below frees the tmp qr_ref.
6312 * Give qr a life of its own */
6320 if (!RExC_utf8 && SvUTF8(qr)) {
6321 /* first time through; the pattern got upgraded; save the
6322 * qr for the next time through */
6323 assert(!pRExC_state->runtime_code_qr);
6324 pRExC_state->runtime_code_qr = qr;
6329 /* extract any code blocks within the returned qr// */
6332 /* merge the main (r1) and run-time (r2) code blocks into one */
6334 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6335 struct reg_code_block *new_block, *dst;
6336 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6339 if (!r2->num_code_blocks) /* we guessed wrong */
6341 SvREFCNT_dec_NN(qr);
6346 r1->num_code_blocks + r2->num_code_blocks,
6347 struct reg_code_block);
6350 while ( i1 < r1->num_code_blocks
6351 || i2 < r2->num_code_blocks)
6353 struct reg_code_block *src;
6356 if (i1 == r1->num_code_blocks) {
6357 src = &r2->code_blocks[i2++];
6360 else if (i2 == r2->num_code_blocks)
6361 src = &r1->code_blocks[i1++];
6362 else if ( r1->code_blocks[i1].start
6363 < r2->code_blocks[i2].start)
6365 src = &r1->code_blocks[i1++];
6366 assert(src->end < r2->code_blocks[i2].start);
6369 assert( r1->code_blocks[i1].start
6370 > r2->code_blocks[i2].start);
6371 src = &r2->code_blocks[i2++];
6373 assert(src->end < r1->code_blocks[i1].start);
6376 assert(pat[src->start] == '(');
6377 assert(pat[src->end] == ')');
6378 dst->start = src->start;
6379 dst->end = src->end;
6380 dst->block = src->block;
6381 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6385 r1->num_code_blocks += r2->num_code_blocks;
6386 Safefree(r1->code_blocks);
6387 r1->code_blocks = new_block;
6390 SvREFCNT_dec_NN(qr);
6396 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6397 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6398 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6399 STRLEN longest_length, bool eol, bool meol)
6401 /* This is the common code for setting up the floating and fixed length
6402 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6403 * as to whether succeeded or not */
6408 if (! (longest_length
6409 || (eol /* Can't have SEOL and MULTI */
6410 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6412 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6413 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6418 /* copy the information about the longest from the reg_scan_data
6419 over to the program. */
6420 if (SvUTF8(sv_longest)) {
6421 *rx_utf8 = sv_longest;
6424 *rx_substr = sv_longest;
6427 /* end_shift is how many chars that must be matched that
6428 follow this item. We calculate it ahead of time as once the
6429 lookbehind offset is added in we lose the ability to correctly
6431 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6432 *rx_end_shift = ml - offset
6433 - longest_length + (SvTAIL(sv_longest) != 0)
6436 t = (eol/* Can't have SEOL and MULTI */
6437 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6438 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6444 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6445 * regular expression into internal code.
6446 * The pattern may be passed either as:
6447 * a list of SVs (patternp plus pat_count)
6448 * a list of OPs (expr)
6449 * If both are passed, the SV list is used, but the OP list indicates
6450 * which SVs are actually pre-compiled code blocks
6452 * The SVs in the list have magic and qr overloading applied to them (and
6453 * the list may be modified in-place with replacement SVs in the latter
6456 * If the pattern hasn't changed from old_re, then old_re will be
6459 * eng is the current engine. If that engine has an op_comp method, then
6460 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6461 * do the initial concatenation of arguments and pass on to the external
6464 * If is_bare_re is not null, set it to a boolean indicating whether the
6465 * arg list reduced (after overloading) to a single bare regex which has
6466 * been returned (i.e. /$qr/).
6468 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6470 * pm_flags contains the PMf_* flags, typically based on those from the
6471 * pm_flags field of the related PMOP. Currently we're only interested in
6472 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6474 * We can't allocate space until we know how big the compiled form will be,
6475 * but we can't compile it (and thus know how big it is) until we've got a
6476 * place to put the code. So we cheat: we compile it twice, once with code
6477 * generation turned off and size counting turned on, and once "for real".
6478 * This also means that we don't allocate space until we are sure that the
6479 * thing really will compile successfully, and we never have to move the
6480 * code and thus invalidate pointers into it. (Note that it has to be in
6481 * one piece because free() must be able to free it all.) [NB: not true in perl]
6483 * Beware that the optimization-preparation code in here knows about some
6484 * of the structure of the compiled regexp. [I'll say.]
6488 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6489 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6490 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6494 regexp_internal *ri;
6502 SV *code_blocksv = NULL;
6503 SV** new_patternp = patternp;
6505 /* these are all flags - maybe they should be turned
6506 * into a single int with different bit masks */
6507 I32 sawlookahead = 0;
6512 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6514 bool runtime_code = 0;
6516 RExC_state_t RExC_state;
6517 RExC_state_t * const pRExC_state = &RExC_state;
6518 #ifdef TRIE_STUDY_OPT
6520 RExC_state_t copyRExC_state;
6522 GET_RE_DEBUG_FLAGS_DECL;
6524 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6526 DEBUG_r(if (!PL_colorset) reginitcolors());
6528 #ifndef PERL_IN_XSUB_RE
6529 /* Initialize these here instead of as-needed, as is quick and avoids
6530 * having to test them each time otherwise */
6531 if (! PL_AboveLatin1) {
6532 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6533 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6534 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6535 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6536 PL_HasMultiCharFold =
6537 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6539 /* This is calculated here, because the Perl program that generates the
6540 * static global ones doesn't currently have access to
6541 * NUM_ANYOF_CODE_POINTS */
6542 PL_InBitmap = _new_invlist(2);
6543 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6544 NUM_ANYOF_CODE_POINTS - 1);
6548 pRExC_state->code_blocks = NULL;
6549 pRExC_state->num_code_blocks = 0;
6552 *is_bare_re = FALSE;
6554 if (expr && (expr->op_type == OP_LIST ||
6555 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6556 /* allocate code_blocks if needed */
6560 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6561 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6562 ncode++; /* count of DO blocks */
6564 pRExC_state->num_code_blocks = ncode;
6565 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6570 /* compile-time pattern with just OP_CONSTs and DO blocks */
6575 /* find how many CONSTs there are */
6578 if (expr->op_type == OP_CONST)
6581 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6582 if (o->op_type == OP_CONST)
6586 /* fake up an SV array */
6588 assert(!new_patternp);
6589 Newx(new_patternp, n, SV*);
6590 SAVEFREEPV(new_patternp);
6594 if (expr->op_type == OP_CONST)
6595 new_patternp[n] = cSVOPx_sv(expr);
6597 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6598 if (o->op_type == OP_CONST)
6599 new_patternp[n++] = cSVOPo_sv;
6604 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6605 "Assembling pattern from %d elements%s\n", pat_count,
6606 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6608 /* set expr to the first arg op */
6610 if (pRExC_state->num_code_blocks
6611 && expr->op_type != OP_CONST)
6613 expr = cLISTOPx(expr)->op_first;
6614 assert( expr->op_type == OP_PUSHMARK
6615 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6616 || expr->op_type == OP_PADRANGE);
6617 expr = OP_SIBLING(expr);
6620 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6621 expr, &recompile, NULL);
6623 /* handle bare (possibly after overloading) regex: foo =~ $re */
6628 if (SvTYPE(re) == SVt_REGEXP) {
6632 Safefree(pRExC_state->code_blocks);
6633 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6634 "Precompiled pattern%s\n",
6635 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6641 exp = SvPV_nomg(pat, plen);
6643 if (!eng->op_comp) {
6644 if ((SvUTF8(pat) && IN_BYTES)
6645 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6647 /* make a temporary copy; either to convert to bytes,
6648 * or to avoid repeating get-magic / overloaded stringify */
6649 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6650 (IN_BYTES ? 0 : SvUTF8(pat)));
6652 Safefree(pRExC_state->code_blocks);
6653 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6656 /* ignore the utf8ness if the pattern is 0 length */
6657 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6658 RExC_uni_semantics = 0;
6659 RExC_contains_locale = 0;
6660 RExC_contains_i = 0;
6661 pRExC_state->runtime_code_qr = NULL;
6662 RExC_frame_head= NULL;
6663 RExC_frame_last= NULL;
6664 RExC_frame_count= 0;
6667 RExC_mysv1= sv_newmortal();
6668 RExC_mysv2= sv_newmortal();
6671 SV *dsv= sv_newmortal();
6672 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6673 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6674 PL_colors[4],PL_colors[5],s);
6678 /* we jump here if we upgrade the pattern to utf8 and have to
6681 if ((pm_flags & PMf_USE_RE_EVAL)
6682 /* this second condition covers the non-regex literal case,
6683 * i.e. $foo =~ '(?{})'. */
6684 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6686 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6688 /* return old regex if pattern hasn't changed */
6689 /* XXX: note in the below we have to check the flags as well as the
6692 * Things get a touch tricky as we have to compare the utf8 flag
6693 * independently from the compile flags. */
6697 && !!RX_UTF8(old_re) == !!RExC_utf8
6698 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6699 && RX_PRECOMP(old_re)
6700 && RX_PRELEN(old_re) == plen
6701 && memEQ(RX_PRECOMP(old_re), exp, plen)
6702 && !runtime_code /* with runtime code, always recompile */ )
6704 Safefree(pRExC_state->code_blocks);
6708 rx_flags = orig_rx_flags;
6710 if (rx_flags & PMf_FOLD) {
6711 RExC_contains_i = 1;
6713 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6715 /* Set to use unicode semantics if the pattern is in utf8 and has the
6716 * 'depends' charset specified, as it means unicode when utf8 */
6717 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6721 RExC_flags = rx_flags;
6722 RExC_pm_flags = pm_flags;
6725 if (TAINTING_get && TAINT_get)
6726 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6728 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6729 /* whoops, we have a non-utf8 pattern, whilst run-time code
6730 * got compiled as utf8. Try again with a utf8 pattern */
6731 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6732 pRExC_state->num_code_blocks);
6733 goto redo_first_pass;
6736 assert(!pRExC_state->runtime_code_qr);
6742 RExC_in_lookbehind = 0;
6743 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6745 RExC_override_recoding = 0;
6746 RExC_in_multi_char_class = 0;
6748 /* First pass: determine size, legality. */
6751 RExC_end = exp + plen;
6756 RExC_emit = (regnode *) &RExC_emit_dummy;
6757 RExC_whilem_seen = 0;
6758 RExC_open_parens = NULL;
6759 RExC_close_parens = NULL;
6761 RExC_paren_names = NULL;
6763 RExC_paren_name_list = NULL;
6765 RExC_recurse = NULL;
6766 RExC_study_chunk_recursed = NULL;
6767 RExC_study_chunk_recursed_bytes= 0;
6768 RExC_recurse_count = 0;
6769 pRExC_state->code_index = 0;
6771 #if 0 /* REGC() is (currently) a NOP at the first pass.
6772 * Clever compilers notice this and complain. --jhi */
6773 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6776 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6778 RExC_lastparse=NULL;
6780 /* reg may croak on us, not giving us a chance to free
6781 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6782 need it to survive as long as the regexp (qr/(?{})/).
6783 We must check that code_blocksv is not already set, because we may
6784 have jumped back to restart the sizing pass. */
6785 if (pRExC_state->code_blocks && !code_blocksv) {
6786 code_blocksv = newSV_type(SVt_PV);
6787 SAVEFREESV(code_blocksv);
6788 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6789 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6791 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6792 /* It's possible to write a regexp in ascii that represents Unicode
6793 codepoints outside of the byte range, such as via \x{100}. If we
6794 detect such a sequence we have to convert the entire pattern to utf8
6795 and then recompile, as our sizing calculation will have been based
6796 on 1 byte == 1 character, but we will need to use utf8 to encode
6797 at least some part of the pattern, and therefore must convert the whole
6800 if (flags & RESTART_UTF8) {
6801 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6802 pRExC_state->num_code_blocks);
6803 goto redo_first_pass;
6805 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6808 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6811 PerlIO_printf(Perl_debug_log,
6812 "Required size %"IVdf" nodes\n"
6813 "Starting second pass (creation)\n",
6816 RExC_lastparse=NULL;
6819 /* The first pass could have found things that force Unicode semantics */
6820 if ((RExC_utf8 || RExC_uni_semantics)
6821 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6823 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6826 /* Small enough for pointer-storage convention?
6827 If extralen==0, this means that we will not need long jumps. */
6828 if (RExC_size >= 0x10000L && RExC_extralen)
6829 RExC_size += RExC_extralen;
6832 if (RExC_whilem_seen > 15)
6833 RExC_whilem_seen = 15;
6835 /* Allocate space and zero-initialize. Note, the two step process
6836 of zeroing when in debug mode, thus anything assigned has to
6837 happen after that */
6838 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6840 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6841 char, regexp_internal);
6842 if ( r == NULL || ri == NULL )
6843 FAIL("Regexp out of space");
6845 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6846 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6849 /* bulk initialize base fields with 0. */
6850 Zero(ri, sizeof(regexp_internal), char);
6853 /* non-zero initialization begins here */
6856 r->extflags = rx_flags;
6857 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6859 if (pm_flags & PMf_IS_QR) {
6860 ri->code_blocks = pRExC_state->code_blocks;
6861 ri->num_code_blocks = pRExC_state->num_code_blocks;
6866 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6867 if (pRExC_state->code_blocks[n].src_regex)
6868 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6869 SAVEFREEPV(pRExC_state->code_blocks);
6873 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6874 bool has_charset = (get_regex_charset(r->extflags)
6875 != REGEX_DEPENDS_CHARSET);
6877 /* The caret is output if there are any defaults: if not all the STD
6878 * flags are set, or if no character set specifier is needed */
6880 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6882 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6883 == REG_RUN_ON_COMMENT_SEEN);
6884 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6885 >> RXf_PMf_STD_PMMOD_SHIFT);
6886 const char *fptr = STD_PAT_MODS; /*"msix"*/
6888 /* Allocate for the worst case, which is all the std flags are turned
6889 * on. If more precision is desired, we could do a population count of
6890 * the flags set. This could be done with a small lookup table, or by
6891 * shifting, masking and adding, or even, when available, assembly
6892 * language for a machine-language population count.
6893 * We never output a minus, as all those are defaults, so are
6894 * covered by the caret */
6895 const STRLEN wraplen = plen + has_p + has_runon
6896 + has_default /* If needs a caret */
6898 /* If needs a character set specifier */
6899 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6900 + (sizeof(STD_PAT_MODS) - 1)
6901 + (sizeof("(?:)") - 1);
6903 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6904 r->xpv_len_u.xpvlenu_pv = p;
6906 SvFLAGS(rx) |= SVf_UTF8;
6909 /* If a default, cover it using the caret */
6911 *p++= DEFAULT_PAT_MOD;
6915 const char* const name = get_regex_charset_name(r->extflags, &len);
6916 Copy(name, p, len, char);
6920 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6923 while((ch = *fptr++)) {
6931 Copy(RExC_precomp, p, plen, char);
6932 assert ((RX_WRAPPED(rx) - p) < 16);
6933 r->pre_prefix = p - RX_WRAPPED(rx);
6939 SvCUR_set(rx, p - RX_WRAPPED(rx));
6943 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6945 /* setup various meta data about recursion, this all requires
6946 * RExC_npar to be correctly set, and a bit later on we clear it */
6947 if (RExC_seen & REG_RECURSE_SEEN) {
6948 Newxz(RExC_open_parens, RExC_npar,regnode *);
6949 SAVEFREEPV(RExC_open_parens);
6950 Newxz(RExC_close_parens,RExC_npar,regnode *);
6951 SAVEFREEPV(RExC_close_parens);
6953 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6954 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6955 * So its 1 if there are no parens. */
6956 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6957 ((RExC_npar & 0x07) != 0);
6958 Newx(RExC_study_chunk_recursed,
6959 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6960 SAVEFREEPV(RExC_study_chunk_recursed);
6963 /* Useful during FAIL. */
6964 #ifdef RE_TRACK_PATTERN_OFFSETS
6965 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6966 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6967 "%s %"UVuf" bytes for offset annotations.\n",
6968 ri->u.offsets ? "Got" : "Couldn't get",
6969 (UV)((2*RExC_size+1) * sizeof(U32))));
6971 SetProgLen(ri,RExC_size);
6976 /* Second pass: emit code. */
6977 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6978 RExC_pm_flags = pm_flags;
6980 RExC_end = exp + plen;
6983 RExC_emit_start = ri->program;
6984 RExC_emit = ri->program;
6985 RExC_emit_bound = ri->program + RExC_size + 1;
6986 pRExC_state->code_index = 0;
6988 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6989 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6991 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6993 /* XXXX To minimize changes to RE engine we always allocate
6994 3-units-long substrs field. */
6995 Newx(r->substrs, 1, struct reg_substr_data);
6996 if (RExC_recurse_count) {
6997 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6998 SAVEFREEPV(RExC_recurse);
7002 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7004 RExC_study_chunk_recursed_count= 0;
7006 Zero(r->substrs, 1, struct reg_substr_data);
7007 if (RExC_study_chunk_recursed) {
7008 Zero(RExC_study_chunk_recursed,
7009 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7013 #ifdef TRIE_STUDY_OPT
7015 StructCopy(&zero_scan_data, &data, scan_data_t);
7016 copyRExC_state = RExC_state;
7019 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
7021 RExC_state = copyRExC_state;
7022 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7023 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7025 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7026 StructCopy(&zero_scan_data, &data, scan_data_t);
7029 StructCopy(&zero_scan_data, &data, scan_data_t);
7032 /* Dig out information for optimizations. */
7033 r->extflags = RExC_flags; /* was pm_op */
7034 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7037 SvUTF8_on(rx); /* Unicode in it? */
7038 ri->regstclass = NULL;
7039 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
7040 r->intflags |= PREGf_NAUGHTY;
7041 scan = ri->program + 1; /* First BRANCH. */
7043 /* testing for BRANCH here tells us whether there is "must appear"
7044 data in the pattern. If there is then we can use it for optimisations */
7045 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7048 STRLEN longest_float_length, longest_fixed_length;
7049 regnode_ssc ch_class; /* pointed to by data */
7051 SSize_t last_close = 0; /* pointed to by data */
7052 regnode *first= scan;
7053 regnode *first_next= regnext(first);
7055 * Skip introductions and multiplicators >= 1
7056 * so that we can extract the 'meat' of the pattern that must
7057 * match in the large if() sequence following.
7058 * NOTE that EXACT is NOT covered here, as it is normally
7059 * picked up by the optimiser separately.
7061 * This is unfortunate as the optimiser isnt handling lookahead
7062 * properly currently.
7065 while ((OP(first) == OPEN && (sawopen = 1)) ||
7066 /* An OR of *one* alternative - should not happen now. */
7067 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7068 /* for now we can't handle lookbehind IFMATCH*/
7069 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7070 (OP(first) == PLUS) ||
7071 (OP(first) == MINMOD) ||
7072 /* An {n,m} with n>0 */
7073 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7074 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7077 * the only op that could be a regnode is PLUS, all the rest
7078 * will be regnode_1 or regnode_2.
7080 * (yves doesn't think this is true)
7082 if (OP(first) == PLUS)
7085 if (OP(first) == MINMOD)
7087 first += regarglen[OP(first)];
7089 first = NEXTOPER(first);
7090 first_next= regnext(first);
7093 /* Starting-point info. */
7095 DEBUG_PEEP("first:",first,0);
7096 /* Ignore EXACT as we deal with it later. */
7097 if (PL_regkind[OP(first)] == EXACT) {
7098 if (OP(first) == EXACT)
7099 NOOP; /* Empty, get anchored substr later. */
7101 ri->regstclass = first;
7104 else if (PL_regkind[OP(first)] == TRIE &&
7105 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7107 /* this can happen only on restudy */
7108 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7111 else if (REGNODE_SIMPLE(OP(first)))
7112 ri->regstclass = first;
7113 else if (PL_regkind[OP(first)] == BOUND ||
7114 PL_regkind[OP(first)] == NBOUND)
7115 ri->regstclass = first;
7116 else if (PL_regkind[OP(first)] == BOL) {
7117 r->intflags |= (OP(first) == MBOL
7120 first = NEXTOPER(first);
7123 else if (OP(first) == GPOS) {
7124 r->intflags |= PREGf_ANCH_GPOS;
7125 first = NEXTOPER(first);
7128 else if ((!sawopen || !RExC_sawback) &&
7130 (OP(first) == STAR &&
7131 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7132 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7134 /* turn .* into ^.* with an implied $*=1 */
7136 (OP(NEXTOPER(first)) == REG_ANY)
7139 r->intflags |= (type | PREGf_IMPLICIT);
7140 first = NEXTOPER(first);
7143 if (sawplus && !sawminmod && !sawlookahead
7144 && (!sawopen || !RExC_sawback)
7145 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7146 /* x+ must match at the 1st pos of run of x's */
7147 r->intflags |= PREGf_SKIP;
7149 /* Scan is after the zeroth branch, first is atomic matcher. */
7150 #ifdef TRIE_STUDY_OPT
7153 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7154 (IV)(first - scan + 1))
7158 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7159 (IV)(first - scan + 1))
7165 * If there's something expensive in the r.e., find the
7166 * longest literal string that must appear and make it the
7167 * regmust. Resolve ties in favor of later strings, since
7168 * the regstart check works with the beginning of the r.e.
7169 * and avoiding duplication strengthens checking. Not a
7170 * strong reason, but sufficient in the absence of others.
7171 * [Now we resolve ties in favor of the earlier string if
7172 * it happens that c_offset_min has been invalidated, since the
7173 * earlier string may buy us something the later one won't.]
7176 data.longest_fixed = newSVpvs("");
7177 data.longest_float = newSVpvs("");
7178 data.last_found = newSVpvs("");
7179 data.longest = &(data.longest_fixed);
7180 ENTER_with_name("study_chunk");
7181 SAVEFREESV(data.longest_fixed);
7182 SAVEFREESV(data.longest_float);
7183 SAVEFREESV(data.last_found);
7185 if (!ri->regstclass) {
7186 ssc_init(pRExC_state, &ch_class);
7187 data.start_class = &ch_class;
7188 stclass_flag = SCF_DO_STCLASS_AND;
7189 } else /* XXXX Check for BOUND? */
7191 data.last_closep = &last_close;
7194 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7195 scan + RExC_size, /* Up to end */
7197 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7198 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7202 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7205 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7206 && data.last_start_min == 0 && data.last_end > 0
7207 && !RExC_seen_zerolen
7208 && !(RExC_seen & REG_VERBARG_SEEN)
7209 && !(RExC_seen & REG_GPOS_SEEN)
7211 r->extflags |= RXf_CHECK_ALL;
7213 scan_commit(pRExC_state, &data,&minlen,0);
7215 longest_float_length = CHR_SVLEN(data.longest_float);
7217 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7218 && data.offset_fixed == data.offset_float_min
7219 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7220 && S_setup_longest (aTHX_ pRExC_state,
7224 &(r->float_end_shift),
7225 data.lookbehind_float,
7226 data.offset_float_min,
7228 longest_float_length,
7229 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7230 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7232 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7233 r->float_max_offset = data.offset_float_max;
7234 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7235 r->float_max_offset -= data.lookbehind_float;
7236 SvREFCNT_inc_simple_void_NN(data.longest_float);
7239 r->float_substr = r->float_utf8 = NULL;
7240 longest_float_length = 0;
7243 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7245 if (S_setup_longest (aTHX_ pRExC_state,
7247 &(r->anchored_utf8),
7248 &(r->anchored_substr),
7249 &(r->anchored_end_shift),
7250 data.lookbehind_fixed,
7253 longest_fixed_length,
7254 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7255 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7257 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7258 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7261 r->anchored_substr = r->anchored_utf8 = NULL;
7262 longest_fixed_length = 0;
7264 LEAVE_with_name("study_chunk");
7267 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7268 ri->regstclass = NULL;
7270 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7272 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7273 && is_ssc_worth_it(pRExC_state, data.start_class))
7275 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7277 ssc_finalize(pRExC_state, data.start_class);
7279 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7280 StructCopy(data.start_class,
7281 (regnode_ssc*)RExC_rxi->data->data[n],
7283 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7284 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7285 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7286 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7287 PerlIO_printf(Perl_debug_log,
7288 "synthetic stclass \"%s\".\n",
7289 SvPVX_const(sv));});
7290 data.start_class = NULL;
7293 /* A temporary algorithm prefers floated substr to fixed one to dig
7295 if (longest_fixed_length > longest_float_length) {
7296 r->substrs->check_ix = 0;
7297 r->check_end_shift = r->anchored_end_shift;
7298 r->check_substr = r->anchored_substr;
7299 r->check_utf8 = r->anchored_utf8;
7300 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7301 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7302 r->intflags |= PREGf_NOSCAN;
7305 r->substrs->check_ix = 1;
7306 r->check_end_shift = r->float_end_shift;
7307 r->check_substr = r->float_substr;
7308 r->check_utf8 = r->float_utf8;
7309 r->check_offset_min = r->float_min_offset;
7310 r->check_offset_max = r->float_max_offset;
7312 if ((r->check_substr || r->check_utf8) ) {
7313 r->extflags |= RXf_USE_INTUIT;
7314 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7315 r->extflags |= RXf_INTUIT_TAIL;
7317 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7319 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7320 if ( (STRLEN)minlen < longest_float_length )
7321 minlen= longest_float_length;
7322 if ( (STRLEN)minlen < longest_fixed_length )
7323 minlen= longest_fixed_length;
7327 /* Several toplevels. Best we can is to set minlen. */
7329 regnode_ssc ch_class;
7330 SSize_t last_close = 0;
7332 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7334 scan = ri->program + 1;
7335 ssc_init(pRExC_state, &ch_class);
7336 data.start_class = &ch_class;
7337 data.last_closep = &last_close;
7340 minlen = study_chunk(pRExC_state,
7341 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7342 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7343 ? SCF_TRIE_DOING_RESTUDY
7347 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7349 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7350 = r->float_substr = r->float_utf8 = NULL;
7352 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7353 && is_ssc_worth_it(pRExC_state, data.start_class))
7355 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7357 ssc_finalize(pRExC_state, data.start_class);
7359 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7360 StructCopy(data.start_class,
7361 (regnode_ssc*)RExC_rxi->data->data[n],
7363 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7364 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7365 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7366 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7367 PerlIO_printf(Perl_debug_log,
7368 "synthetic stclass \"%s\".\n",
7369 SvPVX_const(sv));});
7370 data.start_class = NULL;
7374 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7375 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7376 r->maxlen = REG_INFTY;
7379 r->maxlen = RExC_maxlen;
7382 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7383 the "real" pattern. */
7385 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7386 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7388 r->minlenret = minlen;
7389 if (r->minlen < minlen)
7392 if (RExC_seen & REG_GPOS_SEEN)
7393 r->intflags |= PREGf_GPOS_SEEN;
7394 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7395 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7397 if (pRExC_state->num_code_blocks)
7398 r->extflags |= RXf_EVAL_SEEN;
7399 if (RExC_seen & REG_CANY_SEEN)
7400 r->intflags |= PREGf_CANY_SEEN;
7401 if (RExC_seen & REG_VERBARG_SEEN)
7403 r->intflags |= PREGf_VERBARG_SEEN;
7404 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7406 if (RExC_seen & REG_CUTGROUP_SEEN)
7407 r->intflags |= PREGf_CUTGROUP_SEEN;
7408 if (pm_flags & PMf_USE_RE_EVAL)
7409 r->intflags |= PREGf_USE_RE_EVAL;
7410 if (RExC_paren_names)
7411 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7413 RXp_PAREN_NAMES(r) = NULL;
7415 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7416 * so it can be used in pp.c */
7417 if (r->intflags & PREGf_ANCH)
7418 r->extflags |= RXf_IS_ANCHORED;
7422 /* this is used to identify "special" patterns that might result
7423 * in Perl NOT calling the regex engine and instead doing the match "itself",
7424 * particularly special cases in split//. By having the regex compiler
7425 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7426 * we avoid weird issues with equivalent patterns resulting in different behavior,
7427 * AND we allow non Perl engines to get the same optimizations by the setting the
7428 * flags appropriately - Yves */
7429 regnode *first = ri->program + 1;
7431 regnode *next = NEXTOPER(first);
7434 if (PL_regkind[fop] == NOTHING && nop == END)
7435 r->extflags |= RXf_NULL;
7436 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7437 /* when fop is SBOL first->flags will be true only when it was
7438 * produced by parsing /\A/, and not when parsing /^/. This is
7439 * very important for the split code as there we want to
7440 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7441 * See rt #122761 for more details. -- Yves */
7442 r->extflags |= RXf_START_ONLY;
7443 else if (fop == PLUS
7444 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7445 && OP(regnext(first)) == END)
7446 r->extflags |= RXf_WHITE;
7447 else if ( r->extflags & RXf_SPLIT
7449 && STR_LEN(first) == 1
7450 && *(STRING(first)) == ' '
7451 && OP(regnext(first)) == END )
7452 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7456 if (RExC_contains_locale) {
7457 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7461 if (RExC_paren_names) {
7462 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7463 ri->data->data[ri->name_list_idx]
7464 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7467 ri->name_list_idx = 0;
7469 if (RExC_recurse_count) {
7470 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7471 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7472 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7475 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7476 /* assume we don't need to swap parens around before we match */
7478 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7479 (unsigned long)RExC_study_chunk_recursed_count);
7483 PerlIO_printf(Perl_debug_log,"Final program:\n");
7486 #ifdef RE_TRACK_PATTERN_OFFSETS
7487 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7488 const STRLEN len = ri->u.offsets[0];
7490 GET_RE_DEBUG_FLAGS_DECL;
7491 PerlIO_printf(Perl_debug_log,
7492 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7493 for (i = 1; i <= len; i++) {
7494 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7495 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7496 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7498 PerlIO_printf(Perl_debug_log, "\n");
7503 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7504 * by setting the regexp SV to readonly-only instead. If the
7505 * pattern's been recompiled, the USEDness should remain. */
7506 if (old_re && SvREADONLY(old_re))
7514 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7517 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7519 PERL_UNUSED_ARG(value);
7521 if (flags & RXapif_FETCH) {
7522 return reg_named_buff_fetch(rx, key, flags);
7523 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7524 Perl_croak_no_modify();
7526 } else if (flags & RXapif_EXISTS) {
7527 return reg_named_buff_exists(rx, key, flags)
7530 } else if (flags & RXapif_REGNAMES) {
7531 return reg_named_buff_all(rx, flags);
7532 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7533 return reg_named_buff_scalar(rx, flags);
7535 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7541 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7544 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7545 PERL_UNUSED_ARG(lastkey);
7547 if (flags & RXapif_FIRSTKEY)
7548 return reg_named_buff_firstkey(rx, flags);
7549 else if (flags & RXapif_NEXTKEY)
7550 return reg_named_buff_nextkey(rx, flags);
7552 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7559 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7562 AV *retarray = NULL;
7564 struct regexp *const rx = ReANY(r);
7566 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7568 if (flags & RXapif_ALL)
7571 if (rx && RXp_PAREN_NAMES(rx)) {
7572 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7575 SV* sv_dat=HeVAL(he_str);
7576 I32 *nums=(I32*)SvPVX(sv_dat);
7577 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7578 if ((I32)(rx->nparens) >= nums[i]
7579 && rx->offs[nums[i]].start != -1
7580 && rx->offs[nums[i]].end != -1)
7583 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7588 ret = newSVsv(&PL_sv_undef);
7591 av_push(retarray, ret);
7594 return newRV_noinc(MUTABLE_SV(retarray));
7601 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7604 struct regexp *const rx = ReANY(r);
7606 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7608 if (rx && RXp_PAREN_NAMES(rx)) {
7609 if (flags & RXapif_ALL) {
7610 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7612 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7614 SvREFCNT_dec_NN(sv);
7626 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7628 struct regexp *const rx = ReANY(r);
7630 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7632 if ( rx && RXp_PAREN_NAMES(rx) ) {
7633 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7635 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7642 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7644 struct regexp *const rx = ReANY(r);
7645 GET_RE_DEBUG_FLAGS_DECL;
7647 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7649 if (rx && RXp_PAREN_NAMES(rx)) {
7650 HV *hv = RXp_PAREN_NAMES(rx);
7652 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7655 SV* sv_dat = HeVAL(temphe);
7656 I32 *nums = (I32*)SvPVX(sv_dat);
7657 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7658 if ((I32)(rx->lastparen) >= nums[i] &&
7659 rx->offs[nums[i]].start != -1 &&
7660 rx->offs[nums[i]].end != -1)
7666 if (parno || flags & RXapif_ALL) {
7667 return newSVhek(HeKEY_hek(temphe));
7675 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7680 struct regexp *const rx = ReANY(r);
7682 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7684 if (rx && RXp_PAREN_NAMES(rx)) {
7685 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7686 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7687 } else if (flags & RXapif_ONE) {
7688 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7689 av = MUTABLE_AV(SvRV(ret));
7690 length = av_tindex(av);
7691 SvREFCNT_dec_NN(ret);
7692 return newSViv(length + 1);
7694 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7699 return &PL_sv_undef;
7703 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7705 struct regexp *const rx = ReANY(r);
7708 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7710 if (rx && RXp_PAREN_NAMES(rx)) {
7711 HV *hv= RXp_PAREN_NAMES(rx);
7713 (void)hv_iterinit(hv);
7714 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7717 SV* sv_dat = HeVAL(temphe);
7718 I32 *nums = (I32*)SvPVX(sv_dat);
7719 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7720 if ((I32)(rx->lastparen) >= nums[i] &&
7721 rx->offs[nums[i]].start != -1 &&
7722 rx->offs[nums[i]].end != -1)
7728 if (parno || flags & RXapif_ALL) {
7729 av_push(av, newSVhek(HeKEY_hek(temphe)));
7734 return newRV_noinc(MUTABLE_SV(av));
7738 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7741 struct regexp *const rx = ReANY(r);
7747 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7749 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7750 || n == RX_BUFF_IDX_CARET_FULLMATCH
7751 || n == RX_BUFF_IDX_CARET_POSTMATCH
7754 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7756 /* on something like
7759 * the KEEPCOPY is set on the PMOP rather than the regex */
7760 if (PL_curpm && r == PM_GETRE(PL_curpm))
7761 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7770 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7771 /* no need to distinguish between them any more */
7772 n = RX_BUFF_IDX_FULLMATCH;
7774 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7775 && rx->offs[0].start != -1)
7777 /* $`, ${^PREMATCH} */
7778 i = rx->offs[0].start;
7782 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7783 && rx->offs[0].end != -1)
7785 /* $', ${^POSTMATCH} */
7786 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7787 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7790 if ( 0 <= n && n <= (I32)rx->nparens &&
7791 (s1 = rx->offs[n].start) != -1 &&
7792 (t1 = rx->offs[n].end) != -1)
7794 /* $&, ${^MATCH}, $1 ... */
7796 s = rx->subbeg + s1 - rx->suboffset;
7801 assert(s >= rx->subbeg);
7802 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7804 #ifdef NO_TAINT_SUPPORT
7805 sv_setpvn(sv, s, i);
7807 const int oldtainted = TAINT_get;
7809 sv_setpvn(sv, s, i);
7810 TAINT_set(oldtainted);
7812 if ( (rx->intflags & PREGf_CANY_SEEN)
7813 ? (RXp_MATCH_UTF8(rx)
7814 && (!i || is_utf8_string((U8*)s, i)))
7815 : (RXp_MATCH_UTF8(rx)) )
7822 if (RXp_MATCH_TAINTED(rx)) {
7823 if (SvTYPE(sv) >= SVt_PVMG) {
7824 MAGIC* const mg = SvMAGIC(sv);
7827 SvMAGIC_set(sv, mg->mg_moremagic);
7829 if ((mgt = SvMAGIC(sv))) {
7830 mg->mg_moremagic = mgt;
7831 SvMAGIC_set(sv, mg);
7842 sv_setsv(sv,&PL_sv_undef);
7848 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7849 SV const * const value)
7851 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7853 PERL_UNUSED_ARG(rx);
7854 PERL_UNUSED_ARG(paren);
7855 PERL_UNUSED_ARG(value);
7858 Perl_croak_no_modify();
7862 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7865 struct regexp *const rx = ReANY(r);
7869 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7871 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7872 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7873 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7876 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7878 /* on something like
7881 * the KEEPCOPY is set on the PMOP rather than the regex */
7882 if (PL_curpm && r == PM_GETRE(PL_curpm))
7883 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7889 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7891 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7892 case RX_BUFF_IDX_PREMATCH: /* $` */
7893 if (rx->offs[0].start != -1) {
7894 i = rx->offs[0].start;
7903 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7904 case RX_BUFF_IDX_POSTMATCH: /* $' */
7905 if (rx->offs[0].end != -1) {
7906 i = rx->sublen - rx->offs[0].end;
7908 s1 = rx->offs[0].end;
7915 default: /* $& / ${^MATCH}, $1, $2, ... */
7916 if (paren <= (I32)rx->nparens &&
7917 (s1 = rx->offs[paren].start) != -1 &&
7918 (t1 = rx->offs[paren].end) != -1)
7924 if (ckWARN(WARN_UNINITIALIZED))
7925 report_uninit((const SV *)sv);
7930 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7931 const char * const s = rx->subbeg - rx->suboffset + s1;
7936 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7943 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7945 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7946 PERL_UNUSED_ARG(rx);
7950 return newSVpvs("Regexp");
7953 /* Scans the name of a named buffer from the pattern.
7954 * If flags is REG_RSN_RETURN_NULL returns null.
7955 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7956 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7957 * to the parsed name as looked up in the RExC_paren_names hash.
7958 * If there is an error throws a vFAIL().. type exception.
7961 #define REG_RSN_RETURN_NULL 0
7962 #define REG_RSN_RETURN_NAME 1
7963 #define REG_RSN_RETURN_DATA 2
7966 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7968 char *name_start = RExC_parse;
7970 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7972 assert (RExC_parse <= RExC_end);
7973 if (RExC_parse == RExC_end) NOOP;
7974 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7975 /* skip IDFIRST by using do...while */
7978 RExC_parse += UTF8SKIP(RExC_parse);
7979 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7983 } while (isWORDCHAR(*RExC_parse));
7985 RExC_parse++; /* so the <- from the vFAIL is after the offending
7987 vFAIL("Group name must start with a non-digit word character");
7991 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7992 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7993 if ( flags == REG_RSN_RETURN_NAME)
7995 else if (flags==REG_RSN_RETURN_DATA) {
7998 if ( ! sv_name ) /* should not happen*/
7999 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8000 if (RExC_paren_names)
8001 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8003 sv_dat = HeVAL(he_str);
8005 vFAIL("Reference to nonexistent named group");
8009 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8010 (unsigned long) flags);
8012 assert(0); /* NOT REACHED */
8017 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8019 if (RExC_lastparse!=RExC_parse) { \
8020 PerlIO_printf(Perl_debug_log, "%s", \
8021 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8022 RExC_end - RExC_parse, 16, \
8024 PERL_PV_ESCAPE_UNI_DETECT | \
8025 PERL_PV_PRETTY_ELLIPSES | \
8026 PERL_PV_PRETTY_LTGT | \
8027 PERL_PV_ESCAPE_RE | \
8028 PERL_PV_PRETTY_EXACTSIZE \
8032 PerlIO_printf(Perl_debug_log,"%16s",""); \
8035 num = RExC_size + 1; \
8037 num=REG_NODE_NUM(RExC_emit); \
8038 if (RExC_lastnum!=num) \
8039 PerlIO_printf(Perl_debug_log,"|%4d",num); \
8041 PerlIO_printf(Perl_debug_log,"|%4s",""); \
8042 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
8043 (int)((depth*2)), "", \
8047 RExC_lastparse=RExC_parse; \
8052 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8053 DEBUG_PARSE_MSG((funcname)); \
8054 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
8056 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
8057 DEBUG_PARSE_MSG((funcname)); \
8058 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
8061 /* This section of code defines the inversion list object and its methods. The
8062 * interfaces are highly subject to change, so as much as possible is static to
8063 * this file. An inversion list is here implemented as a malloc'd C UV array
8064 * as an SVt_INVLIST scalar.
8066 * An inversion list for Unicode is an array of code points, sorted by ordinal
8067 * number. The zeroth element is the first code point in the list. The 1th
8068 * element is the first element beyond that not in the list. In other words,
8069 * the first range is
8070 * invlist[0]..(invlist[1]-1)
8071 * The other ranges follow. Thus every element whose index is divisible by two
8072 * marks the beginning of a range that is in the list, and every element not
8073 * divisible by two marks the beginning of a range not in the list. A single
8074 * element inversion list that contains the single code point N generally
8075 * consists of two elements
8078 * (The exception is when N is the highest representable value on the
8079 * machine, in which case the list containing just it would be a single
8080 * element, itself. By extension, if the last range in the list extends to
8081 * infinity, then the first element of that range will be in the inversion list
8082 * at a position that is divisible by two, and is the final element in the
8084 * Taking the complement (inverting) an inversion list is quite simple, if the
8085 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8086 * This implementation reserves an element at the beginning of each inversion
8087 * list to always contain 0; there is an additional flag in the header which
8088 * indicates if the list begins at the 0, or is offset to begin at the next
8091 * More about inversion lists can be found in "Unicode Demystified"
8092 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8093 * More will be coming when functionality is added later.
8095 * The inversion list data structure is currently implemented as an SV pointing
8096 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8097 * array of UV whose memory management is automatically handled by the existing
8098 * facilities for SV's.
8100 * Some of the methods should always be private to the implementation, and some
8101 * should eventually be made public */
8103 /* The header definitions are in F<inline_invlist.c> */
8105 PERL_STATIC_INLINE UV*
8106 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8108 /* Returns a pointer to the first element in the inversion list's array.
8109 * This is called upon initialization of an inversion list. Where the
8110 * array begins depends on whether the list has the code point U+0000 in it
8111 * or not. The other parameter tells it whether the code that follows this
8112 * call is about to put a 0 in the inversion list or not. The first
8113 * element is either the element reserved for 0, if TRUE, or the element
8114 * after it, if FALSE */
8116 bool* offset = get_invlist_offset_addr(invlist);
8117 UV* zero_addr = (UV *) SvPVX(invlist);
8119 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8122 assert(! _invlist_len(invlist));
8126 /* 1^1 = 0; 1^0 = 1 */
8127 *offset = 1 ^ will_have_0;
8128 return zero_addr + *offset;
8131 PERL_STATIC_INLINE UV*
8132 S_invlist_array(SV* const invlist)
8134 /* Returns the pointer to the inversion list's array. Every time the
8135 * length changes, this needs to be called in case malloc or realloc moved
8138 PERL_ARGS_ASSERT_INVLIST_ARRAY;
8140 /* Must not be empty. If these fail, you probably didn't check for <len>
8141 * being non-zero before trying to get the array */
8142 assert(_invlist_len(invlist));
8144 /* The very first element always contains zero, The array begins either
8145 * there, or if the inversion list is offset, at the element after it.
8146 * The offset header field determines which; it contains 0 or 1 to indicate
8147 * how much additionally to add */
8148 assert(0 == *(SvPVX(invlist)));
8149 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8152 PERL_STATIC_INLINE void
8153 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8155 /* Sets the current number of elements stored in the inversion list.
8156 * Updates SvCUR correspondingly */
8157 PERL_UNUSED_CONTEXT;
8158 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8160 assert(SvTYPE(invlist) == SVt_INVLIST);
8165 : TO_INTERNAL_SIZE(len + offset));
8166 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8169 PERL_STATIC_INLINE IV*
8170 S_get_invlist_previous_index_addr(SV* invlist)
8172 /* Return the address of the IV that is reserved to hold the cached index
8174 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8176 assert(SvTYPE(invlist) == SVt_INVLIST);
8178 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8181 PERL_STATIC_INLINE IV
8182 S_invlist_previous_index(SV* const invlist)
8184 /* Returns cached index of previous search */
8186 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8188 return *get_invlist_previous_index_addr(invlist);
8191 PERL_STATIC_INLINE void
8192 S_invlist_set_previous_index(SV* const invlist, const IV index)
8194 /* Caches <index> for later retrieval */
8196 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8198 assert(index == 0 || index < (int) _invlist_len(invlist));
8200 *get_invlist_previous_index_addr(invlist) = index;
8203 PERL_STATIC_INLINE UV
8204 S_invlist_max(SV* const invlist)
8206 /* Returns the maximum number of elements storable in the inversion list's
8207 * array, without having to realloc() */
8209 PERL_ARGS_ASSERT_INVLIST_MAX;
8211 assert(SvTYPE(invlist) == SVt_INVLIST);
8213 /* Assumes worst case, in which the 0 element is not counted in the
8214 * inversion list, so subtracts 1 for that */
8215 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8216 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8217 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8220 #ifndef PERL_IN_XSUB_RE
8222 Perl__new_invlist(pTHX_ IV initial_size)
8225 /* Return a pointer to a newly constructed inversion list, with enough
8226 * space to store 'initial_size' elements. If that number is negative, a
8227 * system default is used instead */
8231 if (initial_size < 0) {
8235 /* Allocate the initial space */
8236 new_list = newSV_type(SVt_INVLIST);
8238 /* First 1 is in case the zero element isn't in the list; second 1 is for
8240 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8241 invlist_set_len(new_list, 0, 0);
8243 /* Force iterinit() to be used to get iteration to work */
8244 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8246 *get_invlist_previous_index_addr(new_list) = 0;
8252 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8254 /* Return a pointer to a newly constructed inversion list, initialized to
8255 * point to <list>, which has to be in the exact correct inversion list
8256 * form, including internal fields. Thus this is a dangerous routine that
8257 * should not be used in the wrong hands. The passed in 'list' contains
8258 * several header fields at the beginning that are not part of the
8259 * inversion list body proper */
8261 const STRLEN length = (STRLEN) list[0];
8262 const UV version_id = list[1];
8263 const bool offset = cBOOL(list[2]);
8264 #define HEADER_LENGTH 3
8265 /* If any of the above changes in any way, you must change HEADER_LENGTH
8266 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8267 * perl -E 'say int(rand 2**31-1)'
8269 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8270 data structure type, so that one being
8271 passed in can be validated to be an
8272 inversion list of the correct vintage.
8275 SV* invlist = newSV_type(SVt_INVLIST);
8277 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8279 if (version_id != INVLIST_VERSION_ID) {
8280 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8283 /* The generated array passed in includes header elements that aren't part
8284 * of the list proper, so start it just after them */
8285 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8287 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8288 shouldn't touch it */
8290 *(get_invlist_offset_addr(invlist)) = offset;
8292 /* The 'length' passed to us is the physical number of elements in the
8293 * inversion list. But if there is an offset the logical number is one
8295 invlist_set_len(invlist, length - offset, offset);
8297 invlist_set_previous_index(invlist, 0);
8299 /* Initialize the iteration pointer. */
8300 invlist_iterfinish(invlist);
8302 SvREADONLY_on(invlist);
8306 #endif /* ifndef PERL_IN_XSUB_RE */
8309 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8311 /* Grow the maximum size of an inversion list */
8313 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8315 assert(SvTYPE(invlist) == SVt_INVLIST);
8317 /* Add one to account for the zero element at the beginning which may not
8318 * be counted by the calling parameters */
8319 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8322 PERL_STATIC_INLINE void
8323 S_invlist_trim(SV* const invlist)
8325 PERL_ARGS_ASSERT_INVLIST_TRIM;
8327 assert(SvTYPE(invlist) == SVt_INVLIST);
8329 /* Change the length of the inversion list to how many entries it currently
8331 SvPV_shrink_to_cur((SV *) invlist);
8335 S__append_range_to_invlist(pTHX_ SV* const invlist,
8336 const UV start, const UV end)
8338 /* Subject to change or removal. Append the range from 'start' to 'end' at
8339 * the end of the inversion list. The range must be above any existing
8343 UV max = invlist_max(invlist);
8344 UV len = _invlist_len(invlist);
8347 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8349 if (len == 0) { /* Empty lists must be initialized */
8350 offset = start != 0;
8351 array = _invlist_array_init(invlist, ! offset);
8354 /* Here, the existing list is non-empty. The current max entry in the
8355 * list is generally the first value not in the set, except when the
8356 * set extends to the end of permissible values, in which case it is
8357 * the first entry in that final set, and so this call is an attempt to
8358 * append out-of-order */
8360 UV final_element = len - 1;
8361 array = invlist_array(invlist);
8362 if (array[final_element] > start
8363 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8365 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8366 array[final_element], start,
8367 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8370 /* Here, it is a legal append. If the new range begins with the first
8371 * value not in the set, it is extending the set, so the new first
8372 * value not in the set is one greater than the newly extended range.
8374 offset = *get_invlist_offset_addr(invlist);
8375 if (array[final_element] == start) {
8376 if (end != UV_MAX) {
8377 array[final_element] = end + 1;
8380 /* But if the end is the maximum representable on the machine,
8381 * just let the range that this would extend to have no end */
8382 invlist_set_len(invlist, len - 1, offset);
8388 /* Here the new range doesn't extend any existing set. Add it */
8390 len += 2; /* Includes an element each for the start and end of range */
8392 /* If wll overflow the existing space, extend, which may cause the array to
8395 invlist_extend(invlist, len);
8397 /* Have to set len here to avoid assert failure in invlist_array() */
8398 invlist_set_len(invlist, len, offset);
8400 array = invlist_array(invlist);
8403 invlist_set_len(invlist, len, offset);
8406 /* The next item on the list starts the range, the one after that is
8407 * one past the new range. */
8408 array[len - 2] = start;
8409 if (end != UV_MAX) {
8410 array[len - 1] = end + 1;
8413 /* But if the end is the maximum representable on the machine, just let
8414 * the range have no end */
8415 invlist_set_len(invlist, len - 1, offset);
8419 #ifndef PERL_IN_XSUB_RE
8422 Perl__invlist_search(SV* const invlist, const UV cp)
8424 /* Searches the inversion list for the entry that contains the input code
8425 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8426 * return value is the index into the list's array of the range that
8431 IV high = _invlist_len(invlist);
8432 const IV highest_element = high - 1;
8435 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8437 /* If list is empty, return failure. */
8442 /* (We can't get the array unless we know the list is non-empty) */
8443 array = invlist_array(invlist);
8445 mid = invlist_previous_index(invlist);
8446 assert(mid >=0 && mid <= highest_element);
8448 /* <mid> contains the cache of the result of the previous call to this
8449 * function (0 the first time). See if this call is for the same result,
8450 * or if it is for mid-1. This is under the theory that calls to this
8451 * function will often be for related code points that are near each other.
8452 * And benchmarks show that caching gives better results. We also test
8453 * here if the code point is within the bounds of the list. These tests
8454 * replace others that would have had to be made anyway to make sure that
8455 * the array bounds were not exceeded, and these give us extra information
8456 * at the same time */
8457 if (cp >= array[mid]) {
8458 if (cp >= array[highest_element]) {
8459 return highest_element;
8462 /* Here, array[mid] <= cp < array[highest_element]. This means that
8463 * the final element is not the answer, so can exclude it; it also
8464 * means that <mid> is not the final element, so can refer to 'mid + 1'
8466 if (cp < array[mid + 1]) {
8472 else { /* cp < aray[mid] */
8473 if (cp < array[0]) { /* Fail if outside the array */
8477 if (cp >= array[mid - 1]) {
8482 /* Binary search. What we are looking for is <i> such that
8483 * array[i] <= cp < array[i+1]
8484 * The loop below converges on the i+1. Note that there may not be an
8485 * (i+1)th element in the array, and things work nonetheless */
8486 while (low < high) {
8487 mid = (low + high) / 2;
8488 assert(mid <= highest_element);
8489 if (array[mid] <= cp) { /* cp >= array[mid] */
8492 /* We could do this extra test to exit the loop early.
8493 if (cp < array[low]) {
8498 else { /* cp < array[mid] */
8505 invlist_set_previous_index(invlist, high);
8510 Perl__invlist_populate_swatch(SV* const invlist,
8511 const UV start, const UV end, U8* swatch)
8513 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8514 * but is used when the swash has an inversion list. This makes this much
8515 * faster, as it uses a binary search instead of a linear one. This is
8516 * intimately tied to that function, and perhaps should be in utf8.c,
8517 * except it is intimately tied to inversion lists as well. It assumes
8518 * that <swatch> is all 0's on input */
8521 const IV len = _invlist_len(invlist);
8525 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8527 if (len == 0) { /* Empty inversion list */
8531 array = invlist_array(invlist);
8533 /* Find which element it is */
8534 i = _invlist_search(invlist, start);
8536 /* We populate from <start> to <end> */
8537 while (current < end) {
8540 /* The inversion list gives the results for every possible code point
8541 * after the first one in the list. Only those ranges whose index is
8542 * even are ones that the inversion list matches. For the odd ones,
8543 * and if the initial code point is not in the list, we have to skip
8544 * forward to the next element */
8545 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8547 if (i >= len) { /* Finished if beyond the end of the array */
8551 if (current >= end) { /* Finished if beyond the end of what we
8553 if (LIKELY(end < UV_MAX)) {
8557 /* We get here when the upper bound is the maximum
8558 * representable on the machine, and we are looking for just
8559 * that code point. Have to special case it */
8561 goto join_end_of_list;
8564 assert(current >= start);
8566 /* The current range ends one below the next one, except don't go past
8569 upper = (i < len && array[i] < end) ? array[i] : end;
8571 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8572 * for each code point in it */
8573 for (; current < upper; current++) {
8574 const STRLEN offset = (STRLEN)(current - start);
8575 swatch[offset >> 3] |= 1 << (offset & 7);
8580 /* Quit if at the end of the list */
8583 /* But first, have to deal with the highest possible code point on
8584 * the platform. The previous code assumes that <end> is one
8585 * beyond where we want to populate, but that is impossible at the
8586 * platform's infinity, so have to handle it specially */
8587 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8589 const STRLEN offset = (STRLEN)(end - start);
8590 swatch[offset >> 3] |= 1 << (offset & 7);
8595 /* Advance to the next range, which will be for code points not in the
8604 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8605 const bool complement_b, SV** output)
8607 /* Take the union of two inversion lists and point <output> to it. *output
8608 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8609 * the reference count to that list will be decremented if not already a
8610 * temporary (mortal); otherwise *output will be made correspondingly
8611 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8612 * second list is returned. If <complement_b> is TRUE, the union is taken
8613 * of the complement (inversion) of <b> instead of b itself.
8615 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8616 * Richard Gillam, published by Addison-Wesley, and explained at some
8617 * length there. The preface says to incorporate its examples into your
8618 * code at your own risk.
8620 * The algorithm is like a merge sort.
8622 * XXX A potential performance improvement is to keep track as we go along
8623 * if only one of the inputs contributes to the result, meaning the other
8624 * is a subset of that one. In that case, we can skip the final copy and
8625 * return the larger of the input lists, but then outside code might need
8626 * to keep track of whether to free the input list or not */
8628 const UV* array_a; /* a's array */
8630 UV len_a; /* length of a's array */
8633 SV* u; /* the resulting union */
8637 UV i_a = 0; /* current index into a's array */
8641 /* running count, as explained in the algorithm source book; items are
8642 * stopped accumulating and are output when the count changes to/from 0.
8643 * The count is incremented when we start a range that's in the set, and
8644 * decremented when we start a range that's not in the set. So its range
8645 * is 0 to 2. Only when the count is zero is something not in the set.
8649 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8652 /* If either one is empty, the union is the other one */
8653 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8654 bool make_temp = FALSE; /* Should we mortalize the result? */
8658 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8664 *output = invlist_clone(b);
8666 _invlist_invert(*output);
8668 } /* else *output already = b; */
8671 sv_2mortal(*output);
8675 else if ((len_b = _invlist_len(b)) == 0) {
8676 bool make_temp = FALSE;
8678 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8683 /* The complement of an empty list is a list that has everything in it,
8684 * so the union with <a> includes everything too */
8687 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8691 *output = _new_invlist(1);
8692 _append_range_to_invlist(*output, 0, UV_MAX);
8694 else if (*output != a) {
8695 *output = invlist_clone(a);
8697 /* else *output already = a; */
8700 sv_2mortal(*output);
8705 /* Here both lists exist and are non-empty */
8706 array_a = invlist_array(a);
8707 array_b = invlist_array(b);
8709 /* If are to take the union of 'a' with the complement of b, set it
8710 * up so are looking at b's complement. */
8713 /* To complement, we invert: if the first element is 0, remove it. To
8714 * do this, we just pretend the array starts one later */
8715 if (array_b[0] == 0) {
8721 /* But if the first element is not zero, we pretend the list starts
8722 * at the 0 that is always stored immediately before the array. */
8728 /* Size the union for the worst case: that the sets are completely
8730 u = _new_invlist(len_a + len_b);
8732 /* Will contain U+0000 if either component does */
8733 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8734 || (len_b > 0 && array_b[0] == 0));
8736 /* Go through each list item by item, stopping when exhausted one of
8738 while (i_a < len_a && i_b < len_b) {
8739 UV cp; /* The element to potentially add to the union's array */
8740 bool cp_in_set; /* is it in the the input list's set or not */
8742 /* We need to take one or the other of the two inputs for the union.
8743 * Since we are merging two sorted lists, we take the smaller of the
8744 * next items. In case of a tie, we take the one that is in its set
8745 * first. If we took one not in the set first, it would decrement the
8746 * count, possibly to 0 which would cause it to be output as ending the
8747 * range, and the next time through we would take the same number, and
8748 * output it again as beginning the next range. By doing it the
8749 * opposite way, there is no possibility that the count will be
8750 * momentarily decremented to 0, and thus the two adjoining ranges will
8751 * be seamlessly merged. (In a tie and both are in the set or both not
8752 * in the set, it doesn't matter which we take first.) */
8753 if (array_a[i_a] < array_b[i_b]
8754 || (array_a[i_a] == array_b[i_b]
8755 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8757 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8761 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8762 cp = array_b[i_b++];
8765 /* Here, have chosen which of the two inputs to look at. Only output
8766 * if the running count changes to/from 0, which marks the
8767 * beginning/end of a range in that's in the set */
8770 array_u[i_u++] = cp;
8777 array_u[i_u++] = cp;
8782 /* Here, we are finished going through at least one of the lists, which
8783 * means there is something remaining in at most one. We check if the list
8784 * that hasn't been exhausted is positioned such that we are in the middle
8785 * of a range in its set or not. (i_a and i_b point to the element beyond
8786 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8787 * is potentially more to output.
8788 * There are four cases:
8789 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8790 * in the union is entirely from the non-exhausted set.
8791 * 2) Both were in their sets, count is 2. Nothing further should
8792 * be output, as everything that remains will be in the exhausted
8793 * list's set, hence in the union; decrementing to 1 but not 0 insures
8795 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8796 * Nothing further should be output because the union includes
8797 * everything from the exhausted set. Not decrementing ensures that.
8798 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8799 * decrementing to 0 insures that we look at the remainder of the
8800 * non-exhausted set */
8801 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8802 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8807 /* The final length is what we've output so far, plus what else is about to
8808 * be output. (If 'count' is non-zero, then the input list we exhausted
8809 * has everything remaining up to the machine's limit in its set, and hence
8810 * in the union, so there will be no further output. */
8813 /* At most one of the subexpressions will be non-zero */
8814 len_u += (len_a - i_a) + (len_b - i_b);
8817 /* Set result to final length, which can change the pointer to array_u, so
8819 if (len_u != _invlist_len(u)) {
8820 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8822 array_u = invlist_array(u);
8825 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8826 * the other) ended with everything above it not in its set. That means
8827 * that the remaining part of the union is precisely the same as the
8828 * non-exhausted list, so can just copy it unchanged. (If both list were
8829 * exhausted at the same time, then the operations below will be both 0.)
8832 IV copy_count; /* At most one will have a non-zero copy count */
8833 if ((copy_count = len_a - i_a) > 0) {
8834 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8836 else if ((copy_count = len_b - i_b) > 0) {
8837 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8841 /* We may be removing a reference to one of the inputs. If so, the output
8842 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8843 * count decremented) */
8844 if (a == *output || b == *output) {
8845 assert(! invlist_is_iterating(*output));
8846 if ((SvTEMP(*output))) {
8850 SvREFCNT_dec_NN(*output);
8860 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8861 const bool complement_b, SV** i)
8863 /* Take the intersection of two inversion lists and point <i> to it. *i
8864 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8865 * the reference count to that list will be decremented if not already a
8866 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8867 * The first list, <a>, may be NULL, in which case an empty list is
8868 * returned. If <complement_b> is TRUE, the result will be the
8869 * intersection of <a> and the complement (or inversion) of <b> instead of
8872 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8873 * Richard Gillam, published by Addison-Wesley, and explained at some
8874 * length there. The preface says to incorporate its examples into your
8875 * code at your own risk. In fact, it had bugs
8877 * The algorithm is like a merge sort, and is essentially the same as the
8881 const UV* array_a; /* a's array */
8883 UV len_a; /* length of a's array */
8886 SV* r; /* the resulting intersection */
8890 UV i_a = 0; /* current index into a's array */
8894 /* running count, as explained in the algorithm source book; items are
8895 * stopped accumulating and are output when the count changes to/from 2.
8896 * The count is incremented when we start a range that's in the set, and
8897 * decremented when we start a range that's not in the set. So its range
8898 * is 0 to 2. Only when the count is 2 is something in the intersection.
8902 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8905 /* Special case if either one is empty */
8906 len_a = (a == NULL) ? 0 : _invlist_len(a);
8907 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8908 bool make_temp = FALSE;
8910 if (len_a != 0 && complement_b) {
8912 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8913 * be empty. Here, also we are using 'b's complement, which hence
8914 * must be every possible code point. Thus the intersection is
8918 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8923 *i = invlist_clone(a);
8925 /* else *i is already 'a' */
8933 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8934 * intersection must be empty */
8936 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8941 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8945 *i = _new_invlist(0);
8953 /* Here both lists exist and are non-empty */
8954 array_a = invlist_array(a);
8955 array_b = invlist_array(b);
8957 /* If are to take the intersection of 'a' with the complement of b, set it
8958 * up so are looking at b's complement. */
8961 /* To complement, we invert: if the first element is 0, remove it. To
8962 * do this, we just pretend the array starts one later */
8963 if (array_b[0] == 0) {
8969 /* But if the first element is not zero, we pretend the list starts
8970 * at the 0 that is always stored immediately before the array. */
8976 /* Size the intersection for the worst case: that the intersection ends up
8977 * fragmenting everything to be completely disjoint */
8978 r= _new_invlist(len_a + len_b);
8980 /* Will contain U+0000 iff both components do */
8981 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8982 && len_b > 0 && array_b[0] == 0);
8984 /* Go through each list item by item, stopping when exhausted one of
8986 while (i_a < len_a && i_b < len_b) {
8987 UV cp; /* The element to potentially add to the intersection's
8989 bool cp_in_set; /* Is it in the input list's set or not */
8991 /* We need to take one or the other of the two inputs for the
8992 * intersection. Since we are merging two sorted lists, we take the
8993 * smaller of the next items. In case of a tie, we take the one that
8994 * is not in its set first (a difference from the union algorithm). If
8995 * we took one in the set first, it would increment the count, possibly
8996 * to 2 which would cause it to be output as starting a range in the
8997 * intersection, and the next time through we would take that same
8998 * number, and output it again as ending the set. By doing it the
8999 * opposite of this, there is no possibility that the count will be
9000 * momentarily incremented to 2. (In a tie and both are in the set or
9001 * both not in the set, it doesn't matter which we take first.) */
9002 if (array_a[i_a] < array_b[i_b]
9003 || (array_a[i_a] == array_b[i_b]
9004 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9006 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9010 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9014 /* Here, have chosen which of the two inputs to look at. Only output
9015 * if the running count changes to/from 2, which marks the
9016 * beginning/end of a range that's in the intersection */
9020 array_r[i_r++] = cp;
9025 array_r[i_r++] = cp;
9031 /* Here, we are finished going through at least one of the lists, which
9032 * means there is something remaining in at most one. We check if the list
9033 * that has been exhausted is positioned such that we are in the middle
9034 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
9035 * the ones we care about.) There are four cases:
9036 * 1) Both weren't in their sets, count is 0, and remains 0. There's
9037 * nothing left in the intersection.
9038 * 2) Both were in their sets, count is 2 and perhaps is incremented to
9039 * above 2. What should be output is exactly that which is in the
9040 * non-exhausted set, as everything it has is also in the intersection
9041 * set, and everything it doesn't have can't be in the intersection
9042 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9043 * gets incremented to 2. Like the previous case, the intersection is
9044 * everything that remains in the non-exhausted set.
9045 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9046 * remains 1. And the intersection has nothing more. */
9047 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9048 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9053 /* The final length is what we've output so far plus what else is in the
9054 * intersection. At most one of the subexpressions below will be non-zero
9058 len_r += (len_a - i_a) + (len_b - i_b);
9061 /* Set result to final length, which can change the pointer to array_r, so
9063 if (len_r != _invlist_len(r)) {
9064 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9066 array_r = invlist_array(r);
9069 /* Finish outputting any remaining */
9070 if (count >= 2) { /* At most one will have a non-zero copy count */
9072 if ((copy_count = len_a - i_a) > 0) {
9073 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9075 else if ((copy_count = len_b - i_b) > 0) {
9076 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9080 /* We may be removing a reference to one of the inputs. If so, the output
9081 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
9082 * count decremented) */
9083 if (a == *i || b == *i) {
9084 assert(! invlist_is_iterating(*i));
9089 SvREFCNT_dec_NN(*i);
9099 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9101 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9102 * set. A pointer to the inversion list is returned. This may actually be
9103 * a new list, in which case the passed in one has been destroyed. The
9104 * passed in inversion list can be NULL, in which case a new one is created
9105 * with just the one range in it */
9110 if (invlist == NULL) {
9111 invlist = _new_invlist(2);
9115 len = _invlist_len(invlist);
9118 /* If comes after the final entry actually in the list, can just append it
9121 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9122 && start >= invlist_array(invlist)[len - 1]))
9124 _append_range_to_invlist(invlist, start, end);
9128 /* Here, can't just append things, create and return a new inversion list
9129 * which is the union of this range and the existing inversion list */
9130 range_invlist = _new_invlist(2);
9131 _append_range_to_invlist(range_invlist, start, end);
9133 _invlist_union(invlist, range_invlist, &invlist);
9135 /* The temporary can be freed */
9136 SvREFCNT_dec_NN(range_invlist);
9142 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9143 UV** other_elements_ptr)
9145 /* Create and return an inversion list whose contents are to be populated
9146 * by the caller. The caller gives the number of elements (in 'size') and
9147 * the very first element ('element0'). This function will set
9148 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9151 * Obviously there is some trust involved that the caller will properly
9152 * fill in the other elements of the array.
9154 * (The first element needs to be passed in, as the underlying code does
9155 * things differently depending on whether it is zero or non-zero) */
9157 SV* invlist = _new_invlist(size);
9160 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9162 _append_range_to_invlist(invlist, element0, element0);
9163 offset = *get_invlist_offset_addr(invlist);
9165 invlist_set_len(invlist, size, offset);
9166 *other_elements_ptr = invlist_array(invlist) + 1;
9172 PERL_STATIC_INLINE SV*
9173 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9174 return _add_range_to_invlist(invlist, cp, cp);
9177 #ifndef PERL_IN_XSUB_RE
9179 Perl__invlist_invert(pTHX_ SV* const invlist)
9181 /* Complement the input inversion list. This adds a 0 if the list didn't
9182 * have a zero; removes it otherwise. As described above, the data
9183 * structure is set up so that this is very efficient */
9185 PERL_ARGS_ASSERT__INVLIST_INVERT;
9187 assert(! invlist_is_iterating(invlist));
9189 /* The inverse of matching nothing is matching everything */
9190 if (_invlist_len(invlist) == 0) {
9191 _append_range_to_invlist(invlist, 0, UV_MAX);
9195 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9200 PERL_STATIC_INLINE SV*
9201 S_invlist_clone(pTHX_ SV* const invlist)
9204 /* Return a new inversion list that is a copy of the input one, which is
9205 * unchanged. The new list will not be mortal even if the old one was. */
9207 /* Need to allocate extra space to accommodate Perl's addition of a
9208 * trailing NUL to SvPV's, since it thinks they are always strings */
9209 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9210 STRLEN physical_length = SvCUR(invlist);
9211 bool offset = *(get_invlist_offset_addr(invlist));
9213 PERL_ARGS_ASSERT_INVLIST_CLONE;
9215 *(get_invlist_offset_addr(new_invlist)) = offset;
9216 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9217 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9222 PERL_STATIC_INLINE STRLEN*
9223 S_get_invlist_iter_addr(SV* invlist)
9225 /* Return the address of the UV that contains the current iteration
9228 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9230 assert(SvTYPE(invlist) == SVt_INVLIST);
9232 return &(((XINVLIST*) SvANY(invlist))->iterator);
9235 PERL_STATIC_INLINE void
9236 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9238 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9240 *get_invlist_iter_addr(invlist) = 0;
9243 PERL_STATIC_INLINE void
9244 S_invlist_iterfinish(SV* invlist)
9246 /* Terminate iterator for invlist. This is to catch development errors.
9247 * Any iteration that is interrupted before completed should call this
9248 * function. Functions that add code points anywhere else but to the end
9249 * of an inversion list assert that they are not in the middle of an
9250 * iteration. If they were, the addition would make the iteration
9251 * problematical: if the iteration hadn't reached the place where things
9252 * were being added, it would be ok */
9254 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9256 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9260 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9262 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9263 * This call sets in <*start> and <*end>, the next range in <invlist>.
9264 * Returns <TRUE> if successful and the next call will return the next
9265 * range; <FALSE> if was already at the end of the list. If the latter,
9266 * <*start> and <*end> are unchanged, and the next call to this function
9267 * will start over at the beginning of the list */
9269 STRLEN* pos = get_invlist_iter_addr(invlist);
9270 UV len = _invlist_len(invlist);
9273 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9276 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9280 array = invlist_array(invlist);
9282 *start = array[(*pos)++];
9288 *end = array[(*pos)++] - 1;
9294 PERL_STATIC_INLINE bool
9295 S_invlist_is_iterating(SV* const invlist)
9297 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9299 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9302 PERL_STATIC_INLINE UV
9303 S_invlist_highest(SV* const invlist)
9305 /* Returns the highest code point that matches an inversion list. This API
9306 * has an ambiguity, as it returns 0 under either the highest is actually
9307 * 0, or if the list is empty. If this distinction matters to you, check
9308 * for emptiness before calling this function */
9310 UV len = _invlist_len(invlist);
9313 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9319 array = invlist_array(invlist);
9321 /* The last element in the array in the inversion list always starts a
9322 * range that goes to infinity. That range may be for code points that are
9323 * matched in the inversion list, or it may be for ones that aren't
9324 * matched. In the latter case, the highest code point in the set is one
9325 * less than the beginning of this range; otherwise it is the final element
9326 * of this range: infinity */
9327 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9329 : array[len - 1] - 1;
9332 #ifndef PERL_IN_XSUB_RE
9334 Perl__invlist_contents(pTHX_ SV* const invlist)
9336 /* Get the contents of an inversion list into a string SV so that they can
9337 * be printed out. It uses the format traditionally done for debug tracing
9341 SV* output = newSVpvs("\n");
9343 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9345 assert(! invlist_is_iterating(invlist));
9347 invlist_iterinit(invlist);
9348 while (invlist_iternext(invlist, &start, &end)) {
9349 if (end == UV_MAX) {
9350 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9352 else if (end != start) {
9353 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9357 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9365 #ifndef PERL_IN_XSUB_RE
9367 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9368 const char * const indent, SV* const invlist)
9370 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9371 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9372 * the string 'indent'. The output looks like this:
9373 [0] 0x000A .. 0x000D
9375 [4] 0x2028 .. 0x2029
9376 [6] 0x3104 .. INFINITY
9377 * This means that the first range of code points matched by the list are
9378 * 0xA through 0xD; the second range contains only the single code point
9379 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9380 * are used to define each range (except if the final range extends to
9381 * infinity, only a single element is needed). The array index of the
9382 * first element for the corresponding range is given in brackets. */
9387 PERL_ARGS_ASSERT__INVLIST_DUMP;
9389 if (invlist_is_iterating(invlist)) {
9390 Perl_dump_indent(aTHX_ level, file,
9391 "%sCan't dump inversion list because is in middle of iterating\n",
9396 invlist_iterinit(invlist);
9397 while (invlist_iternext(invlist, &start, &end)) {
9398 if (end == UV_MAX) {
9399 Perl_dump_indent(aTHX_ level, file,
9400 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9401 indent, (UV)count, start);
9403 else if (end != start) {
9404 Perl_dump_indent(aTHX_ level, file,
9405 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9406 indent, (UV)count, start, end);
9409 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9410 indent, (UV)count, start);
9417 Perl__load_PL_utf8_foldclosures (pTHX)
9419 assert(! PL_utf8_foldclosures);
9421 /* If the folds haven't been read in, call a fold function
9423 if (! PL_utf8_tofold) {
9424 U8 dummy[UTF8_MAXBYTES_CASE+1];
9426 /* This string is just a short named one above \xff */
9427 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9428 assert(PL_utf8_tofold); /* Verify that worked */
9430 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9434 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9436 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9438 /* Return a boolean as to if the two passed in inversion lists are
9439 * identical. The final argument, if TRUE, says to take the complement of
9440 * the second inversion list before doing the comparison */
9442 const UV* array_a = invlist_array(a);
9443 const UV* array_b = invlist_array(b);
9444 UV len_a = _invlist_len(a);
9445 UV len_b = _invlist_len(b);
9447 UV i = 0; /* current index into the arrays */
9448 bool retval = TRUE; /* Assume are identical until proven otherwise */
9450 PERL_ARGS_ASSERT__INVLISTEQ;
9452 /* If are to compare 'a' with the complement of b, set it
9453 * up so are looking at b's complement. */
9456 /* The complement of nothing is everything, so <a> would have to have
9457 * just one element, starting at zero (ending at infinity) */
9459 return (len_a == 1 && array_a[0] == 0);
9461 else if (array_b[0] == 0) {
9463 /* Otherwise, to complement, we invert. Here, the first element is
9464 * 0, just remove it. To do this, we just pretend the array starts
9472 /* But if the first element is not zero, we pretend the list starts
9473 * at the 0 that is always stored immediately before the array. */
9479 /* Make sure that the lengths are the same, as well as the final element
9480 * before looping through the remainder. (Thus we test the length, final,
9481 * and first elements right off the bat) */
9482 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9485 else for (i = 0; i < len_a - 1; i++) {
9486 if (array_a[i] != array_b[i]) {
9496 #undef HEADER_LENGTH
9497 #undef TO_INTERNAL_SIZE
9498 #undef FROM_INTERNAL_SIZE
9499 #undef INVLIST_VERSION_ID
9501 /* End of inversion list object */
9504 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9506 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9507 * constructs, and updates RExC_flags with them. On input, RExC_parse
9508 * should point to the first flag; it is updated on output to point to the
9509 * final ')' or ':'. There needs to be at least one flag, or this will
9512 /* for (?g), (?gc), and (?o) warnings; warning
9513 about (?c) will warn about (?g) -- japhy */
9515 #define WASTED_O 0x01
9516 #define WASTED_G 0x02
9517 #define WASTED_C 0x04
9518 #define WASTED_GC (WASTED_G|WASTED_C)
9519 I32 wastedflags = 0x00;
9520 U32 posflags = 0, negflags = 0;
9521 U32 *flagsp = &posflags;
9522 char has_charset_modifier = '\0';
9524 bool has_use_defaults = FALSE;
9525 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9526 int x_mod_count = 0;
9528 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9530 /* '^' as an initial flag sets certain defaults */
9531 if (UCHARAT(RExC_parse) == '^') {
9533 has_use_defaults = TRUE;
9534 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9535 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9536 ? REGEX_UNICODE_CHARSET
9537 : REGEX_DEPENDS_CHARSET);
9540 cs = get_regex_charset(RExC_flags);
9541 if (cs == REGEX_DEPENDS_CHARSET
9542 && (RExC_utf8 || RExC_uni_semantics))
9544 cs = REGEX_UNICODE_CHARSET;
9547 while (*RExC_parse) {
9548 /* && strchr("iogcmsx", *RExC_parse) */
9549 /* (?g), (?gc) and (?o) are useless here
9550 and must be globally applied -- japhy */
9551 switch (*RExC_parse) {
9553 /* Code for the imsx flags */
9554 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9556 case LOCALE_PAT_MOD:
9557 if (has_charset_modifier) {
9558 goto excess_modifier;
9560 else if (flagsp == &negflags) {
9563 cs = REGEX_LOCALE_CHARSET;
9564 has_charset_modifier = LOCALE_PAT_MOD;
9566 case UNICODE_PAT_MOD:
9567 if (has_charset_modifier) {
9568 goto excess_modifier;
9570 else if (flagsp == &negflags) {
9573 cs = REGEX_UNICODE_CHARSET;
9574 has_charset_modifier = UNICODE_PAT_MOD;
9576 case ASCII_RESTRICT_PAT_MOD:
9577 if (flagsp == &negflags) {
9580 if (has_charset_modifier) {
9581 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9582 goto excess_modifier;
9584 /* Doubled modifier implies more restricted */
9585 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9588 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9590 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9592 case DEPENDS_PAT_MOD:
9593 if (has_use_defaults) {
9594 goto fail_modifiers;
9596 else if (flagsp == &negflags) {
9599 else if (has_charset_modifier) {
9600 goto excess_modifier;
9603 /* The dual charset means unicode semantics if the
9604 * pattern (or target, not known until runtime) are
9605 * utf8, or something in the pattern indicates unicode
9607 cs = (RExC_utf8 || RExC_uni_semantics)
9608 ? REGEX_UNICODE_CHARSET
9609 : REGEX_DEPENDS_CHARSET;
9610 has_charset_modifier = DEPENDS_PAT_MOD;
9614 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9615 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9617 else if (has_charset_modifier == *(RExC_parse - 1)) {
9618 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9622 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9627 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9630 case ONCE_PAT_MOD: /* 'o' */
9631 case GLOBAL_PAT_MOD: /* 'g' */
9632 if (PASS2 && ckWARN(WARN_REGEXP)) {
9633 const I32 wflagbit = *RExC_parse == 'o'
9636 if (! (wastedflags & wflagbit) ) {
9637 wastedflags |= wflagbit;
9638 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9641 "Useless (%s%c) - %suse /%c modifier",
9642 flagsp == &negflags ? "?-" : "?",
9644 flagsp == &negflags ? "don't " : "",
9651 case CONTINUE_PAT_MOD: /* 'c' */
9652 if (PASS2 && ckWARN(WARN_REGEXP)) {
9653 if (! (wastedflags & WASTED_C) ) {
9654 wastedflags |= WASTED_GC;
9655 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9658 "Useless (%sc) - %suse /gc modifier",
9659 flagsp == &negflags ? "?-" : "?",
9660 flagsp == &negflags ? "don't " : ""
9665 case KEEPCOPY_PAT_MOD: /* 'p' */
9666 if (flagsp == &negflags) {
9668 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9670 *flagsp |= RXf_PMf_KEEPCOPY;
9674 /* A flag is a default iff it is following a minus, so
9675 * if there is a minus, it means will be trying to
9676 * re-specify a default which is an error */
9677 if (has_use_defaults || flagsp == &negflags) {
9678 goto fail_modifiers;
9681 wastedflags = 0; /* reset so (?g-c) warns twice */
9685 RExC_flags |= posflags;
9686 RExC_flags &= ~negflags;
9687 set_regex_charset(&RExC_flags, cs);
9688 if (RExC_flags & RXf_PMf_FOLD) {
9689 RExC_contains_i = 1;
9692 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9698 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9699 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9700 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9701 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9709 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9714 - reg - regular expression, i.e. main body or parenthesized thing
9716 * Caller must absorb opening parenthesis.
9718 * Combining parenthesis handling with the base level of regular expression
9719 * is a trifle forced, but the need to tie the tails of the branches to what
9720 * follows makes it hard to avoid.
9722 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9724 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9726 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9729 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9730 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9731 needs to be restarted.
9732 Otherwise would only return NULL if regbranch() returns NULL, which
9735 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9736 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9737 * 2 is like 1, but indicates that nextchar() has been called to advance
9738 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9739 * this flag alerts us to the need to check for that */
9741 regnode *ret; /* Will be the head of the group. */
9744 regnode *ender = NULL;
9747 U32 oregflags = RExC_flags;
9748 bool have_branch = 0;
9750 I32 freeze_paren = 0;
9751 I32 after_freeze = 0;
9752 I32 num; /* numeric backreferences */
9754 char * parse_start = RExC_parse; /* MJD */
9755 char * const oregcomp_parse = RExC_parse;
9757 GET_RE_DEBUG_FLAGS_DECL;
9759 PERL_ARGS_ASSERT_REG;
9760 DEBUG_PARSE("reg ");
9762 *flagp = 0; /* Tentatively. */
9765 /* Make an OPEN node, if parenthesized. */
9768 /* Under /x, space and comments can be gobbled up between the '(' and
9769 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9770 * intervening space, as the sequence is a token, and a token should be
9772 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9774 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9775 char *start_verb = RExC_parse;
9776 STRLEN verb_len = 0;
9777 char *start_arg = NULL;
9778 unsigned char op = 0;
9780 int internal_argval = 0; /* internal_argval is only useful if
9783 if (has_intervening_patws) {
9785 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9787 while ( *RExC_parse && *RExC_parse != ')' ) {
9788 if ( *RExC_parse == ':' ) {
9789 start_arg = RExC_parse + 1;
9795 verb_len = RExC_parse - start_verb;
9798 while ( *RExC_parse && *RExC_parse != ')' )
9800 if ( *RExC_parse != ')' )
9801 vFAIL("Unterminated verb pattern argument");
9802 if ( RExC_parse == start_arg )
9805 if ( *RExC_parse != ')' )
9806 vFAIL("Unterminated verb pattern");
9809 switch ( *start_verb ) {
9810 case 'A': /* (*ACCEPT) */
9811 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9813 internal_argval = RExC_nestroot;
9816 case 'C': /* (*COMMIT) */
9817 if ( memEQs(start_verb,verb_len,"COMMIT") )
9820 case 'F': /* (*FAIL) */
9821 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9826 case ':': /* (*:NAME) */
9827 case 'M': /* (*MARK:NAME) */
9828 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9833 case 'P': /* (*PRUNE) */
9834 if ( memEQs(start_verb,verb_len,"PRUNE") )
9837 case 'S': /* (*SKIP) */
9838 if ( memEQs(start_verb,verb_len,"SKIP") )
9841 case 'T': /* (*THEN) */
9842 /* [19:06] <TimToady> :: is then */
9843 if ( memEQs(start_verb,verb_len,"THEN") ) {
9845 RExC_seen |= REG_CUTGROUP_SEEN;
9850 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9852 "Unknown verb pattern '%"UTF8f"'",
9853 UTF8fARG(UTF, verb_len, start_verb));
9856 if ( start_arg && internal_argval ) {
9857 vFAIL3("Verb pattern '%.*s' may not have an argument",
9858 verb_len, start_verb);
9859 } else if ( argok < 0 && !start_arg ) {
9860 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9861 verb_len, start_verb);
9863 ret = reganode(pRExC_state, op, internal_argval);
9864 if ( ! internal_argval && ! SIZE_ONLY ) {
9866 SV *sv = newSVpvn( start_arg,
9867 RExC_parse - start_arg);
9868 ARG(ret) = add_data( pRExC_state,
9870 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9877 if (!internal_argval)
9878 RExC_seen |= REG_VERBARG_SEEN;
9879 } else if ( start_arg ) {
9880 vFAIL3("Verb pattern '%.*s' may not have an argument",
9881 verb_len, start_verb);
9883 ret = reg_node(pRExC_state, op);
9885 nextchar(pRExC_state);
9888 else if (*RExC_parse == '?') { /* (?...) */
9889 bool is_logical = 0;
9890 const char * const seqstart = RExC_parse;
9891 const char * endptr;
9892 if (has_intervening_patws) {
9894 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9898 paren = *RExC_parse++;
9899 ret = NULL; /* For look-ahead/behind. */
9902 case 'P': /* (?P...) variants for those used to PCRE/Python */
9903 paren = *RExC_parse++;
9904 if ( paren == '<') /* (?P<...>) named capture */
9906 else if (paren == '>') { /* (?P>name) named recursion */
9907 goto named_recursion;
9909 else if (paren == '=') { /* (?P=...) named backref */
9910 /* this pretty much dupes the code for \k<NAME> in
9911 * regatom(), if you change this make sure you change that
9913 char* name_start = RExC_parse;
9915 SV *sv_dat = reg_scan_name(pRExC_state,
9916 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9917 if (RExC_parse == name_start || *RExC_parse != ')')
9918 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9919 vFAIL2("Sequence %.3s... not terminated",parse_start);
9922 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9923 RExC_rxi->data->data[num]=(void*)sv_dat;
9924 SvREFCNT_inc_simple_void(sv_dat);
9927 ret = reganode(pRExC_state,
9930 : (ASCII_FOLD_RESTRICTED)
9932 : (AT_LEAST_UNI_SEMANTICS)
9940 Set_Node_Offset(ret, parse_start+1);
9941 Set_Node_Cur_Length(ret, parse_start);
9943 nextchar(pRExC_state);
9947 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9948 vFAIL3("Sequence (%.*s...) not recognized",
9949 RExC_parse-seqstart, seqstart);
9951 case '<': /* (?<...) */
9952 if (*RExC_parse == '!')
9954 else if (*RExC_parse != '=')
9960 case '\'': /* (?'...') */
9961 name_start= RExC_parse;
9962 svname = reg_scan_name(pRExC_state,
9963 SIZE_ONLY /* reverse test from the others */
9964 ? REG_RSN_RETURN_NAME
9965 : REG_RSN_RETURN_NULL);
9966 if (RExC_parse == name_start || *RExC_parse != paren)
9967 vFAIL2("Sequence (?%c... not terminated",
9968 paren=='>' ? '<' : paren);
9972 if (!svname) /* shouldn't happen */
9974 "panic: reg_scan_name returned NULL");
9975 if (!RExC_paren_names) {
9976 RExC_paren_names= newHV();
9977 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9979 RExC_paren_name_list= newAV();
9980 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9983 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9985 sv_dat = HeVAL(he_str);
9987 /* croak baby croak */
9989 "panic: paren_name hash element allocation failed");
9990 } else if ( SvPOK(sv_dat) ) {
9991 /* (?|...) can mean we have dupes so scan to check
9992 its already been stored. Maybe a flag indicating
9993 we are inside such a construct would be useful,
9994 but the arrays are likely to be quite small, so
9995 for now we punt -- dmq */
9996 IV count = SvIV(sv_dat);
9997 I32 *pv = (I32*)SvPVX(sv_dat);
9999 for ( i = 0 ; i < count ; i++ ) {
10000 if ( pv[i] == RExC_npar ) {
10006 pv = (I32*)SvGROW(sv_dat,
10007 SvCUR(sv_dat) + sizeof(I32)+1);
10008 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10009 pv[count] = RExC_npar;
10010 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10013 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10014 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10017 SvIV_set(sv_dat, 1);
10020 /* Yes this does cause a memory leak in debugging Perls
10022 if (!av_store(RExC_paren_name_list,
10023 RExC_npar, SvREFCNT_inc(svname)))
10024 SvREFCNT_dec_NN(svname);
10027 /*sv_dump(sv_dat);*/
10029 nextchar(pRExC_state);
10031 goto capturing_parens;
10033 RExC_seen |= REG_LOOKBEHIND_SEEN;
10034 RExC_in_lookbehind++;
10037 case '=': /* (?=...) */
10038 RExC_seen_zerolen++;
10040 case '!': /* (?!...) */
10041 RExC_seen_zerolen++;
10042 if (*RExC_parse == ')') {
10043 ret=reg_node(pRExC_state, OPFAIL);
10044 nextchar(pRExC_state);
10048 case '|': /* (?|...) */
10049 /* branch reset, behave like a (?:...) except that
10050 buffers in alternations share the same numbers */
10052 after_freeze = freeze_paren = RExC_npar;
10054 case ':': /* (?:...) */
10055 case '>': /* (?>...) */
10057 case '$': /* (?$...) */
10058 case '@': /* (?@...) */
10059 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10061 case '0' : /* (?0) */
10062 case 'R' : /* (?R) */
10063 if (*RExC_parse != ')')
10064 FAIL("Sequence (?R) not terminated");
10065 ret = reg_node(pRExC_state, GOSTART);
10066 RExC_seen |= REG_GOSTART_SEEN;
10067 *flagp |= POSTPONED;
10068 nextchar(pRExC_state);
10071 /* named and numeric backreferences */
10072 case '&': /* (?&NAME) */
10073 parse_start = RExC_parse - 1;
10076 SV *sv_dat = reg_scan_name(pRExC_state,
10077 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10078 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10080 if (RExC_parse == RExC_end || *RExC_parse != ')')
10081 vFAIL("Sequence (?&... not terminated");
10082 goto gen_recurse_regop;
10083 assert(0); /* NOT REACHED */
10085 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10087 vFAIL("Illegal pattern");
10089 goto parse_recursion;
10091 case '-': /* (?-1) */
10092 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10093 RExC_parse--; /* rewind to let it be handled later */
10097 case '1': case '2': case '3': case '4': /* (?1) */
10098 case '5': case '6': case '7': case '8': case '9':
10102 bool is_neg = FALSE;
10103 parse_start = RExC_parse - 1; /* MJD */
10104 if (*RExC_parse == '-') {
10108 num = grok_atou(RExC_parse, &endptr);
10110 RExC_parse = (char*)endptr;
10112 /* Some limit for num? */
10116 if (*RExC_parse!=')')
10117 vFAIL("Expecting close bracket");
10120 if ( paren == '-' ) {
10122 Diagram of capture buffer numbering.
10123 Top line is the normal capture buffer numbers
10124 Bottom line is the negative indexing as from
10128 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10132 num = RExC_npar + num;
10135 vFAIL("Reference to nonexistent group");
10137 } else if ( paren == '+' ) {
10138 num = RExC_npar + num - 1;
10141 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10143 if (num > (I32)RExC_rx->nparens) {
10145 vFAIL("Reference to nonexistent group");
10147 RExC_recurse_count++;
10148 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10149 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10150 22, "| |", 1 + depth * 2, "",
10151 (UV)ARG(ret), (IV)ARG2L(ret)));
10153 RExC_seen |= REG_RECURSE_SEEN;
10154 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10155 Set_Node_Offset(ret, parse_start); /* MJD */
10157 *flagp |= POSTPONED;
10158 nextchar(pRExC_state);
10161 assert(0); /* NOT REACHED */
10163 case '?': /* (??...) */
10165 if (*RExC_parse != '{') {
10167 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10169 "Sequence (%"UTF8f"...) not recognized",
10170 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10173 *flagp |= POSTPONED;
10174 paren = *RExC_parse++;
10176 case '{': /* (?{...}) */
10179 struct reg_code_block *cb;
10181 RExC_seen_zerolen++;
10183 if ( !pRExC_state->num_code_blocks
10184 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10185 || pRExC_state->code_blocks[pRExC_state->code_index].start
10186 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10189 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10190 FAIL("panic: Sequence (?{...}): no code block found\n");
10191 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10193 /* this is a pre-compiled code block (?{...}) */
10194 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10195 RExC_parse = RExC_start + cb->end;
10198 if (cb->src_regex) {
10199 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10200 RExC_rxi->data->data[n] =
10201 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10202 RExC_rxi->data->data[n+1] = (void*)o;
10205 n = add_data(pRExC_state,
10206 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10207 RExC_rxi->data->data[n] = (void*)o;
10210 pRExC_state->code_index++;
10211 nextchar(pRExC_state);
10215 ret = reg_node(pRExC_state, LOGICAL);
10217 eval = reg2Lanode(pRExC_state, EVAL,
10220 /* for later propagation into (??{})
10222 RExC_flags & RXf_PMf_COMPILETIME
10227 REGTAIL(pRExC_state, ret, eval);
10228 /* deal with the length of this later - MJD */
10231 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10232 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10233 Set_Node_Offset(ret, parse_start);
10236 case '(': /* (?(?{...})...) and (?(?=...)...) */
10239 const int DEFINE_len = sizeof("DEFINE") - 1;
10240 if (RExC_parse[0] == '?') { /* (?(?...)) */
10241 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10242 || RExC_parse[1] == '<'
10243 || RExC_parse[1] == '{') { /* Lookahead or eval. */
10247 ret = reg_node(pRExC_state, LOGICAL);
10251 tail = reg(pRExC_state, 1, &flag, depth+1);
10252 if (flag & RESTART_UTF8) {
10253 *flagp = RESTART_UTF8;
10256 REGTAIL(pRExC_state, ret, tail);
10259 /* Fall through to ‘Unknown switch condition’ at the
10260 end of the if/else chain. */
10262 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10263 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10265 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10266 char *name_start= RExC_parse++;
10268 SV *sv_dat=reg_scan_name(pRExC_state,
10269 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10270 if (RExC_parse == name_start || *RExC_parse != ch)
10271 vFAIL2("Sequence (?(%c... not terminated",
10272 (ch == '>' ? '<' : ch));
10275 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10276 RExC_rxi->data->data[num]=(void*)sv_dat;
10277 SvREFCNT_inc_simple_void(sv_dat);
10279 ret = reganode(pRExC_state,NGROUPP,num);
10280 goto insert_if_check_paren;
10282 else if (strnEQ(RExC_parse, "DEFINE",
10283 MIN(DEFINE_len, RExC_end - RExC_parse)))
10285 ret = reganode(pRExC_state,DEFINEP,0);
10286 RExC_parse += DEFINE_len;
10288 goto insert_if_check_paren;
10290 else if (RExC_parse[0] == 'R') {
10293 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10294 parno = grok_atou(RExC_parse, &endptr);
10296 RExC_parse = (char*)endptr;
10297 } else if (RExC_parse[0] == '&') {
10300 sv_dat = reg_scan_name(pRExC_state,
10302 ? REG_RSN_RETURN_NULL
10303 : REG_RSN_RETURN_DATA);
10304 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10306 ret = reganode(pRExC_state,INSUBP,parno);
10307 goto insert_if_check_paren;
10309 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10313 parno = grok_atou(RExC_parse, &endptr);
10315 RExC_parse = (char*)endptr;
10316 ret = reganode(pRExC_state, GROUPP, parno);
10318 insert_if_check_paren:
10319 if (*(tmp = nextchar(pRExC_state)) != ')') {
10320 /* nextchar also skips comments, so undo its work
10321 * and skip over the the next character.
10324 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10325 vFAIL("Switch condition not recognized");
10328 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10329 br = regbranch(pRExC_state, &flags, 1,depth+1);
10331 if (flags & RESTART_UTF8) {
10332 *flagp = RESTART_UTF8;
10335 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10338 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10340 c = *nextchar(pRExC_state);
10341 if (flags&HASWIDTH)
10342 *flagp |= HASWIDTH;
10345 vFAIL("(?(DEFINE)....) does not allow branches");
10347 /* Fake one for optimizer. */
10348 lastbr = reganode(pRExC_state, IFTHEN, 0);
10350 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10351 if (flags & RESTART_UTF8) {
10352 *flagp = RESTART_UTF8;
10355 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10358 REGTAIL(pRExC_state, ret, lastbr);
10359 if (flags&HASWIDTH)
10360 *flagp |= HASWIDTH;
10361 c = *nextchar(pRExC_state);
10366 if (RExC_parse>RExC_end)
10367 vFAIL("Switch (?(condition)... not terminated");
10369 vFAIL("Switch (?(condition)... contains too many branches");
10371 ender = reg_node(pRExC_state, TAIL);
10372 REGTAIL(pRExC_state, br, ender);
10374 REGTAIL(pRExC_state, lastbr, ender);
10375 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10378 REGTAIL(pRExC_state, ret, ender);
10379 RExC_size++; /* XXX WHY do we need this?!!
10380 For large programs it seems to be required
10381 but I can't figure out why. -- dmq*/
10384 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10385 vFAIL("Unknown switch condition (?(...))");
10387 case '[': /* (?[ ... ]) */
10388 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10391 RExC_parse--; /* for vFAIL to print correctly */
10392 vFAIL("Sequence (? incomplete");
10394 default: /* e.g., (?i) */
10397 parse_lparen_question_flags(pRExC_state);
10398 if (UCHARAT(RExC_parse) != ':') {
10399 nextchar(pRExC_state);
10404 nextchar(pRExC_state);
10414 ret = reganode(pRExC_state, OPEN, parno);
10416 if (!RExC_nestroot)
10417 RExC_nestroot = parno;
10418 if (RExC_seen & REG_RECURSE_SEEN
10419 && !RExC_open_parens[parno-1])
10421 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10422 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10423 22, "| |", 1+2 * depth, "",
10424 (IV)parno, REG_NODE_NUM(ret)));
10425 RExC_open_parens[parno-1]= ret;
10428 Set_Node_Length(ret, 1); /* MJD */
10429 Set_Node_Offset(ret, RExC_parse); /* MJD */
10437 /* Pick up the branches, linking them together. */
10438 parse_start = RExC_parse; /* MJD */
10439 br = regbranch(pRExC_state, &flags, 1,depth+1);
10441 /* branch_len = (paren != 0); */
10444 if (flags & RESTART_UTF8) {
10445 *flagp = RESTART_UTF8;
10448 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10450 if (*RExC_parse == '|') {
10451 if (!SIZE_ONLY && RExC_extralen) {
10452 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10455 reginsert(pRExC_state, BRANCH, br, depth+1);
10456 Set_Node_Length(br, paren != 0);
10457 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10461 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10463 else if (paren == ':') {
10464 *flagp |= flags&SIMPLE;
10466 if (is_open) { /* Starts with OPEN. */
10467 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10469 else if (paren != '?') /* Not Conditional */
10471 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10473 while (*RExC_parse == '|') {
10474 if (!SIZE_ONLY && RExC_extralen) {
10475 ender = reganode(pRExC_state, LONGJMP,0);
10477 /* Append to the previous. */
10478 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10481 RExC_extralen += 2; /* Account for LONGJMP. */
10482 nextchar(pRExC_state);
10483 if (freeze_paren) {
10484 if (RExC_npar > after_freeze)
10485 after_freeze = RExC_npar;
10486 RExC_npar = freeze_paren;
10488 br = regbranch(pRExC_state, &flags, 0, depth+1);
10491 if (flags & RESTART_UTF8) {
10492 *flagp = RESTART_UTF8;
10495 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10497 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10499 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10502 if (have_branch || paren != ':') {
10503 /* Make a closing node, and hook it on the end. */
10506 ender = reg_node(pRExC_state, TAIL);
10509 ender = reganode(pRExC_state, CLOSE, parno);
10510 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10511 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10512 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10513 22, "| |", 1+2 * depth, "", (IV)parno, REG_NODE_NUM(ender)));
10514 RExC_close_parens[parno-1]= ender;
10515 if (RExC_nestroot == parno)
10518 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10519 Set_Node_Length(ender,1); /* MJD */
10525 *flagp &= ~HASWIDTH;
10528 ender = reg_node(pRExC_state, SUCCEED);
10531 ender = reg_node(pRExC_state, END);
10533 assert(!RExC_opend); /* there can only be one! */
10534 RExC_opend = ender;
10538 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10539 DEBUG_PARSE_MSG("lsbr");
10540 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10541 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10542 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10543 SvPV_nolen_const(RExC_mysv1),
10544 (IV)REG_NODE_NUM(lastbr),
10545 SvPV_nolen_const(RExC_mysv2),
10546 (IV)REG_NODE_NUM(ender),
10547 (IV)(ender - lastbr)
10550 REGTAIL(pRExC_state, lastbr, ender);
10552 if (have_branch && !SIZE_ONLY) {
10553 char is_nothing= 1;
10555 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10557 /* Hook the tails of the branches to the closing node. */
10558 for (br = ret; br; br = regnext(br)) {
10559 const U8 op = PL_regkind[OP(br)];
10560 if (op == BRANCH) {
10561 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10562 if ( OP(NEXTOPER(br)) != NOTHING
10563 || regnext(NEXTOPER(br)) != ender)
10566 else if (op == BRANCHJ) {
10567 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10568 /* for now we always disable this optimisation * /
10569 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10570 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10576 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10577 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10578 DEBUG_PARSE_MSG("NADA");
10579 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10580 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10581 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10582 SvPV_nolen_const(RExC_mysv1),
10583 (IV)REG_NODE_NUM(ret),
10584 SvPV_nolen_const(RExC_mysv2),
10585 (IV)REG_NODE_NUM(ender),
10590 if (OP(ender) == TAIL) {
10595 for ( opt= br + 1; opt < ender ; opt++ )
10596 OP(opt)= OPTIMIZED;
10597 NEXT_OFF(br)= ender - br;
10605 static const char parens[] = "=!<,>";
10607 if (paren && (p = strchr(parens, paren))) {
10608 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10609 int flag = (p - parens) > 1;
10612 node = SUSPEND, flag = 0;
10613 reginsert(pRExC_state, node,ret, depth+1);
10614 Set_Node_Cur_Length(ret, parse_start);
10615 Set_Node_Offset(ret, parse_start + 1);
10617 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10621 /* Check for proper termination. */
10623 /* restore original flags, but keep (?p) */
10624 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10625 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10626 RExC_parse = oregcomp_parse;
10627 vFAIL("Unmatched (");
10630 else if (!paren && RExC_parse < RExC_end) {
10631 if (*RExC_parse == ')') {
10633 vFAIL("Unmatched )");
10636 FAIL("Junk on end of regexp"); /* "Can't happen". */
10637 assert(0); /* NOTREACHED */
10640 if (RExC_in_lookbehind) {
10641 RExC_in_lookbehind--;
10643 if (after_freeze > RExC_npar)
10644 RExC_npar = after_freeze;
10649 - regbranch - one alternative of an | operator
10651 * Implements the concatenation operator.
10653 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10657 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10660 regnode *chain = NULL;
10662 I32 flags = 0, c = 0;
10663 GET_RE_DEBUG_FLAGS_DECL;
10665 PERL_ARGS_ASSERT_REGBRANCH;
10667 DEBUG_PARSE("brnc");
10672 if (!SIZE_ONLY && RExC_extralen)
10673 ret = reganode(pRExC_state, BRANCHJ,0);
10675 ret = reg_node(pRExC_state, BRANCH);
10676 Set_Node_Length(ret, 1);
10680 if (!first && SIZE_ONLY)
10681 RExC_extralen += 1; /* BRANCHJ */
10683 *flagp = WORST; /* Tentatively. */
10686 nextchar(pRExC_state);
10687 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10688 flags &= ~TRYAGAIN;
10689 latest = regpiece(pRExC_state, &flags,depth+1);
10690 if (latest == NULL) {
10691 if (flags & TRYAGAIN)
10693 if (flags & RESTART_UTF8) {
10694 *flagp = RESTART_UTF8;
10697 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10699 else if (ret == NULL)
10701 *flagp |= flags&(HASWIDTH|POSTPONED);
10702 if (chain == NULL) /* First piece. */
10703 *flagp |= flags&SPSTART;
10706 REGTAIL(pRExC_state, chain, latest);
10711 if (chain == NULL) { /* Loop ran zero times. */
10712 chain = reg_node(pRExC_state, NOTHING);
10717 *flagp |= flags&SIMPLE;
10724 - regpiece - something followed by possible [*+?]
10726 * Note that the branching code sequences used for ? and the general cases
10727 * of * and + are somewhat optimized: they use the same NOTHING node as
10728 * both the endmarker for their branch list and the body of the last branch.
10729 * It might seem that this node could be dispensed with entirely, but the
10730 * endmarker role is not redundant.
10732 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10734 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10738 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10744 const char * const origparse = RExC_parse;
10746 I32 max = REG_INFTY;
10747 #ifdef RE_TRACK_PATTERN_OFFSETS
10750 const char *maxpos = NULL;
10752 /* Save the original in case we change the emitted regop to a FAIL. */
10753 regnode * const orig_emit = RExC_emit;
10755 GET_RE_DEBUG_FLAGS_DECL;
10757 PERL_ARGS_ASSERT_REGPIECE;
10759 DEBUG_PARSE("piec");
10761 ret = regatom(pRExC_state, &flags,depth+1);
10763 if (flags & (TRYAGAIN|RESTART_UTF8))
10764 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10766 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10772 if (op == '{' && regcurly(RExC_parse)) {
10774 #ifdef RE_TRACK_PATTERN_OFFSETS
10775 parse_start = RExC_parse; /* MJD */
10777 next = RExC_parse + 1;
10778 while (isDIGIT(*next) || *next == ',') {
10779 if (*next == ',') {
10787 if (*next == '}') { /* got one */
10788 const char* endptr;
10792 min = grok_atou(RExC_parse, &endptr);
10793 if (*maxpos == ',')
10796 maxpos = RExC_parse;
10797 max = grok_atou(maxpos, &endptr);
10798 if (!max && *maxpos != '0')
10799 max = REG_INFTY; /* meaning "infinity" */
10800 else if (max >= REG_INFTY)
10801 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10803 nextchar(pRExC_state);
10804 if (max < min) { /* If can't match, warn and optimize to fail
10808 /* We can't back off the size because we have to reserve
10809 * enough space for all the things we are about to throw
10810 * away, but we can shrink it by the ammount we are about
10811 * to re-use here */
10812 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10815 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10816 RExC_emit = orig_emit;
10818 ret = reg_node(pRExC_state, OPFAIL);
10821 else if (min == max
10822 && RExC_parse < RExC_end
10823 && (*RExC_parse == '?' || *RExC_parse == '+'))
10826 ckWARN2reg(RExC_parse + 1,
10827 "Useless use of greediness modifier '%c'",
10830 /* Absorb the modifier, so later code doesn't see nor use
10832 nextchar(pRExC_state);
10836 if ((flags&SIMPLE)) {
10837 RExC_naughty += 2 + RExC_naughty / 2;
10838 reginsert(pRExC_state, CURLY, ret, depth+1);
10839 Set_Node_Offset(ret, parse_start+1); /* MJD */
10840 Set_Node_Cur_Length(ret, parse_start);
10843 regnode * const w = reg_node(pRExC_state, WHILEM);
10846 REGTAIL(pRExC_state, ret, w);
10847 if (!SIZE_ONLY && RExC_extralen) {
10848 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10849 reginsert(pRExC_state, NOTHING,ret, depth+1);
10850 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10852 reginsert(pRExC_state, CURLYX,ret, depth+1);
10854 Set_Node_Offset(ret, parse_start+1);
10855 Set_Node_Length(ret,
10856 op == '{' ? (RExC_parse - parse_start) : 1);
10858 if (!SIZE_ONLY && RExC_extralen)
10859 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10860 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10862 RExC_whilem_seen++, RExC_extralen += 3;
10863 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10870 *flagp |= HASWIDTH;
10872 ARG1_SET(ret, (U16)min);
10873 ARG2_SET(ret, (U16)max);
10875 if (max == REG_INFTY)
10876 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10882 if (!ISMULT1(op)) {
10887 #if 0 /* Now runtime fix should be reliable. */
10889 /* if this is reinstated, don't forget to put this back into perldiag:
10891 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10893 (F) The part of the regexp subject to either the * or + quantifier
10894 could match an empty string. The {#} shows in the regular
10895 expression about where the problem was discovered.
10899 if (!(flags&HASWIDTH) && op != '?')
10900 vFAIL("Regexp *+ operand could be empty");
10903 #ifdef RE_TRACK_PATTERN_OFFSETS
10904 parse_start = RExC_parse;
10906 nextchar(pRExC_state);
10908 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10910 if (op == '*' && (flags&SIMPLE)) {
10911 reginsert(pRExC_state, STAR, ret, depth+1);
10914 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10916 else if (op == '*') {
10920 else if (op == '+' && (flags&SIMPLE)) {
10921 reginsert(pRExC_state, PLUS, ret, depth+1);
10924 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10926 else if (op == '+') {
10930 else if (op == '?') {
10935 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10936 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10937 ckWARN2reg(RExC_parse,
10938 "%"UTF8f" matches null string many times",
10939 UTF8fARG(UTF, (RExC_parse >= origparse
10940 ? RExC_parse - origparse
10943 (void)ReREFCNT_inc(RExC_rx_sv);
10946 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10947 nextchar(pRExC_state);
10948 reginsert(pRExC_state, MINMOD, ret, depth+1);
10949 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10952 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10954 nextchar(pRExC_state);
10955 ender = reg_node(pRExC_state, SUCCEED);
10956 REGTAIL(pRExC_state, ret, ender);
10957 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10959 ender = reg_node(pRExC_state, TAIL);
10960 REGTAIL(pRExC_state, ret, ender);
10963 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10965 vFAIL("Nested quantifiers");
10972 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10973 UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10977 /* This is expected to be called by a parser routine that has recognized '\N'
10978 and needs to handle the rest. RExC_parse is expected to point at the first
10979 char following the N at the time of the call. On successful return,
10980 RExC_parse has been updated to point to just after the sequence identified
10981 by this routine, <*flagp> has been updated, and the non-NULL input pointers
10982 have been set appropriately.
10984 The typical case for this is \N{some character name}. This is usually
10985 called while parsing the input, filling in or ready to fill in an EXACTish
10986 node, and the code point for the character should be returned, so that it
10987 can be added to the node, and parsing continued with the next input
10988 character. But it may be that instead of a single character the \N{}
10989 expands to more than one, a named sequence. In this case any following
10990 quantifier applies to the whole sequence, and it is easier, given the code
10991 structure that calls this, to handle it from a different area of the code.
10992 For this reason, the input parameters can be set so that it returns valid
10993 only on one or the other of these cases.
10995 Another possibility is for the input to be an empty \N{}, which for
10996 backwards compatibility we accept, but generate a NOTHING node which should
10997 later get optimized out. This is handled from the area of code which can
10998 handle a named sequence, so if called with the parameters for the other, it
11001 Still another possibility is for the \N to mean [^\n], and not a single
11002 character or explicit sequence at all. This is determined by context.
11003 Again, this is handled from the area of code which can handle a named
11004 sequence, so if called with the parameters for the other, it also fails.
11006 And the final possibility is for the \N to be called from within a bracketed
11007 character class. In this case the [^\n] meaning makes no sense, and so is
11008 an error. Other anomalous situations are left to the calling code to handle.
11010 For non-single-quoted regexes, the tokenizer has attempted to decide which
11011 of the above applies, and in the case of a named sequence, has converted it
11012 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11013 where c1... are the characters in the sequence. For single-quoted regexes,
11014 the tokenizer passes the \N sequence through unchanged; this code will not
11015 attempt to determine this nor expand those, instead raising a syntax error.
11016 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11017 or there is no '}', it signals that this \N occurrence means to match a
11018 non-newline. (This mostly was done because of [perl #56444].)
11020 The API is somewhat convoluted due to historical and the above reasons.
11022 The function raises an error (via vFAIL), and doesn't return for various
11023 syntax errors. For other failures, it returns (STRLEN) -1. For successes,
11024 it returns a count of how many characters were accounted for by it. (This
11025 can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11026 points in the sequence. It sets <node_p>, <valuep>, and/or
11027 <substitute_parse> on success.
11029 If <valuep> is non-null, it means the caller can accept an input sequence
11030 consisting of a just a single code point; <*valuep> is set to the value
11031 of the only or first code point in the input.
11033 If <substitute_parse> is non-null, it means the caller can accept an input
11034 sequence consisting of one or more code points; <*substitute_parse> is a
11035 newly created mortal SV* in this case, containing \x{} escapes representing
11038 Both <valuep> and <substitute_parse> can be non-NULL.
11040 If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
11041 that the caller can accept any legal sequence other than a single code
11042 point. To wit, <*node_p> is set as follows:
11043 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11044 2) \N{}: points to a new NOTHING node; return is 0
11045 3) otherwise: points to a new EXACT node containing the resolved
11046 string; return is the number of code points in the
11047 string. This will never be 1.
11048 Note that failure is returned for single code point sequences if <valuep> is
11049 null and <node_p> is not.
11052 char * endbrace; /* '}' following the name */
11054 char *endchar; /* Points to '.' or '}' ending cur char in the input
11056 bool has_multiple_chars; /* true if the input stream contains a sequence of
11057 more than one character */
11058 bool in_char_class = substitute_parse != NULL;
11059 STRLEN count = 0; /* Number of characters in this sequence */
11061 GET_RE_DEBUG_FLAGS_DECL;
11063 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11065 GET_RE_DEBUG_FLAGS;
11067 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
11068 assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11070 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11071 * modifier. The other meaning does not, so use a temporary until we find
11072 * out which we are being called with */
11073 p = (RExC_flags & RXf_PMf_EXTENDED)
11074 ? regpatws(pRExC_state, RExC_parse,
11075 TRUE) /* means recognize comments */
11078 /* Disambiguate between \N meaning a named character versus \N meaning
11079 * [^\n]. The former is assumed when it can't be the latter. */
11080 if (*p != '{' || regcurly(p)) {
11083 /* no bare \N allowed in a charclass */
11084 if (in_char_class) {
11085 vFAIL("\\N in a character class must be a named character: \\N{...}");
11087 return (STRLEN) -1;
11089 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
11091 nextchar(pRExC_state);
11092 *node_p = reg_node(pRExC_state, REG_ANY);
11093 *flagp |= HASWIDTH|SIMPLE;
11095 Set_Node_Length(*node_p, 1); /* MJD */
11099 /* Here, we have decided it should be a named character or sequence */
11101 /* The test above made sure that the next real character is a '{', but
11102 * under the /x modifier, it could be separated by space (or a comment and
11103 * \n) and this is not allowed (for consistency with \x{...} and the
11104 * tokenizer handling of \N{NAME}). */
11105 if (*RExC_parse != '{') {
11106 vFAIL("Missing braces on \\N{}");
11109 RExC_parse++; /* Skip past the '{' */
11111 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11112 || ! (endbrace == RExC_parse /* nothing between the {} */
11113 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
11115 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
11118 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11119 vFAIL("\\N{NAME} must be resolved by the lexer");
11122 if (endbrace == RExC_parse) { /* empty: \N{} */
11124 *node_p = reg_node(pRExC_state,NOTHING);
11126 else if (! in_char_class) {
11127 return (STRLEN) -1;
11129 nextchar(pRExC_state);
11133 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11134 RExC_parse += 2; /* Skip past the 'U+' */
11136 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11138 /* Code points are separated by dots. If none, there is only one code
11139 * point, and is terminated by the brace */
11140 has_multiple_chars = (endchar < endbrace);
11142 /* We get the first code point if we want it, and either there is only one,
11143 * or we can accept both cases of one and more than one */
11144 if (valuep && (substitute_parse || ! has_multiple_chars)) {
11145 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11146 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11147 | PERL_SCAN_DISALLOW_PREFIX
11149 /* No errors in the first pass (See [perl
11150 * #122671].) We let the code below find the
11151 * errors when there are multiple chars. */
11152 | ((SIZE_ONLY || has_multiple_chars)
11153 ? PERL_SCAN_SILENT_ILLDIGIT
11156 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11158 /* The tokenizer should have guaranteed validity, but it's possible to
11159 * bypass it by using single quoting, so check. Don't do the check
11160 * here when there are multiple chars; we do it below anyway. */
11161 if (! has_multiple_chars) {
11162 if (length_of_hex == 0
11163 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11165 RExC_parse += length_of_hex; /* Includes all the valid */
11166 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11167 ? UTF8SKIP(RExC_parse)
11169 /* Guard against malformed utf8 */
11170 if (RExC_parse >= endchar) {
11171 RExC_parse = endchar;
11173 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11176 RExC_parse = endbrace + 1;
11181 /* Here, we should have already handled the case where a single character
11182 * is expected and found. So it is a failure if we aren't expecting
11183 * multiple chars and got them; or didn't get them but wanted them. We
11184 * fail without advancing the parse, so that the caller can try again with
11185 * different acceptance criteria */
11186 if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11188 return (STRLEN) -1;
11193 /* What is done here is to convert this to a sub-pattern of the form
11194 * \x{char1}\x{char2}...
11195 * and then either return it in <*substitute_parse> if non-null; or
11196 * call reg recursively to parse it (enclosing in "(?: ... )" ). That
11197 * way, it retains its atomicness, while not having to worry about
11198 * special handling that some code points may have. toke.c has
11199 * converted the original Unicode values to native, so that we can just
11200 * pass on the hex values unchanged. We do have to set a flag to keep
11201 * recoding from happening in the recursion */
11205 char *orig_end = RExC_end;
11208 if (substitute_parse) {
11209 *substitute_parse = newSVpvs("");
11212 substitute_parse = &dummy;
11213 *substitute_parse = newSVpvs("?:");
11215 *substitute_parse = sv_2mortal(*substitute_parse);
11217 while (RExC_parse < endbrace) {
11219 /* Convert to notation the rest of the code understands */
11220 sv_catpv(*substitute_parse, "\\x{");
11221 sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11222 sv_catpv(*substitute_parse, "}");
11224 /* Point to the beginning of the next character in the sequence. */
11225 RExC_parse = endchar + 1;
11226 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11230 if (! in_char_class) {
11231 sv_catpv(*substitute_parse, ")");
11234 RExC_parse = SvPV(*substitute_parse, len);
11236 /* Don't allow empty number */
11237 if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11238 RExC_parse = endbrace;
11239 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11241 RExC_end = RExC_parse + len;
11243 /* The values are Unicode, and therefore not subject to recoding */
11244 RExC_override_recoding = 1;
11247 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11248 if (flags & RESTART_UTF8) {
11249 *flagp = RESTART_UTF8;
11250 return (STRLEN) -1;
11252 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11255 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11258 RExC_parse = endbrace;
11259 RExC_end = orig_end;
11260 RExC_override_recoding = 0;
11262 nextchar(pRExC_state);
11272 * It returns the code point in utf8 for the value in *encp.
11273 * value: a code value in the source encoding
11274 * encp: a pointer to an Encode object
11276 * If the result from Encode is not a single character,
11277 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11280 S_reg_recode(pTHX_ const char value, SV **encp)
11283 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11284 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11285 const STRLEN newlen = SvCUR(sv);
11286 UV uv = UNICODE_REPLACEMENT;
11288 PERL_ARGS_ASSERT_REG_RECODE;
11292 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11295 if (!newlen || numlen != newlen) {
11296 uv = UNICODE_REPLACEMENT;
11302 PERL_STATIC_INLINE U8
11303 S_compute_EXACTish(RExC_state_t *pRExC_state)
11307 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11313 op = get_regex_charset(RExC_flags);
11314 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11315 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11316 been, so there is no hole */
11319 return op + EXACTF;
11322 PERL_STATIC_INLINE void
11323 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11324 regnode *node, I32* flagp, STRLEN len, UV code_point,
11327 /* This knows the details about sizing an EXACTish node, setting flags for
11328 * it (by setting <*flagp>, and potentially populating it with a single
11331 * If <len> (the length in bytes) is non-zero, this function assumes that
11332 * the node has already been populated, and just does the sizing. In this
11333 * case <code_point> should be the final code point that has already been
11334 * placed into the node. This value will be ignored except that under some
11335 * circumstances <*flagp> is set based on it.
11337 * If <len> is zero, the function assumes that the node is to contain only
11338 * the single character given by <code_point> and calculates what <len>
11339 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11340 * additionally will populate the node's STRING with <code_point> or its
11343 * In both cases <*flagp> is appropriately set
11345 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11346 * 255, must be folded (the former only when the rules indicate it can
11349 * When it does the populating, it looks at the flag 'downgradable'. If
11350 * true with a node that folds, it checks if the single code point
11351 * participates in a fold, and if not downgrades the node to an EXACT.
11352 * This helps the optimizer */
11354 bool len_passed_in = cBOOL(len != 0);
11355 U8 character[UTF8_MAXBYTES_CASE+1];
11357 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11359 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11360 * sizing difference, and is extra work that is thrown away */
11361 if (downgradable && ! PASS2) {
11362 downgradable = FALSE;
11365 if (! len_passed_in) {
11367 if (UVCHR_IS_INVARIANT(code_point)) {
11368 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11369 *character = (U8) code_point;
11371 else { /* Here is /i and not /l. (toFOLD() is defined on just
11372 ASCII, which isn't the same thing as INVARIANT on
11373 EBCDIC, but it works there, as the extra invariants
11374 fold to themselves) */
11375 *character = toFOLD((U8) code_point);
11377 /* We can downgrade to an EXACT node if this character
11378 * isn't a folding one. Note that this assumes that
11379 * nothing above Latin1 folds to some other invariant than
11380 * one of these alphabetics; otherwise we would also have
11382 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11383 * || ASCII_FOLD_RESTRICTED))
11385 if (downgradable && PL_fold[code_point] == code_point) {
11391 else if (FOLD && (! LOC
11392 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11393 { /* Folding, and ok to do so now */
11394 UV folded = _to_uni_fold_flags(
11398 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11399 ? FOLD_FLAGS_NOMIX_ASCII
11402 && folded == code_point /* This quickly rules out many
11403 cases, avoiding the
11404 _invlist_contains_cp() overhead
11406 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11411 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11413 /* Not folding this cp, and can output it directly */
11414 *character = UTF8_TWO_BYTE_HI(code_point);
11415 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11419 uvchr_to_utf8( character, code_point);
11420 len = UTF8SKIP(character);
11422 } /* Else pattern isn't UTF8. */
11424 *character = (U8) code_point;
11426 } /* Else is folded non-UTF8 */
11427 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11429 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11430 * comments at join_exact()); */
11431 *character = (U8) code_point;
11434 /* Can turn into an EXACT node if we know the fold at compile time,
11435 * and it folds to itself and doesn't particpate in other folds */
11438 && PL_fold_latin1[code_point] == code_point
11439 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11440 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11444 } /* else is Sharp s. May need to fold it */
11445 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11447 *(character + 1) = 's';
11451 *character = LATIN_SMALL_LETTER_SHARP_S;
11457 RExC_size += STR_SZ(len);
11460 RExC_emit += STR_SZ(len);
11461 STR_LEN(node) = len;
11462 if (! len_passed_in) {
11463 Copy((char *) character, STRING(node), len, char);
11467 *flagp |= HASWIDTH;
11469 /* A single character node is SIMPLE, except for the special-cased SHARP S
11471 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11472 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11473 || ! FOLD || ! DEPENDS_SEMANTICS))
11478 /* The OP may not be well defined in PASS1 */
11479 if (PASS2 && OP(node) == EXACTFL) {
11480 RExC_contains_locale = 1;
11485 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11486 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11489 S_backref_value(char *p)
11491 const char* endptr;
11492 UV val = grok_atou(p, &endptr);
11493 if (endptr == p || endptr == NULL || val > I32_MAX)
11500 - regatom - the lowest level
11502 Try to identify anything special at the start of the pattern. If there
11503 is, then handle it as required. This may involve generating a single regop,
11504 such as for an assertion; or it may involve recursing, such as to
11505 handle a () structure.
11507 If the string doesn't start with something special then we gobble up
11508 as much literal text as we can.
11510 Once we have been able to handle whatever type of thing started the
11511 sequence, we return.
11513 Note: we have to be careful with escapes, as they can be both literal
11514 and special, and in the case of \10 and friends, context determines which.
11516 A summary of the code structure is:
11518 switch (first_byte) {
11519 cases for each special:
11520 handle this special;
11523 switch (2nd byte) {
11524 cases for each unambiguous special:
11525 handle this special;
11527 cases for each ambigous special/literal:
11529 if (special) handle here
11531 default: // unambiguously literal:
11534 default: // is a literal char
11537 create EXACTish node for literal;
11538 while (more input and node isn't full) {
11539 switch (input_byte) {
11540 cases for each special;
11541 make sure parse pointer is set so that the next call to
11542 regatom will see this special first
11543 goto loopdone; // EXACTish node terminated by prev. char
11545 append char to EXACTISH node;
11547 get next input byte;
11551 return the generated node;
11553 Specifically there are two separate switches for handling
11554 escape sequences, with the one for handling literal escapes requiring
11555 a dummy entry for all of the special escapes that are actually handled
11558 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11560 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11562 Otherwise does not return NULL.
11566 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11568 regnode *ret = NULL;
11570 char *parse_start = RExC_parse;
11575 GET_RE_DEBUG_FLAGS_DECL;
11577 *flagp = WORST; /* Tentatively. */
11579 DEBUG_PARSE("atom");
11581 PERL_ARGS_ASSERT_REGATOM;
11584 switch ((U8)*RExC_parse) {
11586 RExC_seen_zerolen++;
11587 nextchar(pRExC_state);
11588 if (RExC_flags & RXf_PMf_MULTILINE)
11589 ret = reg_node(pRExC_state, MBOL);
11591 ret = reg_node(pRExC_state, SBOL);
11592 Set_Node_Length(ret, 1); /* MJD */
11595 nextchar(pRExC_state);
11597 RExC_seen_zerolen++;
11598 if (RExC_flags & RXf_PMf_MULTILINE)
11599 ret = reg_node(pRExC_state, MEOL);
11601 ret = reg_node(pRExC_state, SEOL);
11602 Set_Node_Length(ret, 1); /* MJD */
11605 nextchar(pRExC_state);
11606 if (RExC_flags & RXf_PMf_SINGLELINE)
11607 ret = reg_node(pRExC_state, SANY);
11609 ret = reg_node(pRExC_state, REG_ANY);
11610 *flagp |= HASWIDTH|SIMPLE;
11612 Set_Node_Length(ret, 1); /* MJD */
11616 char * const oregcomp_parse = ++RExC_parse;
11617 ret = regclass(pRExC_state, flagp,depth+1,
11618 FALSE, /* means parse the whole char class */
11619 TRUE, /* allow multi-char folds */
11620 FALSE, /* don't silence non-portable warnings. */
11622 if (*RExC_parse != ']') {
11623 RExC_parse = oregcomp_parse;
11624 vFAIL("Unmatched [");
11627 if (*flagp & RESTART_UTF8)
11629 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11632 nextchar(pRExC_state);
11633 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11637 nextchar(pRExC_state);
11638 ret = reg(pRExC_state, 2, &flags,depth+1);
11640 if (flags & TRYAGAIN) {
11641 if (RExC_parse == RExC_end) {
11642 /* Make parent create an empty node if needed. */
11643 *flagp |= TRYAGAIN;
11648 if (flags & RESTART_UTF8) {
11649 *flagp = RESTART_UTF8;
11652 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11655 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11659 if (flags & TRYAGAIN) {
11660 *flagp |= TRYAGAIN;
11663 vFAIL("Internal urp");
11664 /* Supposed to be caught earlier. */
11670 vFAIL("Quantifier follows nothing");
11675 This switch handles escape sequences that resolve to some kind
11676 of special regop and not to literal text. Escape sequnces that
11677 resolve to literal text are handled below in the switch marked
11680 Every entry in this switch *must* have a corresponding entry
11681 in the literal escape switch. However, the opposite is not
11682 required, as the default for this switch is to jump to the
11683 literal text handling code.
11685 switch ((U8)*++RExC_parse) {
11686 /* Special Escapes */
11688 RExC_seen_zerolen++;
11689 ret = reg_node(pRExC_state, SBOL);
11690 /* SBOL is shared with /^/ so we set the flags so we can tell
11691 * /\A/ from /^/ in split. We check ret because first pass we
11692 * have no regop struct to set the flags on. */
11696 goto finish_meta_pat;
11698 ret = reg_node(pRExC_state, GPOS);
11699 RExC_seen |= REG_GPOS_SEEN;
11701 goto finish_meta_pat;
11703 RExC_seen_zerolen++;
11704 ret = reg_node(pRExC_state, KEEPS);
11706 /* XXX:dmq : disabling in-place substitution seems to
11707 * be necessary here to avoid cases of memory corruption, as
11708 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11710 RExC_seen |= REG_LOOKBEHIND_SEEN;
11711 goto finish_meta_pat;
11713 ret = reg_node(pRExC_state, SEOL);
11715 RExC_seen_zerolen++; /* Do not optimize RE away */
11716 goto finish_meta_pat;
11718 ret = reg_node(pRExC_state, EOS);
11720 RExC_seen_zerolen++; /* Do not optimize RE away */
11721 goto finish_meta_pat;
11723 ret = reg_node(pRExC_state, CANY);
11724 RExC_seen |= REG_CANY_SEEN;
11725 *flagp |= HASWIDTH|SIMPLE;
11727 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11729 goto finish_meta_pat;
11731 ret = reg_node(pRExC_state, CLUMP);
11732 *flagp |= HASWIDTH;
11733 goto finish_meta_pat;
11739 arg = ANYOF_WORDCHAR;
11743 RExC_seen_zerolen++;
11744 RExC_seen |= REG_LOOKBEHIND_SEEN;
11745 op = BOUND + get_regex_charset(RExC_flags);
11746 if (op > BOUNDA) { /* /aa is same as /a */
11749 else if (op == BOUNDL) {
11750 RExC_contains_locale = 1;
11752 ret = reg_node(pRExC_state, op);
11753 FLAGS(ret) = get_regex_charset(RExC_flags);
11755 if ((U8) *(RExC_parse + 1) == '{') {
11756 /* diag_listed_as: Use "%s" instead of "%s" */
11757 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11759 goto finish_meta_pat;
11761 RExC_seen_zerolen++;
11762 RExC_seen |= REG_LOOKBEHIND_SEEN;
11763 op = NBOUND + get_regex_charset(RExC_flags);
11764 if (op > NBOUNDA) { /* /aa is same as /a */
11767 else if (op == NBOUNDL) {
11768 RExC_contains_locale = 1;
11770 ret = reg_node(pRExC_state, op);
11771 FLAGS(ret) = get_regex_charset(RExC_flags);
11773 if ((U8) *(RExC_parse + 1) == '{') {
11774 /* diag_listed_as: Use "%s" instead of "%s" */
11775 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11777 goto finish_meta_pat;
11787 ret = reg_node(pRExC_state, LNBREAK);
11788 *flagp |= HASWIDTH|SIMPLE;
11789 goto finish_meta_pat;
11797 goto join_posix_op_known;
11803 arg = ANYOF_VERTWS;
11805 goto join_posix_op_known;
11815 op = POSIXD + get_regex_charset(RExC_flags);
11816 if (op > POSIXA) { /* /aa is same as /a */
11819 else if (op == POSIXL) {
11820 RExC_contains_locale = 1;
11823 join_posix_op_known:
11826 op += NPOSIXD - POSIXD;
11829 ret = reg_node(pRExC_state, op);
11831 FLAGS(ret) = namedclass_to_classnum(arg);
11834 *flagp |= HASWIDTH|SIMPLE;
11838 nextchar(pRExC_state);
11839 Set_Node_Length(ret, 2); /* MJD */
11845 char* parse_start = RExC_parse - 2;
11850 ret = regclass(pRExC_state, flagp,depth+1,
11851 TRUE, /* means just parse this element */
11852 FALSE, /* don't allow multi-char folds */
11853 FALSE, /* don't silence non-portable warnings.
11854 It would be a bug if these returned
11857 /* regclass() can only return RESTART_UTF8 if multi-char folds
11860 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11865 Set_Node_Offset(ret, parse_start + 2);
11866 Set_Node_Cur_Length(ret, parse_start);
11867 nextchar(pRExC_state);
11871 /* Handle \N and \N{NAME} with multiple code points here and not
11872 * below because it can be multicharacter. join_exact() will join
11873 * them up later on. Also this makes sure that things like
11874 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11875 * The options to the grok function call causes it to fail if the
11876 * sequence is just a single code point. We then go treat it as
11877 * just another character in the current EXACT node, and hence it
11878 * gets uniform treatment with all the other characters. The
11879 * special treatment for quantifiers is not needed for such single
11880 * character sequences */
11882 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11885 if (*flagp & RESTART_UTF8)
11891 case 'k': /* Handle \k<NAME> and \k'NAME' */
11894 char ch= RExC_parse[1];
11895 if (ch != '<' && ch != '\'' && ch != '{') {
11897 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11898 vFAIL2("Sequence %.2s... not terminated",parse_start);
11900 /* this pretty much dupes the code for (?P=...) in reg(), if
11901 you change this make sure you change that */
11902 char* name_start = (RExC_parse += 2);
11904 SV *sv_dat = reg_scan_name(pRExC_state,
11905 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11906 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11907 if (RExC_parse == name_start || *RExC_parse != ch)
11908 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11909 vFAIL2("Sequence %.3s... not terminated",parse_start);
11912 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11913 RExC_rxi->data->data[num]=(void*)sv_dat;
11914 SvREFCNT_inc_simple_void(sv_dat);
11918 ret = reganode(pRExC_state,
11921 : (ASCII_FOLD_RESTRICTED)
11923 : (AT_LEAST_UNI_SEMANTICS)
11929 *flagp |= HASWIDTH;
11931 /* override incorrect value set in reganode MJD */
11932 Set_Node_Offset(ret, parse_start+1);
11933 Set_Node_Cur_Length(ret, parse_start);
11934 nextchar(pRExC_state);
11940 case '1': case '2': case '3': case '4':
11941 case '5': case '6': case '7': case '8': case '9':
11946 if (*RExC_parse == 'g') {
11950 if (*RExC_parse == '{') {
11954 if (*RExC_parse == '-') {
11958 if (hasbrace && !isDIGIT(*RExC_parse)) {
11959 if (isrel) RExC_parse--;
11961 goto parse_named_seq;
11964 num = S_backref_value(RExC_parse);
11966 vFAIL("Reference to invalid group 0");
11967 else if (num == I32_MAX) {
11968 if (isDIGIT(*RExC_parse))
11969 vFAIL("Reference to nonexistent group");
11971 vFAIL("Unterminated \\g... pattern");
11975 num = RExC_npar - num;
11977 vFAIL("Reference to nonexistent or unclosed group");
11981 num = S_backref_value(RExC_parse);
11982 /* bare \NNN might be backref or octal - if it is larger than or equal
11983 * RExC_npar then it is assumed to be and octal escape.
11984 * Note RExC_npar is +1 from the actual number of parens*/
11985 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11986 && *RExC_parse != '8' && *RExC_parse != '9'))
11988 /* Probably a character specified in octal, e.g. \35 */
11993 /* at this point RExC_parse definitely points to a backref
11996 #ifdef RE_TRACK_PATTERN_OFFSETS
11997 char * const parse_start = RExC_parse - 1; /* MJD */
11999 while (isDIGIT(*RExC_parse))
12002 if (*RExC_parse != '}')
12003 vFAIL("Unterminated \\g{...} pattern");
12007 if (num > (I32)RExC_rx->nparens)
12008 vFAIL("Reference to nonexistent group");
12011 ret = reganode(pRExC_state,
12014 : (ASCII_FOLD_RESTRICTED)
12016 : (AT_LEAST_UNI_SEMANTICS)
12022 *flagp |= HASWIDTH;
12024 /* override incorrect value set in reganode MJD */
12025 Set_Node_Offset(ret, parse_start+1);
12026 Set_Node_Cur_Length(ret, parse_start);
12028 nextchar(pRExC_state);
12033 if (RExC_parse >= RExC_end)
12034 FAIL("Trailing \\");
12037 /* Do not generate "unrecognized" warnings here, we fall
12038 back into the quick-grab loop below */
12045 if (RExC_flags & RXf_PMf_EXTENDED) {
12046 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12047 if (RExC_parse < RExC_end)
12054 parse_start = RExC_parse - 1;
12063 #define MAX_NODE_STRING_SIZE 127
12064 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12066 U8 upper_parse = MAX_NODE_STRING_SIZE;
12067 U8 node_type = compute_EXACTish(pRExC_state);
12068 bool next_is_quantifier;
12069 char * oldp = NULL;
12071 /* We can convert EXACTF nodes to EXACTFU if they contain only
12072 * characters that match identically regardless of the target
12073 * string's UTF8ness. The reason to do this is that EXACTF is not
12074 * trie-able, EXACTFU is.
12076 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12077 * contain only above-Latin1 characters (hence must be in UTF8),
12078 * which don't participate in folds with Latin1-range characters,
12079 * as the latter's folds aren't known until runtime. (We don't
12080 * need to figure this out until pass 2) */
12081 bool maybe_exactfu = PASS2
12082 && (node_type == EXACTF || node_type == EXACTFL);
12084 /* If a folding node contains only code points that don't
12085 * participate in folds, it can be changed into an EXACT node,
12086 * which allows the optimizer more things to look for */
12089 ret = reg_node(pRExC_state, node_type);
12091 /* In pass1, folded, we use a temporary buffer instead of the
12092 * actual node, as the node doesn't exist yet */
12093 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12099 /* We do the EXACTFish to EXACT node only if folding. (And we
12100 * don't need to figure this out until pass 2) */
12101 maybe_exact = FOLD && PASS2;
12103 /* XXX The node can hold up to 255 bytes, yet this only goes to
12104 * 127. I (khw) do not know why. Keeping it somewhat less than
12105 * 255 allows us to not have to worry about overflow due to
12106 * converting to utf8 and fold expansion, but that value is
12107 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12108 * split up by this limit into a single one using the real max of
12109 * 255. Even at 127, this breaks under rare circumstances. If
12110 * folding, we do not want to split a node at a character that is a
12111 * non-final in a multi-char fold, as an input string could just
12112 * happen to want to match across the node boundary. The join
12113 * would solve that problem if the join actually happens. But a
12114 * series of more than two nodes in a row each of 127 would cause
12115 * the first join to succeed to get to 254, but then there wouldn't
12116 * be room for the next one, which could at be one of those split
12117 * multi-char folds. I don't know of any fool-proof solution. One
12118 * could back off to end with only a code point that isn't such a
12119 * non-final, but it is possible for there not to be any in the
12121 for (p = RExC_parse - 1;
12122 len < upper_parse && p < RExC_end;
12127 if (RExC_flags & RXf_PMf_EXTENDED)
12128 p = regpatws(pRExC_state, p,
12129 TRUE); /* means recognize comments */
12140 /* Literal Escapes Switch
12142 This switch is meant to handle escape sequences that
12143 resolve to a literal character.
12145 Every escape sequence that represents something
12146 else, like an assertion or a char class, is handled
12147 in the switch marked 'Special Escapes' above in this
12148 routine, but also has an entry here as anything that
12149 isn't explicitly mentioned here will be treated as
12150 an unescaped equivalent literal.
12153 switch ((U8)*++p) {
12154 /* These are all the special escapes. */
12155 case 'A': /* Start assertion */
12156 case 'b': case 'B': /* Word-boundary assertion*/
12157 case 'C': /* Single char !DANGEROUS! */
12158 case 'd': case 'D': /* digit class */
12159 case 'g': case 'G': /* generic-backref, pos assertion */
12160 case 'h': case 'H': /* HORIZWS */
12161 case 'k': case 'K': /* named backref, keep marker */
12162 case 'p': case 'P': /* Unicode property */
12163 case 'R': /* LNBREAK */
12164 case 's': case 'S': /* space class */
12165 case 'v': case 'V': /* VERTWS */
12166 case 'w': case 'W': /* word class */
12167 case 'X': /* eXtended Unicode "combining
12168 character sequence" */
12169 case 'z': case 'Z': /* End of line/string assertion */
12173 /* Anything after here is an escape that resolves to a
12174 literal. (Except digits, which may or may not)
12180 case 'N': /* Handle a single-code point named character. */
12181 /* The options cause it to fail if a multiple code
12182 * point sequence. Handle those in the switch() above
12184 RExC_parse = p + 1;
12185 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12191 if (*flagp & RESTART_UTF8)
12192 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12193 RExC_parse = p = oldp;
12197 if (ender > 0xff) {
12214 ender = ESC_NATIVE;
12224 const char* error_msg;
12226 bool valid = grok_bslash_o(&p,
12229 PASS2, /* out warnings */
12230 FALSE, /* not strict */
12231 TRUE, /* Output warnings
12236 RExC_parse = p; /* going to die anyway; point
12237 to exact spot of failure */
12241 if (PL_encoding && ender < 0x100) {
12242 goto recode_encoding;
12244 if (ender > 0xff) {
12251 UV result = UV_MAX; /* initialize to erroneous
12253 const char* error_msg;
12255 bool valid = grok_bslash_x(&p,
12258 PASS2, /* out warnings */
12259 FALSE, /* not strict */
12260 TRUE, /* Output warnings
12265 RExC_parse = p; /* going to die anyway; point
12266 to exact spot of failure */
12271 if (PL_encoding && ender < 0x100) {
12272 goto recode_encoding;
12274 if (ender > 0xff) {
12281 ender = grok_bslash_c(*p++, PASS2);
12283 case '8': case '9': /* must be a backreference */
12286 case '1': case '2': case '3':case '4':
12287 case '5': case '6': case '7':
12288 /* When we parse backslash escapes there is ambiguity
12289 * between backreferences and octal escapes. Any escape
12290 * from \1 - \9 is a backreference, any multi-digit
12291 * escape which does not start with 0 and which when
12292 * evaluated as decimal could refer to an already
12293 * parsed capture buffer is a backslash. Anything else
12296 * Note this implies that \118 could be interpreted as
12297 * 118 OR as "\11" . "8" depending on whether there
12298 * were 118 capture buffers defined already in the
12301 /* NOTE, RExC_npar is 1 more than the actual number of
12302 * parens we have seen so far, hence the < RExC_npar below. */
12304 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12305 { /* Not to be treated as an octal constant, go
12313 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12315 ender = grok_oct(p, &numlen, &flags, NULL);
12316 if (ender > 0xff) {
12320 if (PASS2 /* like \08, \178 */
12323 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12325 reg_warn_non_literal_string(
12327 form_short_octal_warning(p, numlen));
12330 if (PL_encoding && ender < 0x100)
12331 goto recode_encoding;
12334 if (! RExC_override_recoding) {
12335 SV* enc = PL_encoding;
12336 ender = reg_recode((const char)(U8)ender, &enc);
12338 ckWARNreg(p, "Invalid escape in the specified encoding");
12344 FAIL("Trailing \\");
12347 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12348 /* Include any { following the alpha to emphasize
12349 * that it could be part of an escape at some point
12351 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12352 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12354 goto normal_default;
12355 } /* End of switch on '\' */
12358 /* Currently we don't warn when the lbrace is at the start
12359 * of a construct. This catches it in the middle of a
12360 * literal string, or when its the first thing after
12361 * something like "\b" */
12363 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12365 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12368 default: /* A literal character */
12370 if (UTF8_IS_START(*p) && UTF) {
12372 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12373 &numlen, UTF8_ALLOW_DEFAULT);
12379 } /* End of switch on the literal */
12381 /* Here, have looked at the literal character and <ender>
12382 * contains its ordinal, <p> points to the character after it
12385 if ( RExC_flags & RXf_PMf_EXTENDED)
12386 p = regpatws(pRExC_state, p,
12387 TRUE); /* means recognize comments */
12389 /* If the next thing is a quantifier, it applies to this
12390 * character only, which means that this character has to be in
12391 * its own node and can't just be appended to the string in an
12392 * existing node, so if there are already other characters in
12393 * the node, close the node with just them, and set up to do
12394 * this character again next time through, when it will be the
12395 * only thing in its new node */
12396 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12402 if (! FOLD /* The simple case, just append the literal */
12403 || (LOC /* Also don't fold for tricky chars under /l */
12404 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12407 const STRLEN unilen = reguni(pRExC_state, ender, s);
12413 /* The loop increments <len> each time, as all but this
12414 * path (and one other) through it add a single byte to
12415 * the EXACTish node. But this one has changed len to
12416 * be the correct final value, so subtract one to
12417 * cancel out the increment that follows */
12421 REGC((char)ender, s++);
12424 /* Can get here if folding only if is one of the /l
12425 * characters whose fold depends on the locale. The
12426 * occurrence of any of these indicate that we can't
12427 * simplify things */
12429 maybe_exact = FALSE;
12430 maybe_exactfu = FALSE;
12435 /* See comments for join_exact() as to why we fold this
12436 * non-UTF at compile time */
12437 || (node_type == EXACTFU
12438 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12440 /* Here, are folding and are not UTF-8 encoded; therefore
12441 * the character must be in the range 0-255, and is not /l
12442 * (Not /l because we already handled these under /l in
12443 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12444 if (IS_IN_SOME_FOLD_L1(ender)) {
12445 maybe_exact = FALSE;
12447 /* See if the character's fold differs between /d and
12448 * /u. This includes the multi-char fold SHARP S to
12451 && (PL_fold[ender] != PL_fold_latin1[ender]
12452 || ender == LATIN_SMALL_LETTER_SHARP_S
12454 && isALPHA_FOLD_EQ(ender, 's')
12455 && isALPHA_FOLD_EQ(*(s-1), 's'))))
12457 maybe_exactfu = FALSE;
12461 /* Even when folding, we store just the input character, as
12462 * we have an array that finds its fold quickly */
12463 *(s++) = (char) ender;
12465 else { /* FOLD and UTF */
12466 /* Unlike the non-fold case, we do actually have to
12467 * calculate the results here in pass 1. This is for two
12468 * reasons, the folded length may be longer than the
12469 * unfolded, and we have to calculate how many EXACTish
12470 * nodes it will take; and we may run out of room in a node
12471 * in the middle of a potential multi-char fold, and have
12472 * to back off accordingly. (Hence we can't use REGC for
12473 * the simple case just below.) */
12476 if (isASCII_uni(ender)) {
12477 folded = toFOLD(ender);
12478 *(s)++ = (U8) folded;
12483 folded = _to_uni_fold_flags(
12487 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12488 ? FOLD_FLAGS_NOMIX_ASCII
12492 /* The loop increments <len> each time, as all but this
12493 * path (and one other) through it add a single byte to
12494 * the EXACTish node. But this one has changed len to
12495 * be the correct final value, so subtract one to
12496 * cancel out the increment that follows */
12497 len += foldlen - 1;
12499 /* If this node only contains non-folding code points so
12500 * far, see if this new one is also non-folding */
12502 if (folded != ender) {
12503 maybe_exact = FALSE;
12506 /* Here the fold is the original; we have to check
12507 * further to see if anything folds to it */
12508 if (_invlist_contains_cp(PL_utf8_foldable,
12511 maybe_exact = FALSE;
12518 if (next_is_quantifier) {
12520 /* Here, the next input is a quantifier, and to get here,
12521 * the current character is the only one in the node.
12522 * Also, here <len> doesn't include the final byte for this
12528 } /* End of loop through literal characters */
12530 /* Here we have either exhausted the input or ran out of room in
12531 * the node. (If we encountered a character that can't be in the
12532 * node, transfer is made directly to <loopdone>, and so we
12533 * wouldn't have fallen off the end of the loop.) In the latter
12534 * case, we artificially have to split the node into two, because
12535 * we just don't have enough space to hold everything. This
12536 * creates a problem if the final character participates in a
12537 * multi-character fold in the non-final position, as a match that
12538 * should have occurred won't, due to the way nodes are matched,
12539 * and our artificial boundary. So back off until we find a non-
12540 * problematic character -- one that isn't at the beginning or
12541 * middle of such a fold. (Either it doesn't participate in any
12542 * folds, or appears only in the final position of all the folds it
12543 * does participate in.) A better solution with far fewer false
12544 * positives, and that would fill the nodes more completely, would
12545 * be to actually have available all the multi-character folds to
12546 * test against, and to back-off only far enough to be sure that
12547 * this node isn't ending with a partial one. <upper_parse> is set
12548 * further below (if we need to reparse the node) to include just
12549 * up through that final non-problematic character that this code
12550 * identifies, so when it is set to less than the full node, we can
12551 * skip the rest of this */
12552 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12554 const STRLEN full_len = len;
12556 assert(len >= MAX_NODE_STRING_SIZE);
12558 /* Here, <s> points to the final byte of the final character.
12559 * Look backwards through the string until find a non-
12560 * problematic character */
12564 /* This has no multi-char folds to non-UTF characters */
12565 if (ASCII_FOLD_RESTRICTED) {
12569 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12573 if (! PL_NonL1NonFinalFold) {
12574 PL_NonL1NonFinalFold = _new_invlist_C_array(
12575 NonL1_Perl_Non_Final_Folds_invlist);
12578 /* Point to the first byte of the final character */
12579 s = (char *) utf8_hop((U8 *) s, -1);
12581 while (s >= s0) { /* Search backwards until find
12582 non-problematic char */
12583 if (UTF8_IS_INVARIANT(*s)) {
12585 /* There are no ascii characters that participate
12586 * in multi-char folds under /aa. In EBCDIC, the
12587 * non-ascii invariants are all control characters,
12588 * so don't ever participate in any folds. */
12589 if (ASCII_FOLD_RESTRICTED
12590 || ! IS_NON_FINAL_FOLD(*s))
12595 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12596 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12602 else if (! _invlist_contains_cp(
12603 PL_NonL1NonFinalFold,
12604 valid_utf8_to_uvchr((U8 *) s, NULL)))
12609 /* Here, the current character is problematic in that
12610 * it does occur in the non-final position of some
12611 * fold, so try the character before it, but have to
12612 * special case the very first byte in the string, so
12613 * we don't read outside the string */
12614 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12615 } /* End of loop backwards through the string */
12617 /* If there were only problematic characters in the string,
12618 * <s> will point to before s0, in which case the length
12619 * should be 0, otherwise include the length of the
12620 * non-problematic character just found */
12621 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12624 /* Here, have found the final character, if any, that is
12625 * non-problematic as far as ending the node without splitting
12626 * it across a potential multi-char fold. <len> contains the
12627 * number of bytes in the node up-to and including that
12628 * character, or is 0 if there is no such character, meaning
12629 * the whole node contains only problematic characters. In
12630 * this case, give up and just take the node as-is. We can't
12635 /* If the node ends in an 's' we make sure it stays EXACTF,
12636 * as if it turns into an EXACTFU, it could later get
12637 * joined with another 's' that would then wrongly match
12639 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12641 maybe_exactfu = FALSE;
12645 /* Here, the node does contain some characters that aren't
12646 * problematic. If one such is the final character in the
12647 * node, we are done */
12648 if (len == full_len) {
12651 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12653 /* If the final character is problematic, but the
12654 * penultimate is not, back-off that last character to
12655 * later start a new node with it */
12660 /* Here, the final non-problematic character is earlier
12661 * in the input than the penultimate character. What we do
12662 * is reparse from the beginning, going up only as far as
12663 * this final ok one, thus guaranteeing that the node ends
12664 * in an acceptable character. The reason we reparse is
12665 * that we know how far in the character is, but we don't
12666 * know how to correlate its position with the input parse.
12667 * An alternate implementation would be to build that
12668 * correlation as we go along during the original parse,
12669 * but that would entail extra work for every node, whereas
12670 * this code gets executed only when the string is too
12671 * large for the node, and the final two characters are
12672 * problematic, an infrequent occurrence. Yet another
12673 * possible strategy would be to save the tail of the
12674 * string, and the next time regatom is called, initialize
12675 * with that. The problem with this is that unless you
12676 * back off one more character, you won't be guaranteed
12677 * regatom will get called again, unless regbranch,
12678 * regpiece ... are also changed. If you do back off that
12679 * extra character, so that there is input guaranteed to
12680 * force calling regatom, you can't handle the case where
12681 * just the first character in the node is acceptable. I
12682 * (khw) decided to try this method which doesn't have that
12683 * pitfall; if performance issues are found, we can do a
12684 * combination of the current approach plus that one */
12690 } /* End of verifying node ends with an appropriate char */
12692 loopdone: /* Jumped to when encounters something that shouldn't be in
12695 /* I (khw) don't know if you can get here with zero length, but the
12696 * old code handled this situation by creating a zero-length EXACT
12697 * node. Might as well be NOTHING instead */
12703 /* If 'maybe_exact' is still set here, means there are no
12704 * code points in the node that participate in folds;
12705 * similarly for 'maybe_exactfu' and code points that match
12706 * differently depending on UTF8ness of the target string
12707 * (for /u), or depending on locale for /l */
12711 else if (maybe_exactfu) {
12715 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12716 FALSE /* Don't look to see if could
12717 be turned into an EXACT
12718 node, as we have already
12723 RExC_parse = p - 1;
12724 Set_Node_Cur_Length(ret, parse_start);
12725 nextchar(pRExC_state);
12727 /* len is STRLEN which is unsigned, need to copy to signed */
12730 vFAIL("Internal disaster");
12733 } /* End of label 'defchar:' */
12735 } /* End of giant switch on input character */
12741 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12743 /* Returns the next non-pattern-white space, non-comment character (the
12744 * latter only if 'recognize_comment is true) in the string p, which is
12745 * ended by RExC_end. See also reg_skipcomment */
12746 const char *e = RExC_end;
12748 PERL_ARGS_ASSERT_REGPATWS;
12752 if ((len = is_PATWS_safe(p, e, UTF))) {
12755 else if (recognize_comment && *p == '#') {
12756 p = reg_skipcomment(pRExC_state, p);
12765 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12767 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12768 * sets up the bitmap and any flags, removing those code points from the
12769 * inversion list, setting it to NULL should it become completely empty */
12771 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12772 assert(PL_regkind[OP(node)] == ANYOF);
12774 ANYOF_BITMAP_ZERO(node);
12775 if (*invlist_ptr) {
12777 /* This gets set if we actually need to modify things */
12778 bool change_invlist = FALSE;
12782 /* Start looking through *invlist_ptr */
12783 invlist_iterinit(*invlist_ptr);
12784 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12788 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12789 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12791 else if (end >= NUM_ANYOF_CODE_POINTS) {
12792 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12795 /* Quit if are above what we should change */
12796 if (start >= NUM_ANYOF_CODE_POINTS) {
12800 change_invlist = TRUE;
12802 /* Set all the bits in the range, up to the max that we are doing */
12803 high = (end < NUM_ANYOF_CODE_POINTS - 1)
12805 : NUM_ANYOF_CODE_POINTS - 1;
12806 for (i = start; i <= (int) high; i++) {
12807 if (! ANYOF_BITMAP_TEST(node, i)) {
12808 ANYOF_BITMAP_SET(node, i);
12812 invlist_iterfinish(*invlist_ptr);
12814 /* Done with loop; remove any code points that are in the bitmap from
12815 * *invlist_ptr; similarly for code points above the bitmap if we have
12816 * a flag to match all of them anyways */
12817 if (change_invlist) {
12818 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12820 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12821 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12824 /* If have completely emptied it, remove it completely */
12825 if (_invlist_len(*invlist_ptr) == 0) {
12826 SvREFCNT_dec_NN(*invlist_ptr);
12827 *invlist_ptr = NULL;
12832 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12833 Character classes ([:foo:]) can also be negated ([:^foo:]).
12834 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12835 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12836 but trigger failures because they are currently unimplemented. */
12838 #define POSIXCC_DONE(c) ((c) == ':')
12839 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12840 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12842 PERL_STATIC_INLINE I32
12843 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12845 I32 namedclass = OOB_NAMEDCLASS;
12847 PERL_ARGS_ASSERT_REGPPOSIXCC;
12849 if (value == '[' && RExC_parse + 1 < RExC_end &&
12850 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12851 POSIXCC(UCHARAT(RExC_parse)))
12853 const char c = UCHARAT(RExC_parse);
12854 char* const s = RExC_parse++;
12856 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12858 if (RExC_parse == RExC_end) {
12861 /* Try to give a better location for the error (than the end of
12862 * the string) by looking for the matching ']' */
12864 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12867 vFAIL2("Unmatched '%c' in POSIX class", c);
12869 /* Grandfather lone [:, [=, [. */
12873 const char* const t = RExC_parse++; /* skip over the c */
12876 if (UCHARAT(RExC_parse) == ']') {
12877 const char *posixcc = s + 1;
12878 RExC_parse++; /* skip over the ending ] */
12881 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12882 const I32 skip = t - posixcc;
12884 /* Initially switch on the length of the name. */
12887 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12888 this is the Perl \w
12890 namedclass = ANYOF_WORDCHAR;
12893 /* Names all of length 5. */
12894 /* alnum alpha ascii blank cntrl digit graph lower
12895 print punct space upper */
12896 /* Offset 4 gives the best switch position. */
12897 switch (posixcc[4]) {
12899 if (memEQ(posixcc, "alph", 4)) /* alpha */
12900 namedclass = ANYOF_ALPHA;
12903 if (memEQ(posixcc, "spac", 4)) /* space */
12904 namedclass = ANYOF_PSXSPC;
12907 if (memEQ(posixcc, "grap", 4)) /* graph */
12908 namedclass = ANYOF_GRAPH;
12911 if (memEQ(posixcc, "asci", 4)) /* ascii */
12912 namedclass = ANYOF_ASCII;
12915 if (memEQ(posixcc, "blan", 4)) /* blank */
12916 namedclass = ANYOF_BLANK;
12919 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12920 namedclass = ANYOF_CNTRL;
12923 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12924 namedclass = ANYOF_ALPHANUMERIC;
12927 if (memEQ(posixcc, "lowe", 4)) /* lower */
12928 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12929 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12930 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12933 if (memEQ(posixcc, "digi", 4)) /* digit */
12934 namedclass = ANYOF_DIGIT;
12935 else if (memEQ(posixcc, "prin", 4)) /* print */
12936 namedclass = ANYOF_PRINT;
12937 else if (memEQ(posixcc, "punc", 4)) /* punct */
12938 namedclass = ANYOF_PUNCT;
12943 if (memEQ(posixcc, "xdigit", 6))
12944 namedclass = ANYOF_XDIGIT;
12948 if (namedclass == OOB_NAMEDCLASS)
12950 "POSIX class [:%"UTF8f":] unknown",
12951 UTF8fARG(UTF, t - s - 1, s + 1));
12953 /* The #defines are structured so each complement is +1 to
12954 * the normal one */
12958 assert (posixcc[skip] == ':');
12959 assert (posixcc[skip+1] == ']');
12960 } else if (!SIZE_ONLY) {
12961 /* [[=foo=]] and [[.foo.]] are still future. */
12963 /* adjust RExC_parse so the warning shows after
12964 the class closes */
12965 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12967 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12970 /* Maternal grandfather:
12971 * "[:" ending in ":" but not in ":]" */
12973 vFAIL("Unmatched '[' in POSIX class");
12976 /* Grandfather lone [:, [=, [. */
12986 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12988 /* This applies some heuristics at the current parse position (which should
12989 * be at a '[') to see if what follows might be intended to be a [:posix:]
12990 * class. It returns true if it really is a posix class, of course, but it
12991 * also can return true if it thinks that what was intended was a posix
12992 * class that didn't quite make it.
12994 * It will return true for
12996 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12997 * ')' indicating the end of the (?[
12998 * [:any garbage including %^&$ punctuation:]
13000 * This is designed to be called only from S_handle_regex_sets; it could be
13001 * easily adapted to be called from the spot at the beginning of regclass()
13002 * that checks to see in a normal bracketed class if the surrounding []
13003 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13004 * change long-standing behavior, so I (khw) didn't do that */
13005 char* p = RExC_parse + 1;
13006 char first_char = *p;
13008 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13010 assert(*(p - 1) == '[');
13012 if (! POSIXCC(first_char)) {
13017 while (p < RExC_end && isWORDCHAR(*p)) p++;
13019 if (p >= RExC_end) {
13023 if (p - RExC_parse > 2 /* Got at least 1 word character */
13024 && (*p == first_char
13025 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13030 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13033 && p - RExC_parse > 2 /* [:] evaluates to colon;
13034 [::] is a bad posix class. */
13035 && first_char == *(p - 1));
13039 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13040 I32 *flagp, U32 depth,
13041 char * const oregcomp_parse)
13043 /* Handle the (?[...]) construct to do set operations */
13046 UV start, end; /* End points of code point ranges */
13048 char *save_end, *save_parse;
13053 const bool save_fold = FOLD;
13055 GET_RE_DEBUG_FLAGS_DECL;
13057 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13060 vFAIL("(?[...]) not valid in locale");
13062 RExC_uni_semantics = 1;
13064 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13065 * (such as EXACT). Thus we can skip most everything if just sizing. We
13066 * call regclass to handle '[]' so as to not have to reinvent its parsing
13067 * rules here (throwing away the size it computes each time). And, we exit
13068 * upon an unescaped ']' that isn't one ending a regclass. To do both
13069 * these things, we need to realize that something preceded by a backslash
13070 * is escaped, so we have to keep track of backslashes */
13072 Perl_ck_warner_d(aTHX_
13073 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13074 "The regex_sets feature is experimental" REPORT_LOCATION,
13075 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13077 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13078 RExC_precomp + (RExC_parse - RExC_precomp)));
13081 UV depth = 0; /* how many nested (?[...]) constructs */
13083 while (RExC_parse < RExC_end) {
13084 SV* current = NULL;
13085 RExC_parse = regpatws(pRExC_state, RExC_parse,
13086 TRUE); /* means recognize comments */
13087 switch (*RExC_parse) {
13089 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13094 /* Skip the next byte (which could cause us to end up in
13095 * the middle of a UTF-8 character, but since none of those
13096 * are confusable with anything we currently handle in this
13097 * switch (invariants all), it's safe. We'll just hit the
13098 * default: case next time and keep on incrementing until
13099 * we find one of the invariants we do handle. */
13104 /* If this looks like it is a [:posix:] class, leave the
13105 * parse pointer at the '[' to fool regclass() into
13106 * thinking it is part of a '[[:posix:]]'. That function
13107 * will use strict checking to force a syntax error if it
13108 * doesn't work out to a legitimate class */
13109 bool is_posix_class
13110 = could_it_be_a_POSIX_class(pRExC_state);
13111 if (! is_posix_class) {
13115 /* regclass() can only return RESTART_UTF8 if multi-char
13116 folds are allowed. */
13117 if (!regclass(pRExC_state, flagp,depth+1,
13118 is_posix_class, /* parse the whole char
13119 class only if not a
13121 FALSE, /* don't allow multi-char folds */
13122 TRUE, /* silence non-portable warnings. */
13124 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13127 /* function call leaves parse pointing to the ']', except
13128 * if we faked it */
13129 if (is_posix_class) {
13133 SvREFCNT_dec(current); /* In case it returned something */
13138 if (depth--) break;
13140 if (RExC_parse < RExC_end
13141 && *RExC_parse == ')')
13143 node = reganode(pRExC_state, ANYOF, 0);
13144 RExC_size += ANYOF_SKIP;
13145 nextchar(pRExC_state);
13146 Set_Node_Length(node,
13147 RExC_parse - oregcomp_parse + 1); /* MJD */
13156 FAIL("Syntax error in (?[...])");
13159 /* Pass 2 only after this. Everything in this construct is a
13160 * metacharacter. Operands begin with either a '\' (for an escape
13161 * sequence), or a '[' for a bracketed character class. Any other
13162 * character should be an operator, or parenthesis for grouping. Both
13163 * types of operands are handled by calling regclass() to parse them. It
13164 * is called with a parameter to indicate to return the computed inversion
13165 * list. The parsing here is implemented via a stack. Each entry on the
13166 * stack is a single character representing one of the operators, or the
13167 * '('; or else a pointer to an operand inversion list. */
13169 #define IS_OPERAND(a) (! SvIOK(a))
13171 /* The stack starts empty. It is a syntax error if the first thing parsed
13172 * is a binary operator; everything else is pushed on the stack. When an
13173 * operand is parsed, the top of the stack is examined. If it is a binary
13174 * operator, the item before it should be an operand, and both are replaced
13175 * by the result of doing that operation on the new operand and the one on
13176 * the stack. Thus a sequence of binary operands is reduced to a single
13177 * one before the next one is parsed.
13179 * A unary operator may immediately follow a binary in the input, for
13182 * When an operand is parsed and the top of the stack is a unary operator,
13183 * the operation is performed, and then the stack is rechecked to see if
13184 * this new operand is part of a binary operation; if so, it is handled as
13187 * A '(' is simply pushed on the stack; it is valid only if the stack is
13188 * empty, or the top element of the stack is an operator or another '('
13189 * (for which the parenthesized expression will become an operand). By the
13190 * time the corresponding ')' is parsed everything in between should have
13191 * been parsed and evaluated to a single operand (or else is a syntax
13192 * error), and is handled as a regular operand */
13194 sv_2mortal((SV *)(stack = newAV()));
13196 while (RExC_parse < RExC_end) {
13197 I32 top_index = av_tindex(stack);
13199 SV* current = NULL;
13201 /* Skip white space */
13202 RExC_parse = regpatws(pRExC_state, RExC_parse,
13203 TRUE /* means recognize comments */ );
13204 if (RExC_parse >= RExC_end) {
13205 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13207 if ((curchar = UCHARAT(RExC_parse)) == ']') {
13214 if (av_tindex(stack) >= 0 /* This makes sure that we can
13215 safely subtract 1 from
13216 RExC_parse in the next clause.
13217 If we have something on the
13218 stack, we have parsed something
13220 && UCHARAT(RExC_parse - 1) == '('
13221 && RExC_parse < RExC_end)
13223 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13224 * This happens when we have some thing like
13226 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13228 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13230 * Here we would be handling the interpolated
13231 * '$thai_or_lao'. We handle this by a recursive call to
13232 * ourselves which returns the inversion list the
13233 * interpolated expression evaluates to. We use the flags
13234 * from the interpolated pattern. */
13235 U32 save_flags = RExC_flags;
13236 const char * const save_parse = ++RExC_parse;
13238 parse_lparen_question_flags(pRExC_state);
13240 if (RExC_parse == save_parse /* Makes sure there was at
13241 least one flag (or this
13242 embedding wasn't compiled)
13244 || RExC_parse >= RExC_end - 4
13245 || UCHARAT(RExC_parse) != ':'
13246 || UCHARAT(++RExC_parse) != '('
13247 || UCHARAT(++RExC_parse) != '?'
13248 || UCHARAT(++RExC_parse) != '[')
13251 /* In combination with the above, this moves the
13252 * pointer to the point just after the first erroneous
13253 * character (or if there are no flags, to where they
13254 * should have been) */
13255 if (RExC_parse >= RExC_end - 4) {
13256 RExC_parse = RExC_end;
13258 else if (RExC_parse != save_parse) {
13259 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13261 vFAIL("Expecting '(?flags:(?[...'");
13264 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13265 depth+1, oregcomp_parse);
13267 /* Here, 'current' contains the embedded expression's
13268 * inversion list, and RExC_parse points to the trailing
13269 * ']'; the next character should be the ')' which will be
13270 * paired with the '(' that has been put on the stack, so
13271 * the whole embedded expression reduces to '(operand)' */
13274 RExC_flags = save_flags;
13275 goto handle_operand;
13280 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13281 vFAIL("Unexpected character");
13284 /* regclass() can only return RESTART_UTF8 if multi-char
13285 folds are allowed. */
13286 if (!regclass(pRExC_state, flagp,depth+1,
13287 TRUE, /* means parse just the next thing */
13288 FALSE, /* don't allow multi-char folds */
13289 FALSE, /* don't silence non-portable warnings. */
13291 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13293 /* regclass() will return with parsing just the \ sequence,
13294 * leaving the parse pointer at the next thing to parse */
13296 goto handle_operand;
13298 case '[': /* Is a bracketed character class */
13300 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13302 if (! is_posix_class) {
13306 /* regclass() can only return RESTART_UTF8 if multi-char
13307 folds are allowed. */
13308 if(!regclass(pRExC_state, flagp,depth+1,
13309 is_posix_class, /* parse the whole char class
13310 only if not a posix class */
13311 FALSE, /* don't allow multi-char folds */
13312 FALSE, /* don't silence non-portable warnings. */
13314 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13316 /* function call leaves parse pointing to the ']', except if we
13318 if (is_posix_class) {
13322 goto handle_operand;
13331 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13332 || ! IS_OPERAND(*top_ptr))
13335 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13337 av_push(stack, newSVuv(curchar));
13341 av_push(stack, newSVuv(curchar));
13345 if (top_index >= 0) {
13346 top_ptr = av_fetch(stack, top_index, FALSE);
13348 if (IS_OPERAND(*top_ptr)) {
13350 vFAIL("Unexpected '(' with no preceding operator");
13353 av_push(stack, newSVuv(curchar));
13360 || ! (current = av_pop(stack))
13361 || ! IS_OPERAND(current)
13362 || ! (lparen = av_pop(stack))
13363 || IS_OPERAND(lparen)
13364 || SvUV(lparen) != '(')
13366 SvREFCNT_dec(current);
13368 vFAIL("Unexpected ')'");
13371 SvREFCNT_dec_NN(lparen);
13378 /* Here, we have an operand to process, in 'current' */
13380 if (top_index < 0) { /* Just push if stack is empty */
13381 av_push(stack, current);
13384 SV* top = av_pop(stack);
13386 char current_operator;
13388 if (IS_OPERAND(top)) {
13389 SvREFCNT_dec_NN(top);
13390 SvREFCNT_dec_NN(current);
13391 vFAIL("Operand with no preceding operator");
13393 current_operator = (char) SvUV(top);
13394 switch (current_operator) {
13395 case '(': /* Push the '(' back on followed by the new
13397 av_push(stack, top);
13398 av_push(stack, current);
13399 SvREFCNT_inc(top); /* Counters the '_dec' done
13400 just after the 'break', so
13401 it doesn't get wrongly freed
13406 _invlist_invert(current);
13408 /* Unlike binary operators, the top of the stack,
13409 * now that this unary one has been popped off, may
13410 * legally be an operator, and we now have operand
13413 SvREFCNT_dec_NN(top);
13414 goto handle_operand;
13417 prev = av_pop(stack);
13418 _invlist_intersection(prev,
13421 av_push(stack, current);
13426 prev = av_pop(stack);
13427 _invlist_union(prev, current, ¤t);
13428 av_push(stack, current);
13432 prev = av_pop(stack);;
13433 _invlist_subtract(prev, current, ¤t);
13434 av_push(stack, current);
13437 case '^': /* The union minus the intersection */
13443 prev = av_pop(stack);
13444 _invlist_union(prev, current, &u);
13445 _invlist_intersection(prev, current, &i);
13446 /* _invlist_subtract will overwrite current
13447 without freeing what it already contains */
13449 _invlist_subtract(u, i, ¤t);
13450 av_push(stack, current);
13451 SvREFCNT_dec_NN(i);
13452 SvREFCNT_dec_NN(u);
13453 SvREFCNT_dec_NN(element);
13458 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13460 SvREFCNT_dec_NN(top);
13461 SvREFCNT_dec(prev);
13465 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13468 if (av_tindex(stack) < 0 /* Was empty */
13469 || ((final = av_pop(stack)) == NULL)
13470 || ! IS_OPERAND(final)
13471 || av_tindex(stack) >= 0) /* More left on stack */
13473 vFAIL("Incomplete expression within '(?[ ])'");
13476 /* Here, 'final' is the resultant inversion list from evaluating the
13477 * expression. Return it if so requested */
13478 if (return_invlist) {
13479 *return_invlist = final;
13483 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13484 * expecting a string of ranges and individual code points */
13485 invlist_iterinit(final);
13486 result_string = newSVpvs("");
13487 while (invlist_iternext(final, &start, &end)) {
13488 if (start == end) {
13489 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13492 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13497 save_parse = RExC_parse;
13498 RExC_parse = SvPV(result_string, len);
13499 save_end = RExC_end;
13500 RExC_end = RExC_parse + len;
13502 /* We turn off folding around the call, as the class we have constructed
13503 * already has all folding taken into consideration, and we don't want
13504 * regclass() to add to that */
13505 RExC_flags &= ~RXf_PMf_FOLD;
13506 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13508 node = regclass(pRExC_state, flagp,depth+1,
13509 FALSE, /* means parse the whole char class */
13510 FALSE, /* don't allow multi-char folds */
13511 TRUE, /* silence non-portable warnings. The above may very
13512 well have generated non-portable code points, but
13513 they're valid on this machine */
13516 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13519 RExC_flags |= RXf_PMf_FOLD;
13521 RExC_parse = save_parse + 1;
13522 RExC_end = save_end;
13523 SvREFCNT_dec_NN(final);
13524 SvREFCNT_dec_NN(result_string);
13526 nextchar(pRExC_state);
13527 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13533 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13535 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13536 * innocent-looking character class, like /[ks]/i won't have to go out to
13537 * disk to find the possible matches.
13539 * This should be called only for a Latin1-range code points, cp, which is
13540 * known to be involved in a simple fold with other code points above
13541 * Latin1. It would give false results if /aa has been specified.
13542 * Multi-char folds are outside the scope of this, and must be handled
13545 * XXX It would be better to generate these via regen, in case a new
13546 * version of the Unicode standard adds new mappings, though that is not
13547 * really likely, and may be caught by the default: case of the switch
13550 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13552 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13558 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13562 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13565 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13566 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13568 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13569 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13570 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13572 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13573 *invlist = add_cp_to_invlist(*invlist,
13574 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13576 case LATIN_SMALL_LETTER_SHARP_S:
13577 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13580 /* Use deprecated warning to increase the chances of this being
13583 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13590 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13592 /* This adds the string scalar <multi_string> to the array
13593 * <multi_char_matches>. <multi_string> is known to have exactly
13594 * <cp_count> code points in it. This is used when constructing a
13595 * bracketed character class and we find something that needs to match more
13596 * than a single character.
13598 * <multi_char_matches> is actually an array of arrays. Each top-level
13599 * element is an array that contains all the strings known so far that are
13600 * the same length. And that length (in number of code points) is the same
13601 * as the index of the top-level array. Hence, the [2] element is an
13602 * array, each element thereof is a string containing TWO code points;
13603 * while element [3] is for strings of THREE characters, and so on. Since
13604 * this is for multi-char strings there can never be a [0] nor [1] element.
13606 * When we rewrite the character class below, we will do so such that the
13607 * longest strings are written first, so that it prefers the longest
13608 * matching strings first. This is done even if it turns out that any
13609 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
13610 * Christiansen has agreed that this is ok. This makes the test for the
13611 * ligature 'ffi' come before the test for 'ff', for example */
13614 AV** this_array_ptr;
13616 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13618 if (! multi_char_matches) {
13619 multi_char_matches = newAV();
13622 if (av_exists(multi_char_matches, cp_count)) {
13623 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13624 this_array = *this_array_ptr;
13627 this_array = newAV();
13628 av_store(multi_char_matches, cp_count,
13631 av_push(this_array, multi_string);
13633 return multi_char_matches;
13636 /* The names of properties whose definitions are not known at compile time are
13637 * stored in this SV, after a constant heading. So if the length has been
13638 * changed since initialization, then there is a run-time definition. */
13639 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13640 (SvCUR(listsv) != initial_listsv_len)
13643 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13644 const bool stop_at_1, /* Just parse the next thing, don't
13645 look for a full character class */
13646 bool allow_multi_folds,
13647 const bool silence_non_portable, /* Don't output warnings
13650 SV** ret_invlist) /* Return an inversion list, not a node */
13652 /* parse a bracketed class specification. Most of these will produce an
13653 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13654 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13655 * under /i with multi-character folds: it will be rewritten following the
13656 * paradigm of this example, where the <multi-fold>s are characters which
13657 * fold to multiple character sequences:
13658 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13659 * gets effectively rewritten as:
13660 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13661 * reg() gets called (recursively) on the rewritten version, and this
13662 * function will return what it constructs. (Actually the <multi-fold>s
13663 * aren't physically removed from the [abcdefghi], it's just that they are
13664 * ignored in the recursion by means of a flag:
13665 * <RExC_in_multi_char_class>.)
13667 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13668 * characters, with the corresponding bit set if that character is in the
13669 * list. For characters above this, a range list or swash is used. There
13670 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13671 * determinable at compile time
13673 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13674 * to be restarted. This can only happen if ret_invlist is non-NULL.
13677 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13679 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13682 IV namedclass = OOB_NAMEDCLASS;
13683 char *rangebegin = NULL;
13684 bool need_class = 0;
13686 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13687 than just initialized. */
13688 SV* properties = NULL; /* Code points that match \p{} \P{} */
13689 SV* posixes = NULL; /* Code points that match classes like [:word:],
13690 extended beyond the Latin1 range. These have to
13691 be kept separate from other code points for much
13692 of this function because their handling is
13693 different under /i, and for most classes under
13695 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13696 separate for a while from the non-complemented
13697 versions because of complications with /d
13699 UV element_count = 0; /* Number of distinct elements in the class.
13700 Optimizations may be possible if this is tiny */
13701 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13702 character; used under /i */
13704 char * stop_ptr = RExC_end; /* where to stop parsing */
13705 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13707 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13709 /* Unicode properties are stored in a swash; this holds the current one
13710 * being parsed. If this swash is the only above-latin1 component of the
13711 * character class, an optimization is to pass it directly on to the
13712 * execution engine. Otherwise, it is set to NULL to indicate that there
13713 * are other things in the class that have to be dealt with at execution
13715 SV* swash = NULL; /* Code points that match \p{} \P{} */
13717 /* Set if a component of this character class is user-defined; just passed
13718 * on to the engine */
13719 bool has_user_defined_property = FALSE;
13721 /* inversion list of code points this node matches only when the target
13722 * string is in UTF-8. (Because is under /d) */
13723 SV* depends_list = NULL;
13725 /* Inversion list of code points this node matches regardless of things
13726 * like locale, folding, utf8ness of the target string */
13727 SV* cp_list = NULL;
13729 /* Like cp_list, but code points on this list need to be checked for things
13730 * that fold to/from them under /i */
13731 SV* cp_foldable_list = NULL;
13733 /* Like cp_list, but code points on this list are valid only when the
13734 * runtime locale is UTF-8 */
13735 SV* only_utf8_locale_list = NULL;
13738 /* In a range, counts how many 0-2 of the ends of it came from literals,
13739 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13740 UV literal_endpoint = 0;
13742 bool invert = FALSE; /* Is this class to be complemented */
13744 bool warn_super = ALWAYS_WARN_SUPER;
13746 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13747 case we need to change the emitted regop to an EXACT. */
13748 const char * orig_parse = RExC_parse;
13749 const SSize_t orig_size = RExC_size;
13750 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13751 GET_RE_DEBUG_FLAGS_DECL;
13753 PERL_ARGS_ASSERT_REGCLASS;
13755 PERL_UNUSED_ARG(depth);
13758 DEBUG_PARSE("clas");
13760 /* Assume we are going to generate an ANYOF node. */
13761 ret = reganode(pRExC_state, ANYOF, 0);
13764 RExC_size += ANYOF_SKIP;
13765 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13768 ANYOF_FLAGS(ret) = 0;
13770 RExC_emit += ANYOF_SKIP;
13771 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13772 initial_listsv_len = SvCUR(listsv);
13773 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13777 RExC_parse = regpatws(pRExC_state, RExC_parse,
13778 FALSE /* means don't recognize comments */ );
13781 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13784 allow_multi_folds = FALSE;
13787 RExC_parse = regpatws(pRExC_state, RExC_parse,
13788 FALSE /* means don't recognize comments */ );
13792 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13793 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13794 const char *s = RExC_parse;
13795 const char c = *s++;
13797 while (isWORDCHAR(*s))
13799 if (*s && c == *s && s[1] == ']') {
13800 SAVEFREESV(RExC_rx_sv);
13802 "POSIX syntax [%c %c] belongs inside character classes",
13804 (void)ReREFCNT_inc(RExC_rx_sv);
13808 /* If the caller wants us to just parse a single element, accomplish this
13809 * by faking the loop ending condition */
13810 if (stop_at_1 && RExC_end > RExC_parse) {
13811 stop_ptr = RExC_parse + 1;
13814 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13815 if (UCHARAT(RExC_parse) == ']')
13816 goto charclassloop;
13819 if (RExC_parse >= stop_ptr) {
13824 RExC_parse = regpatws(pRExC_state, RExC_parse,
13825 FALSE /* means don't recognize comments */ );
13828 if (UCHARAT(RExC_parse) == ']') {
13834 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13835 save_value = value;
13836 save_prevvalue = prevvalue;
13839 rangebegin = RExC_parse;
13843 value = utf8n_to_uvchr((U8*)RExC_parse,
13844 RExC_end - RExC_parse,
13845 &numlen, UTF8_ALLOW_DEFAULT);
13846 RExC_parse += numlen;
13849 value = UCHARAT(RExC_parse++);
13852 && RExC_parse < RExC_end
13853 && POSIXCC(UCHARAT(RExC_parse)))
13855 namedclass = regpposixcc(pRExC_state, value, strict);
13857 else if (value != '\\') {
13859 literal_endpoint++;
13863 /* Is a backslash; get the code point of the char after it */
13864 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13865 value = utf8n_to_uvchr((U8*)RExC_parse,
13866 RExC_end - RExC_parse,
13867 &numlen, UTF8_ALLOW_DEFAULT);
13868 RExC_parse += numlen;
13871 value = UCHARAT(RExC_parse++);
13873 /* Some compilers cannot handle switching on 64-bit integer
13874 * values, therefore value cannot be an UV. Yes, this will
13875 * be a problem later if we want switch on Unicode.
13876 * A similar issue a little bit later when switching on
13877 * namedclass. --jhi */
13879 /* If the \ is escaping white space when white space is being
13880 * skipped, it means that that white space is wanted literally, and
13881 * is already in 'value'. Otherwise, need to translate the escape
13882 * into what it signifies. */
13883 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13885 case 'w': namedclass = ANYOF_WORDCHAR; break;
13886 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13887 case 's': namedclass = ANYOF_SPACE; break;
13888 case 'S': namedclass = ANYOF_NSPACE; break;
13889 case 'd': namedclass = ANYOF_DIGIT; break;
13890 case 'D': namedclass = ANYOF_NDIGIT; break;
13891 case 'v': namedclass = ANYOF_VERTWS; break;
13892 case 'V': namedclass = ANYOF_NVERTWS; break;
13893 case 'h': namedclass = ANYOF_HORIZWS; break;
13894 case 'H': namedclass = ANYOF_NHORIZWS; break;
13895 case 'N': /* Handle \N{NAME} in class */
13898 STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13899 flagp, depth, &as_text);
13900 if (*flagp & RESTART_UTF8)
13901 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13902 if (cp_count != 1) { /* The typical case drops through */
13903 assert(cp_count != (STRLEN) -1);
13904 if (cp_count == 0) {
13906 RExC_parse++; /* Position after the "}" */
13907 vFAIL("Zero length \\N{}");
13910 ckWARNreg(RExC_parse,
13911 "Ignoring zero length \\N{} in character class");
13914 else { /* cp_count > 1 */
13915 if (! RExC_in_multi_char_class) {
13916 if (invert || range || *RExC_parse == '-') {
13919 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13922 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13927 = add_multi_match(multi_char_matches,
13931 break; /* <value> contains the first code
13932 point. Drop out of the switch to
13935 } /* End of cp_count != 1 */
13937 /* This element should not be processed further in this
13940 value = save_value;
13941 prevvalue = save_prevvalue;
13942 continue; /* Back to top of loop to get next char */
13944 /* Here, is a single code point, and <value> contains it */
13946 /* We consider named characters to be literal characters */
13947 literal_endpoint++;
13956 /* We will handle any undefined properties ourselves */
13957 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13958 /* And we actually would prefer to get
13959 * the straight inversion list of the
13960 * swash, since we will be accessing it
13961 * anyway, to save a little time */
13962 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13964 if (RExC_parse >= RExC_end)
13965 vFAIL2("Empty \\%c{}", (U8)value);
13966 if (*RExC_parse == '{') {
13967 const U8 c = (U8)value;
13968 e = strchr(RExC_parse++, '}');
13970 vFAIL2("Missing right brace on \\%c{}", c);
13971 while (isSPACE(*RExC_parse))
13973 if (e == RExC_parse)
13974 vFAIL2("Empty \\%c{}", c);
13975 n = e - RExC_parse;
13976 while (isSPACE(*(RExC_parse + n - 1)))
13987 if (UCHARAT(RExC_parse) == '^') {
13990 /* toggle. (The rhs xor gets the single bit that
13991 * differs between P and p; the other xor inverts just
13993 value ^= 'P' ^ 'p';
13995 while (isSPACE(*RExC_parse)) {
14000 /* Try to get the definition of the property into
14001 * <invlist>. If /i is in effect, the effective property
14002 * will have its name be <__NAME_i>. The design is
14003 * discussed in commit
14004 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14005 name = savepv(Perl_form(aTHX_
14007 (FOLD) ? "__" : "",
14013 /* Look up the property name, and get its swash and
14014 * inversion list, if the property is found */
14016 SvREFCNT_dec_NN(swash);
14018 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14021 NULL, /* No inversion list */
14024 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14025 HV* curpkg = (IN_PERL_COMPILETIME)
14027 : CopSTASH(PL_curcop);
14029 SvREFCNT_dec_NN(swash);
14033 /* Here didn't find it. It could be a user-defined
14034 * property that will be available at run-time. If we
14035 * accept only compile-time properties, is an error;
14036 * otherwise add it to the list for run-time look up */
14038 RExC_parse = e + 1;
14040 "Property '%"UTF8f"' is unknown",
14041 UTF8fARG(UTF, n, name));
14044 /* If the property name doesn't already have a package
14045 * name, add the current one to it so that it can be
14046 * referred to outside it. [perl #121777] */
14047 if (curpkg && ! instr(name, "::")) {
14048 char* pkgname = HvNAME(curpkg);
14049 if (strNE(pkgname, "main")) {
14050 char* full_name = Perl_form(aTHX_
14054 n = strlen(full_name);
14056 name = savepvn(full_name, n);
14059 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14060 (value == 'p' ? '+' : '!'),
14061 UTF8fARG(UTF, n, name));
14062 has_user_defined_property = TRUE;
14064 /* We don't know yet, so have to assume that the
14065 * property could match something in the Latin1 range,
14066 * hence something that isn't utf8. Note that this
14067 * would cause things in <depends_list> to match
14068 * inappropriately, except that any \p{}, including
14069 * this one forces Unicode semantics, which means there
14070 * is no <depends_list> */
14072 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14076 /* Here, did get the swash and its inversion list. If
14077 * the swash is from a user-defined property, then this
14078 * whole character class should be regarded as such */
14079 if (swash_init_flags
14080 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14082 has_user_defined_property = TRUE;
14085 /* We warn on matching an above-Unicode code point
14086 * if the match would return true, except don't
14087 * warn for \p{All}, which has exactly one element
14089 (_invlist_contains_cp(invlist, 0x110000)
14090 && (! (_invlist_len(invlist) == 1
14091 && *invlist_array(invlist) == 0)))
14097 /* Invert if asking for the complement */
14098 if (value == 'P') {
14099 _invlist_union_complement_2nd(properties,
14103 /* The swash can't be used as-is, because we've
14104 * inverted things; delay removing it to here after
14105 * have copied its invlist above */
14106 SvREFCNT_dec_NN(swash);
14110 _invlist_union(properties, invlist, &properties);
14115 RExC_parse = e + 1;
14116 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14119 /* \p means they want Unicode semantics */
14120 RExC_uni_semantics = 1;
14123 case 'n': value = '\n'; break;
14124 case 'r': value = '\r'; break;
14125 case 't': value = '\t'; break;
14126 case 'f': value = '\f'; break;
14127 case 'b': value = '\b'; break;
14128 case 'e': value = ESC_NATIVE; break;
14129 case 'a': value = '\a'; break;
14131 RExC_parse--; /* function expects to be pointed at the 'o' */
14133 const char* error_msg;
14134 bool valid = grok_bslash_o(&RExC_parse,
14137 PASS2, /* warnings only in
14140 silence_non_portable,
14146 if (PL_encoding && value < 0x100) {
14147 goto recode_encoding;
14151 RExC_parse--; /* function expects to be pointed at the 'x' */
14153 const char* error_msg;
14154 bool valid = grok_bslash_x(&RExC_parse,
14157 PASS2, /* Output warnings */
14159 silence_non_portable,
14165 if (PL_encoding && value < 0x100)
14166 goto recode_encoding;
14169 value = grok_bslash_c(*RExC_parse++, PASS2);
14171 case '0': case '1': case '2': case '3': case '4':
14172 case '5': case '6': case '7':
14174 /* Take 1-3 octal digits */
14175 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14176 numlen = (strict) ? 4 : 3;
14177 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14178 RExC_parse += numlen;
14181 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14182 vFAIL("Need exactly 3 octal digits");
14184 else if (! SIZE_ONLY /* like \08, \178 */
14186 && RExC_parse < RExC_end
14187 && isDIGIT(*RExC_parse)
14188 && ckWARN(WARN_REGEXP))
14190 SAVEFREESV(RExC_rx_sv);
14191 reg_warn_non_literal_string(
14193 form_short_octal_warning(RExC_parse, numlen));
14194 (void)ReREFCNT_inc(RExC_rx_sv);
14197 if (PL_encoding && value < 0x100)
14198 goto recode_encoding;
14202 if (! RExC_override_recoding) {
14203 SV* enc = PL_encoding;
14204 value = reg_recode((const char)(U8)value, &enc);
14207 vFAIL("Invalid escape in the specified encoding");
14210 ckWARNreg(RExC_parse,
14211 "Invalid escape in the specified encoding");
14217 /* Allow \_ to not give an error */
14218 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14220 vFAIL2("Unrecognized escape \\%c in character class",
14224 SAVEFREESV(RExC_rx_sv);
14225 ckWARN2reg(RExC_parse,
14226 "Unrecognized escape \\%c in character class passed through",
14228 (void)ReREFCNT_inc(RExC_rx_sv);
14232 } /* End of switch on char following backslash */
14233 } /* end of handling backslash escape sequences */
14235 /* Here, we have the current token in 'value' */
14237 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14240 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14241 * literal, as is the character that began the false range, i.e.
14242 * the 'a' in the examples */
14245 const int w = (RExC_parse >= rangebegin)
14246 ? RExC_parse - rangebegin
14250 "False [] range \"%"UTF8f"\"",
14251 UTF8fARG(UTF, w, rangebegin));
14254 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14255 ckWARN2reg(RExC_parse,
14256 "False [] range \"%"UTF8f"\"",
14257 UTF8fARG(UTF, w, rangebegin));
14258 (void)ReREFCNT_inc(RExC_rx_sv);
14259 cp_list = add_cp_to_invlist(cp_list, '-');
14260 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14265 range = 0; /* this was not a true range */
14266 element_count += 2; /* So counts for three values */
14269 classnum = namedclass_to_classnum(namedclass);
14271 if (LOC && namedclass < ANYOF_POSIXL_MAX
14272 #ifndef HAS_ISASCII
14273 && classnum != _CC_ASCII
14276 /* What the Posix classes (like \w, [:space:]) match in locale
14277 * isn't knowable under locale until actual match time. Room
14278 * must be reserved (one time per outer bracketed class) to
14279 * store such classes. The space will contain a bit for each
14280 * named class that is to be matched against. This isn't
14281 * needed for \p{} and pseudo-classes, as they are not affected
14282 * by locale, and hence are dealt with separately */
14283 if (! need_class) {
14286 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14289 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14291 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14292 ANYOF_POSIXL_ZERO(ret);
14295 /* Coverity thinks it is possible for this to be negative; both
14296 * jhi and khw think it's not, but be safer */
14297 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14298 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14300 /* See if it already matches the complement of this POSIX
14302 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14303 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14307 posixl_matches_all = TRUE;
14308 break; /* No need to continue. Since it matches both
14309 e.g., \w and \W, it matches everything, and the
14310 bracketed class can be optimized into qr/./s */
14313 /* Add this class to those that should be checked at runtime */
14314 ANYOF_POSIXL_SET(ret, namedclass);
14316 /* The above-Latin1 characters are not subject to locale rules.
14317 * Just add them, in the second pass, to the
14318 * unconditionally-matched list */
14320 SV* scratch_list = NULL;
14322 /* Get the list of the above-Latin1 code points this
14324 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14325 PL_XPosix_ptrs[classnum],
14327 /* Odd numbers are complements, like
14328 * NDIGIT, NASCII, ... */
14329 namedclass % 2 != 0,
14331 /* Checking if 'cp_list' is NULL first saves an extra
14332 * clone. Its reference count will be decremented at the
14333 * next union, etc, or if this is the only instance, at the
14334 * end of the routine */
14336 cp_list = scratch_list;
14339 _invlist_union(cp_list, scratch_list, &cp_list);
14340 SvREFCNT_dec_NN(scratch_list);
14342 continue; /* Go get next character */
14345 else if (! SIZE_ONLY) {
14347 /* Here, not in pass1 (in that pass we skip calculating the
14348 * contents of this class), and is /l, or is a POSIX class for
14349 * which /l doesn't matter (or is a Unicode property, which is
14350 * skipped here). */
14351 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14352 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14354 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14355 * nor /l make a difference in what these match,
14356 * therefore we just add what they match to cp_list. */
14357 if (classnum != _CC_VERTSPACE) {
14358 assert( namedclass == ANYOF_HORIZWS
14359 || namedclass == ANYOF_NHORIZWS);
14361 /* It turns out that \h is just a synonym for
14363 classnum = _CC_BLANK;
14366 _invlist_union_maybe_complement_2nd(
14368 PL_XPosix_ptrs[classnum],
14369 namedclass % 2 != 0, /* Complement if odd
14370 (NHORIZWS, NVERTWS)
14375 else { /* Garden variety class. If is NASCII, NDIGIT, ...
14376 complement and use nposixes */
14377 SV** posixes_ptr = namedclass % 2 == 0
14380 SV** source_ptr = &PL_XPosix_ptrs[classnum];
14381 _invlist_union_maybe_complement_2nd(
14384 namedclass % 2 != 0,
14388 } /* end of namedclass \blah */
14391 RExC_parse = regpatws(pRExC_state, RExC_parse,
14392 FALSE /* means don't recognize comments */ );
14395 /* If 'range' is set, 'value' is the ending of a range--check its
14396 * validity. (If value isn't a single code point in the case of a
14397 * range, we should have figured that out above in the code that
14398 * catches false ranges). Later, we will handle each individual code
14399 * point in the range. If 'range' isn't set, this could be the
14400 * beginning of a range, so check for that by looking ahead to see if
14401 * the next real character to be processed is the range indicator--the
14405 if (prevvalue > value) /* b-a */ {
14406 const int w = RExC_parse - rangebegin;
14408 "Invalid [] range \"%"UTF8f"\"",
14409 UTF8fARG(UTF, w, rangebegin));
14410 range = 0; /* not a valid range */
14414 prevvalue = value; /* save the beginning of the potential range */
14415 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14416 && *RExC_parse == '-')
14418 char* next_char_ptr = RExC_parse + 1;
14419 if (skip_white) { /* Get the next real char after the '-' */
14420 next_char_ptr = regpatws(pRExC_state,
14422 FALSE); /* means don't recognize
14426 /* If the '-' is at the end of the class (just before the ']',
14427 * it is a literal minus; otherwise it is a range */
14428 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14429 RExC_parse = next_char_ptr;
14431 /* a bad range like \w-, [:word:]- ? */
14432 if (namedclass > OOB_NAMEDCLASS) {
14433 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14434 const int w = RExC_parse >= rangebegin
14435 ? RExC_parse - rangebegin
14438 vFAIL4("False [] range \"%*.*s\"",
14443 "False [] range \"%*.*s\"",
14448 cp_list = add_cp_to_invlist(cp_list, '-');
14452 range = 1; /* yeah, it's a range! */
14453 continue; /* but do it the next time */
14458 if (namedclass > OOB_NAMEDCLASS) {
14462 /* Here, we have a single value, and <prevvalue> is the beginning of
14463 * the range, if any; or <value> if not */
14465 /* non-Latin1 code point implies unicode semantics. Must be set in
14466 * pass1 so is there for the whole of pass 2 */
14468 RExC_uni_semantics = 1;
14471 /* Ready to process either the single value, or the completed range.
14472 * For single-valued non-inverted ranges, we consider the possibility
14473 * of multi-char folds. (We made a conscious decision to not do this
14474 * for the other cases because it can often lead to non-intuitive
14475 * results. For example, you have the peculiar case that:
14476 * "s s" =~ /^[^\xDF]+$/i => Y
14477 * "ss" =~ /^[^\xDF]+$/i => N
14479 * See [perl #89750] */
14480 if (FOLD && allow_multi_folds && value == prevvalue) {
14481 if (value == LATIN_SMALL_LETTER_SHARP_S
14482 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14485 /* Here <value> is indeed a multi-char fold. Get what it is */
14487 U8 foldbuf[UTF8_MAXBYTES_CASE];
14490 UV folded = _to_uni_fold_flags(
14494 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14495 ? FOLD_FLAGS_NOMIX_ASCII
14499 /* Here, <folded> should be the first character of the
14500 * multi-char fold of <value>, with <foldbuf> containing the
14501 * whole thing. But, if this fold is not allowed (because of
14502 * the flags), <fold> will be the same as <value>, and should
14503 * be processed like any other character, so skip the special
14505 if (folded != value) {
14507 /* Skip if we are recursed, currently parsing the class
14508 * again. Otherwise add this character to the list of
14509 * multi-char folds. */
14510 if (! RExC_in_multi_char_class) {
14511 STRLEN cp_count = utf8_length(foldbuf,
14512 foldbuf + foldlen);
14513 SV* multi_fold = sv_2mortal(newSVpvs(""));
14515 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14518 = add_multi_match(multi_char_matches,
14524 /* This element should not be processed further in this
14527 value = save_value;
14528 prevvalue = save_prevvalue;
14534 /* Deal with this element of the class */
14537 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14540 SV* this_range = _new_invlist(1);
14541 _append_range_to_invlist(this_range, prevvalue, value);
14543 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14544 * If this range was specified using something like 'i-j', we want
14545 * to include only the 'i' and the 'j', and not anything in
14546 * between, so exclude non-ASCII, non-alphabetics from it.
14547 * However, if the range was specified with something like
14548 * [\x89-\x91] or [\x89-j], all code points within it should be
14549 * included. literal_endpoint==2 means both ends of the range used
14550 * a literal character, not \x{foo} */
14551 if (literal_endpoint == 2
14552 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14553 || (isUPPER_A(prevvalue) && isUPPER_A(value))))
14555 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14558 /* Since 'this_range' now only contains ascii, the intersection
14559 * of it with anything will still yield only ascii */
14560 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14563 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14564 literal_endpoint = 0;
14565 SvREFCNT_dec_NN(this_range);
14569 range = 0; /* this range (if it was one) is done now */
14570 } /* End of loop through all the text within the brackets */
14572 /* If anything in the class expands to more than one character, we have to
14573 * deal with them by building up a substitute parse string, and recursively
14574 * calling reg() on it, instead of proceeding */
14575 if (multi_char_matches) {
14576 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14579 char *save_end = RExC_end;
14580 char *save_parse = RExC_parse;
14581 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14586 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14587 because too confusing */
14589 sv_catpv(substitute_parse, "(?:");
14593 /* Look at the longest folds first */
14594 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14596 if (av_exists(multi_char_matches, cp_count)) {
14597 AV** this_array_ptr;
14600 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14602 while ((this_sequence = av_pop(*this_array_ptr)) !=
14605 if (! first_time) {
14606 sv_catpv(substitute_parse, "|");
14608 first_time = FALSE;
14610 sv_catpv(substitute_parse, SvPVX(this_sequence));
14615 /* If the character class contains anything else besides these
14616 * multi-character folds, have to include it in recursive parsing */
14617 if (element_count) {
14618 sv_catpv(substitute_parse, "|[");
14619 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14620 sv_catpv(substitute_parse, "]");
14623 sv_catpv(substitute_parse, ")");
14626 /* This is a way to get the parse to skip forward a whole named
14627 * sequence instead of matching the 2nd character when it fails the
14629 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14633 RExC_parse = SvPV(substitute_parse, len);
14634 RExC_end = RExC_parse + len;
14635 RExC_in_multi_char_class = 1;
14636 RExC_override_recoding = 1;
14637 RExC_emit = (regnode *)orig_emit;
14639 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14641 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14643 RExC_parse = save_parse;
14644 RExC_end = save_end;
14645 RExC_in_multi_char_class = 0;
14646 RExC_override_recoding = 0;
14647 SvREFCNT_dec_NN(multi_char_matches);
14651 /* Here, we've gone through the entire class and dealt with multi-char
14652 * folds. We are now in a position that we can do some checks to see if we
14653 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14654 * Currently we only do two checks:
14655 * 1) is in the unlikely event that the user has specified both, eg. \w and
14656 * \W under /l, then the class matches everything. (This optimization
14657 * is done only to make the optimizer code run later work.)
14658 * 2) if the character class contains only a single element (including a
14659 * single range), we see if there is an equivalent node for it.
14660 * Other checks are possible */
14661 if (! ret_invlist /* Can't optimize if returning the constructed
14663 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14668 if (UNLIKELY(posixl_matches_all)) {
14671 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14672 \w or [:digit:] or \p{foo}
14675 /* All named classes are mapped into POSIXish nodes, with its FLAG
14676 * argument giving which class it is */
14677 switch ((I32)namedclass) {
14678 case ANYOF_UNIPROP:
14681 /* These don't depend on the charset modifiers. They always
14682 * match under /u rules */
14683 case ANYOF_NHORIZWS:
14684 case ANYOF_HORIZWS:
14685 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14688 case ANYOF_NVERTWS:
14693 /* The actual POSIXish node for all the rest depends on the
14694 * charset modifier. The ones in the first set depend only on
14695 * ASCII or, if available on this platform, locale */
14699 op = (LOC) ? POSIXL : POSIXA;
14710 /* under /a could be alpha */
14712 if (ASCII_RESTRICTED) {
14713 namedclass = ANYOF_ALPHA + (namedclass % 2);
14721 /* The rest have more possibilities depending on the charset.
14722 * We take advantage of the enum ordering of the charset
14723 * modifiers to get the exact node type, */
14725 op = POSIXD + get_regex_charset(RExC_flags);
14726 if (op > POSIXA) { /* /aa is same as /a */
14731 /* The odd numbered ones are the complements of the
14732 * next-lower even number one */
14733 if (namedclass % 2 == 1) {
14737 arg = namedclass_to_classnum(namedclass);
14741 else if (value == prevvalue) {
14743 /* Here, the class consists of just a single code point */
14746 if (! LOC && value == '\n') {
14747 op = REG_ANY; /* Optimize [^\n] */
14748 *flagp |= HASWIDTH|SIMPLE;
14752 else if (value < 256 || UTF) {
14754 /* Optimize a single value into an EXACTish node, but not if it
14755 * would require converting the pattern to UTF-8. */
14756 op = compute_EXACTish(pRExC_state);
14758 } /* Otherwise is a range */
14759 else if (! LOC) { /* locale could vary these */
14760 if (prevvalue == '0') {
14761 if (value == '9') {
14766 else if (prevvalue == 'A') {
14769 && literal_endpoint == 2
14772 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14776 else if (prevvalue == 'a') {
14779 && literal_endpoint == 2
14782 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14788 /* Here, we have changed <op> away from its initial value iff we found
14789 * an optimization */
14792 /* Throw away this ANYOF regnode, and emit the calculated one,
14793 * which should correspond to the beginning, not current, state of
14795 const char * cur_parse = RExC_parse;
14796 RExC_parse = (char *)orig_parse;
14800 /* To get locale nodes to not use the full ANYOF size would
14801 * require moving the code above that writes the portions
14802 * of it that aren't in other nodes to after this point.
14803 * e.g. ANYOF_POSIXL_SET */
14804 RExC_size = orig_size;
14808 RExC_emit = (regnode *)orig_emit;
14809 if (PL_regkind[op] == POSIXD) {
14810 if (op == POSIXL) {
14811 RExC_contains_locale = 1;
14814 op += NPOSIXD - POSIXD;
14819 ret = reg_node(pRExC_state, op);
14821 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14825 *flagp |= HASWIDTH|SIMPLE;
14827 else if (PL_regkind[op] == EXACT) {
14828 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14829 TRUE /* downgradable to EXACT */
14833 RExC_parse = (char *) cur_parse;
14835 SvREFCNT_dec(posixes);
14836 SvREFCNT_dec(nposixes);
14837 SvREFCNT_dec(cp_list);
14838 SvREFCNT_dec(cp_foldable_list);
14845 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14847 /* If folding, we calculate all characters that could fold to or from the
14848 * ones already on the list */
14849 if (cp_foldable_list) {
14851 UV start, end; /* End points of code point ranges */
14853 SV* fold_intersection = NULL;
14856 /* Our calculated list will be for Unicode rules. For locale
14857 * matching, we have to keep a separate list that is consulted at
14858 * runtime only when the locale indicates Unicode rules. For
14859 * non-locale, we just use to the general list */
14861 use_list = &only_utf8_locale_list;
14864 use_list = &cp_list;
14867 /* Only the characters in this class that participate in folds need
14868 * be checked. Get the intersection of this class and all the
14869 * possible characters that are foldable. This can quickly narrow
14870 * down a large class */
14871 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14872 &fold_intersection);
14874 /* The folds for all the Latin1 characters are hard-coded into this
14875 * program, but we have to go out to disk to get the others. */
14876 if (invlist_highest(cp_foldable_list) >= 256) {
14878 /* This is a hash that for a particular fold gives all
14879 * characters that are involved in it */
14880 if (! PL_utf8_foldclosures) {
14881 _load_PL_utf8_foldclosures();
14885 /* Now look at the foldable characters in this class individually */
14886 invlist_iterinit(fold_intersection);
14887 while (invlist_iternext(fold_intersection, &start, &end)) {
14890 /* Look at every character in the range */
14891 for (j = start; j <= end; j++) {
14892 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14898 if (IS_IN_SOME_FOLD_L1(j)) {
14900 /* ASCII is always matched; non-ASCII is matched
14901 * only under Unicode rules (which could happen
14902 * under /l if the locale is a UTF-8 one */
14903 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14904 *use_list = add_cp_to_invlist(*use_list,
14905 PL_fold_latin1[j]);
14909 add_cp_to_invlist(depends_list,
14910 PL_fold_latin1[j]);
14914 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14915 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14917 add_above_Latin1_folds(pRExC_state,
14924 /* Here is an above Latin1 character. We don't have the
14925 * rules hard-coded for it. First, get its fold. This is
14926 * the simple fold, as the multi-character folds have been
14927 * handled earlier and separated out */
14928 _to_uni_fold_flags(j, foldbuf, &foldlen,
14929 (ASCII_FOLD_RESTRICTED)
14930 ? FOLD_FLAGS_NOMIX_ASCII
14933 /* Single character fold of above Latin1. Add everything in
14934 * its fold closure to the list that this node should match.
14935 * The fold closures data structure is a hash with the keys
14936 * being the UTF-8 of every character that is folded to, like
14937 * 'k', and the values each an array of all code points that
14938 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14939 * Multi-character folds are not included */
14940 if ((listp = hv_fetch(PL_utf8_foldclosures,
14941 (char *) foldbuf, foldlen, FALSE)))
14943 AV* list = (AV*) *listp;
14945 for (k = 0; k <= av_tindex(list); k++) {
14946 SV** c_p = av_fetch(list, k, FALSE);
14952 /* /aa doesn't allow folds between ASCII and non- */
14953 if ((ASCII_FOLD_RESTRICTED
14954 && (isASCII(c) != isASCII(j))))
14959 /* Folds under /l which cross the 255/256 boundary
14960 * are added to a separate list. (These are valid
14961 * only when the locale is UTF-8.) */
14962 if (c < 256 && LOC) {
14963 *use_list = add_cp_to_invlist(*use_list, c);
14967 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14969 cp_list = add_cp_to_invlist(cp_list, c);
14972 /* Similarly folds involving non-ascii Latin1
14973 * characters under /d are added to their list */
14974 depends_list = add_cp_to_invlist(depends_list,
14981 SvREFCNT_dec_NN(fold_intersection);
14984 /* Now that we have finished adding all the folds, there is no reason
14985 * to keep the foldable list separate */
14986 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14987 SvREFCNT_dec_NN(cp_foldable_list);
14990 /* And combine the result (if any) with any inversion list from posix
14991 * classes. The lists are kept separate up to now because we don't want to
14992 * fold the classes (folding of those is automatically handled by the swash
14993 * fetching code) */
14994 if (posixes || nposixes) {
14995 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14996 /* Under /a and /aa, nothing above ASCII matches these */
14997 _invlist_intersection(posixes,
14998 PL_XPosix_ptrs[_CC_ASCII],
15002 if (DEPENDS_SEMANTICS) {
15003 /* Under /d, everything in the upper half of the Latin1 range
15004 * matches these complements */
15005 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15007 else if (AT_LEAST_ASCII_RESTRICTED) {
15008 /* Under /a and /aa, everything above ASCII matches these
15010 _invlist_union_complement_2nd(nposixes,
15011 PL_XPosix_ptrs[_CC_ASCII],
15015 _invlist_union(posixes, nposixes, &posixes);
15016 SvREFCNT_dec_NN(nposixes);
15019 posixes = nposixes;
15022 if (! DEPENDS_SEMANTICS) {
15024 _invlist_union(cp_list, posixes, &cp_list);
15025 SvREFCNT_dec_NN(posixes);
15032 /* Under /d, we put into a separate list the Latin1 things that
15033 * match only when the target string is utf8 */
15034 SV* nonascii_but_latin1_properties = NULL;
15035 _invlist_intersection(posixes, PL_UpperLatin1,
15036 &nonascii_but_latin1_properties);
15037 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15040 _invlist_union(cp_list, posixes, &cp_list);
15041 SvREFCNT_dec_NN(posixes);
15047 if (depends_list) {
15048 _invlist_union(depends_list, nonascii_but_latin1_properties,
15050 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15053 depends_list = nonascii_but_latin1_properties;
15058 /* And combine the result (if any) with any inversion list from properties.
15059 * The lists are kept separate up to now so that we can distinguish the two
15060 * in regards to matching above-Unicode. A run-time warning is generated
15061 * if a Unicode property is matched against a non-Unicode code point. But,
15062 * we allow user-defined properties to match anything, without any warning,
15063 * and we also suppress the warning if there is a portion of the character
15064 * class that isn't a Unicode property, and which matches above Unicode, \W
15065 * or [\x{110000}] for example.
15066 * (Note that in this case, unlike the Posix one above, there is no
15067 * <depends_list>, because having a Unicode property forces Unicode
15072 /* If it matters to the final outcome, see if a non-property
15073 * component of the class matches above Unicode. If so, the
15074 * warning gets suppressed. This is true even if just a single
15075 * such code point is specified, as though not strictly correct if
15076 * another such code point is matched against, the fact that they
15077 * are using above-Unicode code points indicates they should know
15078 * the issues involved */
15080 warn_super = ! (invert
15081 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15084 _invlist_union(properties, cp_list, &cp_list);
15085 SvREFCNT_dec_NN(properties);
15088 cp_list = properties;
15092 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15096 /* Here, we have calculated what code points should be in the character
15099 * Now we can see about various optimizations. Fold calculation (which we
15100 * did above) needs to take place before inversion. Otherwise /[^k]/i
15101 * would invert to include K, which under /i would match k, which it
15102 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15103 * folded until runtime */
15105 /* If we didn't do folding, it's because some information isn't available
15106 * until runtime; set the run-time fold flag for these. (We don't have to
15107 * worry about properties folding, as that is taken care of by the swash
15108 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15109 * locales, or the class matches at least one 0-255 range code point */
15111 if (only_utf8_locale_list) {
15112 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15114 else if (cp_list) { /* Look to see if there a 0-255 code point is in
15117 invlist_iterinit(cp_list);
15118 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15119 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15121 invlist_iterfinish(cp_list);
15125 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15126 * at compile time. Besides not inverting folded locale now, we can't
15127 * invert if there are things such as \w, which aren't known until runtime
15131 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15133 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15135 _invlist_invert(cp_list);
15137 /* Any swash can't be used as-is, because we've inverted things */
15139 SvREFCNT_dec_NN(swash);
15143 /* Clear the invert flag since have just done it here */
15148 *ret_invlist = cp_list;
15149 SvREFCNT_dec(swash);
15151 /* Discard the generated node */
15153 RExC_size = orig_size;
15156 RExC_emit = orig_emit;
15161 /* Some character classes are equivalent to other nodes. Such nodes take
15162 * up less room and generally fewer operations to execute than ANYOF nodes.
15163 * Above, we checked for and optimized into some such equivalents for
15164 * certain common classes that are easy to test. Getting to this point in
15165 * the code means that the class didn't get optimized there. Since this
15166 * code is only executed in Pass 2, it is too late to save space--it has
15167 * been allocated in Pass 1, and currently isn't given back. But turning
15168 * things into an EXACTish node can allow the optimizer to join it to any
15169 * adjacent such nodes. And if the class is equivalent to things like /./,
15170 * expensive run-time swashes can be avoided. Now that we have more
15171 * complete information, we can find things necessarily missed by the
15172 * earlier code. I (khw) am not sure how much to look for here. It would
15173 * be easy, but perhaps too slow, to check any candidates against all the
15174 * node types they could possibly match using _invlistEQ(). */
15179 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15180 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15182 /* We don't optimize if we are supposed to make sure all non-Unicode
15183 * code points raise a warning, as only ANYOF nodes have this check.
15185 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15188 U8 op = END; /* The optimzation node-type */
15189 const char * cur_parse= RExC_parse;
15191 invlist_iterinit(cp_list);
15192 if (! invlist_iternext(cp_list, &start, &end)) {
15194 /* Here, the list is empty. This happens, for example, when a
15195 * Unicode property is the only thing in the character class, and
15196 * it doesn't match anything. (perluniprops.pod notes such
15199 *flagp |= HASWIDTH|SIMPLE;
15201 else if (start == end) { /* The range is a single code point */
15202 if (! invlist_iternext(cp_list, &start, &end)
15204 /* Don't do this optimization if it would require changing
15205 * the pattern to UTF-8 */
15206 && (start < 256 || UTF))
15208 /* Here, the list contains a single code point. Can optimize
15209 * into an EXACTish node */
15218 /* A locale node under folding with one code point can be
15219 * an EXACTFL, as its fold won't be calculated until
15225 /* Here, we are generally folding, but there is only one
15226 * code point to match. If we have to, we use an EXACT
15227 * node, but it would be better for joining with adjacent
15228 * nodes in the optimization pass if we used the same
15229 * EXACTFish node that any such are likely to be. We can
15230 * do this iff the code point doesn't participate in any
15231 * folds. For example, an EXACTF of a colon is the same as
15232 * an EXACT one, since nothing folds to or from a colon. */
15234 if (IS_IN_SOME_FOLD_L1(value)) {
15239 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15244 /* If we haven't found the node type, above, it means we
15245 * can use the prevailing one */
15247 op = compute_EXACTish(pRExC_state);
15252 else if (start == 0) {
15253 if (end == UV_MAX) {
15255 *flagp |= HASWIDTH|SIMPLE;
15258 else if (end == '\n' - 1
15259 && invlist_iternext(cp_list, &start, &end)
15260 && start == '\n' + 1 && end == UV_MAX)
15263 *flagp |= HASWIDTH|SIMPLE;
15267 invlist_iterfinish(cp_list);
15270 RExC_parse = (char *)orig_parse;
15271 RExC_emit = (regnode *)orig_emit;
15273 ret = reg_node(pRExC_state, op);
15275 RExC_parse = (char *)cur_parse;
15277 if (PL_regkind[op] == EXACT) {
15278 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15279 TRUE /* downgradable to EXACT */
15283 SvREFCNT_dec_NN(cp_list);
15288 /* Here, <cp_list> contains all the code points we can determine at
15289 * compile time that match under all conditions. Go through it, and
15290 * for things that belong in the bitmap, put them there, and delete from
15291 * <cp_list>. While we are at it, see if everything above 255 is in the
15292 * list, and if so, set a flag to speed up execution */
15294 populate_ANYOF_from_invlist(ret, &cp_list);
15297 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15300 /* Here, the bitmap has been populated with all the Latin1 code points that
15301 * always match. Can now add to the overall list those that match only
15302 * when the target string is UTF-8 (<depends_list>). */
15303 if (depends_list) {
15305 _invlist_union(cp_list, depends_list, &cp_list);
15306 SvREFCNT_dec_NN(depends_list);
15309 cp_list = depends_list;
15311 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15314 /* If there is a swash and more than one element, we can't use the swash in
15315 * the optimization below. */
15316 if (swash && element_count > 1) {
15317 SvREFCNT_dec_NN(swash);
15321 /* Note that the optimization of using 'swash' if it is the only thing in
15322 * the class doesn't have us change swash at all, so it can include things
15323 * that are also in the bitmap; otherwise we have purposely deleted that
15324 * duplicate information */
15325 set_ANYOF_arg(pRExC_state, ret, cp_list,
15326 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15328 only_utf8_locale_list,
15329 swash, has_user_defined_property);
15331 *flagp |= HASWIDTH|SIMPLE;
15333 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15334 RExC_contains_locale = 1;
15340 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15343 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15344 regnode* const node,
15346 SV* const runtime_defns,
15347 SV* const only_utf8_locale_list,
15349 const bool has_user_defined_property)
15351 /* Sets the arg field of an ANYOF-type node 'node', using information about
15352 * the node passed-in. If there is nothing outside the node's bitmap, the
15353 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
15354 * the count returned by add_data(), having allocated and stored an array,
15355 * av, that that count references, as follows:
15356 * av[0] stores the character class description in its textual form.
15357 * This is used later (regexec.c:Perl_regclass_swash()) to
15358 * initialize the appropriate swash, and is also useful for dumping
15359 * the regnode. This is set to &PL_sv_undef if the textual
15360 * description is not needed at run-time (as happens if the other
15361 * elements completely define the class)
15362 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15363 * computed from av[0]. But if no further computation need be done,
15364 * the swash is stored here now (and av[0] is &PL_sv_undef).
15365 * av[2] stores the inversion list of code points that match only if the
15366 * current locale is UTF-8
15367 * av[3] stores the cp_list inversion list for use in addition or instead
15368 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15369 * (Otherwise everything needed is already in av[0] and av[1])
15370 * av[4] is set if any component of the class is from a user-defined
15371 * property; used only if av[3] exists */
15375 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15377 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15378 assert(! (ANYOF_FLAGS(node)
15379 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15380 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15381 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15384 AV * const av = newAV();
15387 assert(ANYOF_FLAGS(node)
15388 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15389 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15391 av_store(av, 0, (runtime_defns)
15392 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15395 av_store(av, 1, swash);
15396 SvREFCNT_dec_NN(cp_list);
15399 av_store(av, 1, &PL_sv_undef);
15401 av_store(av, 3, cp_list);
15402 av_store(av, 4, newSVuv(has_user_defined_property));
15406 if (only_utf8_locale_list) {
15407 av_store(av, 2, only_utf8_locale_list);
15410 av_store(av, 2, &PL_sv_undef);
15413 rv = newRV_noinc(MUTABLE_SV(av));
15414 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15415 RExC_rxi->data->data[n] = (void*)rv;
15420 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15422 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15423 const regnode* node,
15426 SV** only_utf8_locale_ptr,
15430 /* For internal core use only.
15431 * Returns the swash for the input 'node' in the regex 'prog'.
15432 * If <doinit> is 'true', will attempt to create the swash if not already
15434 * If <listsvp> is non-null, will return the printable contents of the
15435 * swash. This can be used to get debugging information even before the
15436 * swash exists, by calling this function with 'doinit' set to false, in
15437 * which case the components that will be used to eventually create the
15438 * swash are returned (in a printable form).
15439 * If <exclude_list> is not NULL, it is an inversion list of things to
15440 * exclude from what's returned in <listsvp>.
15441 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
15442 * that, in spite of this function's name, the swash it returns may include
15443 * the bitmap data as well */
15446 SV *si = NULL; /* Input swash initialization string */
15447 SV* invlist = NULL;
15449 RXi_GET_DECL(prog,progi);
15450 const struct reg_data * const data = prog ? progi->data : NULL;
15452 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15454 assert(ANYOF_FLAGS(node)
15455 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15456 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15458 if (data && data->count) {
15459 const U32 n = ARG(node);
15461 if (data->what[n] == 's') {
15462 SV * const rv = MUTABLE_SV(data->data[n]);
15463 AV * const av = MUTABLE_AV(SvRV(rv));
15464 SV **const ary = AvARRAY(av);
15465 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15467 si = *ary; /* ary[0] = the string to initialize the swash with */
15469 /* Elements 3 and 4 are either both present or both absent. [3] is
15470 * any inversion list generated at compile time; [4] indicates if
15471 * that inversion list has any user-defined properties in it. */
15472 if (av_tindex(av) >= 2) {
15473 if (only_utf8_locale_ptr
15475 && ary[2] != &PL_sv_undef)
15477 *only_utf8_locale_ptr = ary[2];
15480 assert(only_utf8_locale_ptr);
15481 *only_utf8_locale_ptr = NULL;
15484 if (av_tindex(av) >= 3) {
15486 if (SvUV(ary[4])) {
15487 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15495 /* Element [1] is reserved for the set-up swash. If already there,
15496 * return it; if not, create it and store it there */
15497 if (ary[1] && SvROK(ary[1])) {
15500 else if (doinit && ((si && si != &PL_sv_undef)
15501 || (invlist && invlist != &PL_sv_undef))) {
15503 sw = _core_swash_init("utf8", /* the utf8 package */
15507 0, /* not from tr/// */
15509 &swash_init_flags);
15510 (void)av_store(av, 1, sw);
15515 /* If requested, return a printable version of what this swash matches */
15517 SV* matches_string = newSVpvs("");
15519 /* The swash should be used, if possible, to get the data, as it
15520 * contains the resolved data. But this function can be called at
15521 * compile-time, before everything gets resolved, in which case we
15522 * return the currently best available information, which is the string
15523 * that will eventually be used to do that resolving, 'si' */
15524 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15525 && (si && si != &PL_sv_undef))
15527 sv_catsv(matches_string, si);
15530 /* Add the inversion list to whatever we have. This may have come from
15531 * the swash, or from an input parameter */
15533 if (exclude_list) {
15534 SV* clone = invlist_clone(invlist);
15535 _invlist_subtract(clone, exclude_list, &clone);
15536 sv_catsv(matches_string, _invlist_contents(clone));
15537 SvREFCNT_dec_NN(clone);
15540 sv_catsv(matches_string, _invlist_contents(invlist));
15543 *listsvp = matches_string;
15548 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15550 /* reg_skipcomment()
15552 Absorbs an /x style # comment from the input stream,
15553 returning a pointer to the first character beyond the comment, or if the
15554 comment terminates the pattern without anything following it, this returns
15555 one past the final character of the pattern (in other words, RExC_end) and
15556 sets the REG_RUN_ON_COMMENT_SEEN flag.
15558 Note it's the callers responsibility to ensure that we are
15559 actually in /x mode
15563 PERL_STATIC_INLINE char*
15564 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15566 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15570 while (p < RExC_end) {
15571 if (*(++p) == '\n') {
15576 /* we ran off the end of the pattern without ending the comment, so we have
15577 * to add an \n when wrapping */
15578 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15584 Advances the parse position, and optionally absorbs
15585 "whitespace" from the inputstream.
15587 Without /x "whitespace" means (?#...) style comments only,
15588 with /x this means (?#...) and # comments and whitespace proper.
15590 Returns the RExC_parse point from BEFORE the scan occurs.
15592 This is the /x friendly way of saying RExC_parse++.
15596 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15598 char* const retval = RExC_parse++;
15600 PERL_ARGS_ASSERT_NEXTCHAR;
15603 if (RExC_end - RExC_parse >= 3
15604 && *RExC_parse == '('
15605 && RExC_parse[1] == '?'
15606 && RExC_parse[2] == '#')
15608 while (*RExC_parse != ')') {
15609 if (RExC_parse == RExC_end)
15610 FAIL("Sequence (?#... not terminated");
15616 if (RExC_flags & RXf_PMf_EXTENDED) {
15617 char * p = regpatws(pRExC_state, RExC_parse,
15618 TRUE); /* means recognize comments */
15619 if (p != RExC_parse) {
15629 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15631 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15632 * space. In pass1, it aligns and increments RExC_size; in pass2,
15635 regnode * const ret = RExC_emit;
15636 GET_RE_DEBUG_FLAGS_DECL;
15638 PERL_ARGS_ASSERT_REGNODE_GUTS;
15640 assert(extra_size >= regarglen[op]);
15643 SIZE_ALIGN(RExC_size);
15644 RExC_size += 1 + extra_size;
15647 if (RExC_emit >= RExC_emit_bound)
15648 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15649 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15651 NODE_ALIGN_FILL(ret);
15652 #ifndef RE_TRACK_PATTERN_OFFSETS
15653 PERL_UNUSED_ARG(name);
15655 if (RExC_offsets) { /* MJD */
15657 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15660 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15661 ? "Overwriting end of array!\n" : "OK",
15662 (UV)(RExC_emit - RExC_emit_start),
15663 (UV)(RExC_parse - RExC_start),
15664 (UV)RExC_offsets[0]));
15665 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15672 - reg_node - emit a node
15674 STATIC regnode * /* Location. */
15675 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15677 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15679 PERL_ARGS_ASSERT_REG_NODE;
15681 assert(regarglen[op] == 0);
15684 regnode *ptr = ret;
15685 FILL_ADVANCE_NODE(ptr, op);
15692 - reganode - emit a node with an argument
15694 STATIC regnode * /* Location. */
15695 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15697 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15699 PERL_ARGS_ASSERT_REGANODE;
15701 assert(regarglen[op] == 1);
15704 regnode *ptr = ret;
15705 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15712 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15714 /* emit a node with U32 and I32 arguments */
15716 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15718 PERL_ARGS_ASSERT_REG2LANODE;
15720 assert(regarglen[op] == 2);
15723 regnode *ptr = ret;
15724 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15731 - reguni - emit (if appropriate) a Unicode character
15733 PERL_STATIC_INLINE STRLEN
15734 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15736 PERL_ARGS_ASSERT_REGUNI;
15738 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15742 - reginsert - insert an operator in front of already-emitted operand
15744 * Means relocating the operand.
15747 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15752 const int offset = regarglen[(U8)op];
15753 const int size = NODE_STEP_REGNODE + offset;
15754 GET_RE_DEBUG_FLAGS_DECL;
15756 PERL_ARGS_ASSERT_REGINSERT;
15757 PERL_UNUSED_CONTEXT;
15758 PERL_UNUSED_ARG(depth);
15759 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15760 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15769 if (RExC_open_parens) {
15771 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15772 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15773 if ( RExC_open_parens[paren] >= opnd ) {
15774 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15775 RExC_open_parens[paren] += size;
15777 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15779 if ( RExC_close_parens[paren] >= opnd ) {
15780 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15781 RExC_close_parens[paren] += size;
15783 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15788 while (src > opnd) {
15789 StructCopy(--src, --dst, regnode);
15790 #ifdef RE_TRACK_PATTERN_OFFSETS
15791 if (RExC_offsets) { /* MJD 20010112 */
15793 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15797 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15798 ? "Overwriting end of array!\n" : "OK",
15799 (UV)(src - RExC_emit_start),
15800 (UV)(dst - RExC_emit_start),
15801 (UV)RExC_offsets[0]));
15802 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15803 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15809 place = opnd; /* Op node, where operand used to be. */
15810 #ifdef RE_TRACK_PATTERN_OFFSETS
15811 if (RExC_offsets) { /* MJD */
15813 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15817 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15818 ? "Overwriting end of array!\n" : "OK",
15819 (UV)(place - RExC_emit_start),
15820 (UV)(RExC_parse - RExC_start),
15821 (UV)RExC_offsets[0]));
15822 Set_Node_Offset(place, RExC_parse);
15823 Set_Node_Length(place, 1);
15826 src = NEXTOPER(place);
15827 FILL_ADVANCE_NODE(place, op);
15828 Zero(src, offset, regnode);
15832 - regtail - set the next-pointer at the end of a node chain of p to val.
15833 - SEE ALSO: regtail_study
15835 /* TODO: All three parms should be const */
15837 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15838 const regnode *val,U32 depth)
15841 GET_RE_DEBUG_FLAGS_DECL;
15843 PERL_ARGS_ASSERT_REGTAIL;
15845 PERL_UNUSED_ARG(depth);
15851 /* Find last node. */
15854 regnode * const temp = regnext(scan);
15856 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15857 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15858 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15859 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15860 (temp == NULL ? "->" : ""),
15861 (temp == NULL ? PL_reg_name[OP(val)] : "")
15869 if (reg_off_by_arg[OP(scan)]) {
15870 ARG_SET(scan, val - scan);
15873 NEXT_OFF(scan) = val - scan;
15879 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15880 - Look for optimizable sequences at the same time.
15881 - currently only looks for EXACT chains.
15883 This is experimental code. The idea is to use this routine to perform
15884 in place optimizations on branches and groups as they are constructed,
15885 with the long term intention of removing optimization from study_chunk so
15886 that it is purely analytical.
15888 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15889 to control which is which.
15892 /* TODO: All four parms should be const */
15895 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15896 const regnode *val,U32 depth)
15900 #ifdef EXPERIMENTAL_INPLACESCAN
15903 GET_RE_DEBUG_FLAGS_DECL;
15905 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15911 /* Find last node. */
15915 regnode * const temp = regnext(scan);
15916 #ifdef EXPERIMENTAL_INPLACESCAN
15917 if (PL_regkind[OP(scan)] == EXACT) {
15918 bool unfolded_multi_char; /* Unexamined in this routine */
15919 if (join_exact(pRExC_state, scan, &min,
15920 &unfolded_multi_char, 1, val, depth+1))
15925 switch (OP(scan)) {
15928 case EXACTFA_NO_TRIE:
15933 if( exact == PSEUDO )
15935 else if ( exact != OP(scan) )
15944 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15945 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15946 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15947 SvPV_nolen_const(RExC_mysv),
15948 REG_NODE_NUM(scan),
15949 PL_reg_name[exact]);
15956 DEBUG_PARSE_MSG("");
15957 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
15958 PerlIO_printf(Perl_debug_log,
15959 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15960 SvPV_nolen_const(RExC_mysv),
15961 (IV)REG_NODE_NUM(val),
15965 if (reg_off_by_arg[OP(scan)]) {
15966 ARG_SET(scan, val - scan);
15969 NEXT_OFF(scan) = val - scan;
15977 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15982 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15987 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15989 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15990 if (flags & (1<<bit)) {
15991 if (!set++ && lead)
15992 PerlIO_printf(Perl_debug_log, "%s",lead);
15993 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15998 PerlIO_printf(Perl_debug_log, "\n");
16000 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16005 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16011 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16013 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16014 if (flags & (1<<bit)) {
16015 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16018 if (!set++ && lead)
16019 PerlIO_printf(Perl_debug_log, "%s",lead);
16020 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16023 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16024 if (!set++ && lead) {
16025 PerlIO_printf(Perl_debug_log, "%s",lead);
16028 case REGEX_UNICODE_CHARSET:
16029 PerlIO_printf(Perl_debug_log, "UNICODE");
16031 case REGEX_LOCALE_CHARSET:
16032 PerlIO_printf(Perl_debug_log, "LOCALE");
16034 case REGEX_ASCII_RESTRICTED_CHARSET:
16035 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16037 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16038 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16041 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16047 PerlIO_printf(Perl_debug_log, "\n");
16049 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16055 Perl_regdump(pTHX_ const regexp *r)
16058 SV * const sv = sv_newmortal();
16059 SV *dsv= sv_newmortal();
16060 RXi_GET_DECL(r,ri);
16061 GET_RE_DEBUG_FLAGS_DECL;
16063 PERL_ARGS_ASSERT_REGDUMP;
16065 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16067 /* Header fields of interest. */
16068 if (r->anchored_substr) {
16069 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16070 RE_SV_DUMPLEN(r->anchored_substr), 30);
16071 PerlIO_printf(Perl_debug_log,
16072 "anchored %s%s at %"IVdf" ",
16073 s, RE_SV_TAIL(r->anchored_substr),
16074 (IV)r->anchored_offset);
16075 } else if (r->anchored_utf8) {
16076 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16077 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16078 PerlIO_printf(Perl_debug_log,
16079 "anchored utf8 %s%s at %"IVdf" ",
16080 s, RE_SV_TAIL(r->anchored_utf8),
16081 (IV)r->anchored_offset);
16083 if (r->float_substr) {
16084 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16085 RE_SV_DUMPLEN(r->float_substr), 30);
16086 PerlIO_printf(Perl_debug_log,
16087 "floating %s%s at %"IVdf"..%"UVuf" ",
16088 s, RE_SV_TAIL(r->float_substr),
16089 (IV)r->float_min_offset, (UV)r->float_max_offset);
16090 } else if (r->float_utf8) {
16091 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16092 RE_SV_DUMPLEN(r->float_utf8), 30);
16093 PerlIO_printf(Perl_debug_log,
16094 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16095 s, RE_SV_TAIL(r->float_utf8),
16096 (IV)r->float_min_offset, (UV)r->float_max_offset);
16098 if (r->check_substr || r->check_utf8)
16099 PerlIO_printf(Perl_debug_log,
16101 (r->check_substr == r->float_substr
16102 && r->check_utf8 == r->float_utf8
16103 ? "(checking floating" : "(checking anchored"));
16104 if (r->intflags & PREGf_NOSCAN)
16105 PerlIO_printf(Perl_debug_log, " noscan");
16106 if (r->extflags & RXf_CHECK_ALL)
16107 PerlIO_printf(Perl_debug_log, " isall");
16108 if (r->check_substr || r->check_utf8)
16109 PerlIO_printf(Perl_debug_log, ") ");
16111 if (ri->regstclass) {
16112 regprop(r, sv, ri->regstclass, NULL, NULL);
16113 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16115 if (r->intflags & PREGf_ANCH) {
16116 PerlIO_printf(Perl_debug_log, "anchored");
16117 if (r->intflags & PREGf_ANCH_MBOL)
16118 PerlIO_printf(Perl_debug_log, "(MBOL)");
16119 if (r->intflags & PREGf_ANCH_SBOL)
16120 PerlIO_printf(Perl_debug_log, "(SBOL)");
16121 if (r->intflags & PREGf_ANCH_GPOS)
16122 PerlIO_printf(Perl_debug_log, "(GPOS)");
16123 PerlIO_putc(Perl_debug_log, ' ');
16125 if (r->intflags & PREGf_GPOS_SEEN)
16126 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16127 if (r->intflags & PREGf_SKIP)
16128 PerlIO_printf(Perl_debug_log, "plus ");
16129 if (r->intflags & PREGf_IMPLICIT)
16130 PerlIO_printf(Perl_debug_log, "implicit ");
16131 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16132 if (r->extflags & RXf_EVAL_SEEN)
16133 PerlIO_printf(Perl_debug_log, "with eval ");
16134 PerlIO_printf(Perl_debug_log, "\n");
16136 regdump_extflags("r->extflags: ",r->extflags);
16137 regdump_intflags("r->intflags: ",r->intflags);
16140 PERL_ARGS_ASSERT_REGDUMP;
16141 PERL_UNUSED_CONTEXT;
16142 PERL_UNUSED_ARG(r);
16143 #endif /* DEBUGGING */
16147 - regprop - printable representation of opcode, with run time support
16151 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16156 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16157 static const char * const anyofs[] = {
16158 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16159 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
16160 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
16161 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
16162 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
16163 || _CC_VERTSPACE != 16
16164 #error Need to adjust order of anyofs[]
16201 RXi_GET_DECL(prog,progi);
16202 GET_RE_DEBUG_FLAGS_DECL;
16204 PERL_ARGS_ASSERT_REGPROP;
16206 sv_setpvn(sv, "", 0);
16208 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
16209 /* It would be nice to FAIL() here, but this may be called from
16210 regexec.c, and it would be hard to supply pRExC_state. */
16211 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16212 (int)OP(o), (int)REGNODE_MAX);
16213 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16215 k = PL_regkind[OP(o)];
16218 sv_catpvs(sv, " ");
16219 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16220 * is a crude hack but it may be the best for now since
16221 * we have no flag "this EXACTish node was UTF-8"
16223 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16224 PERL_PV_ESCAPE_UNI_DETECT |
16225 PERL_PV_ESCAPE_NONASCII |
16226 PERL_PV_PRETTY_ELLIPSES |
16227 PERL_PV_PRETTY_LTGT |
16228 PERL_PV_PRETTY_NOCLEAR
16230 } else if (k == TRIE) {
16231 /* print the details of the trie in dumpuntil instead, as
16232 * progi->data isn't available here */
16233 const char op = OP(o);
16234 const U32 n = ARG(o);
16235 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16236 (reg_ac_data *)progi->data->data[n] :
16238 const reg_trie_data * const trie
16239 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16241 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16242 DEBUG_TRIE_COMPILE_r(
16243 Perl_sv_catpvf(aTHX_ sv,
16244 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16245 (UV)trie->startstate,
16246 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16247 (UV)trie->wordcount,
16250 (UV)TRIE_CHARCOUNT(trie),
16251 (UV)trie->uniquecharcount
16254 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16255 sv_catpvs(sv, "[");
16256 (void) put_charclass_bitmap_innards(sv,
16257 (IS_ANYOF_TRIE(op))
16259 : TRIE_BITMAP(trie),
16261 sv_catpvs(sv, "]");
16264 } else if (k == CURLY) {
16265 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16266 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16267 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16269 else if (k == WHILEM && o->flags) /* Ordinal/of */
16270 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16271 else if (k == REF || k == OPEN || k == CLOSE
16272 || k == GROUPP || OP(o)==ACCEPT)
16274 AV *name_list= NULL;
16275 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16276 if ( RXp_PAREN_NAMES(prog) ) {
16277 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16278 } else if ( pRExC_state ) {
16279 name_list= RExC_paren_name_list;
16282 if ( k != REF || (OP(o) < NREF)) {
16283 SV **name= av_fetch(name_list, ARG(o), 0 );
16285 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16288 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16289 I32 *nums=(I32*)SvPVX(sv_dat);
16290 SV **name= av_fetch(name_list, nums[0], 0 );
16293 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16294 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16295 (n ? "," : ""), (IV)nums[n]);
16297 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16301 if ( k == REF && reginfo) {
16302 U32 n = ARG(o); /* which paren pair */
16303 I32 ln = prog->offs[n].start;
16304 if (prog->lastparen < n || ln == -1)
16305 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16306 else if (ln == prog->offs[n].end)
16307 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16309 const char *s = reginfo->strbeg + ln;
16310 Perl_sv_catpvf(aTHX_ sv, ": ");
16311 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16312 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16315 } else if (k == GOSUB) {
16316 AV *name_list= NULL;
16317 if ( RXp_PAREN_NAMES(prog) ) {
16318 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16319 } else if ( pRExC_state ) {
16320 name_list= RExC_paren_name_list;
16323 /* Paren and offset */
16324 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16326 SV **name= av_fetch(name_list, ARG(o), 0 );
16328 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16331 else if (k == VERB) {
16333 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16334 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16335 } else if (k == LOGICAL)
16336 /* 2: embedded, otherwise 1 */
16337 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16338 else if (k == ANYOF) {
16339 const U8 flags = ANYOF_FLAGS(o);
16341 SV* bitmap_invlist; /* Will hold what the bit map contains */
16344 if (flags & ANYOF_LOCALE_FLAGS)
16345 sv_catpvs(sv, "{loc}");
16346 if (flags & ANYOF_LOC_FOLD)
16347 sv_catpvs(sv, "{i}");
16348 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16349 if (flags & ANYOF_INVERT)
16350 sv_catpvs(sv, "^");
16352 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16354 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16357 /* output any special charclass tests (used entirely under use
16359 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16361 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16362 if (ANYOF_POSIXL_TEST(o,i)) {
16363 sv_catpv(sv, anyofs[i]);
16369 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16370 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16371 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16375 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16376 if (flags & ANYOF_INVERT)
16377 /*make sure the invert info is in each */
16378 sv_catpvs(sv, "^");
16381 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16382 sv_catpvs(sv, "{non-utf8-latin1-all}");
16385 /* output information about the unicode matching */
16386 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16387 sv_catpvs(sv, "{above_bitmap_all}");
16388 else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16389 SV *lv; /* Set if there is something outside the bit map. */
16390 bool byte_output = FALSE; /* If something in the bitmap has
16392 SV *only_utf8_locale;
16394 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
16395 * is used to guarantee that nothing in the bitmap gets
16397 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16398 &lv, &only_utf8_locale,
16400 if (lv && lv != &PL_sv_undef) {
16401 char *s = savesvpv(lv);
16402 char * const origs = s;
16404 while (*s && *s != '\n')
16408 const char * const t = ++s;
16410 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16411 sv_catpvs(sv, "{outside bitmap}");
16414 sv_catpvs(sv, "{utf8}");
16418 sv_catpvs(sv, " ");
16424 /* Truncate very long output */
16425 if (s - origs > 256) {
16426 Perl_sv_catpvf(aTHX_ sv,
16428 (int) (s - origs - 1),
16434 else if (*s == '\t') {
16448 SvREFCNT_dec_NN(lv);
16451 if ((flags & ANYOF_LOC_FOLD)
16452 && only_utf8_locale
16453 && only_utf8_locale != &PL_sv_undef)
16456 int max_entries = 256;
16458 sv_catpvs(sv, "{utf8 locale}");
16459 invlist_iterinit(only_utf8_locale);
16460 while (invlist_iternext(only_utf8_locale,
16462 put_range(sv, start, end, FALSE);
16464 if (max_entries < 0) {
16465 sv_catpvs(sv, "...");
16469 invlist_iterfinish(only_utf8_locale);
16473 SvREFCNT_dec(bitmap_invlist);
16476 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16478 else if (k == POSIXD || k == NPOSIXD) {
16479 U8 index = FLAGS(o) * 2;
16480 if (index < C_ARRAY_LENGTH(anyofs)) {
16481 if (*anyofs[index] != '[') {
16484 sv_catpv(sv, anyofs[index]);
16485 if (*anyofs[index] != '[') {
16490 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16493 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16494 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16495 else if (OP(o) == SBOL)
16496 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16498 PERL_UNUSED_CONTEXT;
16499 PERL_UNUSED_ARG(sv);
16500 PERL_UNUSED_ARG(o);
16501 PERL_UNUSED_ARG(prog);
16502 PERL_UNUSED_ARG(reginfo);
16503 #endif /* DEBUGGING */
16509 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16510 { /* Assume that RE_INTUIT is set */
16511 struct regexp *const prog = ReANY(r);
16512 GET_RE_DEBUG_FLAGS_DECL;
16514 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16515 PERL_UNUSED_CONTEXT;
16519 const char * const s = SvPV_nolen_const(prog->check_substr
16520 ? prog->check_substr : prog->check_utf8);
16522 if (!PL_colorset) reginitcolors();
16523 PerlIO_printf(Perl_debug_log,
16524 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16526 prog->check_substr ? "" : "utf8 ",
16527 PL_colors[5],PL_colors[0],
16530 (strlen(s) > 60 ? "..." : ""));
16533 return prog->check_substr ? prog->check_substr : prog->check_utf8;
16539 handles refcounting and freeing the perl core regexp structure. When
16540 it is necessary to actually free the structure the first thing it
16541 does is call the 'free' method of the regexp_engine associated to
16542 the regexp, allowing the handling of the void *pprivate; member
16543 first. (This routine is not overridable by extensions, which is why
16544 the extensions free is called first.)
16546 See regdupe and regdupe_internal if you change anything here.
16548 #ifndef PERL_IN_XSUB_RE
16550 Perl_pregfree(pTHX_ REGEXP *r)
16556 Perl_pregfree2(pTHX_ REGEXP *rx)
16558 struct regexp *const r = ReANY(rx);
16559 GET_RE_DEBUG_FLAGS_DECL;
16561 PERL_ARGS_ASSERT_PREGFREE2;
16563 if (r->mother_re) {
16564 ReREFCNT_dec(r->mother_re);
16566 CALLREGFREE_PVT(rx); /* free the private data */
16567 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16568 Safefree(r->xpv_len_u.xpvlenu_pv);
16571 SvREFCNT_dec(r->anchored_substr);
16572 SvREFCNT_dec(r->anchored_utf8);
16573 SvREFCNT_dec(r->float_substr);
16574 SvREFCNT_dec(r->float_utf8);
16575 Safefree(r->substrs);
16577 RX_MATCH_COPY_FREE(rx);
16578 #ifdef PERL_ANY_COW
16579 SvREFCNT_dec(r->saved_copy);
16582 SvREFCNT_dec(r->qr_anoncv);
16583 rx->sv_u.svu_rx = 0;
16588 This is a hacky workaround to the structural issue of match results
16589 being stored in the regexp structure which is in turn stored in
16590 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16591 could be PL_curpm in multiple contexts, and could require multiple
16592 result sets being associated with the pattern simultaneously, such
16593 as when doing a recursive match with (??{$qr})
16595 The solution is to make a lightweight copy of the regexp structure
16596 when a qr// is returned from the code executed by (??{$qr}) this
16597 lightweight copy doesn't actually own any of its data except for
16598 the starp/end and the actual regexp structure itself.
16604 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16606 struct regexp *ret;
16607 struct regexp *const r = ReANY(rx);
16608 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16610 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16613 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16615 SvOK_off((SV *)ret_x);
16617 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16618 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16619 made both spots point to the same regexp body.) */
16620 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16621 assert(!SvPVX(ret_x));
16622 ret_x->sv_u.svu_rx = temp->sv_any;
16623 temp->sv_any = NULL;
16624 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16625 SvREFCNT_dec_NN(temp);
16626 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16627 ing below will not set it. */
16628 SvCUR_set(ret_x, SvCUR(rx));
16631 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16632 sv_force_normal(sv) is called. */
16634 ret = ReANY(ret_x);
16636 SvFLAGS(ret_x) |= SvUTF8(rx);
16637 /* We share the same string buffer as the original regexp, on which we
16638 hold a reference count, incremented when mother_re is set below.
16639 The string pointer is copied here, being part of the regexp struct.
16641 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16642 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16644 const I32 npar = r->nparens+1;
16645 Newx(ret->offs, npar, regexp_paren_pair);
16646 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16649 Newx(ret->substrs, 1, struct reg_substr_data);
16650 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16652 SvREFCNT_inc_void(ret->anchored_substr);
16653 SvREFCNT_inc_void(ret->anchored_utf8);
16654 SvREFCNT_inc_void(ret->float_substr);
16655 SvREFCNT_inc_void(ret->float_utf8);
16657 /* check_substr and check_utf8, if non-NULL, point to either their
16658 anchored or float namesakes, and don't hold a second reference. */
16660 RX_MATCH_COPIED_off(ret_x);
16661 #ifdef PERL_ANY_COW
16662 ret->saved_copy = NULL;
16664 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16665 SvREFCNT_inc_void(ret->qr_anoncv);
16671 /* regfree_internal()
16673 Free the private data in a regexp. This is overloadable by
16674 extensions. Perl takes care of the regexp structure in pregfree(),
16675 this covers the *pprivate pointer which technically perl doesn't
16676 know about, however of course we have to handle the
16677 regexp_internal structure when no extension is in use.
16679 Note this is called before freeing anything in the regexp
16684 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16686 struct regexp *const r = ReANY(rx);
16687 RXi_GET_DECL(r,ri);
16688 GET_RE_DEBUG_FLAGS_DECL;
16690 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16696 SV *dsv= sv_newmortal();
16697 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16698 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16699 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16700 PL_colors[4],PL_colors[5],s);
16703 #ifdef RE_TRACK_PATTERN_OFFSETS
16705 Safefree(ri->u.offsets); /* 20010421 MJD */
16707 if (ri->code_blocks) {
16709 for (n = 0; n < ri->num_code_blocks; n++)
16710 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16711 Safefree(ri->code_blocks);
16715 int n = ri->data->count;
16718 /* If you add a ->what type here, update the comment in regcomp.h */
16719 switch (ri->data->what[n]) {
16725 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16728 Safefree(ri->data->data[n]);
16734 { /* Aho Corasick add-on structure for a trie node.
16735 Used in stclass optimization only */
16737 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16738 #ifdef USE_ITHREADS
16742 refcount = --aho->refcount;
16745 PerlMemShared_free(aho->states);
16746 PerlMemShared_free(aho->fail);
16747 /* do this last!!!! */
16748 PerlMemShared_free(ri->data->data[n]);
16749 /* we should only ever get called once, so
16750 * assert as much, and also guard the free
16751 * which /might/ happen twice. At the least
16752 * it will make code anlyzers happy and it
16753 * doesn't cost much. - Yves */
16754 assert(ri->regstclass);
16755 if (ri->regstclass) {
16756 PerlMemShared_free(ri->regstclass);
16757 ri->regstclass = 0;
16764 /* trie structure. */
16766 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16767 #ifdef USE_ITHREADS
16771 refcount = --trie->refcount;
16774 PerlMemShared_free(trie->charmap);
16775 PerlMemShared_free(trie->states);
16776 PerlMemShared_free(trie->trans);
16778 PerlMemShared_free(trie->bitmap);
16780 PerlMemShared_free(trie->jump);
16781 PerlMemShared_free(trie->wordinfo);
16782 /* do this last!!!! */
16783 PerlMemShared_free(ri->data->data[n]);
16788 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16789 ri->data->what[n]);
16792 Safefree(ri->data->what);
16793 Safefree(ri->data);
16799 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16800 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16801 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16804 re_dup - duplicate a regexp.
16806 This routine is expected to clone a given regexp structure. It is only
16807 compiled under USE_ITHREADS.
16809 After all of the core data stored in struct regexp is duplicated
16810 the regexp_engine.dupe method is used to copy any private data
16811 stored in the *pprivate pointer. This allows extensions to handle
16812 any duplication it needs to do.
16814 See pregfree() and regfree_internal() if you change anything here.
16816 #if defined(USE_ITHREADS)
16817 #ifndef PERL_IN_XSUB_RE
16819 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16823 const struct regexp *r = ReANY(sstr);
16824 struct regexp *ret = ReANY(dstr);
16826 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16828 npar = r->nparens+1;
16829 Newx(ret->offs, npar, regexp_paren_pair);
16830 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16832 if (ret->substrs) {
16833 /* Do it this way to avoid reading from *r after the StructCopy().
16834 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16835 cache, it doesn't matter. */
16836 const bool anchored = r->check_substr
16837 ? r->check_substr == r->anchored_substr
16838 : r->check_utf8 == r->anchored_utf8;
16839 Newx(ret->substrs, 1, struct reg_substr_data);
16840 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16842 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16843 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16844 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16845 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16847 /* check_substr and check_utf8, if non-NULL, point to either their
16848 anchored or float namesakes, and don't hold a second reference. */
16850 if (ret->check_substr) {
16852 assert(r->check_utf8 == r->anchored_utf8);
16853 ret->check_substr = ret->anchored_substr;
16854 ret->check_utf8 = ret->anchored_utf8;
16856 assert(r->check_substr == r->float_substr);
16857 assert(r->check_utf8 == r->float_utf8);
16858 ret->check_substr = ret->float_substr;
16859 ret->check_utf8 = ret->float_utf8;
16861 } else if (ret->check_utf8) {
16863 ret->check_utf8 = ret->anchored_utf8;
16865 ret->check_utf8 = ret->float_utf8;
16870 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16871 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16874 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16876 if (RX_MATCH_COPIED(dstr))
16877 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16879 ret->subbeg = NULL;
16880 #ifdef PERL_ANY_COW
16881 ret->saved_copy = NULL;
16884 /* Whether mother_re be set or no, we need to copy the string. We
16885 cannot refrain from copying it when the storage points directly to
16886 our mother regexp, because that's
16887 1: a buffer in a different thread
16888 2: something we no longer hold a reference on
16889 so we need to copy it locally. */
16890 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16891 ret->mother_re = NULL;
16893 #endif /* PERL_IN_XSUB_RE */
16898 This is the internal complement to regdupe() which is used to copy
16899 the structure pointed to by the *pprivate pointer in the regexp.
16900 This is the core version of the extension overridable cloning hook.
16901 The regexp structure being duplicated will be copied by perl prior
16902 to this and will be provided as the regexp *r argument, however
16903 with the /old/ structures pprivate pointer value. Thus this routine
16904 may override any copying normally done by perl.
16906 It returns a pointer to the new regexp_internal structure.
16910 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16913 struct regexp *const r = ReANY(rx);
16914 regexp_internal *reti;
16916 RXi_GET_DECL(r,ri);
16918 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16922 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16923 char, regexp_internal);
16924 Copy(ri->program, reti->program, len+1, regnode);
16926 reti->num_code_blocks = ri->num_code_blocks;
16927 if (ri->code_blocks) {
16929 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16930 struct reg_code_block);
16931 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16932 struct reg_code_block);
16933 for (n = 0; n < ri->num_code_blocks; n++)
16934 reti->code_blocks[n].src_regex = (REGEXP*)
16935 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16938 reti->code_blocks = NULL;
16940 reti->regstclass = NULL;
16943 struct reg_data *d;
16944 const int count = ri->data->count;
16947 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16948 char, struct reg_data);
16949 Newx(d->what, count, U8);
16952 for (i = 0; i < count; i++) {
16953 d->what[i] = ri->data->what[i];
16954 switch (d->what[i]) {
16955 /* see also regcomp.h and regfree_internal() */
16956 case 'a': /* actually an AV, but the dup function is identical. */
16960 case 'u': /* actually an HV, but the dup function is identical. */
16961 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16964 /* This is cheating. */
16965 Newx(d->data[i], 1, regnode_ssc);
16966 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16967 reti->regstclass = (regnode*)d->data[i];
16970 /* Trie stclasses are readonly and can thus be shared
16971 * without duplication. We free the stclass in pregfree
16972 * when the corresponding reg_ac_data struct is freed.
16974 reti->regstclass= ri->regstclass;
16978 ((reg_trie_data*)ri->data->data[i])->refcount++;
16983 d->data[i] = ri->data->data[i];
16986 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16987 ri->data->what[i]);
16996 reti->name_list_idx = ri->name_list_idx;
16998 #ifdef RE_TRACK_PATTERN_OFFSETS
16999 if (ri->u.offsets) {
17000 Newx(reti->u.offsets, 2*len+1, U32);
17001 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17004 SetProgLen(reti,len);
17007 return (void*)reti;
17010 #endif /* USE_ITHREADS */
17012 #ifndef PERL_IN_XSUB_RE
17015 - regnext - dig the "next" pointer out of a node
17018 Perl_regnext(pTHX_ regnode *p)
17025 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17026 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17027 (int)OP(p), (int)REGNODE_MAX);
17030 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17039 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17042 STRLEN l1 = strlen(pat1);
17043 STRLEN l2 = strlen(pat2);
17046 const char *message;
17048 PERL_ARGS_ASSERT_RE_CROAK2;
17054 Copy(pat1, buf, l1 , char);
17055 Copy(pat2, buf + l1, l2 , char);
17056 buf[l1 + l2] = '\n';
17057 buf[l1 + l2 + 1] = '\0';
17058 va_start(args, pat2);
17059 msv = vmess(buf, &args);
17061 message = SvPV_const(msv,l1);
17064 Copy(message, buf, l1 , char);
17065 /* l1-1 to avoid \n */
17066 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17070 /* Certain characters are output as a sequence with the first being a
17072 #define isBACKSLASHED_PUNCT(c) \
17073 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17076 S_put_code_point(pTHX_ SV *sv, UV c)
17078 PERL_ARGS_ASSERT_PUT_CODE_POINT;
17081 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17083 else if (isPRINT(c)) {
17084 const char string = (char) c;
17085 if (isBACKSLASHED_PUNCT(c))
17086 sv_catpvs(sv, "\\");
17087 sv_catpvn(sv, &string, 1);
17090 const char * const mnemonic = cntrl_to_mnemonic((char) c);
17092 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17095 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17100 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17103 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17105 /* Appends to 'sv' a displayable version of the range of code points from
17106 * 'start' to 'end'. It assumes that only ASCII printables are displayable
17107 * as-is (though some of these will be escaped by put_code_point()). */
17109 const unsigned int min_range_count = 3;
17111 assert(start <= end);
17113 PERL_ARGS_ASSERT_PUT_RANGE;
17115 while (start <= end) {
17117 const char * format;
17119 if (end - start < min_range_count) {
17121 /* Individual chars in short ranges */
17122 for (; start <= end; start++) {
17123 put_code_point(sv, start);
17128 /* If permitted by the input options, and there is a possibility that
17129 * this range contains a printable literal, look to see if there is
17131 if (allow_literals && start <= MAX_PRINT_A) {
17133 /* If the range begin isn't an ASCII printable, effectively split
17134 * the range into two parts:
17135 * 1) the portion before the first such printable,
17137 * and output them separately. */
17138 if (! isPRINT_A(start)) {
17139 UV temp_end = start + 1;
17141 /* There is no point looking beyond the final possible
17142 * printable, in MAX_PRINT_A */
17143 UV max = MIN(end, MAX_PRINT_A);
17145 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17149 /* Here, temp_end points to one beyond the first printable if
17150 * found, or to one beyond 'max' if not. If none found, make
17151 * sure that we use the entire range */
17152 if (temp_end > MAX_PRINT_A) {
17153 temp_end = end + 1;
17156 /* Output the first part of the split range, the part that
17157 * doesn't have printables, with no looking for literals
17158 * (otherwise we would infinitely recurse) */
17159 put_range(sv, start, temp_end - 1, FALSE);
17161 /* The 2nd part of the range (if any) starts here. */
17164 /* We continue instead of dropping down because even if the 2nd
17165 * part is non-empty, it could be so short that we want to
17166 * output it specially, as tested for at the top of this loop.
17171 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
17172 * output a sub-range of just the digits or letters, then process
17173 * the remaining portion as usual. */
17174 if (isALPHANUMERIC_A(start)) {
17175 UV mask = (isDIGIT_A(start))
17180 UV temp_end = start + 1;
17182 /* Find the end of the sub-range that includes just the
17183 * characters in the same class as the first character in it */
17184 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17189 /* For short ranges, don't duplicate the code above to output
17190 * them; just call recursively */
17191 if (temp_end - start < min_range_count) {
17192 put_range(sv, start, temp_end, FALSE);
17194 else { /* Output as a range */
17195 put_code_point(sv, start);
17196 sv_catpvs(sv, "-");
17197 put_code_point(sv, temp_end);
17199 start = temp_end + 1;
17203 /* We output any other printables as individual characters */
17204 if (isPUNCT_A(start) || isSPACE_A(start)) {
17205 while (start <= end && (isPUNCT_A(start)
17206 || isSPACE_A(start)))
17208 put_code_point(sv, start);
17213 } /* End of looking for literals */
17215 /* Here is not to output as a literal. Some control characters have
17216 * mnemonic names. Split off any of those at the beginning and end of
17217 * the range to print mnemonically. It isn't possible for many of
17218 * these to be in a row, so this won't overwhelm with output */
17219 while (isMNEMONIC_CNTRL(start) && start <= end) {
17220 put_code_point(sv, start);
17223 if (start < end && isMNEMONIC_CNTRL(end)) {
17225 /* Here, the final character in the range has a mnemonic name.
17226 * Work backwards from the end to find the final non-mnemonic */
17227 UV temp_end = end - 1;
17228 while (isMNEMONIC_CNTRL(temp_end)) {
17232 /* And separately output the range that doesn't have mnemonics */
17233 put_range(sv, start, temp_end, FALSE);
17235 /* Then output the mnemonic trailing controls */
17236 start = temp_end + 1;
17237 while (start <= end) {
17238 put_code_point(sv, start);
17244 /* As a final resort, output the range or subrange as hex. */
17246 this_end = (end < NUM_ANYOF_CODE_POINTS)
17248 : NUM_ANYOF_CODE_POINTS - 1;
17249 format = (this_end < 256)
17250 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17251 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17252 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17258 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17260 /* Appends to 'sv' a displayable version of the innards of the bracketed
17261 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
17262 * output anything, and bitmap_invlist, if not NULL, will point to an
17263 * inversion list of what is in the bit map */
17267 unsigned int punct_count = 0;
17268 SV* invlist = NULL;
17269 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17270 bool allow_literals = TRUE;
17272 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17274 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17276 /* Worst case is exactly every-other code point is in the list */
17277 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17279 /* Convert the bit map to an inversion list, keeping track of how many
17280 * ASCII puncts are set, including an extra amount for the backslashed
17282 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17283 if (BITMAP_TEST(bitmap, i)) {
17284 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17285 if (isPUNCT_A(i)) {
17287 if isBACKSLASHED_PUNCT(i) {
17294 /* Nothing to output */
17295 if (_invlist_len(*invlist_ptr) == 0) {
17296 SvREFCNT_dec(invlist);
17300 /* Generally, it is more readable if printable characters are output as
17301 * literals, but if a range (nearly) spans all of them, it's best to output
17302 * it as a single range. This code will use a single range if all but 2
17303 * printables are in it */
17304 invlist_iterinit(*invlist_ptr);
17305 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17307 /* If range starts beyond final printable, it doesn't have any in it */
17308 if (start > MAX_PRINT_A) {
17312 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
17313 * all but two, the range must start and end no later than 2 from
17315 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17316 if (end > MAX_PRINT_A) {
17322 if (end - start >= MAX_PRINT_A - ' ' - 2) {
17323 allow_literals = FALSE;
17328 invlist_iterfinish(*invlist_ptr);
17330 /* The legibility of the output depends mostly on how many punctuation
17331 * characters are output. There are 32 possible ASCII ones, and some have
17332 * an additional backslash, bringing it to currently 36, so if any more
17333 * than 18 are to be output, we can instead output it as its complement,
17334 * yielding fewer puncts, and making it more legible. But give some weight
17335 * to the fact that outputting it as a complement is less legible than a
17336 * straight output, so don't complement unless we are somewhat over the 18
17338 if (allow_literals && punct_count > 22) {
17339 sv_catpvs(sv, "^");
17341 /* Add everything remaining to the list, so when we invert it just
17342 * below, it will be excluded */
17343 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17344 _invlist_invert(*invlist_ptr);
17347 /* Here we have figured things out. Output each range */
17348 invlist_iterinit(*invlist_ptr);
17349 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17350 if (start >= NUM_ANYOF_CODE_POINTS) {
17353 put_range(sv, start, end, allow_literals);
17355 invlist_iterfinish(*invlist_ptr);
17360 #define CLEAR_OPTSTART \
17361 if (optstart) STMT_START { \
17362 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
17363 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17367 #define DUMPUNTIL(b,e) \
17369 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17371 STATIC const regnode *
17372 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17373 const regnode *last, const regnode *plast,
17374 SV* sv, I32 indent, U32 depth)
17376 U8 op = PSEUDO; /* Arbitrary non-END op. */
17377 const regnode *next;
17378 const regnode *optstart= NULL;
17380 RXi_GET_DECL(r,ri);
17381 GET_RE_DEBUG_FLAGS_DECL;
17383 PERL_ARGS_ASSERT_DUMPUNTIL;
17385 #ifdef DEBUG_DUMPUNTIL
17386 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17387 last ? last-start : 0,plast ? plast-start : 0);
17390 if (plast && plast < last)
17393 while (PL_regkind[op] != END && (!last || node < last)) {
17395 /* While that wasn't END last time... */
17398 if (op == CLOSE || op == WHILEM)
17400 next = regnext((regnode *)node);
17403 if (OP(node) == OPTIMIZED) {
17404 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17411 regprop(r, sv, node, NULL, NULL);
17412 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17413 (int)(2*indent + 1), "", SvPVX_const(sv));
17415 if (OP(node) != OPTIMIZED) {
17416 if (next == NULL) /* Next ptr. */
17417 PerlIO_printf(Perl_debug_log, " (0)");
17418 else if (PL_regkind[(U8)op] == BRANCH
17419 && PL_regkind[OP(next)] != BRANCH )
17420 PerlIO_printf(Perl_debug_log, " (FAIL)");
17422 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17423 (void)PerlIO_putc(Perl_debug_log, '\n');
17427 if (PL_regkind[(U8)op] == BRANCHJ) {
17430 const regnode *nnode = (OP(next) == LONGJMP
17431 ? regnext((regnode *)next)
17433 if (last && nnode > last)
17435 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17438 else if (PL_regkind[(U8)op] == BRANCH) {
17440 DUMPUNTIL(NEXTOPER(node), next);
17442 else if ( PL_regkind[(U8)op] == TRIE ) {
17443 const regnode *this_trie = node;
17444 const char op = OP(node);
17445 const U32 n = ARG(node);
17446 const reg_ac_data * const ac = op>=AHOCORASICK ?
17447 (reg_ac_data *)ri->data->data[n] :
17449 const reg_trie_data * const trie =
17450 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17452 AV *const trie_words
17453 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17455 const regnode *nextbranch= NULL;
17458 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17459 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17461 PerlIO_printf(Perl_debug_log, "%*s%s ",
17462 (int)(2*(indent+3)), "",
17464 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17465 SvCUR(*elem_ptr), 60,
17466 PL_colors[0], PL_colors[1],
17468 ? PERL_PV_ESCAPE_UNI
17470 | PERL_PV_PRETTY_ELLIPSES
17471 | PERL_PV_PRETTY_LTGT
17476 U16 dist= trie->jump[word_idx+1];
17477 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17478 (UV)((dist ? this_trie + dist : next) - start));
17481 nextbranch= this_trie + trie->jump[0];
17482 DUMPUNTIL(this_trie + dist, nextbranch);
17484 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17485 nextbranch= regnext((regnode *)nextbranch);
17487 PerlIO_printf(Perl_debug_log, "\n");
17490 if (last && next > last)
17495 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
17496 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17497 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17499 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17501 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17503 else if ( op == PLUS || op == STAR) {
17504 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17506 else if (PL_regkind[(U8)op] == ANYOF) {
17507 /* arglen 1 + class block */
17508 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17509 ? ANYOF_POSIXL_SKIP
17511 node = NEXTOPER(node);
17513 else if (PL_regkind[(U8)op] == EXACT) {
17514 /* Literal string, where present. */
17515 node += NODE_SZ_STR(node) - 1;
17516 node = NEXTOPER(node);
17519 node = NEXTOPER(node);
17520 node += regarglen[(U8)op];
17522 if (op == CURLYX || op == OPEN)
17526 #ifdef DEBUG_DUMPUNTIL
17527 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17532 #endif /* DEBUGGING */
17536 * c-indentation-style: bsd
17537 * c-basic-offset: 4
17538 * indent-tabs-mode: nil
17541 * ex: set ts=8 sts=4 sw=4 et: