5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
109 /* this is a chain of data about sub patterns we are processing that
110 need to be handled separately/specially in study_chunk. Its so
111 we can simulate recursion without losing state. */
113 typedef struct scan_frame {
114 regnode *last_regnode; /* last node to process in this frame */
115 regnode *next_regnode; /* next node to process when last is reached */
116 U32 prev_recursed_depth;
117 I32 stopparen; /* what stopparen do we use */
118 U32 is_top_frame; /* what flags do we use? */
120 struct scan_frame *this_prev_frame; /* this previous frame */
121 struct scan_frame *prev_frame; /* previous frame */
122 struct scan_frame *next_frame; /* next frame */
125 struct RExC_state_t {
126 U32 flags; /* RXf_* are we folding, multilining? */
127 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
128 char *precomp; /* uncompiled string. */
129 REGEXP *rx_sv; /* The SV that is the regexp. */
130 regexp *rx; /* perl core regexp structure */
131 regexp_internal *rxi; /* internal data for regexp object
133 char *start; /* Start of input for compile */
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
136 SSize_t whilem_seen; /* number of WHILEM in this expr */
137 regnode *emit_start; /* Start of emitted-code area */
138 regnode *emit_bound; /* First regnode outside of the
140 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
141 implies compiling, so don't emit */
142 regnode_ssc emit_dummy; /* placeholder for emit to point to;
143 large enough for the largest
144 non-EXACTish node, so can use it as
146 I32 naughty; /* How bad is this pattern? */
147 I32 sawback; /* Did we see \1, ...? */
149 SSize_t size; /* Code size. */
150 I32 npar; /* Capture buffer count, (OPEN) plus
151 one. ("par" 0 is the whole
153 I32 nestroot; /* root parens we are in - used by
157 regnode **open_parens; /* pointers to open parens */
158 regnode **close_parens; /* pointers to close parens */
159 regnode *opend; /* END node in program */
160 I32 utf8; /* whether the pattern is utf8 or not */
161 I32 orig_utf8; /* whether the pattern was originally in utf8 */
162 /* XXX use this for future optimisation of case
163 * where pattern must be upgraded to utf8. */
164 I32 uni_semantics; /* If a d charset modifier should use unicode
165 rules, even if the pattern is not in
167 HV *paren_names; /* Paren names */
169 regnode **recurse; /* Recurse regops */
170 I32 recurse_count; /* Number of recurse regops */
171 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
173 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
177 I32 override_recoding;
178 I32 in_multi_char_class;
179 struct reg_code_block *code_blocks; /* positions of literal (?{})
181 int num_code_blocks; /* size of code_blocks[] */
182 int code_index; /* next code_blocks[] slot */
183 SSize_t maxlen; /* mininum possible number of chars in string to match */
184 scan_frame *frame_head;
185 scan_frame *frame_last;
187 #ifdef ADD_TO_REGEXEC
188 char *starttry; /* -Dr: where regtry was called. */
189 #define RExC_starttry (pRExC_state->starttry)
191 SV *runtime_code_qr; /* qr with the runtime code blocks */
193 const char *lastparse;
195 AV *paren_name_list; /* idx -> name */
196 U32 study_chunk_recursed_count;
199 #define RExC_lastparse (pRExC_state->lastparse)
200 #define RExC_lastnum (pRExC_state->lastnum)
201 #define RExC_paren_name_list (pRExC_state->paren_name_list)
202 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
203 #define RExC_mysv (pRExC_state->mysv1)
204 #define RExC_mysv1 (pRExC_state->mysv1)
205 #define RExC_mysv2 (pRExC_state->mysv2)
210 #define RExC_flags (pRExC_state->flags)
211 #define RExC_pm_flags (pRExC_state->pm_flags)
212 #define RExC_precomp (pRExC_state->precomp)
213 #define RExC_rx_sv (pRExC_state->rx_sv)
214 #define RExC_rx (pRExC_state->rx)
215 #define RExC_rxi (pRExC_state->rxi)
216 #define RExC_start (pRExC_state->start)
217 #define RExC_end (pRExC_state->end)
218 #define RExC_parse (pRExC_state->parse)
219 #define RExC_whilem_seen (pRExC_state->whilem_seen)
220 #ifdef RE_TRACK_PATTERN_OFFSETS
221 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
224 #define RExC_emit (pRExC_state->emit)
225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
226 #define RExC_emit_start (pRExC_state->emit_start)
227 #define RExC_emit_bound (pRExC_state->emit_bound)
228 #define RExC_naughty (pRExC_state->naughty)
229 #define RExC_sawback (pRExC_state->sawback)
230 #define RExC_seen (pRExC_state->seen)
231 #define RExC_size (pRExC_state->size)
232 #define RExC_maxlen (pRExC_state->maxlen)
233 #define RExC_npar (pRExC_state->npar)
234 #define RExC_nestroot (pRExC_state->nestroot)
235 #define RExC_extralen (pRExC_state->extralen)
236 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
237 #define RExC_utf8 (pRExC_state->utf8)
238 #define RExC_uni_semantics (pRExC_state->uni_semantics)
239 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
240 #define RExC_open_parens (pRExC_state->open_parens)
241 #define RExC_close_parens (pRExC_state->close_parens)
242 #define RExC_opend (pRExC_state->opend)
243 #define RExC_paren_names (pRExC_state->paren_names)
244 #define RExC_recurse (pRExC_state->recurse)
245 #define RExC_recurse_count (pRExC_state->recurse_count)
246 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
247 #define RExC_study_chunk_recursed_bytes \
248 (pRExC_state->study_chunk_recursed_bytes)
249 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
250 #define RExC_contains_locale (pRExC_state->contains_locale)
251 #define RExC_contains_i (pRExC_state->contains_i)
252 #define RExC_override_recoding (pRExC_state->override_recoding)
253 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
254 #define RExC_frame_head (pRExC_state->frame_head)
255 #define RExC_frame_last (pRExC_state->frame_last)
256 #define RExC_frame_count (pRExC_state->frame_count)
259 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
260 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
261 ((*s) == '{' && regcurly(s)))
264 * Flags to be passed up and down.
266 #define WORST 0 /* Worst case. */
267 #define HASWIDTH 0x01 /* Known to match non-null strings. */
269 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
270 * character. (There needs to be a case: in the switch statement in regexec.c
271 * for any node marked SIMPLE.) Note that this is not the same thing as
274 #define SPSTART 0x04 /* Starts with * or + */
275 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
276 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
277 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
279 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
281 /* whether trie related optimizations are enabled */
282 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
283 #define TRIE_STUDY_OPT
284 #define FULL_TRIE_STUDY
290 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
291 #define PBITVAL(paren) (1 << ((paren) & 7))
292 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
293 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
294 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
296 #define REQUIRE_UTF8 STMT_START { \
298 *flagp = RESTART_UTF8; \
303 /* This converts the named class defined in regcomp.h to its equivalent class
304 * number defined in handy.h. */
305 #define namedclass_to_classnum(class) ((int) ((class) / 2))
306 #define classnum_to_namedclass(classnum) ((classnum) * 2)
308 #define _invlist_union_complement_2nd(a, b, output) \
309 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
310 #define _invlist_intersection_complement_2nd(a, b, output) \
311 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
313 /* About scan_data_t.
315 During optimisation we recurse through the regexp program performing
316 various inplace (keyhole style) optimisations. In addition study_chunk
317 and scan_commit populate this data structure with information about
318 what strings MUST appear in the pattern. We look for the longest
319 string that must appear at a fixed location, and we look for the
320 longest string that may appear at a floating location. So for instance
325 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
326 strings (because they follow a .* construct). study_chunk will identify
327 both FOO and BAR as being the longest fixed and floating strings respectively.
329 The strings can be composites, for instance
333 will result in a composite fixed substring 'foo'.
335 For each string some basic information is maintained:
337 - offset or min_offset
338 This is the position the string must appear at, or not before.
339 It also implicitly (when combined with minlenp) tells us how many
340 characters must match before the string we are searching for.
341 Likewise when combined with minlenp and the length of the string it
342 tells us how many characters must appear after the string we have
346 Only used for floating strings. This is the rightmost point that
347 the string can appear at. If set to SSize_t_MAX it indicates that the
348 string can occur infinitely far to the right.
351 A pointer to the minimum number of characters of the pattern that the
352 string was found inside. This is important as in the case of positive
353 lookahead or positive lookbehind we can have multiple patterns
358 The minimum length of the pattern overall is 3, the minimum length
359 of the lookahead part is 3, but the minimum length of the part that
360 will actually match is 1. So 'FOO's minimum length is 3, but the
361 minimum length for the F is 1. This is important as the minimum length
362 is used to determine offsets in front of and behind the string being
363 looked for. Since strings can be composites this is the length of the
364 pattern at the time it was committed with a scan_commit. Note that
365 the length is calculated by study_chunk, so that the minimum lengths
366 are not known until the full pattern has been compiled, thus the
367 pointer to the value.
371 In the case of lookbehind the string being searched for can be
372 offset past the start point of the final matching string.
373 If this value was just blithely removed from the min_offset it would
374 invalidate some of the calculations for how many chars must match
375 before or after (as they are derived from min_offset and minlen and
376 the length of the string being searched for).
377 When the final pattern is compiled and the data is moved from the
378 scan_data_t structure into the regexp structure the information
379 about lookbehind is factored in, with the information that would
380 have been lost precalculated in the end_shift field for the
383 The fields pos_min and pos_delta are used to store the minimum offset
384 and the delta to the maximum offset at the current point in the pattern.
388 typedef struct scan_data_t {
389 /*I32 len_min; unused */
390 /*I32 len_delta; unused */
394 SSize_t last_end; /* min value, <0 unless valid. */
395 SSize_t last_start_min;
396 SSize_t last_start_max;
397 SV **longest; /* Either &l_fixed, or &l_float. */
398 SV *longest_fixed; /* longest fixed string found in pattern */
399 SSize_t offset_fixed; /* offset where it starts */
400 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
401 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
402 SV *longest_float; /* longest floating string found in pattern */
403 SSize_t offset_float_min; /* earliest point in string it can appear */
404 SSize_t offset_float_max; /* latest point in string it can appear */
405 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
406 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
409 SSize_t *last_closep;
410 regnode_ssc *start_class;
414 * Forward declarations for pregcomp()'s friends.
417 static const scan_data_t zero_scan_data =
418 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
420 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
421 #define SF_BEFORE_SEOL 0x0001
422 #define SF_BEFORE_MEOL 0x0002
423 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
424 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
426 #define SF_FIX_SHIFT_EOL (+2)
427 #define SF_FL_SHIFT_EOL (+4)
429 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
430 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
432 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
433 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
434 #define SF_IS_INF 0x0040
435 #define SF_HAS_PAR 0x0080
436 #define SF_IN_PAR 0x0100
437 #define SF_HAS_EVAL 0x0200
438 #define SCF_DO_SUBSTR 0x0400
439 #define SCF_DO_STCLASS_AND 0x0800
440 #define SCF_DO_STCLASS_OR 0x1000
441 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
442 #define SCF_WHILEM_VISITED_POS 0x2000
444 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
445 #define SCF_SEEN_ACCEPT 0x8000
446 #define SCF_TRIE_DOING_RESTUDY 0x10000
447 #define SCF_IN_DEFINE 0x20000
452 #define UTF cBOOL(RExC_utf8)
454 /* The enums for all these are ordered so things work out correctly */
455 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
456 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
457 == REGEX_DEPENDS_CHARSET)
458 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
459 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
460 >= REGEX_UNICODE_CHARSET)
461 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
462 == REGEX_ASCII_RESTRICTED_CHARSET)
463 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
464 >= REGEX_ASCII_RESTRICTED_CHARSET)
465 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
466 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
468 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
470 /* For programs that want to be strictly Unicode compatible by dying if any
471 * attempt is made to match a non-Unicode code point against a Unicode
473 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
475 #define OOB_NAMEDCLASS -1
477 /* There is no code point that is out-of-bounds, so this is problematic. But
478 * its only current use is to initialize a variable that is always set before
480 #define OOB_UNICODE 0xDEADBEEF
482 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
483 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
486 /* length of regex to show in messages that don't mark a position within */
487 #define RegexLengthToShowInErrorMessages 127
490 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
491 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
492 * op/pragma/warn/regcomp.
494 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
495 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
497 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
498 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
500 #define REPORT_LOCATION_ARGS(offset) \
501 UTF8fARG(UTF, offset, RExC_precomp), \
502 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
505 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
506 * arg. Show regex, up to a maximum length. If it's too long, chop and add
509 #define _FAIL(code) STMT_START { \
510 const char *ellipses = ""; \
511 IV len = RExC_end - RExC_precomp; \
514 SAVEFREESV(RExC_rx_sv); \
515 if (len > RegexLengthToShowInErrorMessages) { \
516 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
517 len = RegexLengthToShowInErrorMessages - 10; \
523 #define FAIL(msg) _FAIL( \
524 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
525 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
527 #define FAIL2(msg,arg) _FAIL( \
528 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
529 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
532 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
534 #define Simple_vFAIL(m) STMT_START { \
536 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
537 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
538 m, REPORT_LOCATION_ARGS(offset)); \
542 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
544 #define vFAIL(m) STMT_START { \
546 SAVEFREESV(RExC_rx_sv); \
551 * Like Simple_vFAIL(), but accepts two arguments.
553 #define Simple_vFAIL2(m,a1) STMT_START { \
554 const IV offset = RExC_parse - RExC_precomp; \
555 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
556 REPORT_LOCATION_ARGS(offset)); \
560 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
562 #define vFAIL2(m,a1) STMT_START { \
564 SAVEFREESV(RExC_rx_sv); \
565 Simple_vFAIL2(m, a1); \
570 * Like Simple_vFAIL(), but accepts three arguments.
572 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
573 const IV offset = RExC_parse - RExC_precomp; \
574 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
575 REPORT_LOCATION_ARGS(offset)); \
579 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
581 #define vFAIL3(m,a1,a2) STMT_START { \
583 SAVEFREESV(RExC_rx_sv); \
584 Simple_vFAIL3(m, a1, a2); \
588 * Like Simple_vFAIL(), but accepts four arguments.
590 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
591 const IV offset = RExC_parse - RExC_precomp; \
592 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
593 REPORT_LOCATION_ARGS(offset)); \
596 #define vFAIL4(m,a1,a2,a3) STMT_START { \
598 SAVEFREESV(RExC_rx_sv); \
599 Simple_vFAIL4(m, a1, a2, a3); \
602 /* A specialized version of vFAIL2 that works with UTF8f */
603 #define vFAIL2utf8f(m, a1) STMT_START { \
604 const IV offset = RExC_parse - RExC_precomp; \
606 SAVEFREESV(RExC_rx_sv); \
607 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
608 REPORT_LOCATION_ARGS(offset)); \
611 /* These have asserts in them because of [perl #122671] Many warnings in
612 * regcomp.c can occur twice. If they get output in pass1 and later in that
613 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
614 * would get output again. So they should be output in pass2, and these
615 * asserts make sure new warnings follow that paradigm. */
617 /* m is not necessarily a "literal string", in this macro */
618 #define reg_warn_non_literal_string(loc, m) STMT_START { \
619 const IV offset = loc - RExC_precomp; \
620 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
621 m, REPORT_LOCATION_ARGS(offset)); \
624 #define ckWARNreg(loc,m) STMT_START { \
625 const IV offset = loc - RExC_precomp; \
626 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
627 REPORT_LOCATION_ARGS(offset)); \
630 #define vWARN_dep(loc, m) STMT_START { \
631 const IV offset = loc - RExC_precomp; \
632 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
633 REPORT_LOCATION_ARGS(offset)); \
636 #define ckWARNdep(loc,m) STMT_START { \
637 const IV offset = loc - RExC_precomp; \
638 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
640 REPORT_LOCATION_ARGS(offset)); \
643 #define ckWARNregdep(loc,m) STMT_START { \
644 const IV offset = loc - RExC_precomp; \
645 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
647 REPORT_LOCATION_ARGS(offset)); \
650 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
651 const IV offset = loc - RExC_precomp; \
652 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
654 a1, REPORT_LOCATION_ARGS(offset)); \
657 #define ckWARN2reg(loc, m, a1) STMT_START { \
658 const IV offset = loc - RExC_precomp; \
659 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
660 a1, REPORT_LOCATION_ARGS(offset)); \
663 #define vWARN3(loc, m, a1, a2) STMT_START { \
664 const IV offset = loc - RExC_precomp; \
665 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
666 a1, a2, REPORT_LOCATION_ARGS(offset)); \
669 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
670 const IV offset = loc - RExC_precomp; \
671 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
672 a1, a2, REPORT_LOCATION_ARGS(offset)); \
675 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
676 const IV offset = loc - RExC_precomp; \
677 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
678 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
681 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
682 const IV offset = loc - RExC_precomp; \
683 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
684 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
687 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
688 const IV offset = loc - RExC_precomp; \
689 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
690 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
694 /* Allow for side effects in s */
695 #define REGC(c,s) STMT_START { \
696 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
699 /* Macros for recording node offsets. 20001227 mjd@plover.com
700 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
701 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
702 * Element 0 holds the number n.
703 * Position is 1 indexed.
705 #ifndef RE_TRACK_PATTERN_OFFSETS
706 #define Set_Node_Offset_To_R(node,byte)
707 #define Set_Node_Offset(node,byte)
708 #define Set_Cur_Node_Offset
709 #define Set_Node_Length_To_R(node,len)
710 #define Set_Node_Length(node,len)
711 #define Set_Node_Cur_Length(node,start)
712 #define Node_Offset(n)
713 #define Node_Length(n)
714 #define Set_Node_Offset_Length(node,offset,len)
715 #define ProgLen(ri) ri->u.proglen
716 #define SetProgLen(ri,x) ri->u.proglen = x
718 #define ProgLen(ri) ri->u.offsets[0]
719 #define SetProgLen(ri,x) ri->u.offsets[0] = x
720 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
722 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
723 __LINE__, (int)(node), (int)(byte))); \
725 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
728 RExC_offsets[2*(node)-1] = (byte); \
733 #define Set_Node_Offset(node,byte) \
734 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
735 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
737 #define Set_Node_Length_To_R(node,len) STMT_START { \
739 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
740 __LINE__, (int)(node), (int)(len))); \
742 Perl_croak(aTHX_ "value of node is %d in Length macro", \
745 RExC_offsets[2*(node)] = (len); \
750 #define Set_Node_Length(node,len) \
751 Set_Node_Length_To_R((node)-RExC_emit_start, len)
752 #define Set_Node_Cur_Length(node, start) \
753 Set_Node_Length(node, RExC_parse - start)
755 /* Get offsets and lengths */
756 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
757 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
759 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
760 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
761 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
765 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
766 #define EXPERIMENTAL_INPLACESCAN
767 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
769 #define DEBUG_RExC_seen() \
770 DEBUG_OPTIMISE_MORE_r({ \
771 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
773 if (RExC_seen & REG_ZERO_LEN_SEEN) \
774 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
776 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
777 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
779 if (RExC_seen & REG_GPOS_SEEN) \
780 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
782 if (RExC_seen & REG_CANY_SEEN) \
783 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
785 if (RExC_seen & REG_RECURSE_SEEN) \
786 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
788 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
789 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
791 if (RExC_seen & REG_VERBARG_SEEN) \
792 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
794 if (RExC_seen & REG_CUTGROUP_SEEN) \
795 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
797 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
798 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
800 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
801 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
803 if (RExC_seen & REG_GOSTART_SEEN) \
804 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
806 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
807 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
809 PerlIO_printf(Perl_debug_log,"\n"); \
812 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
813 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
815 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
817 PerlIO_printf(Perl_debug_log, "%s", open_str); \
818 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
819 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
820 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
821 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
822 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
823 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
824 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
825 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
826 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
827 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
828 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
829 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
830 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
831 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
832 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
833 PerlIO_printf(Perl_debug_log, "%s", close_str); \
837 #define DEBUG_STUDYDATA(str,data,depth) \
838 DEBUG_OPTIMISE_MORE_r(if(data){ \
839 PerlIO_printf(Perl_debug_log, \
840 "%*s" str "Pos:%"IVdf"/%"IVdf \
842 (int)(depth)*2, "", \
843 (IV)((data)->pos_min), \
844 (IV)((data)->pos_delta), \
845 (UV)((data)->flags) \
847 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
848 PerlIO_printf(Perl_debug_log, \
849 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
850 (IV)((data)->whilem_c), \
851 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
852 is_inf ? "INF " : "" \
854 if ((data)->last_found) \
855 PerlIO_printf(Perl_debug_log, \
856 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
857 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
858 SvPVX_const((data)->last_found), \
859 (IV)((data)->last_end), \
860 (IV)((data)->last_start_min), \
861 (IV)((data)->last_start_max), \
862 ((data)->longest && \
863 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
864 SvPVX_const((data)->longest_fixed), \
865 (IV)((data)->offset_fixed), \
866 ((data)->longest && \
867 (data)->longest==&((data)->longest_float)) ? "*" : "", \
868 SvPVX_const((data)->longest_float), \
869 (IV)((data)->offset_float_min), \
870 (IV)((data)->offset_float_max) \
872 PerlIO_printf(Perl_debug_log,"\n"); \
877 /* is c a control character for which we have a mnemonic? */
878 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
881 S_cntrl_to_mnemonic(const U8 c)
883 /* Returns the mnemonic string that represents character 'c', if one
884 * exists; NULL otherwise. The only ones that exist for the purposes of
885 * this routine are a few control characters */
888 case '\a': return "\\a";
889 case '\b': return "\\b";
890 case ESC_NATIVE: return "\\e";
891 case '\f': return "\\f";
892 case '\n': return "\\n";
893 case '\r': return "\\r";
894 case '\t': return "\\t";
902 /* Mark that we cannot extend a found fixed substring at this point.
903 Update the longest found anchored substring and the longest found
904 floating substrings if needed. */
907 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
908 SSize_t *minlenp, int is_inf)
910 const STRLEN l = CHR_SVLEN(data->last_found);
911 const STRLEN old_l = CHR_SVLEN(*data->longest);
912 GET_RE_DEBUG_FLAGS_DECL;
914 PERL_ARGS_ASSERT_SCAN_COMMIT;
916 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
917 SvSetMagicSV(*data->longest, data->last_found);
918 if (*data->longest == data->longest_fixed) {
919 data->offset_fixed = l ? data->last_start_min : data->pos_min;
920 if (data->flags & SF_BEFORE_EOL)
922 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
924 data->flags &= ~SF_FIX_BEFORE_EOL;
925 data->minlen_fixed=minlenp;
926 data->lookbehind_fixed=0;
928 else { /* *data->longest == data->longest_float */
929 data->offset_float_min = l ? data->last_start_min : data->pos_min;
930 data->offset_float_max = (l
931 ? data->last_start_max
932 : (data->pos_delta == SSize_t_MAX
934 : data->pos_min + data->pos_delta));
936 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
937 data->offset_float_max = SSize_t_MAX;
938 if (data->flags & SF_BEFORE_EOL)
940 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
942 data->flags &= ~SF_FL_BEFORE_EOL;
943 data->minlen_float=minlenp;
944 data->lookbehind_float=0;
947 SvCUR_set(data->last_found, 0);
949 SV * const sv = data->last_found;
950 if (SvUTF8(sv) && SvMAGICAL(sv)) {
951 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
957 data->flags &= ~SF_BEFORE_EOL;
958 DEBUG_STUDYDATA("commit: ",data,0);
961 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
962 * list that describes which code points it matches */
965 S_ssc_anything(pTHX_ regnode_ssc *ssc)
967 /* Set the SSC 'ssc' to match an empty string or any code point */
969 PERL_ARGS_ASSERT_SSC_ANYTHING;
971 assert(is_ANYOF_SYNTHETIC(ssc));
973 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
974 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
975 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
979 S_ssc_is_anything(const regnode_ssc *ssc)
981 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
982 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
983 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
984 * in any way, so there's no point in using it */
989 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
991 assert(is_ANYOF_SYNTHETIC(ssc));
993 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
997 /* See if the list consists solely of the range 0 - Infinity */
998 invlist_iterinit(ssc->invlist);
999 ret = invlist_iternext(ssc->invlist, &start, &end)
1003 invlist_iterfinish(ssc->invlist);
1009 /* If e.g., both \w and \W are set, matches everything */
1010 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1012 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1013 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1023 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1025 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1026 * string, any code point, or any posix class under locale */
1028 PERL_ARGS_ASSERT_SSC_INIT;
1030 Zero(ssc, 1, regnode_ssc);
1031 set_ANYOF_SYNTHETIC(ssc);
1032 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1035 /* If any portion of the regex is to operate under locale rules,
1036 * initialization includes it. The reason this isn't done for all regexes
1037 * is that the optimizer was written under the assumption that locale was
1038 * all-or-nothing. Given the complexity and lack of documentation in the
1039 * optimizer, and that there are inadequate test cases for locale, many
1040 * parts of it may not work properly, it is safest to avoid locale unless
1042 if (RExC_contains_locale) {
1043 ANYOF_POSIXL_SETALL(ssc);
1046 ANYOF_POSIXL_ZERO(ssc);
1051 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1052 const regnode_ssc *ssc)
1054 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1055 * to the list of code points matched, and locale posix classes; hence does
1056 * not check its flags) */
1061 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1063 assert(is_ANYOF_SYNTHETIC(ssc));
1065 invlist_iterinit(ssc->invlist);
1066 ret = invlist_iternext(ssc->invlist, &start, &end)
1070 invlist_iterfinish(ssc->invlist);
1076 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1084 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1085 const regnode_charclass* const node)
1087 /* Returns a mortal inversion list defining which code points are matched
1088 * by 'node', which is of type ANYOF. Handles complementing the result if
1089 * appropriate. If some code points aren't knowable at this time, the
1090 * returned list must, and will, contain every code point that is a
1093 SV* invlist = sv_2mortal(_new_invlist(0));
1094 SV* only_utf8_locale_invlist = NULL;
1096 const U32 n = ARG(node);
1097 bool new_node_has_latin1 = FALSE;
1099 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1101 /* Look at the data structure created by S_set_ANYOF_arg() */
1102 if (n != ANYOF_ONLY_HAS_BITMAP) {
1103 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1104 AV * const av = MUTABLE_AV(SvRV(rv));
1105 SV **const ary = AvARRAY(av);
1106 assert(RExC_rxi->data->what[n] == 's');
1108 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1109 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1111 else if (ary[0] && ary[0] != &PL_sv_undef) {
1113 /* Here, no compile-time swash, and there are things that won't be
1114 * known until runtime -- we have to assume it could be anything */
1115 return _add_range_to_invlist(invlist, 0, UV_MAX);
1117 else if (ary[3] && ary[3] != &PL_sv_undef) {
1119 /* Here no compile-time swash, and no run-time only data. Use the
1120 * node's inversion list */
1121 invlist = sv_2mortal(invlist_clone(ary[3]));
1124 /* Get the code points valid only under UTF-8 locales */
1125 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1126 && ary[2] && ary[2] != &PL_sv_undef)
1128 only_utf8_locale_invlist = ary[2];
1132 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1133 * code points, and an inversion list for the others, but if there are code
1134 * points that should match only conditionally on the target string being
1135 * UTF-8, those are placed in the inversion list, and not the bitmap.
1136 * Since there are circumstances under which they could match, they are
1137 * included in the SSC. But if the ANYOF node is to be inverted, we have
1138 * to exclude them here, so that when we invert below, the end result
1139 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1140 * have to do this here before we add the unconditionally matched code
1142 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1143 _invlist_intersection_complement_2nd(invlist,
1148 /* Add in the points from the bit map */
1149 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1150 if (ANYOF_BITMAP_TEST(node, i)) {
1151 invlist = add_cp_to_invlist(invlist, i);
1152 new_node_has_latin1 = TRUE;
1156 /* If this can match all upper Latin1 code points, have to add them
1158 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1159 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1162 /* Similarly for these */
1163 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1164 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1167 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1168 _invlist_invert(invlist);
1170 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1172 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1173 * locale. We can skip this if there are no 0-255 at all. */
1174 _invlist_union(invlist, PL_Latin1, &invlist);
1177 /* Similarly add the UTF-8 locale possible matches. These have to be
1178 * deferred until after the non-UTF-8 locale ones are taken care of just
1179 * above, or it leads to wrong results under ANYOF_INVERT */
1180 if (only_utf8_locale_invlist) {
1181 _invlist_union_maybe_complement_2nd(invlist,
1182 only_utf8_locale_invlist,
1183 ANYOF_FLAGS(node) & ANYOF_INVERT,
1190 /* These two functions currently do the exact same thing */
1191 #define ssc_init_zero ssc_init
1193 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1194 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1196 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1197 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1198 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1201 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1202 const regnode_charclass *and_with)
1204 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1205 * another SSC or a regular ANYOF class. Can create false positives. */
1210 PERL_ARGS_ASSERT_SSC_AND;
1212 assert(is_ANYOF_SYNTHETIC(ssc));
1214 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1215 * the code point inversion list and just the relevant flags */
1216 if (is_ANYOF_SYNTHETIC(and_with)) {
1217 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1218 anded_flags = ANYOF_FLAGS(and_with);
1220 /* XXX This is a kludge around what appears to be deficiencies in the
1221 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1222 * there are paths through the optimizer where it doesn't get weeded
1223 * out when it should. And if we don't make some extra provision for
1224 * it like the code just below, it doesn't get added when it should.
1225 * This solution is to add it only when AND'ing, which is here, and
1226 * only when what is being AND'ed is the pristine, original node
1227 * matching anything. Thus it is like adding it to ssc_anything() but
1228 * only when the result is to be AND'ed. Probably the same solution
1229 * could be adopted for the same problem we have with /l matching,
1230 * which is solved differently in S_ssc_init(), and that would lead to
1231 * fewer false positives than that solution has. But if this solution
1232 * creates bugs, the consequences are only that a warning isn't raised
1233 * that should be; while the consequences for having /l bugs is
1234 * incorrect matches */
1235 if (ssc_is_anything((regnode_ssc *)and_with)) {
1236 anded_flags |= ANYOF_WARN_SUPER;
1240 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1241 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1244 ANYOF_FLAGS(ssc) &= anded_flags;
1246 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1247 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1248 * 'and_with' may be inverted. When not inverted, we have the situation of
1250 * (C1 | P1) & (C2 | P2)
1251 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1252 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1253 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1254 * <= ((C1 & C2) | P1 | P2)
1255 * Alternatively, the last few steps could be:
1256 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1257 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1258 * <= (C1 | C2 | (P1 & P2))
1259 * We favor the second approach if either P1 or P2 is non-empty. This is
1260 * because these components are a barrier to doing optimizations, as what
1261 * they match cannot be known until the moment of matching as they are
1262 * dependent on the current locale, 'AND"ing them likely will reduce or
1264 * But we can do better if we know that C1,P1 are in their initial state (a
1265 * frequent occurrence), each matching everything:
1266 * (<everything>) & (C2 | P2) = C2 | P2
1267 * Similarly, if C2,P2 are in their initial state (again a frequent
1268 * occurrence), the result is a no-op
1269 * (C1 | P1) & (<everything>) = C1 | P1
1272 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1273 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1274 * <= (C1 & ~C2) | (P1 & ~P2)
1277 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1278 && ! is_ANYOF_SYNTHETIC(and_with))
1282 ssc_intersection(ssc,
1284 FALSE /* Has already been inverted */
1287 /* If either P1 or P2 is empty, the intersection will be also; can skip
1289 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1290 ANYOF_POSIXL_ZERO(ssc);
1292 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1294 /* Note that the Posix class component P from 'and_with' actually
1296 * P = Pa | Pb | ... | Pn
1297 * where each component is one posix class, such as in [\w\s].
1299 * ~P = ~(Pa | Pb | ... | Pn)
1300 * = ~Pa & ~Pb & ... & ~Pn
1301 * <= ~Pa | ~Pb | ... | ~Pn
1302 * The last is something we can easily calculate, but unfortunately
1303 * is likely to have many false positives. We could do better
1304 * in some (but certainly not all) instances if two classes in
1305 * P have known relationships. For example
1306 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1308 * :lower: & :print: = :lower:
1309 * And similarly for classes that must be disjoint. For example,
1310 * since \s and \w can have no elements in common based on rules in
1311 * the POSIX standard,
1312 * \w & ^\S = nothing
1313 * Unfortunately, some vendor locales do not meet the Posix
1314 * standard, in particular almost everything by Microsoft.
1315 * The loop below just changes e.g., \w into \W and vice versa */
1317 regnode_charclass_posixl temp;
1318 int add = 1; /* To calculate the index of the complement */
1320 ANYOF_POSIXL_ZERO(&temp);
1321 for (i = 0; i < ANYOF_MAX; i++) {
1323 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1324 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1326 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1327 ANYOF_POSIXL_SET(&temp, i + add);
1329 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1331 ANYOF_POSIXL_AND(&temp, ssc);
1333 } /* else ssc already has no posixes */
1334 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1335 in its initial state */
1336 else if (! is_ANYOF_SYNTHETIC(and_with)
1337 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1339 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1340 * copy it over 'ssc' */
1341 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1342 if (is_ANYOF_SYNTHETIC(and_with)) {
1343 StructCopy(and_with, ssc, regnode_ssc);
1346 ssc->invlist = anded_cp_list;
1347 ANYOF_POSIXL_ZERO(ssc);
1348 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1349 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1353 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1354 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1356 /* One or the other of P1, P2 is non-empty. */
1357 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1358 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1360 ssc_union(ssc, anded_cp_list, FALSE);
1362 else { /* P1 = P2 = empty */
1363 ssc_intersection(ssc, anded_cp_list, FALSE);
1369 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1370 const regnode_charclass *or_with)
1372 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1373 * another SSC or a regular ANYOF class. Can create false positives if
1374 * 'or_with' is to be inverted. */
1379 PERL_ARGS_ASSERT_SSC_OR;
1381 assert(is_ANYOF_SYNTHETIC(ssc));
1383 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1384 * the code point inversion list and just the relevant flags */
1385 if (is_ANYOF_SYNTHETIC(or_with)) {
1386 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1387 ored_flags = ANYOF_FLAGS(or_with);
1390 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1391 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1394 ANYOF_FLAGS(ssc) |= ored_flags;
1396 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1397 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1398 * 'or_with' may be inverted. When not inverted, we have the simple
1399 * situation of computing:
1400 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1401 * If P1|P2 yields a situation with both a class and its complement are
1402 * set, like having both \w and \W, this matches all code points, and we
1403 * can delete these from the P component of the ssc going forward. XXX We
1404 * might be able to delete all the P components, but I (khw) am not certain
1405 * about this, and it is better to be safe.
1408 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1409 * <= (C1 | P1) | ~C2
1410 * <= (C1 | ~C2) | P1
1411 * (which results in actually simpler code than the non-inverted case)
1414 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1415 && ! is_ANYOF_SYNTHETIC(or_with))
1417 /* We ignore P2, leaving P1 going forward */
1418 } /* else Not inverted */
1419 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1420 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1421 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1423 for (i = 0; i < ANYOF_MAX; i += 2) {
1424 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1426 ssc_match_all_cp(ssc);
1427 ANYOF_POSIXL_CLEAR(ssc, i);
1428 ANYOF_POSIXL_CLEAR(ssc, i+1);
1436 FALSE /* Already has been inverted */
1440 PERL_STATIC_INLINE void
1441 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1443 PERL_ARGS_ASSERT_SSC_UNION;
1445 assert(is_ANYOF_SYNTHETIC(ssc));
1447 _invlist_union_maybe_complement_2nd(ssc->invlist,
1453 PERL_STATIC_INLINE void
1454 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1456 const bool invert2nd)
1458 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1460 assert(is_ANYOF_SYNTHETIC(ssc));
1462 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1468 PERL_STATIC_INLINE void
1469 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1471 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1473 assert(is_ANYOF_SYNTHETIC(ssc));
1475 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1478 PERL_STATIC_INLINE void
1479 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1481 /* AND just the single code point 'cp' into the SSC 'ssc' */
1483 SV* cp_list = _new_invlist(2);
1485 PERL_ARGS_ASSERT_SSC_CP_AND;
1487 assert(is_ANYOF_SYNTHETIC(ssc));
1489 cp_list = add_cp_to_invlist(cp_list, cp);
1490 ssc_intersection(ssc, cp_list,
1491 FALSE /* Not inverted */
1493 SvREFCNT_dec_NN(cp_list);
1496 PERL_STATIC_INLINE void
1497 S_ssc_clear_locale(regnode_ssc *ssc)
1499 /* Set the SSC 'ssc' to not match any locale things */
1500 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1502 assert(is_ANYOF_SYNTHETIC(ssc));
1504 ANYOF_POSIXL_ZERO(ssc);
1505 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1508 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1511 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1513 /* The synthetic start class is used to hopefully quickly winnow down
1514 * places where a pattern could start a match in the target string. If it
1515 * doesn't really narrow things down that much, there isn't much point to
1516 * having the overhead of using it. This function uses some very crude
1517 * heuristics to decide if to use the ssc or not.
1519 * It returns TRUE if 'ssc' rules out more than half what it considers to
1520 * be the "likely" possible matches, but of course it doesn't know what the
1521 * actual things being matched are going to be; these are only guesses
1523 * For /l matches, it assumes that the only likely matches are going to be
1524 * in the 0-255 range, uniformly distributed, so half of that is 127
1525 * For /a and /d matches, it assumes that the likely matches will be just
1526 * the ASCII range, so half of that is 63
1527 * For /u and there isn't anything matching above the Latin1 range, it
1528 * assumes that that is the only range likely to be matched, and uses
1529 * half that as the cut-off: 127. If anything matches above Latin1,
1530 * it assumes that all of Unicode could match (uniformly), except for
1531 * non-Unicode code points and things in the General Category "Other"
1532 * (unassigned, private use, surrogates, controls and formats). This
1533 * is a much large number. */
1535 const U32 max_match = (LOC)
1539 : (invlist_highest(ssc->invlist) < 256)
1541 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1542 U32 count = 0; /* Running total of number of code points matched by
1544 UV start, end; /* Start and end points of current range in inversion
1547 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1549 invlist_iterinit(ssc->invlist);
1550 while (invlist_iternext(ssc->invlist, &start, &end)) {
1552 /* /u is the only thing that we expect to match above 255; so if not /u
1553 * and even if there are matches above 255, ignore them. This catches
1554 * things like \d under /d which does match the digits above 255, but
1555 * since the pattern is /d, it is not likely to be expecting them */
1556 if (! UNI_SEMANTICS) {
1560 end = MIN(end, 255);
1562 count += end - start + 1;
1563 if (count > max_match) {
1564 invlist_iterfinish(ssc->invlist);
1574 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1576 /* The inversion list in the SSC is marked mortal; now we need a more
1577 * permanent copy, which is stored the same way that is done in a regular
1578 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1581 SV* invlist = invlist_clone(ssc->invlist);
1583 PERL_ARGS_ASSERT_SSC_FINALIZE;
1585 assert(is_ANYOF_SYNTHETIC(ssc));
1587 /* The code in this file assumes that all but these flags aren't relevant
1588 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1589 * by the time we reach here */
1590 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1592 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1594 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1595 NULL, NULL, NULL, FALSE);
1597 /* Make sure is clone-safe */
1598 ssc->invlist = NULL;
1600 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1601 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1604 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1607 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1608 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1609 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1610 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1611 ? (TRIE_LIST_CUR( idx ) - 1) \
1617 dump_trie(trie,widecharmap,revcharmap)
1618 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1619 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1621 These routines dump out a trie in a somewhat readable format.
1622 The _interim_ variants are used for debugging the interim
1623 tables that are used to generate the final compressed
1624 representation which is what dump_trie expects.
1626 Part of the reason for their existence is to provide a form
1627 of documentation as to how the different representations function.
1632 Dumps the final compressed table form of the trie to Perl_debug_log.
1633 Used for debugging make_trie().
1637 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1638 AV *revcharmap, U32 depth)
1641 SV *sv=sv_newmortal();
1642 int colwidth= widecharmap ? 6 : 4;
1644 GET_RE_DEBUG_FLAGS_DECL;
1646 PERL_ARGS_ASSERT_DUMP_TRIE;
1648 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1649 (int)depth * 2 + 2,"",
1650 "Match","Base","Ofs" );
1652 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1653 SV ** const tmp = av_fetch( revcharmap, state, 0);
1655 PerlIO_printf( Perl_debug_log, "%*s",
1657 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1658 PL_colors[0], PL_colors[1],
1659 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1660 PERL_PV_ESCAPE_FIRSTCHAR
1665 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1666 (int)depth * 2 + 2,"");
1668 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1669 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1670 PerlIO_printf( Perl_debug_log, "\n");
1672 for( state = 1 ; state < trie->statecount ; state++ ) {
1673 const U32 base = trie->states[ state ].trans.base;
1675 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1676 (int)depth * 2 + 2,"", (UV)state);
1678 if ( trie->states[ state ].wordnum ) {
1679 PerlIO_printf( Perl_debug_log, " W%4X",
1680 trie->states[ state ].wordnum );
1682 PerlIO_printf( Perl_debug_log, "%6s", "" );
1685 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1690 while( ( base + ofs < trie->uniquecharcount ) ||
1691 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1692 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1696 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1698 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1699 if ( ( base + ofs >= trie->uniquecharcount )
1700 && ( base + ofs - trie->uniquecharcount
1702 && trie->trans[ base + ofs
1703 - trie->uniquecharcount ].check == state )
1705 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1707 (UV)trie->trans[ base + ofs
1708 - trie->uniquecharcount ].next );
1710 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1714 PerlIO_printf( Perl_debug_log, "]");
1717 PerlIO_printf( Perl_debug_log, "\n" );
1719 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1721 for (word=1; word <= trie->wordcount; word++) {
1722 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1723 (int)word, (int)(trie->wordinfo[word].prev),
1724 (int)(trie->wordinfo[word].len));
1726 PerlIO_printf(Perl_debug_log, "\n" );
1729 Dumps a fully constructed but uncompressed trie in list form.
1730 List tries normally only are used for construction when the number of
1731 possible chars (trie->uniquecharcount) is very high.
1732 Used for debugging make_trie().
1735 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1736 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1740 SV *sv=sv_newmortal();
1741 int colwidth= widecharmap ? 6 : 4;
1742 GET_RE_DEBUG_FLAGS_DECL;
1744 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1746 /* print out the table precompression. */
1747 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1748 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1749 "------:-----+-----------------\n" );
1751 for( state=1 ; state < next_alloc ; state ++ ) {
1754 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1755 (int)depth * 2 + 2,"", (UV)state );
1756 if ( ! trie->states[ state ].wordnum ) {
1757 PerlIO_printf( Perl_debug_log, "%5s| ","");
1759 PerlIO_printf( Perl_debug_log, "W%4x| ",
1760 trie->states[ state ].wordnum
1763 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1764 SV ** const tmp = av_fetch( revcharmap,
1765 TRIE_LIST_ITEM(state,charid).forid, 0);
1767 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1769 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1771 PL_colors[0], PL_colors[1],
1772 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1773 | PERL_PV_ESCAPE_FIRSTCHAR
1775 TRIE_LIST_ITEM(state,charid).forid,
1776 (UV)TRIE_LIST_ITEM(state,charid).newstate
1779 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1780 (int)((depth * 2) + 14), "");
1783 PerlIO_printf( Perl_debug_log, "\n");
1788 Dumps a fully constructed but uncompressed trie in table form.
1789 This is the normal DFA style state transition table, with a few
1790 twists to facilitate compression later.
1791 Used for debugging make_trie().
1794 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1795 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1800 SV *sv=sv_newmortal();
1801 int colwidth= widecharmap ? 6 : 4;
1802 GET_RE_DEBUG_FLAGS_DECL;
1804 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1807 print out the table precompression so that we can do a visual check
1808 that they are identical.
1811 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1813 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1814 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1816 PerlIO_printf( Perl_debug_log, "%*s",
1818 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1819 PL_colors[0], PL_colors[1],
1820 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1821 PERL_PV_ESCAPE_FIRSTCHAR
1827 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1829 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1830 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1833 PerlIO_printf( Perl_debug_log, "\n" );
1835 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1837 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1838 (int)depth * 2 + 2,"",
1839 (UV)TRIE_NODENUM( state ) );
1841 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1842 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1844 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1846 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1848 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1849 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1850 (UV)trie->trans[ state ].check );
1852 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1853 (UV)trie->trans[ state ].check,
1854 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1862 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1863 startbranch: the first branch in the whole branch sequence
1864 first : start branch of sequence of branch-exact nodes.
1865 May be the same as startbranch
1866 last : Thing following the last branch.
1867 May be the same as tail.
1868 tail : item following the branch sequence
1869 count : words in the sequence
1870 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1871 depth : indent depth
1873 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1875 A trie is an N'ary tree where the branches are determined by digital
1876 decomposition of the key. IE, at the root node you look up the 1st character and
1877 follow that branch repeat until you find the end of the branches. Nodes can be
1878 marked as "accepting" meaning they represent a complete word. Eg:
1882 would convert into the following structure. Numbers represent states, letters
1883 following numbers represent valid transitions on the letter from that state, if
1884 the number is in square brackets it represents an accepting state, otherwise it
1885 will be in parenthesis.
1887 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1891 (1) +-i->(6)-+-s->[7]
1893 +-s->(3)-+-h->(4)-+-e->[5]
1895 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1897 This shows that when matching against the string 'hers' we will begin at state 1
1898 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1899 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1900 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1901 single traverse. We store a mapping from accepting to state to which word was
1902 matched, and then when we have multiple possibilities we try to complete the
1903 rest of the regex in the order in which they occured in the alternation.
1905 The only prior NFA like behaviour that would be changed by the TRIE support is
1906 the silent ignoring of duplicate alternations which are of the form:
1908 / (DUPE|DUPE) X? (?{ ... }) Y /x
1910 Thus EVAL blocks following a trie may be called a different number of times with
1911 and without the optimisation. With the optimisations dupes will be silently
1912 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1913 the following demonstrates:
1915 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1917 which prints out 'word' three times, but
1919 'words'=~/(word|word|word)(?{ print $1 })S/
1921 which doesnt print it out at all. This is due to other optimisations kicking in.
1923 Example of what happens on a structural level:
1925 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1927 1: CURLYM[1] {1,32767}(18)
1938 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1939 and should turn into:
1941 1: CURLYM[1] {1,32767}(18)
1943 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1951 Cases where tail != last would be like /(?foo|bar)baz/:
1961 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1962 and would end up looking like:
1965 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1972 d = uvchr_to_utf8_flags(d, uv, 0);
1974 is the recommended Unicode-aware way of saying
1979 #define TRIE_STORE_REVCHAR(val) \
1982 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1983 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1984 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1985 SvCUR_set(zlopp, kapow - flrbbbbb); \
1988 av_push(revcharmap, zlopp); \
1990 char ooooff = (char)val; \
1991 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1995 /* This gets the next character from the input, folding it if not already
1997 #define TRIE_READ_CHAR STMT_START { \
2000 /* if it is UTF then it is either already folded, or does not need \
2002 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2004 else if (folder == PL_fold_latin1) { \
2005 /* This folder implies Unicode rules, which in the range expressible \
2006 * by not UTF is the lower case, with the two exceptions, one of \
2007 * which should have been taken care of before calling this */ \
2008 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2009 uvc = toLOWER_L1(*uc); \
2010 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2013 /* raw data, will be folded later if needed */ \
2021 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2022 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2023 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2024 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2026 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2027 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2028 TRIE_LIST_CUR( state )++; \
2031 #define TRIE_LIST_NEW(state) STMT_START { \
2032 Newxz( trie->states[ state ].trans.list, \
2033 4, reg_trie_trans_le ); \
2034 TRIE_LIST_CUR( state ) = 1; \
2035 TRIE_LIST_LEN( state ) = 4; \
2038 #define TRIE_HANDLE_WORD(state) STMT_START { \
2039 U16 dupe= trie->states[ state ].wordnum; \
2040 regnode * const noper_next = regnext( noper ); \
2043 /* store the word for dumping */ \
2045 if (OP(noper) != NOTHING) \
2046 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2048 tmp = newSVpvn_utf8( "", 0, UTF ); \
2049 av_push( trie_words, tmp ); \
2053 trie->wordinfo[curword].prev = 0; \
2054 trie->wordinfo[curword].len = wordlen; \
2055 trie->wordinfo[curword].accept = state; \
2057 if ( noper_next < tail ) { \
2059 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2061 trie->jump[curword] = (U16)(noper_next - convert); \
2063 jumper = noper_next; \
2065 nextbranch= regnext(cur); \
2069 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2070 /* chain, so that when the bits of chain are later */\
2071 /* linked together, the dups appear in the chain */\
2072 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2073 trie->wordinfo[dupe].prev = curword; \
2075 /* we haven't inserted this word yet. */ \
2076 trie->states[ state ].wordnum = curword; \
2081 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2082 ( ( base + charid >= ucharcount \
2083 && base + charid < ubound \
2084 && state == trie->trans[ base - ucharcount + charid ].check \
2085 && trie->trans[ base - ucharcount + charid ].next ) \
2086 ? trie->trans[ base - ucharcount + charid ].next \
2087 : ( state==1 ? special : 0 ) \
2091 #define MADE_JUMP_TRIE 2
2092 #define MADE_EXACT_TRIE 4
2095 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2096 regnode *first, regnode *last, regnode *tail,
2097 U32 word_count, U32 flags, U32 depth)
2099 /* first pass, loop through and scan words */
2100 reg_trie_data *trie;
2101 HV *widecharmap = NULL;
2102 AV *revcharmap = newAV();
2108 regnode *jumper = NULL;
2109 regnode *nextbranch = NULL;
2110 regnode *convert = NULL;
2111 U32 *prev_states; /* temp array mapping each state to previous one */
2112 /* we just use folder as a flag in utf8 */
2113 const U8 * folder = NULL;
2116 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2117 AV *trie_words = NULL;
2118 /* along with revcharmap, this only used during construction but both are
2119 * useful during debugging so we store them in the struct when debugging.
2122 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2123 STRLEN trie_charcount=0;
2125 SV *re_trie_maxbuff;
2126 GET_RE_DEBUG_FLAGS_DECL;
2128 PERL_ARGS_ASSERT_MAKE_TRIE;
2130 PERL_UNUSED_ARG(depth);
2137 case EXACTFU: folder = PL_fold_latin1; break;
2138 case EXACTF: folder = PL_fold; break;
2139 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2142 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2144 trie->startstate = 1;
2145 trie->wordcount = word_count;
2146 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2147 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2149 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2150 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2151 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2154 trie_words = newAV();
2157 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2158 assert(re_trie_maxbuff);
2159 if (!SvIOK(re_trie_maxbuff)) {
2160 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2162 DEBUG_TRIE_COMPILE_r({
2163 PerlIO_printf( Perl_debug_log,
2164 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2165 (int)depth * 2 + 2, "",
2166 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2167 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2170 /* Find the node we are going to overwrite */
2171 if ( first == startbranch && OP( last ) != BRANCH ) {
2172 /* whole branch chain */
2175 /* branch sub-chain */
2176 convert = NEXTOPER( first );
2179 /* -- First loop and Setup --
2181 We first traverse the branches and scan each word to determine if it
2182 contains widechars, and how many unique chars there are, this is
2183 important as we have to build a table with at least as many columns as we
2186 We use an array of integers to represent the character codes 0..255
2187 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2188 the native representation of the character value as the key and IV's for
2191 *TODO* If we keep track of how many times each character is used we can
2192 remap the columns so that the table compression later on is more
2193 efficient in terms of memory by ensuring the most common value is in the
2194 middle and the least common are on the outside. IMO this would be better
2195 than a most to least common mapping as theres a decent chance the most
2196 common letter will share a node with the least common, meaning the node
2197 will not be compressible. With a middle is most common approach the worst
2198 case is when we have the least common nodes twice.
2202 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2203 regnode *noper = NEXTOPER( cur );
2204 const U8 *uc = (U8*)STRING( noper );
2205 const U8 *e = uc + STR_LEN( noper );
2207 U32 wordlen = 0; /* required init */
2208 STRLEN minchars = 0;
2209 STRLEN maxchars = 0;
2210 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2213 if (OP(noper) == NOTHING) {
2214 regnode *noper_next= regnext(noper);
2215 if (noper_next != tail && OP(noper_next) == flags) {
2217 uc= (U8*)STRING(noper);
2218 e= uc + STR_LEN(noper);
2219 trie->minlen= STR_LEN(noper);
2226 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2227 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2228 regardless of encoding */
2229 if (OP( noper ) == EXACTFU_SS) {
2230 /* false positives are ok, so just set this */
2231 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2234 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2236 TRIE_CHARCOUNT(trie)++;
2239 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2240 * is in effect. Under /i, this character can match itself, or
2241 * anything that folds to it. If not under /i, it can match just
2242 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2243 * all fold to k, and all are single characters. But some folds
2244 * expand to more than one character, so for example LATIN SMALL
2245 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2246 * the string beginning at 'uc' is 'ffi', it could be matched by
2247 * three characters, or just by the one ligature character. (It
2248 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2249 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2250 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2251 * match.) The trie needs to know the minimum and maximum number
2252 * of characters that could match so that it can use size alone to
2253 * quickly reject many match attempts. The max is simple: it is
2254 * the number of folded characters in this branch (since a fold is
2255 * never shorter than what folds to it. */
2259 /* And the min is equal to the max if not under /i (indicated by
2260 * 'folder' being NULL), or there are no multi-character folds. If
2261 * there is a multi-character fold, the min is incremented just
2262 * once, for the character that folds to the sequence. Each
2263 * character in the sequence needs to be added to the list below of
2264 * characters in the trie, but we count only the first towards the
2265 * min number of characters needed. This is done through the
2266 * variable 'foldlen', which is returned by the macros that look
2267 * for these sequences as the number of bytes the sequence
2268 * occupies. Each time through the loop, we decrement 'foldlen' by
2269 * how many bytes the current char occupies. Only when it reaches
2270 * 0 do we increment 'minchars' or look for another multi-character
2272 if (folder == NULL) {
2275 else if (foldlen > 0) {
2276 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2281 /* See if *uc is the beginning of a multi-character fold. If
2282 * so, we decrement the length remaining to look at, to account
2283 * for the current character this iteration. (We can use 'uc'
2284 * instead of the fold returned by TRIE_READ_CHAR because for
2285 * non-UTF, the latin1_safe macro is smart enough to account
2286 * for all the unfolded characters, and because for UTF, the
2287 * string will already have been folded earlier in the
2288 * compilation process */
2290 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2291 foldlen -= UTF8SKIP(uc);
2294 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2299 /* The current character (and any potential folds) should be added
2300 * to the possible matching characters for this position in this
2304 U8 folded= folder[ (U8) uvc ];
2305 if ( !trie->charmap[ folded ] ) {
2306 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2307 TRIE_STORE_REVCHAR( folded );
2310 if ( !trie->charmap[ uvc ] ) {
2311 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2312 TRIE_STORE_REVCHAR( uvc );
2315 /* store the codepoint in the bitmap, and its folded
2317 TRIE_BITMAP_SET(trie, uvc);
2319 /* store the folded codepoint */
2320 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2323 /* store first byte of utf8 representation of
2324 variant codepoints */
2325 if (! UVCHR_IS_INVARIANT(uvc)) {
2326 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2329 set_bit = 0; /* We've done our bit :-) */
2333 /* XXX We could come up with the list of code points that fold
2334 * to this using PL_utf8_foldclosures, except not for
2335 * multi-char folds, as there may be multiple combinations
2336 * there that could work, which needs to wait until runtime to
2337 * resolve (The comment about LIGATURE FFI above is such an
2342 widecharmap = newHV();
2344 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2347 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2349 if ( !SvTRUE( *svpp ) ) {
2350 sv_setiv( *svpp, ++trie->uniquecharcount );
2351 TRIE_STORE_REVCHAR(uvc);
2354 } /* end loop through characters in this branch of the trie */
2356 /* We take the min and max for this branch and combine to find the min
2357 * and max for all branches processed so far */
2358 if( cur == first ) {
2359 trie->minlen = minchars;
2360 trie->maxlen = maxchars;
2361 } else if (minchars < trie->minlen) {
2362 trie->minlen = minchars;
2363 } else if (maxchars > trie->maxlen) {
2364 trie->maxlen = maxchars;
2366 } /* end first pass */
2367 DEBUG_TRIE_COMPILE_r(
2368 PerlIO_printf( Perl_debug_log,
2369 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2370 (int)depth * 2 + 2,"",
2371 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2372 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2373 (int)trie->minlen, (int)trie->maxlen )
2377 We now know what we are dealing with in terms of unique chars and
2378 string sizes so we can calculate how much memory a naive
2379 representation using a flat table will take. If it's over a reasonable
2380 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2381 conservative but potentially much slower representation using an array
2384 At the end we convert both representations into the same compressed
2385 form that will be used in regexec.c for matching with. The latter
2386 is a form that cannot be used to construct with but has memory
2387 properties similar to the list form and access properties similar
2388 to the table form making it both suitable for fast searches and
2389 small enough that its feasable to store for the duration of a program.
2391 See the comment in the code where the compressed table is produced
2392 inplace from the flat tabe representation for an explanation of how
2393 the compression works.
2398 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2401 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2402 > SvIV(re_trie_maxbuff) )
2405 Second Pass -- Array Of Lists Representation
2407 Each state will be represented by a list of charid:state records
2408 (reg_trie_trans_le) the first such element holds the CUR and LEN
2409 points of the allocated array. (See defines above).
2411 We build the initial structure using the lists, and then convert
2412 it into the compressed table form which allows faster lookups
2413 (but cant be modified once converted).
2416 STRLEN transcount = 1;
2418 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2419 "%*sCompiling trie using list compiler\n",
2420 (int)depth * 2 + 2, ""));
2422 trie->states = (reg_trie_state *)
2423 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2424 sizeof(reg_trie_state) );
2428 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2430 regnode *noper = NEXTOPER( cur );
2431 U8 *uc = (U8*)STRING( noper );
2432 const U8 *e = uc + STR_LEN( noper );
2433 U32 state = 1; /* required init */
2434 U16 charid = 0; /* sanity init */
2435 U32 wordlen = 0; /* required init */
2437 if (OP(noper) == NOTHING) {
2438 regnode *noper_next= regnext(noper);
2439 if (noper_next != tail && OP(noper_next) == flags) {
2441 uc= (U8*)STRING(noper);
2442 e= uc + STR_LEN(noper);
2446 if (OP(noper) != NOTHING) {
2447 for ( ; uc < e ; uc += len ) {
2452 charid = trie->charmap[ uvc ];
2454 SV** const svpp = hv_fetch( widecharmap,
2461 charid=(U16)SvIV( *svpp );
2464 /* charid is now 0 if we dont know the char read, or
2465 * nonzero if we do */
2472 if ( !trie->states[ state ].trans.list ) {
2473 TRIE_LIST_NEW( state );
2476 check <= TRIE_LIST_USED( state );
2479 if ( TRIE_LIST_ITEM( state, check ).forid
2482 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2487 newstate = next_alloc++;
2488 prev_states[newstate] = state;
2489 TRIE_LIST_PUSH( state, charid, newstate );
2494 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2498 TRIE_HANDLE_WORD(state);
2500 } /* end second pass */
2502 /* next alloc is the NEXT state to be allocated */
2503 trie->statecount = next_alloc;
2504 trie->states = (reg_trie_state *)
2505 PerlMemShared_realloc( trie->states,
2507 * sizeof(reg_trie_state) );
2509 /* and now dump it out before we compress it */
2510 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2511 revcharmap, next_alloc,
2515 trie->trans = (reg_trie_trans *)
2516 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2523 for( state=1 ; state < next_alloc ; state ++ ) {
2527 DEBUG_TRIE_COMPILE_MORE_r(
2528 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2532 if (trie->states[state].trans.list) {
2533 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2537 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2538 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2539 if ( forid < minid ) {
2541 } else if ( forid > maxid ) {
2545 if ( transcount < tp + maxid - minid + 1) {
2547 trie->trans = (reg_trie_trans *)
2548 PerlMemShared_realloc( trie->trans,
2550 * sizeof(reg_trie_trans) );
2551 Zero( trie->trans + (transcount / 2),
2555 base = trie->uniquecharcount + tp - minid;
2556 if ( maxid == minid ) {
2558 for ( ; zp < tp ; zp++ ) {
2559 if ( ! trie->trans[ zp ].next ) {
2560 base = trie->uniquecharcount + zp - minid;
2561 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2563 trie->trans[ zp ].check = state;
2569 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2571 trie->trans[ tp ].check = state;
2576 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2577 const U32 tid = base
2578 - trie->uniquecharcount
2579 + TRIE_LIST_ITEM( state, idx ).forid;
2580 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2582 trie->trans[ tid ].check = state;
2584 tp += ( maxid - minid + 1 );
2586 Safefree(trie->states[ state ].trans.list);
2589 DEBUG_TRIE_COMPILE_MORE_r(
2590 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2593 trie->states[ state ].trans.base=base;
2595 trie->lasttrans = tp + 1;
2599 Second Pass -- Flat Table Representation.
2601 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2602 each. We know that we will need Charcount+1 trans at most to store
2603 the data (one row per char at worst case) So we preallocate both
2604 structures assuming worst case.
2606 We then construct the trie using only the .next slots of the entry
2609 We use the .check field of the first entry of the node temporarily
2610 to make compression both faster and easier by keeping track of how
2611 many non zero fields are in the node.
2613 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2616 There are two terms at use here: state as a TRIE_NODEIDX() which is
2617 a number representing the first entry of the node, and state as a
2618 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2619 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2620 if there are 2 entrys per node. eg:
2628 The table is internally in the right hand, idx form. However as we
2629 also have to deal with the states array which is indexed by nodenum
2630 we have to use TRIE_NODENUM() to convert.
2633 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2634 "%*sCompiling trie using table compiler\n",
2635 (int)depth * 2 + 2, ""));
2637 trie->trans = (reg_trie_trans *)
2638 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2639 * trie->uniquecharcount + 1,
2640 sizeof(reg_trie_trans) );
2641 trie->states = (reg_trie_state *)
2642 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2643 sizeof(reg_trie_state) );
2644 next_alloc = trie->uniquecharcount + 1;
2647 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2649 regnode *noper = NEXTOPER( cur );
2650 const U8 *uc = (U8*)STRING( noper );
2651 const U8 *e = uc + STR_LEN( noper );
2653 U32 state = 1; /* required init */
2655 U16 charid = 0; /* sanity init */
2656 U32 accept_state = 0; /* sanity init */
2658 U32 wordlen = 0; /* required init */
2660 if (OP(noper) == NOTHING) {
2661 regnode *noper_next= regnext(noper);
2662 if (noper_next != tail && OP(noper_next) == flags) {
2664 uc= (U8*)STRING(noper);
2665 e= uc + STR_LEN(noper);
2669 if ( OP(noper) != NOTHING ) {
2670 for ( ; uc < e ; uc += len ) {
2675 charid = trie->charmap[ uvc ];
2677 SV* const * const svpp = hv_fetch( widecharmap,
2681 charid = svpp ? (U16)SvIV(*svpp) : 0;
2685 if ( !trie->trans[ state + charid ].next ) {
2686 trie->trans[ state + charid ].next = next_alloc;
2687 trie->trans[ state ].check++;
2688 prev_states[TRIE_NODENUM(next_alloc)]
2689 = TRIE_NODENUM(state);
2690 next_alloc += trie->uniquecharcount;
2692 state = trie->trans[ state + charid ].next;
2694 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2696 /* charid is now 0 if we dont know the char read, or
2697 * nonzero if we do */
2700 accept_state = TRIE_NODENUM( state );
2701 TRIE_HANDLE_WORD(accept_state);
2703 } /* end second pass */
2705 /* and now dump it out before we compress it */
2706 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2708 next_alloc, depth+1));
2712 * Inplace compress the table.*
2714 For sparse data sets the table constructed by the trie algorithm will
2715 be mostly 0/FAIL transitions or to put it another way mostly empty.
2716 (Note that leaf nodes will not contain any transitions.)
2718 This algorithm compresses the tables by eliminating most such
2719 transitions, at the cost of a modest bit of extra work during lookup:
2721 - Each states[] entry contains a .base field which indicates the
2722 index in the state[] array wheres its transition data is stored.
2724 - If .base is 0 there are no valid transitions from that node.
2726 - If .base is nonzero then charid is added to it to find an entry in
2729 -If trans[states[state].base+charid].check!=state then the
2730 transition is taken to be a 0/Fail transition. Thus if there are fail
2731 transitions at the front of the node then the .base offset will point
2732 somewhere inside the previous nodes data (or maybe even into a node
2733 even earlier), but the .check field determines if the transition is
2737 The following process inplace converts the table to the compressed
2738 table: We first do not compress the root node 1,and mark all its
2739 .check pointers as 1 and set its .base pointer as 1 as well. This
2740 allows us to do a DFA construction from the compressed table later,
2741 and ensures that any .base pointers we calculate later are greater
2744 - We set 'pos' to indicate the first entry of the second node.
2746 - We then iterate over the columns of the node, finding the first and
2747 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2748 and set the .check pointers accordingly, and advance pos
2749 appropriately and repreat for the next node. Note that when we copy
2750 the next pointers we have to convert them from the original
2751 NODEIDX form to NODENUM form as the former is not valid post
2754 - If a node has no transitions used we mark its base as 0 and do not
2755 advance the pos pointer.
2757 - If a node only has one transition we use a second pointer into the
2758 structure to fill in allocated fail transitions from other states.
2759 This pointer is independent of the main pointer and scans forward
2760 looking for null transitions that are allocated to a state. When it
2761 finds one it writes the single transition into the "hole". If the
2762 pointer doesnt find one the single transition is appended as normal.
2764 - Once compressed we can Renew/realloc the structures to release the
2767 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2768 specifically Fig 3.47 and the associated pseudocode.
2772 const U32 laststate = TRIE_NODENUM( next_alloc );
2775 trie->statecount = laststate;
2777 for ( state = 1 ; state < laststate ; state++ ) {
2779 const U32 stateidx = TRIE_NODEIDX( state );
2780 const U32 o_used = trie->trans[ stateidx ].check;
2781 U32 used = trie->trans[ stateidx ].check;
2782 trie->trans[ stateidx ].check = 0;
2785 used && charid < trie->uniquecharcount;
2788 if ( flag || trie->trans[ stateidx + charid ].next ) {
2789 if ( trie->trans[ stateidx + charid ].next ) {
2791 for ( ; zp < pos ; zp++ ) {
2792 if ( ! trie->trans[ zp ].next ) {
2796 trie->states[ state ].trans.base
2798 + trie->uniquecharcount
2800 trie->trans[ zp ].next
2801 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2803 trie->trans[ zp ].check = state;
2804 if ( ++zp > pos ) pos = zp;
2811 trie->states[ state ].trans.base
2812 = pos + trie->uniquecharcount - charid ;
2814 trie->trans[ pos ].next
2815 = SAFE_TRIE_NODENUM(
2816 trie->trans[ stateidx + charid ].next );
2817 trie->trans[ pos ].check = state;
2822 trie->lasttrans = pos + 1;
2823 trie->states = (reg_trie_state *)
2824 PerlMemShared_realloc( trie->states, laststate
2825 * sizeof(reg_trie_state) );
2826 DEBUG_TRIE_COMPILE_MORE_r(
2827 PerlIO_printf( Perl_debug_log,
2828 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2829 (int)depth * 2 + 2,"",
2830 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2834 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2837 } /* end table compress */
2839 DEBUG_TRIE_COMPILE_MORE_r(
2840 PerlIO_printf(Perl_debug_log,
2841 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2842 (int)depth * 2 + 2, "",
2843 (UV)trie->statecount,
2844 (UV)trie->lasttrans)
2846 /* resize the trans array to remove unused space */
2847 trie->trans = (reg_trie_trans *)
2848 PerlMemShared_realloc( trie->trans, trie->lasttrans
2849 * sizeof(reg_trie_trans) );
2851 { /* Modify the program and insert the new TRIE node */
2852 U8 nodetype =(U8)(flags & 0xFF);
2856 regnode *optimize = NULL;
2857 #ifdef RE_TRACK_PATTERN_OFFSETS
2860 U32 mjd_nodelen = 0;
2861 #endif /* RE_TRACK_PATTERN_OFFSETS */
2862 #endif /* DEBUGGING */
2864 This means we convert either the first branch or the first Exact,
2865 depending on whether the thing following (in 'last') is a branch
2866 or not and whther first is the startbranch (ie is it a sub part of
2867 the alternation or is it the whole thing.)
2868 Assuming its a sub part we convert the EXACT otherwise we convert
2869 the whole branch sequence, including the first.
2871 /* Find the node we are going to overwrite */
2872 if ( first != startbranch || OP( last ) == BRANCH ) {
2873 /* branch sub-chain */
2874 NEXT_OFF( first ) = (U16)(last - first);
2875 #ifdef RE_TRACK_PATTERN_OFFSETS
2877 mjd_offset= Node_Offset((convert));
2878 mjd_nodelen= Node_Length((convert));
2881 /* whole branch chain */
2883 #ifdef RE_TRACK_PATTERN_OFFSETS
2886 const regnode *nop = NEXTOPER( convert );
2887 mjd_offset= Node_Offset((nop));
2888 mjd_nodelen= Node_Length((nop));
2892 PerlIO_printf(Perl_debug_log,
2893 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2894 (int)depth * 2 + 2, "",
2895 (UV)mjd_offset, (UV)mjd_nodelen)
2898 /* But first we check to see if there is a common prefix we can
2899 split out as an EXACT and put in front of the TRIE node. */
2900 trie->startstate= 1;
2901 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2903 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2907 const U32 base = trie->states[ state ].trans.base;
2909 if ( trie->states[state].wordnum )
2912 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2913 if ( ( base + ofs >= trie->uniquecharcount ) &&
2914 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2915 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2917 if ( ++count > 1 ) {
2918 SV **tmp = av_fetch( revcharmap, ofs, 0);
2919 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2920 if ( state == 1 ) break;
2922 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2924 PerlIO_printf(Perl_debug_log,
2925 "%*sNew Start State=%"UVuf" Class: [",
2926 (int)depth * 2 + 2, "",
2929 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2930 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2932 TRIE_BITMAP_SET(trie,*ch);
2934 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2936 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2940 TRIE_BITMAP_SET(trie,*ch);
2942 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2943 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2949 SV **tmp = av_fetch( revcharmap, idx, 0);
2951 char *ch = SvPV( *tmp, len );
2953 SV *sv=sv_newmortal();
2954 PerlIO_printf( Perl_debug_log,
2955 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2956 (int)depth * 2 + 2, "",
2958 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2959 PL_colors[0], PL_colors[1],
2960 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2961 PERL_PV_ESCAPE_FIRSTCHAR
2966 OP( convert ) = nodetype;
2967 str=STRING(convert);
2970 STR_LEN(convert) += len;
2976 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2981 trie->prefixlen = (state-1);
2983 regnode *n = convert+NODE_SZ_STR(convert);
2984 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2985 trie->startstate = state;
2986 trie->minlen -= (state - 1);
2987 trie->maxlen -= (state - 1);
2989 /* At least the UNICOS C compiler choked on this
2990 * being argument to DEBUG_r(), so let's just have
2993 #ifdef PERL_EXT_RE_BUILD
2999 regnode *fix = convert;
3000 U32 word = trie->wordcount;
3002 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3003 while( ++fix < n ) {
3004 Set_Node_Offset_Length(fix, 0, 0);
3007 SV ** const tmp = av_fetch( trie_words, word, 0 );
3009 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3010 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3012 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3020 NEXT_OFF(convert) = (U16)(tail - convert);
3021 DEBUG_r(optimize= n);
3027 if ( trie->maxlen ) {
3028 NEXT_OFF( convert ) = (U16)(tail - convert);
3029 ARG_SET( convert, data_slot );
3030 /* Store the offset to the first unabsorbed branch in
3031 jump[0], which is otherwise unused by the jump logic.
3032 We use this when dumping a trie and during optimisation. */
3034 trie->jump[0] = (U16)(nextbranch - convert);
3036 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3037 * and there is a bitmap
3038 * and the first "jump target" node we found leaves enough room
3039 * then convert the TRIE node into a TRIEC node, with the bitmap
3040 * embedded inline in the opcode - this is hypothetically faster.
3042 if ( !trie->states[trie->startstate].wordnum
3044 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3046 OP( convert ) = TRIEC;
3047 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3048 PerlMemShared_free(trie->bitmap);
3051 OP( convert ) = TRIE;
3053 /* store the type in the flags */
3054 convert->flags = nodetype;
3058 + regarglen[ OP( convert ) ];
3060 /* XXX We really should free up the resource in trie now,
3061 as we won't use them - (which resources?) dmq */
3063 /* needed for dumping*/
3064 DEBUG_r(if (optimize) {
3065 regnode *opt = convert;
3067 while ( ++opt < optimize) {
3068 Set_Node_Offset_Length(opt,0,0);
3071 Try to clean up some of the debris left after the
3074 while( optimize < jumper ) {
3075 mjd_nodelen += Node_Length((optimize));
3076 OP( optimize ) = OPTIMIZED;
3077 Set_Node_Offset_Length(optimize,0,0);
3080 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3082 } /* end node insert */
3084 /* Finish populating the prev field of the wordinfo array. Walk back
3085 * from each accept state until we find another accept state, and if
3086 * so, point the first word's .prev field at the second word. If the
3087 * second already has a .prev field set, stop now. This will be the
3088 * case either if we've already processed that word's accept state,
3089 * or that state had multiple words, and the overspill words were
3090 * already linked up earlier.
3097 for (word=1; word <= trie->wordcount; word++) {
3099 if (trie->wordinfo[word].prev)
3101 state = trie->wordinfo[word].accept;
3103 state = prev_states[state];
3106 prev = trie->states[state].wordnum;
3110 trie->wordinfo[word].prev = prev;
3112 Safefree(prev_states);
3116 /* and now dump out the compressed format */
3117 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3119 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3121 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3122 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3124 SvREFCNT_dec_NN(revcharmap);
3128 : trie->startstate>1
3134 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3136 /* The Trie is constructed and compressed now so we can build a fail array if
3139 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3141 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3145 We find the fail state for each state in the trie, this state is the longest
3146 proper suffix of the current state's 'word' that is also a proper prefix of
3147 another word in our trie. State 1 represents the word '' and is thus the
3148 default fail state. This allows the DFA not to have to restart after its
3149 tried and failed a word at a given point, it simply continues as though it
3150 had been matching the other word in the first place.
3152 'abcdgu'=~/abcdefg|cdgu/
3153 When we get to 'd' we are still matching the first word, we would encounter
3154 'g' which would fail, which would bring us to the state representing 'd' in
3155 the second word where we would try 'g' and succeed, proceeding to match
3158 /* add a fail transition */
3159 const U32 trie_offset = ARG(source);
3160 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3162 const U32 ucharcount = trie->uniquecharcount;
3163 const U32 numstates = trie->statecount;
3164 const U32 ubound = trie->lasttrans + ucharcount;
3168 U32 base = trie->states[ 1 ].trans.base;
3171 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3173 GET_RE_DEBUG_FLAGS_DECL;
3175 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3176 PERL_UNUSED_CONTEXT;
3178 PERL_UNUSED_ARG(depth);
3181 if ( OP(source) == TRIE ) {
3182 struct regnode_1 *op = (struct regnode_1 *)
3183 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3184 StructCopy(source,op,struct regnode_1);
3185 stclass = (regnode *)op;
3187 struct regnode_charclass *op = (struct regnode_charclass *)
3188 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3189 StructCopy(source,op,struct regnode_charclass);
3190 stclass = (regnode *)op;
3192 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3194 ARG_SET( stclass, data_slot );
3195 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3196 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3197 aho->trie=trie_offset;
3198 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3199 Copy( trie->states, aho->states, numstates, reg_trie_state );
3200 Newxz( q, numstates, U32);
3201 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3204 /* initialize fail[0..1] to be 1 so that we always have
3205 a valid final fail state */
3206 fail[ 0 ] = fail[ 1 ] = 1;
3208 for ( charid = 0; charid < ucharcount ; charid++ ) {
3209 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3211 q[ q_write ] = newstate;
3212 /* set to point at the root */
3213 fail[ q[ q_write++ ] ]=1;
3216 while ( q_read < q_write) {
3217 const U32 cur = q[ q_read++ % numstates ];
3218 base = trie->states[ cur ].trans.base;
3220 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3221 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3223 U32 fail_state = cur;
3226 fail_state = fail[ fail_state ];
3227 fail_base = aho->states[ fail_state ].trans.base;
3228 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3230 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3231 fail[ ch_state ] = fail_state;
3232 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3234 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3236 q[ q_write++ % numstates] = ch_state;
3240 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3241 when we fail in state 1, this allows us to use the
3242 charclass scan to find a valid start char. This is based on the principle
3243 that theres a good chance the string being searched contains lots of stuff
3244 that cant be a start char.
3246 fail[ 0 ] = fail[ 1 ] = 0;
3247 DEBUG_TRIE_COMPILE_r({
3248 PerlIO_printf(Perl_debug_log,
3249 "%*sStclass Failtable (%"UVuf" states): 0",
3250 (int)(depth * 2), "", (UV)numstates
3252 for( q_read=1; q_read<numstates; q_read++ ) {
3253 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3255 PerlIO_printf(Perl_debug_log, "\n");
3258 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3263 #define DEBUG_PEEP(str,scan,depth) \
3264 DEBUG_OPTIMISE_r({if (scan){ \
3265 regnode *Next = regnext(scan); \
3266 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3267 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3268 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3269 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3270 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3271 PerlIO_printf(Perl_debug_log, "\n"); \
3274 /* The below joins as many adjacent EXACTish nodes as possible into a single
3275 * one. The regop may be changed if the node(s) contain certain sequences that
3276 * require special handling. The joining is only done if:
3277 * 1) there is room in the current conglomerated node to entirely contain the
3279 * 2) they are the exact same node type
3281 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3282 * these get optimized out
3284 * If a node is to match under /i (folded), the number of characters it matches
3285 * can be different than its character length if it contains a multi-character
3286 * fold. *min_subtract is set to the total delta number of characters of the
3289 * And *unfolded_multi_char is set to indicate whether or not the node contains
3290 * an unfolded multi-char fold. This happens when whether the fold is valid or
3291 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3292 * SMALL LETTER SHARP S, as only if the target string being matched against
3293 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3294 * folding rules depend on the locale in force at runtime. (Multi-char folds
3295 * whose components are all above the Latin1 range are not run-time locale
3296 * dependent, and have already been folded by the time this function is
3299 * This is as good a place as any to discuss the design of handling these
3300 * multi-character fold sequences. It's been wrong in Perl for a very long
3301 * time. There are three code points in Unicode whose multi-character folds
3302 * were long ago discovered to mess things up. The previous designs for
3303 * dealing with these involved assigning a special node for them. This
3304 * approach doesn't always work, as evidenced by this example:
3305 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3306 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3307 * would match just the \xDF, it won't be able to handle the case where a
3308 * successful match would have to cross the node's boundary. The new approach
3309 * that hopefully generally solves the problem generates an EXACTFU_SS node
3310 * that is "sss" in this case.
3312 * It turns out that there are problems with all multi-character folds, and not
3313 * just these three. Now the code is general, for all such cases. The
3314 * approach taken is:
3315 * 1) This routine examines each EXACTFish node that could contain multi-
3316 * character folded sequences. Since a single character can fold into
3317 * such a sequence, the minimum match length for this node is less than
3318 * the number of characters in the node. This routine returns in
3319 * *min_subtract how many characters to subtract from the the actual
3320 * length of the string to get a real minimum match length; it is 0 if
3321 * there are no multi-char foldeds. This delta is used by the caller to
3322 * adjust the min length of the match, and the delta between min and max,
3323 * so that the optimizer doesn't reject these possibilities based on size
3325 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3326 * is used for an EXACTFU node that contains at least one "ss" sequence in
3327 * it. For non-UTF-8 patterns and strings, this is the only case where
3328 * there is a possible fold length change. That means that a regular
3329 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3330 * with length changes, and so can be processed faster. regexec.c takes
3331 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3332 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3333 * known until runtime). This saves effort in regex matching. However,
3334 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3335 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3336 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3337 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3338 * possibilities for the non-UTF8 patterns are quite simple, except for
3339 * the sharp s. All the ones that don't involve a UTF-8 target string are
3340 * members of a fold-pair, and arrays are set up for all of them so that
3341 * the other member of the pair can be found quickly. Code elsewhere in
3342 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3343 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3344 * described in the next item.
3345 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3346 * validity of the fold won't be known until runtime, and so must remain
3347 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3348 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3349 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3350 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3351 * The reason this is a problem is that the optimizer part of regexec.c
3352 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3353 * that a character in the pattern corresponds to at most a single
3354 * character in the target string. (And I do mean character, and not byte
3355 * here, unlike other parts of the documentation that have never been
3356 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3357 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3358 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3359 * nodes, violate the assumption, and they are the only instances where it
3360 * is violated. I'm reluctant to try to change the assumption, as the
3361 * code involved is impenetrable to me (khw), so instead the code here
3362 * punts. This routine examines EXACTFL nodes, and (when the pattern
3363 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3364 * boolean indicating whether or not the node contains such a fold. When
3365 * it is true, the caller sets a flag that later causes the optimizer in
3366 * this file to not set values for the floating and fixed string lengths,
3367 * and thus avoids the optimizer code in regexec.c that makes the invalid
3368 * assumption. Thus, there is no optimization based on string lengths for
3369 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3370 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3371 * assumption is wrong only in these cases is that all other non-UTF-8
3372 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3373 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3374 * EXACTF nodes because we don't know at compile time if it actually
3375 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3376 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3377 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3378 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3379 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3380 * string would require the pattern to be forced into UTF-8, the overhead
3381 * of which we want to avoid. Similarly the unfolded multi-char folds in
3382 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3385 * Similarly, the code that generates tries doesn't currently handle
3386 * not-already-folded multi-char folds, and it looks like a pain to change
3387 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3388 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3389 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3390 * using /iaa matching will be doing so almost entirely with ASCII
3391 * strings, so this should rarely be encountered in practice */
3393 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3394 if (PL_regkind[OP(scan)] == EXACT) \
3395 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3398 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3399 UV *min_subtract, bool *unfolded_multi_char,
3400 U32 flags,regnode *val, U32 depth)
3402 /* Merge several consecutive EXACTish nodes into one. */
3403 regnode *n = regnext(scan);
3405 regnode *next = scan + NODE_SZ_STR(scan);
3409 regnode *stop = scan;
3410 GET_RE_DEBUG_FLAGS_DECL;
3412 PERL_UNUSED_ARG(depth);
3415 PERL_ARGS_ASSERT_JOIN_EXACT;
3416 #ifndef EXPERIMENTAL_INPLACESCAN
3417 PERL_UNUSED_ARG(flags);
3418 PERL_UNUSED_ARG(val);
3420 DEBUG_PEEP("join",scan,depth);
3422 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3423 * EXACT ones that are mergeable to the current one. */
3425 && (PL_regkind[OP(n)] == NOTHING
3426 || (stringok && OP(n) == OP(scan)))
3428 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3431 if (OP(n) == TAIL || n > next)
3433 if (PL_regkind[OP(n)] == NOTHING) {
3434 DEBUG_PEEP("skip:",n,depth);
3435 NEXT_OFF(scan) += NEXT_OFF(n);
3436 next = n + NODE_STEP_REGNODE;
3443 else if (stringok) {
3444 const unsigned int oldl = STR_LEN(scan);
3445 regnode * const nnext = regnext(n);
3447 /* XXX I (khw) kind of doubt that this works on platforms (should
3448 * Perl ever run on one) where U8_MAX is above 255 because of lots
3449 * of other assumptions */
3450 /* Don't join if the sum can't fit into a single node */
3451 if (oldl + STR_LEN(n) > U8_MAX)
3454 DEBUG_PEEP("merg",n,depth);
3457 NEXT_OFF(scan) += NEXT_OFF(n);
3458 STR_LEN(scan) += STR_LEN(n);
3459 next = n + NODE_SZ_STR(n);
3460 /* Now we can overwrite *n : */
3461 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3469 #ifdef EXPERIMENTAL_INPLACESCAN
3470 if (flags && !NEXT_OFF(n)) {
3471 DEBUG_PEEP("atch", val, depth);
3472 if (reg_off_by_arg[OP(n)]) {
3473 ARG_SET(n, val - n);
3476 NEXT_OFF(n) = val - n;
3484 *unfolded_multi_char = FALSE;
3486 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3487 * can now analyze for sequences of problematic code points. (Prior to
3488 * this final joining, sequences could have been split over boundaries, and
3489 * hence missed). The sequences only happen in folding, hence for any
3490 * non-EXACT EXACTish node */
3491 if (OP(scan) != EXACT) {
3492 U8* s0 = (U8*) STRING(scan);
3494 U8* s_end = s0 + STR_LEN(scan);
3496 int total_count_delta = 0; /* Total delta number of characters that
3497 multi-char folds expand to */
3499 /* One pass is made over the node's string looking for all the
3500 * possibilities. To avoid some tests in the loop, there are two main
3501 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3506 if (OP(scan) == EXACTFL) {
3509 /* An EXACTFL node would already have been changed to another
3510 * node type unless there is at least one character in it that
3511 * is problematic; likely a character whose fold definition
3512 * won't be known until runtime, and so has yet to be folded.
3513 * For all but the UTF-8 locale, folds are 1-1 in length, but
3514 * to handle the UTF-8 case, we need to create a temporary
3515 * folded copy using UTF-8 locale rules in order to analyze it.
3516 * This is because our macros that look to see if a sequence is
3517 * a multi-char fold assume everything is folded (otherwise the
3518 * tests in those macros would be too complicated and slow).
3519 * Note that here, the non-problematic folds will have already
3520 * been done, so we can just copy such characters. We actually
3521 * don't completely fold the EXACTFL string. We skip the
3522 * unfolded multi-char folds, as that would just create work
3523 * below to figure out the size they already are */
3525 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3528 STRLEN s_len = UTF8SKIP(s);
3529 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3530 Copy(s, d, s_len, U8);
3533 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3534 *unfolded_multi_char = TRUE;
3535 Copy(s, d, s_len, U8);
3538 else if (isASCII(*s)) {
3539 *(d++) = toFOLD(*s);
3543 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3549 /* Point the remainder of the routine to look at our temporary
3553 } /* End of creating folded copy of EXACTFL string */
3555 /* Examine the string for a multi-character fold sequence. UTF-8
3556 * patterns have all characters pre-folded by the time this code is
3558 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3559 length sequence we are looking for is 2 */
3561 int count = 0; /* How many characters in a multi-char fold */
3562 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3563 if (! len) { /* Not a multi-char fold: get next char */
3568 /* Nodes with 'ss' require special handling, except for
3569 * EXACTFA-ish for which there is no multi-char fold to this */
3570 if (len == 2 && *s == 's' && *(s+1) == 's'
3571 && OP(scan) != EXACTFA
3572 && OP(scan) != EXACTFA_NO_TRIE)
3575 if (OP(scan) != EXACTFL) {
3576 OP(scan) = EXACTFU_SS;
3580 else { /* Here is a generic multi-char fold. */
3581 U8* multi_end = s + len;
3583 /* Count how many characters are in it. In the case of
3584 * /aa, no folds which contain ASCII code points are
3585 * allowed, so check for those, and skip if found. */
3586 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3587 count = utf8_length(s, multi_end);
3591 while (s < multi_end) {
3594 goto next_iteration;
3604 /* The delta is how long the sequence is minus 1 (1 is how long
3605 * the character that folds to the sequence is) */
3606 total_count_delta += count - 1;
3610 /* We created a temporary folded copy of the string in EXACTFL
3611 * nodes. Therefore we need to be sure it doesn't go below zero,
3612 * as the real string could be shorter */
3613 if (OP(scan) == EXACTFL) {
3614 int total_chars = utf8_length((U8*) STRING(scan),
3615 (U8*) STRING(scan) + STR_LEN(scan));
3616 if (total_count_delta > total_chars) {
3617 total_count_delta = total_chars;
3621 *min_subtract += total_count_delta;
3624 else if (OP(scan) == EXACTFA) {
3626 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3627 * fold to the ASCII range (and there are no existing ones in the
3628 * upper latin1 range). But, as outlined in the comments preceding
3629 * this function, we need to flag any occurrences of the sharp s.
3630 * This character forbids trie formation (because of added
3633 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3634 OP(scan) = EXACTFA_NO_TRIE;
3635 *unfolded_multi_char = TRUE;
3644 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3645 * folds that are all Latin1. As explained in the comments
3646 * preceding this function, we look also for the sharp s in EXACTF
3647 * and EXACTFL nodes; it can be in the final position. Otherwise
3648 * we can stop looking 1 byte earlier because have to find at least
3649 * two characters for a multi-fold */
3650 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3655 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3656 if (! len) { /* Not a multi-char fold. */
3657 if (*s == LATIN_SMALL_LETTER_SHARP_S
3658 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3660 *unfolded_multi_char = TRUE;
3667 && isALPHA_FOLD_EQ(*s, 's')
3668 && isALPHA_FOLD_EQ(*(s+1), 's'))
3671 /* EXACTF nodes need to know that the minimum length
3672 * changed so that a sharp s in the string can match this
3673 * ss in the pattern, but they remain EXACTF nodes, as they
3674 * won't match this unless the target string is is UTF-8,
3675 * which we don't know until runtime. EXACTFL nodes can't
3676 * transform into EXACTFU nodes */
3677 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3678 OP(scan) = EXACTFU_SS;
3682 *min_subtract += len - 1;
3689 /* Allow dumping but overwriting the collection of skipped
3690 * ops and/or strings with fake optimized ops */
3691 n = scan + NODE_SZ_STR(scan);
3699 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3703 /* REx optimizer. Converts nodes into quicker variants "in place".
3704 Finds fixed substrings. */
3706 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3707 to the position after last scanned or to NULL. */
3709 #define INIT_AND_WITHP \
3710 assert(!and_withp); \
3711 Newx(and_withp,1, regnode_ssc); \
3712 SAVEFREEPV(and_withp)
3716 S_unwind_scan_frames(pTHX_ const void *p)
3718 scan_frame *f= (scan_frame *)p;
3720 scan_frame *n= f->next_frame;
3728 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3729 SSize_t *minlenp, SSize_t *deltap,
3734 regnode_ssc *and_withp,
3735 U32 flags, U32 depth)
3736 /* scanp: Start here (read-write). */
3737 /* deltap: Write maxlen-minlen here. */
3738 /* last: Stop before this one. */
3739 /* data: string data about the pattern */
3740 /* stopparen: treat close N as END */
3741 /* recursed: which subroutines have we recursed into */
3742 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3744 /* There must be at least this number of characters to match */
3747 regnode *scan = *scanp, *next;
3749 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3750 int is_inf_internal = 0; /* The studied chunk is infinite */
3751 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3752 scan_data_t data_fake;
3753 SV *re_trie_maxbuff = NULL;
3754 regnode *first_non_open = scan;
3755 SSize_t stopmin = SSize_t_MAX;
3756 scan_frame *frame = NULL;
3757 GET_RE_DEBUG_FLAGS_DECL;
3759 PERL_ARGS_ASSERT_STUDY_CHUNK;
3763 while (first_non_open && OP(first_non_open) == OPEN)
3764 first_non_open=regnext(first_non_open);
3770 RExC_study_chunk_recursed_count++;
3772 DEBUG_OPTIMISE_MORE_r(
3774 PerlIO_printf(Perl_debug_log,
3775 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3776 (int)(depth*2), "", (long)stopparen,
3777 (unsigned long)RExC_study_chunk_recursed_count,
3778 (unsigned long)depth, (unsigned long)recursed_depth,
3781 if (recursed_depth) {
3784 for ( j = 0 ; j < recursed_depth ; j++ ) {
3785 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3787 PAREN_TEST(RExC_study_chunk_recursed +
3788 ( j * RExC_study_chunk_recursed_bytes), i )
3791 !PAREN_TEST(RExC_study_chunk_recursed +
3792 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3795 PerlIO_printf(Perl_debug_log," %d",(int)i);
3799 if ( j + 1 < recursed_depth ) {
3800 PerlIO_printf(Perl_debug_log, ",");
3804 PerlIO_printf(Perl_debug_log,"\n");
3807 while ( scan && OP(scan) != END && scan < last ){
3808 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3809 node length to get a real minimum (because
3810 the folded version may be shorter) */
3811 bool unfolded_multi_char = FALSE;
3812 /* Peephole optimizer: */
3813 DEBUG_STUDYDATA("Peep:", data, depth);
3814 DEBUG_PEEP("Peep", scan, depth);
3817 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3818 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3819 * by a different invocation of reg() -- Yves
3821 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3823 /* Follow the next-chain of the current node and optimize
3824 away all the NOTHINGs from it. */
3825 if (OP(scan) != CURLYX) {
3826 const int max = (reg_off_by_arg[OP(scan)]
3828 /* I32 may be smaller than U16 on CRAYs! */
3829 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3830 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3834 /* Skip NOTHING and LONGJMP. */
3835 while ((n = regnext(n))
3836 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3837 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3838 && off + noff < max)
3840 if (reg_off_by_arg[OP(scan)])
3843 NEXT_OFF(scan) = off;
3846 /* The principal pseudo-switch. Cannot be a switch, since we
3847 look into several different things. */
3848 if ( OP(scan) == DEFINEP ) {
3850 SSize_t deltanext = 0;
3851 SSize_t fake_last_close = 0;
3852 I32 f = SCF_IN_DEFINE;
3854 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3855 scan = regnext(scan);
3856 assert( OP(scan) == IFTHEN );
3857 DEBUG_PEEP("expect IFTHEN", scan, depth);
3859 data_fake.last_closep= &fake_last_close;
3861 next = regnext(scan);
3862 scan = NEXTOPER(NEXTOPER(scan));
3863 DEBUG_PEEP("scan", scan, depth);
3864 DEBUG_PEEP("next", next, depth);
3866 /* we suppose the run is continuous, last=next...
3867 * NOTE we dont use the return here! */
3868 (void)study_chunk(pRExC_state, &scan, &minlen,
3869 &deltanext, next, &data_fake, stopparen,
3870 recursed_depth, NULL, f, depth+1);
3875 OP(scan) == BRANCH ||
3876 OP(scan) == BRANCHJ ||
3879 next = regnext(scan);
3882 /* The op(next)==code check below is to see if we
3883 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3884 * IFTHEN is special as it might not appear in pairs.
3885 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3886 * we dont handle it cleanly. */
3887 if (OP(next) == code || code == IFTHEN) {
3888 /* NOTE - There is similar code to this block below for
3889 * handling TRIE nodes on a re-study. If you change stuff here
3890 * check there too. */
3891 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3893 regnode * const startbranch=scan;
3895 if (flags & SCF_DO_SUBSTR) {
3896 /* Cannot merge strings after this. */
3897 scan_commit(pRExC_state, data, minlenp, is_inf);
3900 if (flags & SCF_DO_STCLASS)
3901 ssc_init_zero(pRExC_state, &accum);
3903 while (OP(scan) == code) {
3904 SSize_t deltanext, minnext, fake;
3906 regnode_ssc this_class;
3908 DEBUG_PEEP("Branch", scan, depth);
3911 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3913 data_fake.whilem_c = data->whilem_c;
3914 data_fake.last_closep = data->last_closep;
3917 data_fake.last_closep = &fake;
3919 data_fake.pos_delta = delta;
3920 next = regnext(scan);
3922 scan = NEXTOPER(scan); /* everything */
3923 if (code != BRANCH) /* everything but BRANCH */
3924 scan = NEXTOPER(scan);
3926 if (flags & SCF_DO_STCLASS) {
3927 ssc_init(pRExC_state, &this_class);
3928 data_fake.start_class = &this_class;
3929 f = SCF_DO_STCLASS_AND;
3931 if (flags & SCF_WHILEM_VISITED_POS)
3932 f |= SCF_WHILEM_VISITED_POS;
3934 /* we suppose the run is continuous, last=next...*/
3935 minnext = study_chunk(pRExC_state, &scan, minlenp,
3936 &deltanext, next, &data_fake, stopparen,
3937 recursed_depth, NULL, f,depth+1);
3941 if (deltanext == SSize_t_MAX) {
3942 is_inf = is_inf_internal = 1;
3944 } else if (max1 < minnext + deltanext)
3945 max1 = minnext + deltanext;
3947 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3949 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3950 if ( stopmin > minnext)
3951 stopmin = min + min1;
3952 flags &= ~SCF_DO_SUBSTR;
3954 data->flags |= SCF_SEEN_ACCEPT;
3957 if (data_fake.flags & SF_HAS_EVAL)
3958 data->flags |= SF_HAS_EVAL;
3959 data->whilem_c = data_fake.whilem_c;
3961 if (flags & SCF_DO_STCLASS)
3962 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3964 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3966 if (flags & SCF_DO_SUBSTR) {
3967 data->pos_min += min1;
3968 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3969 data->pos_delta = SSize_t_MAX;
3971 data->pos_delta += max1 - min1;
3972 if (max1 != min1 || is_inf)
3973 data->longest = &(data->longest_float);
3976 if (delta == SSize_t_MAX
3977 || SSize_t_MAX - delta - (max1 - min1) < 0)
3978 delta = SSize_t_MAX;
3980 delta += max1 - min1;
3981 if (flags & SCF_DO_STCLASS_OR) {
3982 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3984 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3985 flags &= ~SCF_DO_STCLASS;
3988 else if (flags & SCF_DO_STCLASS_AND) {
3990 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3991 flags &= ~SCF_DO_STCLASS;
3994 /* Switch to OR mode: cache the old value of
3995 * data->start_class */
3997 StructCopy(data->start_class, and_withp, regnode_ssc);
3998 flags &= ~SCF_DO_STCLASS_AND;
3999 StructCopy(&accum, data->start_class, regnode_ssc);
4000 flags |= SCF_DO_STCLASS_OR;
4004 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4005 OP( startbranch ) == BRANCH )
4009 Assuming this was/is a branch we are dealing with: 'scan'
4010 now points at the item that follows the branch sequence,
4011 whatever it is. We now start at the beginning of the
4012 sequence and look for subsequences of
4018 which would be constructed from a pattern like
4021 If we can find such a subsequence we need to turn the first
4022 element into a trie and then add the subsequent branch exact
4023 strings to the trie.
4027 1. patterns where the whole set of branches can be
4030 2. patterns where only a subset can be converted.
4032 In case 1 we can replace the whole set with a single regop
4033 for the trie. In case 2 we need to keep the start and end
4036 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4037 becomes BRANCH TRIE; BRANCH X;
4039 There is an additional case, that being where there is a
4040 common prefix, which gets split out into an EXACT like node
4041 preceding the TRIE node.
4043 If x(1..n)==tail then we can do a simple trie, if not we make
4044 a "jump" trie, such that when we match the appropriate word
4045 we "jump" to the appropriate tail node. Essentially we turn
4046 a nested if into a case structure of sorts.
4051 if (!re_trie_maxbuff) {
4052 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4053 if (!SvIOK(re_trie_maxbuff))
4054 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4056 if ( SvIV(re_trie_maxbuff)>=0 ) {
4058 regnode *first = (regnode *)NULL;
4059 regnode *last = (regnode *)NULL;
4060 regnode *tail = scan;
4064 /* var tail is used because there may be a TAIL
4065 regop in the way. Ie, the exacts will point to the
4066 thing following the TAIL, but the last branch will
4067 point at the TAIL. So we advance tail. If we
4068 have nested (?:) we may have to move through several
4072 while ( OP( tail ) == TAIL ) {
4073 /* this is the TAIL generated by (?:) */
4074 tail = regnext( tail );
4078 DEBUG_TRIE_COMPILE_r({
4079 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4080 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4081 (int)depth * 2 + 2, "",
4082 "Looking for TRIE'able sequences. Tail node is: ",
4083 SvPV_nolen_const( RExC_mysv )
4089 Step through the branches
4090 cur represents each branch,
4091 noper is the first thing to be matched as part
4093 noper_next is the regnext() of that node.
4095 We normally handle a case like this
4096 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4097 support building with NOJUMPTRIE, which restricts
4098 the trie logic to structures like /FOO|BAR/.
4100 If noper is a trieable nodetype then the branch is
4101 a possible optimization target. If we are building
4102 under NOJUMPTRIE then we require that noper_next is
4103 the same as scan (our current position in the regex
4106 Once we have two or more consecutive such branches
4107 we can create a trie of the EXACT's contents and
4108 stitch it in place into the program.
4110 If the sequence represents all of the branches in
4111 the alternation we replace the entire thing with a
4114 Otherwise when it is a subsequence we need to
4115 stitch it in place and replace only the relevant
4116 branches. This means the first branch has to remain
4117 as it is used by the alternation logic, and its
4118 next pointer, and needs to be repointed at the item
4119 on the branch chain following the last branch we
4120 have optimized away.
4122 This could be either a BRANCH, in which case the
4123 subsequence is internal, or it could be the item
4124 following the branch sequence in which case the
4125 subsequence is at the end (which does not
4126 necessarily mean the first node is the start of the
4129 TRIE_TYPE(X) is a define which maps the optype to a
4133 ----------------+-----------
4137 EXACTFU_SS | EXACTFU
4142 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
4143 ( EXACT == (X) ) ? EXACT : \
4144 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
4145 ( EXACTFA == (X) ) ? EXACTFA : \
4148 /* dont use tail as the end marker for this traverse */
4149 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4150 regnode * const noper = NEXTOPER( cur );
4151 U8 noper_type = OP( noper );
4152 U8 noper_trietype = TRIE_TYPE( noper_type );
4153 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4154 regnode * const noper_next = regnext( noper );
4155 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4156 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4159 DEBUG_TRIE_COMPILE_r({
4160 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4161 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4162 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4164 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4165 PerlIO_printf( Perl_debug_log, " -> %s",
4166 SvPV_nolen_const(RExC_mysv));
4169 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4170 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4171 SvPV_nolen_const(RExC_mysv));
4173 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4174 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4175 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4179 /* Is noper a trieable nodetype that can be merged
4180 * with the current trie (if there is one)? */
4184 ( noper_trietype == NOTHING)
4185 || ( trietype == NOTHING )
4186 || ( trietype == noper_trietype )
4189 && noper_next == tail
4193 /* Handle mergable triable node Either we are
4194 * the first node in a new trieable sequence,
4195 * in which case we do some bookkeeping,
4196 * otherwise we update the end pointer. */
4199 if ( noper_trietype == NOTHING ) {
4200 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4201 regnode * const noper_next = regnext( noper );
4202 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4203 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4206 if ( noper_next_trietype ) {
4207 trietype = noper_next_trietype;
4208 } else if (noper_next_type) {
4209 /* a NOTHING regop is 1 regop wide.
4210 * We need at least two for a trie
4211 * so we can't merge this in */
4215 trietype = noper_trietype;
4218 if ( trietype == NOTHING )
4219 trietype = noper_trietype;
4224 } /* end handle mergable triable node */
4226 /* handle unmergable node -
4227 * noper may either be a triable node which can
4228 * not be tried together with the current trie,
4229 * or a non triable node */
4231 /* If last is set and trietype is not
4232 * NOTHING then we have found at least two
4233 * triable branch sequences in a row of a
4234 * similar trietype so we can turn them
4235 * into a trie. If/when we allow NOTHING to
4236 * start a trie sequence this condition
4237 * will be required, and it isn't expensive
4238 * so we leave it in for now. */
4239 if ( trietype && trietype != NOTHING )
4240 make_trie( pRExC_state,
4241 startbranch, first, cur, tail,
4242 count, trietype, depth+1 );
4243 last = NULL; /* note: we clear/update
4244 first, trietype etc below,
4245 so we dont do it here */
4249 && noper_next == tail
4252 /* noper is triable, so we can start a new
4256 trietype = noper_trietype;
4258 /* if we already saw a first but the
4259 * current node is not triable then we have
4260 * to reset the first information. */
4265 } /* end handle unmergable node */
4266 } /* loop over branches */
4267 DEBUG_TRIE_COMPILE_r({
4268 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4269 PerlIO_printf( Perl_debug_log,
4270 "%*s- %s (%d) <SCAN FINISHED>\n",
4272 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4275 if ( last && trietype ) {
4276 if ( trietype != NOTHING ) {
4277 /* the last branch of the sequence was part of
4278 * a trie, so we have to construct it here
4279 * outside of the loop */
4280 made= make_trie( pRExC_state, startbranch,
4281 first, scan, tail, count,
4282 trietype, depth+1 );
4283 #ifdef TRIE_STUDY_OPT
4284 if ( ((made == MADE_EXACT_TRIE &&
4285 startbranch == first)
4286 || ( first_non_open == first )) &&
4288 flags |= SCF_TRIE_RESTUDY;
4289 if ( startbranch == first
4292 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4297 /* at this point we know whatever we have is a
4298 * NOTHING sequence/branch AND if 'startbranch'
4299 * is 'first' then we can turn the whole thing
4302 if ( startbranch == first ) {
4304 /* the entire thing is a NOTHING sequence,
4305 * something like this: (?:|) So we can
4306 * turn it into a plain NOTHING op. */
4307 DEBUG_TRIE_COMPILE_r({
4308 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4309 PerlIO_printf( Perl_debug_log,
4310 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4311 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4314 OP(startbranch)= NOTHING;
4315 NEXT_OFF(startbranch)= tail - startbranch;
4316 for ( opt= startbranch + 1; opt < tail ; opt++ )
4320 } /* end if ( last) */
4321 } /* TRIE_MAXBUF is non zero */
4326 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4327 scan = NEXTOPER(NEXTOPER(scan));
4328 } else /* single branch is optimized. */
4329 scan = NEXTOPER(scan);
4331 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4333 regnode *start = NULL;
4334 regnode *end = NULL;
4335 U32 my_recursed_depth= recursed_depth;
4338 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4339 /* Do setup, note this code has side effects beyond
4340 * the rest of this block. Specifically setting
4341 * RExC_recurse[] must happen at least once during
4343 if (OP(scan) == GOSUB) {
4345 RExC_recurse[ARG2L(scan)] = scan;
4346 start = RExC_open_parens[paren-1];
4347 end = RExC_close_parens[paren-1];
4349 start = RExC_rxi->program + 1;
4352 /* NOTE we MUST always execute the above code, even
4353 * if we do nothing with a GOSUB/GOSTART */
4355 ( flags & SCF_IN_DEFINE )
4358 (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4360 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4363 /* no need to do anything here if we are in a define. */
4364 /* or we are after some kind of infinite construct
4365 * so we can skip recursing into this item.
4366 * Since it is infinite we will not change the maxlen
4367 * or delta, and if we miss something that might raise
4368 * the minlen it will merely pessimise a little.
4370 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4371 * might result in a minlen of 1 and not of 4,
4372 * but this doesn't make us mismatch, just try a bit
4373 * harder than we should.
4375 scan= regnext(scan);
4382 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4384 /* it is quite possible that there are more efficient ways
4385 * to do this. We maintain a bitmap per level of recursion
4386 * of which patterns we have entered so we can detect if a
4387 * pattern creates a possible infinite loop. When we
4388 * recurse down a level we copy the previous levels bitmap
4389 * down. When we are at recursion level 0 we zero the top
4390 * level bitmap. It would be nice to implement a different
4391 * more efficient way of doing this. In particular the top
4392 * level bitmap may be unnecessary.
4394 if (!recursed_depth) {
4395 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4397 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4398 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4399 RExC_study_chunk_recursed_bytes, U8);
4401 /* we havent recursed into this paren yet, so recurse into it */
4402 DEBUG_STUDYDATA("set:", data,depth);
4403 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4404 my_recursed_depth= recursed_depth + 1;
4406 DEBUG_STUDYDATA("inf:", data,depth);
4407 /* some form of infinite recursion, assume infinite length
4409 if (flags & SCF_DO_SUBSTR) {
4410 scan_commit(pRExC_state, data, minlenp, is_inf);
4411 data->longest = &(data->longest_float);
4413 is_inf = is_inf_internal = 1;
4414 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4415 ssc_anything(data->start_class);
4416 flags &= ~SCF_DO_STCLASS;
4418 start= NULL; /* reset start so we dont recurse later on. */
4423 end = regnext(scan);
4426 scan_frame *newframe;
4428 if (!RExC_frame_last) {
4429 Newxz(newframe, 1, scan_frame);
4430 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4431 RExC_frame_head= newframe;
4433 } else if (!RExC_frame_last->next_frame) {
4434 Newxz(newframe,1,scan_frame);
4435 RExC_frame_last->next_frame= newframe;
4436 newframe->prev_frame= RExC_frame_last;
4439 newframe= RExC_frame_last->next_frame;
4441 RExC_frame_last= newframe;
4443 newframe->next_regnode = regnext(scan);
4444 newframe->last_regnode = last;
4445 newframe->stopparen = stopparen;
4446 newframe->prev_recursed_depth = recursed_depth;
4447 newframe->this_prev_frame= frame;
4449 DEBUG_STUDYDATA("frame-new:",data,depth);
4450 DEBUG_PEEP("fnew", scan, depth);
4457 recursed_depth= my_recursed_depth;
4462 else if (OP(scan) == EXACT) {
4463 SSize_t l = STR_LEN(scan);
4466 const U8 * const s = (U8*)STRING(scan);
4467 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4468 l = utf8_length(s, s + l);
4470 uc = *((U8*)STRING(scan));
4473 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4474 /* The code below prefers earlier match for fixed
4475 offset, later match for variable offset. */
4476 if (data->last_end == -1) { /* Update the start info. */
4477 data->last_start_min = data->pos_min;
4478 data->last_start_max = is_inf
4479 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4481 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4483 SvUTF8_on(data->last_found);
4485 SV * const sv = data->last_found;
4486 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4487 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4488 if (mg && mg->mg_len >= 0)
4489 mg->mg_len += utf8_length((U8*)STRING(scan),
4490 (U8*)STRING(scan)+STR_LEN(scan));
4492 data->last_end = data->pos_min + l;
4493 data->pos_min += l; /* As in the first entry. */
4494 data->flags &= ~SF_BEFORE_EOL;
4497 /* ANDing the code point leaves at most it, and not in locale, and
4498 * can't match null string */
4499 if (flags & SCF_DO_STCLASS_AND) {
4500 ssc_cp_and(data->start_class, uc);
4501 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4502 ssc_clear_locale(data->start_class);
4504 else if (flags & SCF_DO_STCLASS_OR) {
4505 ssc_add_cp(data->start_class, uc);
4506 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4508 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4509 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4511 flags &= ~SCF_DO_STCLASS;
4513 else if (PL_regkind[OP(scan)] == EXACT) {
4514 /* But OP != EXACT!, so is EXACTFish */
4515 SSize_t l = STR_LEN(scan);
4516 UV uc = *((U8*)STRING(scan));
4517 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4518 separate code points */
4519 const U8 * s = (U8*)STRING(scan);
4521 /* Search for fixed substrings supports EXACT only. */
4522 if (flags & SCF_DO_SUBSTR) {
4524 scan_commit(pRExC_state, data, minlenp, is_inf);
4527 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4528 l = utf8_length(s, s + l);
4530 if (unfolded_multi_char) {
4531 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4533 min += l - min_subtract;
4535 delta += min_subtract;
4536 if (flags & SCF_DO_SUBSTR) {
4537 data->pos_min += l - min_subtract;
4538 if (data->pos_min < 0) {
4541 data->pos_delta += min_subtract;
4543 data->longest = &(data->longest_float);
4547 if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4548 ssc_clear_locale(data->start_class);
4553 /* We punt and assume can match anything if the node begins
4554 * with a multi-character fold. Things are complicated. For
4555 * example, /ffi/i could match any of:
4556 * "\N{LATIN SMALL LIGATURE FFI}"
4557 * "\N{LATIN SMALL LIGATURE FF}I"
4558 * "F\N{LATIN SMALL LIGATURE FI}"
4559 * plus several other things; and making sure we have all the
4560 * possibilities is hard. */
4561 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4563 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4567 /* Any Latin1 range character can potentially match any
4568 * other depending on the locale */
4569 if (OP(scan) == EXACTFL) {
4570 _invlist_union(EXACTF_invlist, PL_Latin1,
4574 /* But otherwise, it matches at least itself. We can
4575 * quickly tell if it has a distinct fold, and if so,
4576 * it matches that as well */
4577 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4578 if (IS_IN_SOME_FOLD_L1(uc)) {
4579 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4580 PL_fold_latin1[uc]);
4584 /* Some characters match above-Latin1 ones under /i. This
4585 * is true of EXACTFL ones when the locale is UTF-8 */
4586 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4587 && (! isASCII(uc) || (OP(scan) != EXACTFA
4588 && OP(scan) != EXACTFA_NO_TRIE)))
4590 add_above_Latin1_folds(pRExC_state,
4596 else { /* Pattern is UTF-8 */
4597 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4598 STRLEN foldlen = UTF8SKIP(s);
4599 const U8* e = s + STR_LEN(scan);
4602 /* The only code points that aren't folded in a UTF EXACTFish
4603 * node are are the problematic ones in EXACTFL nodes */
4604 if (OP(scan) == EXACTFL
4605 && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4607 /* We need to check for the possibility that this EXACTFL
4608 * node begins with a multi-char fold. Therefore we fold
4609 * the first few characters of it so that we can make that
4614 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4616 *(d++) = (U8) toFOLD(*s);
4621 to_utf8_fold(s, d, &len);
4627 /* And set up so the code below that looks in this folded
4628 * buffer instead of the node's string */
4630 foldlen = UTF8SKIP(folded);
4634 /* When we reach here 's' points to the fold of the first
4635 * character(s) of the node; and 'e' points to far enough along
4636 * the folded string to be just past any possible multi-char
4637 * fold. 'foldlen' is the length in bytes of the first
4640 * Unlike the non-UTF-8 case, the macro for determining if a
4641 * string is a multi-char fold requires all the characters to
4642 * already be folded. This is because of all the complications
4643 * if not. Note that they are folded anyway, except in EXACTFL
4644 * nodes. Like the non-UTF case above, we punt if the node
4645 * begins with a multi-char fold */
4647 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4649 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4651 else { /* Single char fold */
4653 /* It matches all the things that fold to it, which are
4654 * found in PL_utf8_foldclosures (including itself) */
4655 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4656 if (! PL_utf8_foldclosures) {
4657 _load_PL_utf8_foldclosures();
4659 if ((listp = hv_fetch(PL_utf8_foldclosures,
4660 (char *) s, foldlen, FALSE)))
4662 AV* list = (AV*) *listp;
4664 for (k = 0; k <= av_tindex(list); k++) {
4665 SV** c_p = av_fetch(list, k, FALSE);
4671 /* /aa doesn't allow folds between ASCII and non- */
4672 if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4673 && isASCII(c) != isASCII(uc))
4678 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4683 if (flags & SCF_DO_STCLASS_AND) {
4684 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4685 ANYOF_POSIXL_ZERO(data->start_class);
4686 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4688 else if (flags & SCF_DO_STCLASS_OR) {
4689 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4690 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4692 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4693 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4695 flags &= ~SCF_DO_STCLASS;
4696 SvREFCNT_dec(EXACTF_invlist);
4698 else if (REGNODE_VARIES(OP(scan))) {
4699 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4700 I32 fl = 0, f = flags;
4701 regnode * const oscan = scan;
4702 regnode_ssc this_class;
4703 regnode_ssc *oclass = NULL;
4704 I32 next_is_eval = 0;
4706 switch (PL_regkind[OP(scan)]) {
4707 case WHILEM: /* End of (?:...)* . */
4708 scan = NEXTOPER(scan);
4711 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4712 next = NEXTOPER(scan);
4713 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4715 maxcount = REG_INFTY;
4716 next = regnext(scan);
4717 scan = NEXTOPER(scan);
4721 if (flags & SCF_DO_SUBSTR)
4726 if (flags & SCF_DO_STCLASS) {
4728 maxcount = REG_INFTY;
4729 next = regnext(scan);
4730 scan = NEXTOPER(scan);
4733 if (flags & SCF_DO_SUBSTR) {
4734 scan_commit(pRExC_state, data, minlenp, is_inf);
4735 /* Cannot extend fixed substrings */
4736 data->longest = &(data->longest_float);
4738 is_inf = is_inf_internal = 1;
4739 scan = regnext(scan);
4740 goto optimize_curly_tail;
4742 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4743 && (scan->flags == stopparen))
4748 mincount = ARG1(scan);
4749 maxcount = ARG2(scan);
4751 next = regnext(scan);
4752 if (OP(scan) == CURLYX) {
4753 I32 lp = (data ? *(data->last_closep) : 0);
4754 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4756 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4757 next_is_eval = (OP(scan) == EVAL);
4759 if (flags & SCF_DO_SUBSTR) {
4761 scan_commit(pRExC_state, data, minlenp, is_inf);
4762 /* Cannot extend fixed substrings */
4763 pos_before = data->pos_min;
4767 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4769 data->flags |= SF_IS_INF;
4771 if (flags & SCF_DO_STCLASS) {
4772 ssc_init(pRExC_state, &this_class);
4773 oclass = data->start_class;
4774 data->start_class = &this_class;
4775 f |= SCF_DO_STCLASS_AND;
4776 f &= ~SCF_DO_STCLASS_OR;
4778 /* Exclude from super-linear cache processing any {n,m}
4779 regops for which the combination of input pos and regex
4780 pos is not enough information to determine if a match
4783 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4784 regex pos at the \s*, the prospects for a match depend not
4785 only on the input position but also on how many (bar\s*)
4786 repeats into the {4,8} we are. */
4787 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4788 f &= ~SCF_WHILEM_VISITED_POS;
4790 /* This will finish on WHILEM, setting scan, or on NULL: */
4791 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4792 last, data, stopparen, recursed_depth, NULL,
4794 ? (f & ~SCF_DO_SUBSTR)
4798 if (flags & SCF_DO_STCLASS)
4799 data->start_class = oclass;
4800 if (mincount == 0 || minnext == 0) {
4801 if (flags & SCF_DO_STCLASS_OR) {
4802 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4804 else if (flags & SCF_DO_STCLASS_AND) {
4805 /* Switch to OR mode: cache the old value of
4806 * data->start_class */
4808 StructCopy(data->start_class, and_withp, regnode_ssc);
4809 flags &= ~SCF_DO_STCLASS_AND;
4810 StructCopy(&this_class, data->start_class, regnode_ssc);
4811 flags |= SCF_DO_STCLASS_OR;
4812 ANYOF_FLAGS(data->start_class)
4813 |= SSC_MATCHES_EMPTY_STRING;
4815 } else { /* Non-zero len */
4816 if (flags & SCF_DO_STCLASS_OR) {
4817 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4818 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4820 else if (flags & SCF_DO_STCLASS_AND)
4821 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4822 flags &= ~SCF_DO_STCLASS;
4824 if (!scan) /* It was not CURLYX, but CURLY. */
4826 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4827 /* ? quantifier ok, except for (?{ ... }) */
4828 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4829 && (minnext == 0) && (deltanext == 0)
4830 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4831 && maxcount <= REG_INFTY/3) /* Complement check for big
4834 /* Fatal warnings may leak the regexp without this: */
4835 SAVEFREESV(RExC_rx_sv);
4836 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4837 "Quantifier unexpected on zero-length expression "
4838 "in regex m/%"UTF8f"/",
4839 UTF8fARG(UTF, RExC_end - RExC_precomp,
4841 (void)ReREFCNT_inc(RExC_rx_sv);
4844 min += minnext * mincount;
4845 is_inf_internal |= deltanext == SSize_t_MAX
4846 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4847 is_inf |= is_inf_internal;
4849 delta = SSize_t_MAX;
4851 delta += (minnext + deltanext) * maxcount
4852 - minnext * mincount;
4854 /* Try powerful optimization CURLYX => CURLYN. */
4855 if ( OP(oscan) == CURLYX && data
4856 && data->flags & SF_IN_PAR
4857 && !(data->flags & SF_HAS_EVAL)
4858 && !deltanext && minnext == 1 ) {
4859 /* Try to optimize to CURLYN. */
4860 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4861 regnode * const nxt1 = nxt;
4868 if (!REGNODE_SIMPLE(OP(nxt))
4869 && !(PL_regkind[OP(nxt)] == EXACT
4870 && STR_LEN(nxt) == 1))
4876 if (OP(nxt) != CLOSE)
4878 if (RExC_open_parens) {
4879 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4880 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4882 /* Now we know that nxt2 is the only contents: */
4883 oscan->flags = (U8)ARG(nxt);
4885 OP(nxt1) = NOTHING; /* was OPEN. */
4888 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4889 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4890 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4891 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4892 OP(nxt + 1) = OPTIMIZED; /* was count. */
4893 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4898 /* Try optimization CURLYX => CURLYM. */
4899 if ( OP(oscan) == CURLYX && data
4900 && !(data->flags & SF_HAS_PAR)
4901 && !(data->flags & SF_HAS_EVAL)
4902 && !deltanext /* atom is fixed width */
4903 && minnext != 0 /* CURLYM can't handle zero width */
4905 /* Nor characters whose fold at run-time may be
4906 * multi-character */
4907 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4909 /* XXXX How to optimize if data == 0? */
4910 /* Optimize to a simpler form. */
4911 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4915 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4916 && (OP(nxt2) != WHILEM))
4918 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4919 /* Need to optimize away parenths. */
4920 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4921 /* Set the parenth number. */
4922 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4924 oscan->flags = (U8)ARG(nxt);
4925 if (RExC_open_parens) {
4926 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4927 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4929 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4930 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4933 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4934 OP(nxt + 1) = OPTIMIZED; /* was count. */
4935 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4936 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4939 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4940 regnode *nnxt = regnext(nxt1);
4942 if (reg_off_by_arg[OP(nxt1)])
4943 ARG_SET(nxt1, nxt2 - nxt1);
4944 else if (nxt2 - nxt1 < U16_MAX)
4945 NEXT_OFF(nxt1) = nxt2 - nxt1;
4947 OP(nxt) = NOTHING; /* Cannot beautify */
4952 /* Optimize again: */
4953 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4954 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4959 else if ((OP(oscan) == CURLYX)
4960 && (flags & SCF_WHILEM_VISITED_POS)
4961 /* See the comment on a similar expression above.
4962 However, this time it's not a subexpression
4963 we care about, but the expression itself. */
4964 && (maxcount == REG_INFTY)
4965 && data && ++data->whilem_c < 16) {
4966 /* This stays as CURLYX, we can put the count/of pair. */
4967 /* Find WHILEM (as in regexec.c) */
4968 regnode *nxt = oscan + NEXT_OFF(oscan);
4970 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4972 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4973 | (RExC_whilem_seen << 4)); /* On WHILEM */
4975 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4977 if (flags & SCF_DO_SUBSTR) {
4978 SV *last_str = NULL;
4979 STRLEN last_chrs = 0;
4980 int counted = mincount != 0;
4982 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4984 SSize_t b = pos_before >= data->last_start_min
4985 ? pos_before : data->last_start_min;
4987 const char * const s = SvPV_const(data->last_found, l);
4988 SSize_t old = b - data->last_start_min;
4991 old = utf8_hop((U8*)s, old) - (U8*)s;
4993 /* Get the added string: */
4994 last_str = newSVpvn_utf8(s + old, l, UTF);
4995 last_chrs = UTF ? utf8_length((U8*)(s + old),
4996 (U8*)(s + old + l)) : l;
4997 if (deltanext == 0 && pos_before == b) {
4998 /* What was added is a constant string */
5001 SvGROW(last_str, (mincount * l) + 1);
5002 repeatcpy(SvPVX(last_str) + l,
5003 SvPVX_const(last_str), l,
5005 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5006 /* Add additional parts. */
5007 SvCUR_set(data->last_found,
5008 SvCUR(data->last_found) - l);
5009 sv_catsv(data->last_found, last_str);
5011 SV * sv = data->last_found;
5013 SvUTF8(sv) && SvMAGICAL(sv) ?
5014 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5015 if (mg && mg->mg_len >= 0)
5016 mg->mg_len += last_chrs * (mincount-1);
5018 last_chrs *= mincount;
5019 data->last_end += l * (mincount - 1);
5022 /* start offset must point into the last copy */
5023 data->last_start_min += minnext * (mincount - 1);
5024 data->last_start_max += is_inf ? SSize_t_MAX
5025 : (maxcount - 1) * (minnext + data->pos_delta);
5028 /* It is counted once already... */
5029 data->pos_min += minnext * (mincount - counted);
5031 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5032 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5033 " maxcount=%"UVuf" mincount=%"UVuf"\n",
5034 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5036 if (deltanext != SSize_t_MAX)
5037 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5038 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5039 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5041 if (deltanext == SSize_t_MAX
5042 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5043 data->pos_delta = SSize_t_MAX;
5045 data->pos_delta += - counted * deltanext +
5046 (minnext + deltanext) * maxcount - minnext * mincount;
5047 if (mincount != maxcount) {
5048 /* Cannot extend fixed substrings found inside
5050 scan_commit(pRExC_state, data, minlenp, is_inf);
5051 if (mincount && last_str) {
5052 SV * const sv = data->last_found;
5053 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5054 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5058 sv_setsv(sv, last_str);
5059 data->last_end = data->pos_min;
5060 data->last_start_min = data->pos_min - last_chrs;
5061 data->last_start_max = is_inf
5063 : data->pos_min + data->pos_delta - last_chrs;
5065 data->longest = &(data->longest_float);
5067 SvREFCNT_dec(last_str);
5069 if (data && (fl & SF_HAS_EVAL))
5070 data->flags |= SF_HAS_EVAL;
5071 optimize_curly_tail:
5072 if (OP(oscan) != CURLYX) {
5073 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5075 NEXT_OFF(oscan) += NEXT_OFF(next);
5081 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5086 if (flags & SCF_DO_SUBSTR) {
5087 /* Cannot expect anything... */
5088 scan_commit(pRExC_state, data, minlenp, is_inf);
5089 data->longest = &(data->longest_float);
5091 is_inf = is_inf_internal = 1;
5092 if (flags & SCF_DO_STCLASS_OR) {
5093 if (OP(scan) == CLUMP) {
5094 /* Actually is any start char, but very few code points
5095 * aren't start characters */
5096 ssc_match_all_cp(data->start_class);
5099 ssc_anything(data->start_class);
5102 flags &= ~SCF_DO_STCLASS;
5106 else if (OP(scan) == LNBREAK) {
5107 if (flags & SCF_DO_STCLASS) {
5108 if (flags & SCF_DO_STCLASS_AND) {
5109 ssc_intersection(data->start_class,
5110 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5111 ssc_clear_locale(data->start_class);
5112 ANYOF_FLAGS(data->start_class)
5113 &= ~SSC_MATCHES_EMPTY_STRING;
5115 else if (flags & SCF_DO_STCLASS_OR) {
5116 ssc_union(data->start_class,
5117 PL_XPosix_ptrs[_CC_VERTSPACE],
5119 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5121 /* See commit msg for
5122 * 749e076fceedeb708a624933726e7989f2302f6a */
5123 ANYOF_FLAGS(data->start_class)
5124 &= ~SSC_MATCHES_EMPTY_STRING;
5126 flags &= ~SCF_DO_STCLASS;
5129 delta++; /* Because of the 2 char string cr-lf */
5130 if (flags & SCF_DO_SUBSTR) {
5131 /* Cannot expect anything... */
5132 scan_commit(pRExC_state, data, minlenp, is_inf);
5134 data->pos_delta += 1;
5135 data->longest = &(data->longest_float);
5138 else if (REGNODE_SIMPLE(OP(scan))) {
5140 if (flags & SCF_DO_SUBSTR) {
5141 scan_commit(pRExC_state, data, minlenp, is_inf);
5145 if (flags & SCF_DO_STCLASS) {
5147 SV* my_invlist = NULL;
5150 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5151 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5153 /* Some of the logic below assumes that switching
5154 locale on will only add false positives. */
5159 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5164 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5165 ssc_match_all_cp(data->start_class);
5170 SV* REG_ANY_invlist = _new_invlist(2);
5171 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5173 if (flags & SCF_DO_STCLASS_OR) {
5174 ssc_union(data->start_class,
5176 TRUE /* TRUE => invert, hence all but \n
5180 else if (flags & SCF_DO_STCLASS_AND) {
5181 ssc_intersection(data->start_class,
5183 TRUE /* TRUE => invert */
5185 ssc_clear_locale(data->start_class);
5187 SvREFCNT_dec_NN(REG_ANY_invlist);
5192 if (flags & SCF_DO_STCLASS_AND)
5193 ssc_and(pRExC_state, data->start_class,
5194 (regnode_charclass *) scan);
5196 ssc_or(pRExC_state, data->start_class,
5197 (regnode_charclass *) scan);
5205 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5206 if (flags & SCF_DO_STCLASS_AND) {
5207 bool was_there = cBOOL(
5208 ANYOF_POSIXL_TEST(data->start_class,
5210 ANYOF_POSIXL_ZERO(data->start_class);
5211 if (was_there) { /* Do an AND */
5212 ANYOF_POSIXL_SET(data->start_class, namedclass);
5214 /* No individual code points can now match */
5215 data->start_class->invlist
5216 = sv_2mortal(_new_invlist(0));
5219 int complement = namedclass + ((invert) ? -1 : 1);
5221 assert(flags & SCF_DO_STCLASS_OR);
5223 /* If the complement of this class was already there,
5224 * the result is that they match all code points,
5225 * (\d + \D == everything). Remove the classes from
5226 * future consideration. Locale is not relevant in
5228 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5229 ssc_match_all_cp(data->start_class);
5230 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5231 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5233 else { /* The usual case; just add this class to the
5235 ANYOF_POSIXL_SET(data->start_class, namedclass);
5240 case NPOSIXA: /* For these, we always know the exact set of
5245 if (FLAGS(scan) == _CC_ASCII) {
5246 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5249 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5250 PL_XPosix_ptrs[_CC_ASCII],
5261 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5263 /* NPOSIXD matches all upper Latin1 code points unless the
5264 * target string being matched is UTF-8, which is
5265 * unknowable until match time. Since we are going to
5266 * invert, we want to get rid of all of them so that the
5267 * inversion will match all */
5268 if (OP(scan) == NPOSIXD) {
5269 _invlist_subtract(my_invlist, PL_UpperLatin1,
5275 if (flags & SCF_DO_STCLASS_AND) {
5276 ssc_intersection(data->start_class, my_invlist, invert);
5277 ssc_clear_locale(data->start_class);
5280 assert(flags & SCF_DO_STCLASS_OR);
5281 ssc_union(data->start_class, my_invlist, invert);
5283 SvREFCNT_dec(my_invlist);
5285 if (flags & SCF_DO_STCLASS_OR)
5286 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5287 flags &= ~SCF_DO_STCLASS;
5290 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5291 data->flags |= (OP(scan) == MEOL
5294 scan_commit(pRExC_state, data, minlenp, is_inf);
5297 else if ( PL_regkind[OP(scan)] == BRANCHJ
5298 /* Lookbehind, or need to calculate parens/evals/stclass: */
5299 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5300 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5302 if ( OP(scan) == UNLESSM &&
5304 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5305 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5308 regnode *upto= regnext(scan);
5310 DEBUG_STUDYDATA("OPFAIL",data,depth);
5312 /*DEBUG_PARSE_MSG("opfail");*/
5313 regprop(RExC_rx, RExC_mysv, upto, NULL, pRExC_state);
5314 PerlIO_printf(Perl_debug_log,
5315 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5316 SvPV_nolen_const(RExC_mysv),
5317 (IV)REG_NODE_NUM(upto),
5322 NEXT_OFF(scan) = upto - scan;
5323 for (opt= scan + 1; opt < upto ; opt++)
5324 OP(opt) = OPTIMIZED;
5328 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5329 || OP(scan) == UNLESSM )
5331 /* Negative Lookahead/lookbehind
5332 In this case we can't do fixed string optimisation.
5335 SSize_t deltanext, minnext, fake = 0;
5340 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5342 data_fake.whilem_c = data->whilem_c;
5343 data_fake.last_closep = data->last_closep;
5346 data_fake.last_closep = &fake;
5347 data_fake.pos_delta = delta;
5348 if ( flags & SCF_DO_STCLASS && !scan->flags
5349 && OP(scan) == IFMATCH ) { /* Lookahead */
5350 ssc_init(pRExC_state, &intrnl);
5351 data_fake.start_class = &intrnl;
5352 f |= SCF_DO_STCLASS_AND;
5354 if (flags & SCF_WHILEM_VISITED_POS)
5355 f |= SCF_WHILEM_VISITED_POS;
5356 next = regnext(scan);
5357 nscan = NEXTOPER(NEXTOPER(scan));
5358 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5359 last, &data_fake, stopparen,
5360 recursed_depth, NULL, f, depth+1);
5363 FAIL("Variable length lookbehind not implemented");
5365 else if (minnext > (I32)U8_MAX) {
5366 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5369 scan->flags = (U8)minnext;
5372 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5374 if (data_fake.flags & SF_HAS_EVAL)
5375 data->flags |= SF_HAS_EVAL;
5376 data->whilem_c = data_fake.whilem_c;
5378 if (f & SCF_DO_STCLASS_AND) {
5379 if (flags & SCF_DO_STCLASS_OR) {
5380 /* OR before, AND after: ideally we would recurse with
5381 * data_fake to get the AND applied by study of the
5382 * remainder of the pattern, and then derecurse;
5383 * *** HACK *** for now just treat as "no information".
5384 * See [perl #56690].
5386 ssc_init(pRExC_state, data->start_class);
5388 /* AND before and after: combine and continue. These
5389 * assertions are zero-length, so can match an EMPTY
5391 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5392 ANYOF_FLAGS(data->start_class)
5393 |= SSC_MATCHES_EMPTY_STRING;
5397 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5399 /* Positive Lookahead/lookbehind
5400 In this case we can do fixed string optimisation,
5401 but we must be careful about it. Note in the case of
5402 lookbehind the positions will be offset by the minimum
5403 length of the pattern, something we won't know about
5404 until after the recurse.
5406 SSize_t deltanext, fake = 0;
5410 /* We use SAVEFREEPV so that when the full compile
5411 is finished perl will clean up the allocated
5412 minlens when it's all done. This way we don't
5413 have to worry about freeing them when we know
5414 they wont be used, which would be a pain.
5417 Newx( minnextp, 1, SSize_t );
5418 SAVEFREEPV(minnextp);
5421 StructCopy(data, &data_fake, scan_data_t);
5422 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5425 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5426 data_fake.last_found=newSVsv(data->last_found);
5430 data_fake.last_closep = &fake;
5431 data_fake.flags = 0;
5432 data_fake.pos_delta = delta;
5434 data_fake.flags |= SF_IS_INF;
5435 if ( flags & SCF_DO_STCLASS && !scan->flags
5436 && OP(scan) == IFMATCH ) { /* Lookahead */
5437 ssc_init(pRExC_state, &intrnl);
5438 data_fake.start_class = &intrnl;
5439 f |= SCF_DO_STCLASS_AND;
5441 if (flags & SCF_WHILEM_VISITED_POS)
5442 f |= SCF_WHILEM_VISITED_POS;
5443 next = regnext(scan);
5444 nscan = NEXTOPER(NEXTOPER(scan));
5446 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5447 &deltanext, last, &data_fake,
5448 stopparen, recursed_depth, NULL,
5452 FAIL("Variable length lookbehind not implemented");
5454 else if (*minnextp > (I32)U8_MAX) {
5455 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5458 scan->flags = (U8)*minnextp;
5463 if (f & SCF_DO_STCLASS_AND) {
5464 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5465 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5468 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5470 if (data_fake.flags & SF_HAS_EVAL)
5471 data->flags |= SF_HAS_EVAL;
5472 data->whilem_c = data_fake.whilem_c;
5473 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5474 if (RExC_rx->minlen<*minnextp)
5475 RExC_rx->minlen=*minnextp;
5476 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5477 SvREFCNT_dec_NN(data_fake.last_found);
5479 if ( data_fake.minlen_fixed != minlenp )
5481 data->offset_fixed= data_fake.offset_fixed;
5482 data->minlen_fixed= data_fake.minlen_fixed;
5483 data->lookbehind_fixed+= scan->flags;
5485 if ( data_fake.minlen_float != minlenp )
5487 data->minlen_float= data_fake.minlen_float;
5488 data->offset_float_min=data_fake.offset_float_min;
5489 data->offset_float_max=data_fake.offset_float_max;
5490 data->lookbehind_float+= scan->flags;
5497 else if (OP(scan) == OPEN) {
5498 if (stopparen != (I32)ARG(scan))
5501 else if (OP(scan) == CLOSE) {
5502 if (stopparen == (I32)ARG(scan)) {
5505 if ((I32)ARG(scan) == is_par) {
5506 next = regnext(scan);
5508 if ( next && (OP(next) != WHILEM) && next < last)
5509 is_par = 0; /* Disable optimization */
5512 *(data->last_closep) = ARG(scan);
5514 else if (OP(scan) == EVAL) {
5516 data->flags |= SF_HAS_EVAL;
5518 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5519 if (flags & SCF_DO_SUBSTR) {
5520 scan_commit(pRExC_state, data, minlenp, is_inf);
5521 flags &= ~SCF_DO_SUBSTR;
5523 if (data && OP(scan)==ACCEPT) {
5524 data->flags |= SCF_SEEN_ACCEPT;
5529 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5531 if (flags & SCF_DO_SUBSTR) {
5532 scan_commit(pRExC_state, data, minlenp, is_inf);
5533 data->longest = &(data->longest_float);
5535 is_inf = is_inf_internal = 1;
5536 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5537 ssc_anything(data->start_class);
5538 flags &= ~SCF_DO_STCLASS;
5540 else if (OP(scan) == GPOS) {
5541 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5542 !(delta || is_inf || (data && data->pos_delta)))
5544 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5545 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5546 if (RExC_rx->gofs < (STRLEN)min)
5547 RExC_rx->gofs = min;
5549 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5553 #ifdef TRIE_STUDY_OPT
5554 #ifdef FULL_TRIE_STUDY
5555 else if (PL_regkind[OP(scan)] == TRIE) {
5556 /* NOTE - There is similar code to this block above for handling
5557 BRANCH nodes on the initial study. If you change stuff here
5559 regnode *trie_node= scan;
5560 regnode *tail= regnext(scan);
5561 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5562 SSize_t max1 = 0, min1 = SSize_t_MAX;
5565 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5566 /* Cannot merge strings after this. */
5567 scan_commit(pRExC_state, data, minlenp, is_inf);
5569 if (flags & SCF_DO_STCLASS)
5570 ssc_init_zero(pRExC_state, &accum);
5576 const regnode *nextbranch= NULL;
5579 for ( word=1 ; word <= trie->wordcount ; word++)
5581 SSize_t deltanext=0, minnext=0, f = 0, fake;
5582 regnode_ssc this_class;
5584 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5586 data_fake.whilem_c = data->whilem_c;
5587 data_fake.last_closep = data->last_closep;
5590 data_fake.last_closep = &fake;
5591 data_fake.pos_delta = delta;
5592 if (flags & SCF_DO_STCLASS) {
5593 ssc_init(pRExC_state, &this_class);
5594 data_fake.start_class = &this_class;
5595 f = SCF_DO_STCLASS_AND;
5597 if (flags & SCF_WHILEM_VISITED_POS)
5598 f |= SCF_WHILEM_VISITED_POS;
5600 if (trie->jump[word]) {
5602 nextbranch = trie_node + trie->jump[0];
5603 scan= trie_node + trie->jump[word];
5604 /* We go from the jump point to the branch that follows
5605 it. Note this means we need the vestigal unused
5606 branches even though they arent otherwise used. */
5607 minnext = study_chunk(pRExC_state, &scan, minlenp,
5608 &deltanext, (regnode *)nextbranch, &data_fake,
5609 stopparen, recursed_depth, NULL, f,depth+1);
5611 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5612 nextbranch= regnext((regnode*)nextbranch);
5614 if (min1 > (SSize_t)(minnext + trie->minlen))
5615 min1 = minnext + trie->minlen;
5616 if (deltanext == SSize_t_MAX) {
5617 is_inf = is_inf_internal = 1;
5619 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5620 max1 = minnext + deltanext + trie->maxlen;
5622 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5624 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5625 if ( stopmin > min + min1)
5626 stopmin = min + min1;
5627 flags &= ~SCF_DO_SUBSTR;
5629 data->flags |= SCF_SEEN_ACCEPT;
5632 if (data_fake.flags & SF_HAS_EVAL)
5633 data->flags |= SF_HAS_EVAL;
5634 data->whilem_c = data_fake.whilem_c;
5636 if (flags & SCF_DO_STCLASS)
5637 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5640 if (flags & SCF_DO_SUBSTR) {
5641 data->pos_min += min1;
5642 data->pos_delta += max1 - min1;
5643 if (max1 != min1 || is_inf)
5644 data->longest = &(data->longest_float);
5647 delta += max1 - min1;
5648 if (flags & SCF_DO_STCLASS_OR) {
5649 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5651 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5652 flags &= ~SCF_DO_STCLASS;
5655 else if (flags & SCF_DO_STCLASS_AND) {
5657 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5658 flags &= ~SCF_DO_STCLASS;
5661 /* Switch to OR mode: cache the old value of
5662 * data->start_class */
5664 StructCopy(data->start_class, and_withp, regnode_ssc);
5665 flags &= ~SCF_DO_STCLASS_AND;
5666 StructCopy(&accum, data->start_class, regnode_ssc);
5667 flags |= SCF_DO_STCLASS_OR;
5674 else if (PL_regkind[OP(scan)] == TRIE) {
5675 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5678 min += trie->minlen;
5679 delta += (trie->maxlen - trie->minlen);
5680 flags &= ~SCF_DO_STCLASS; /* xxx */
5681 if (flags & SCF_DO_SUBSTR) {
5682 /* Cannot expect anything... */
5683 scan_commit(pRExC_state, data, minlenp, is_inf);
5684 data->pos_min += trie->minlen;
5685 data->pos_delta += (trie->maxlen - trie->minlen);
5686 if (trie->maxlen != trie->minlen)
5687 data->longest = &(data->longest_float);
5689 if (trie->jump) /* no more substrings -- for now /grr*/
5690 flags &= ~SCF_DO_SUBSTR;
5692 #endif /* old or new */
5693 #endif /* TRIE_STUDY_OPT */
5695 /* Else: zero-length, ignore. */
5696 scan = regnext(scan);
5698 /* If we are exiting a recursion we can unset its recursed bit
5699 * and allow ourselves to enter it again - no danger of an
5700 * infinite loop there.
5701 if (stopparen > -1 && recursed) {
5702 DEBUG_STUDYDATA("unset:", data,depth);
5703 PAREN_UNSET( recursed, stopparen);
5709 DEBUG_STUDYDATA("frame-end:",data,depth);
5710 DEBUG_PEEP("fend", scan, depth);
5712 /* restore previous context */
5713 last = frame->last_regnode;
5714 scan = frame->next_regnode;
5715 stopparen = frame->stopparen;
5716 recursed_depth = frame->prev_recursed_depth;
5718 RExC_frame_last = frame->prev_frame;
5719 frame = frame->this_prev_frame;
5720 goto fake_study_recurse;
5725 DEBUG_STUDYDATA("pre-fin:",data,depth);
5728 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5730 if (flags & SCF_DO_SUBSTR && is_inf)
5731 data->pos_delta = SSize_t_MAX - data->pos_min;
5732 if (is_par > (I32)U8_MAX)
5734 if (is_par && pars==1 && data) {
5735 data->flags |= SF_IN_PAR;
5736 data->flags &= ~SF_HAS_PAR;
5738 else if (pars && data) {
5739 data->flags |= SF_HAS_PAR;
5740 data->flags &= ~SF_IN_PAR;
5742 if (flags & SCF_DO_STCLASS_OR)
5743 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5744 if (flags & SCF_TRIE_RESTUDY)
5745 data->flags |= SCF_TRIE_RESTUDY;
5747 DEBUG_STUDYDATA("post-fin:",data,depth);
5750 SSize_t final_minlen= min < stopmin ? min : stopmin;
5752 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5753 RExC_maxlen = final_minlen + delta;
5755 return final_minlen;
5761 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5763 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5765 PERL_ARGS_ASSERT_ADD_DATA;
5767 Renewc(RExC_rxi->data,
5768 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5769 char, struct reg_data);
5771 Renew(RExC_rxi->data->what, count + n, U8);
5773 Newx(RExC_rxi->data->what, n, U8);
5774 RExC_rxi->data->count = count + n;
5775 Copy(s, RExC_rxi->data->what + count, n, U8);
5779 /*XXX: todo make this not included in a non debugging perl, but appears to be
5780 * used anyway there, in 'use re' */
5781 #ifndef PERL_IN_XSUB_RE
5783 Perl_reginitcolors(pTHX)
5785 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5787 char *t = savepv(s);
5791 t = strchr(t, '\t');
5797 PL_colors[i] = t = (char *)"";
5802 PL_colors[i++] = (char *)"";
5809 #ifdef TRIE_STUDY_OPT
5810 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5813 (data.flags & SCF_TRIE_RESTUDY) \
5821 #define CHECK_RESTUDY_GOTO_butfirst
5825 * pregcomp - compile a regular expression into internal code
5827 * Decides which engine's compiler to call based on the hint currently in
5831 #ifndef PERL_IN_XSUB_RE
5833 /* return the currently in-scope regex engine (or the default if none) */
5835 regexp_engine const *
5836 Perl_current_re_engine(pTHX)
5838 if (IN_PERL_COMPILETIME) {
5839 HV * const table = GvHV(PL_hintgv);
5842 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5843 return &PL_core_reg_engine;
5844 ptr = hv_fetchs(table, "regcomp", FALSE);
5845 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5846 return &PL_core_reg_engine;
5847 return INT2PTR(regexp_engine*,SvIV(*ptr));
5851 if (!PL_curcop->cop_hints_hash)
5852 return &PL_core_reg_engine;
5853 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5854 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5855 return &PL_core_reg_engine;
5856 return INT2PTR(regexp_engine*,SvIV(ptr));
5862 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5864 regexp_engine const *eng = current_re_engine();
5865 GET_RE_DEBUG_FLAGS_DECL;
5867 PERL_ARGS_ASSERT_PREGCOMP;
5869 /* Dispatch a request to compile a regexp to correct regexp engine. */
5871 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5874 return CALLREGCOMP_ENG(eng, pattern, flags);
5878 /* public(ish) entry point for the perl core's own regex compiling code.
5879 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5880 * pattern rather than a list of OPs, and uses the internal engine rather
5881 * than the current one */
5884 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5886 SV *pat = pattern; /* defeat constness! */
5887 PERL_ARGS_ASSERT_RE_COMPILE;
5888 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5889 #ifdef PERL_IN_XSUB_RE
5892 &PL_core_reg_engine,
5894 NULL, NULL, rx_flags, 0);
5898 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5899 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5900 * point to the realloced string and length.
5902 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5906 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5907 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5909 U8 *const src = (U8*)*pat_p;
5914 GET_RE_DEBUG_FLAGS_DECL;
5916 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5917 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5919 Newx(dst, *plen_p * 2 + 1, U8);
5922 while (s < *plen_p) {
5923 append_utf8_from_native_byte(src[s], &d);
5924 if (n < num_code_blocks) {
5925 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5926 pRExC_state->code_blocks[n].start = d - dst - 1;
5927 assert(*(d - 1) == '(');
5930 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5931 pRExC_state->code_blocks[n].end = d - dst - 1;
5932 assert(*(d - 1) == ')');
5941 *pat_p = (char*) dst;
5943 RExC_orig_utf8 = RExC_utf8 = 1;
5948 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5949 * while recording any code block indices, and handling overloading,
5950 * nested qr// objects etc. If pat is null, it will allocate a new
5951 * string, or just return the first arg, if there's only one.
5953 * Returns the malloced/updated pat.
5954 * patternp and pat_count is the array of SVs to be concatted;
5955 * oplist is the optional list of ops that generated the SVs;
5956 * recompile_p is a pointer to a boolean that will be set if
5957 * the regex will need to be recompiled.
5958 * delim, if non-null is an SV that will be inserted between each element
5962 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5963 SV *pat, SV ** const patternp, int pat_count,
5964 OP *oplist, bool *recompile_p, SV *delim)
5968 bool use_delim = FALSE;
5969 bool alloced = FALSE;
5971 /* if we know we have at least two args, create an empty string,
5972 * then concatenate args to that. For no args, return an empty string */
5973 if (!pat && pat_count != 1) {
5979 for (svp = patternp; svp < patternp + pat_count; svp++) {
5982 STRLEN orig_patlen = 0;
5984 SV *msv = use_delim ? delim : *svp;
5985 if (!msv) msv = &PL_sv_undef;
5987 /* if we've got a delimiter, we go round the loop twice for each
5988 * svp slot (except the last), using the delimiter the second
5997 if (SvTYPE(msv) == SVt_PVAV) {
5998 /* we've encountered an interpolated array within
5999 * the pattern, e.g. /...@a..../. Expand the list of elements,
6000 * then recursively append elements.
6001 * The code in this block is based on S_pushav() */
6003 AV *const av = (AV*)msv;
6004 const SSize_t maxarg = AvFILL(av) + 1;
6008 assert(oplist->op_type == OP_PADAV
6009 || oplist->op_type == OP_RV2AV);
6010 oplist = OP_SIBLING(oplist);
6013 if (SvRMAGICAL(av)) {
6016 Newx(array, maxarg, SV*);
6018 for (i=0; i < maxarg; i++) {
6019 SV ** const svp = av_fetch(av, i, FALSE);
6020 array[i] = svp ? *svp : &PL_sv_undef;
6024 array = AvARRAY(av);
6026 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6027 array, maxarg, NULL, recompile_p,
6029 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6035 /* we make the assumption here that each op in the list of
6036 * op_siblings maps to one SV pushed onto the stack,
6037 * except for code blocks, with have both an OP_NULL and
6039 * This allows us to match up the list of SVs against the
6040 * list of OPs to find the next code block.
6042 * Note that PUSHMARK PADSV PADSV ..
6044 * PADRANGE PADSV PADSV ..
6045 * so the alignment still works. */
6048 if (oplist->op_type == OP_NULL
6049 && (oplist->op_flags & OPf_SPECIAL))
6051 assert(n < pRExC_state->num_code_blocks);
6052 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6053 pRExC_state->code_blocks[n].block = oplist;
6054 pRExC_state->code_blocks[n].src_regex = NULL;
6057 oplist = OP_SIBLING(oplist); /* skip CONST */
6060 oplist = OP_SIBLING(oplist);;
6063 /* apply magic and QR overloading to arg */
6066 if (SvROK(msv) && SvAMAGIC(msv)) {
6067 SV *sv = AMG_CALLunary(msv, regexp_amg);
6071 if (SvTYPE(sv) != SVt_REGEXP)
6072 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6077 /* try concatenation overload ... */
6078 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6079 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6082 /* overloading involved: all bets are off over literal
6083 * code. Pretend we haven't seen it */
6084 pRExC_state->num_code_blocks -= n;
6088 /* ... or failing that, try "" overload */
6089 while (SvAMAGIC(msv)
6090 && (sv = AMG_CALLunary(msv, string_amg))
6094 && SvRV(msv) == SvRV(sv))
6099 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6103 /* this is a partially unrolled
6104 * sv_catsv_nomg(pat, msv);
6105 * that allows us to adjust code block indices if
6108 char *dst = SvPV_force_nomg(pat, dlen);
6110 if (SvUTF8(msv) && !SvUTF8(pat)) {
6111 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6112 sv_setpvn(pat, dst, dlen);
6115 sv_catsv_nomg(pat, msv);
6122 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6125 /* extract any code blocks within any embedded qr//'s */
6126 if (rx && SvTYPE(rx) == SVt_REGEXP
6127 && RX_ENGINE((REGEXP*)rx)->op_comp)
6130 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6131 if (ri->num_code_blocks) {
6133 /* the presence of an embedded qr// with code means
6134 * we should always recompile: the text of the
6135 * qr// may not have changed, but it may be a
6136 * different closure than last time */
6138 Renew(pRExC_state->code_blocks,
6139 pRExC_state->num_code_blocks + ri->num_code_blocks,
6140 struct reg_code_block);
6141 pRExC_state->num_code_blocks += ri->num_code_blocks;
6143 for (i=0; i < ri->num_code_blocks; i++) {
6144 struct reg_code_block *src, *dst;
6145 STRLEN offset = orig_patlen
6146 + ReANY((REGEXP *)rx)->pre_prefix;
6147 assert(n < pRExC_state->num_code_blocks);
6148 src = &ri->code_blocks[i];
6149 dst = &pRExC_state->code_blocks[n];
6150 dst->start = src->start + offset;
6151 dst->end = src->end + offset;
6152 dst->block = src->block;
6153 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6162 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6171 /* see if there are any run-time code blocks in the pattern.
6172 * False positives are allowed */
6175 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6176 char *pat, STRLEN plen)
6181 PERL_UNUSED_CONTEXT;
6183 for (s = 0; s < plen; s++) {
6184 if (n < pRExC_state->num_code_blocks
6185 && s == pRExC_state->code_blocks[n].start)
6187 s = pRExC_state->code_blocks[n].end;
6191 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6193 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6195 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6202 /* Handle run-time code blocks. We will already have compiled any direct
6203 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6204 * copy of it, but with any literal code blocks blanked out and
6205 * appropriate chars escaped; then feed it into
6207 * eval "qr'modified_pattern'"
6211 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6215 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6217 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6218 * and merge them with any code blocks of the original regexp.
6220 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6221 * instead, just save the qr and return FALSE; this tells our caller that
6222 * the original pattern needs upgrading to utf8.
6226 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6227 char *pat, STRLEN plen)
6231 GET_RE_DEBUG_FLAGS_DECL;
6233 if (pRExC_state->runtime_code_qr) {
6234 /* this is the second time we've been called; this should
6235 * only happen if the main pattern got upgraded to utf8
6236 * during compilation; re-use the qr we compiled first time
6237 * round (which should be utf8 too)
6239 qr = pRExC_state->runtime_code_qr;
6240 pRExC_state->runtime_code_qr = NULL;
6241 assert(RExC_utf8 && SvUTF8(qr));
6247 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6251 /* determine how many extra chars we need for ' and \ escaping */
6252 for (s = 0; s < plen; s++) {
6253 if (pat[s] == '\'' || pat[s] == '\\')
6257 Newx(newpat, newlen, char);
6259 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6261 for (s = 0; s < plen; s++) {
6262 if (n < pRExC_state->num_code_blocks
6263 && s == pRExC_state->code_blocks[n].start)
6265 /* blank out literal code block */
6266 assert(pat[s] == '(');
6267 while (s <= pRExC_state->code_blocks[n].end) {
6275 if (pat[s] == '\'' || pat[s] == '\\')
6280 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6284 PerlIO_printf(Perl_debug_log,
6285 "%sre-parsing pattern for runtime code:%s %s\n",
6286 PL_colors[4],PL_colors[5],newpat);
6289 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6294 PUSHSTACKi(PERLSI_REQUIRE);
6295 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6296 * parsing qr''; normally only q'' does this. It also alters
6298 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6299 SvREFCNT_dec_NN(sv);
6304 SV * const errsv = ERRSV;
6305 if (SvTRUE_NN(errsv))
6307 Safefree(pRExC_state->code_blocks);
6308 /* use croak_sv ? */
6309 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6312 assert(SvROK(qr_ref));
6314 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6315 /* the leaving below frees the tmp qr_ref.
6316 * Give qr a life of its own */
6324 if (!RExC_utf8 && SvUTF8(qr)) {
6325 /* first time through; the pattern got upgraded; save the
6326 * qr for the next time through */
6327 assert(!pRExC_state->runtime_code_qr);
6328 pRExC_state->runtime_code_qr = qr;
6333 /* extract any code blocks within the returned qr// */
6336 /* merge the main (r1) and run-time (r2) code blocks into one */
6338 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6339 struct reg_code_block *new_block, *dst;
6340 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6343 if (!r2->num_code_blocks) /* we guessed wrong */
6345 SvREFCNT_dec_NN(qr);
6350 r1->num_code_blocks + r2->num_code_blocks,
6351 struct reg_code_block);
6354 while ( i1 < r1->num_code_blocks
6355 || i2 < r2->num_code_blocks)
6357 struct reg_code_block *src;
6360 if (i1 == r1->num_code_blocks) {
6361 src = &r2->code_blocks[i2++];
6364 else if (i2 == r2->num_code_blocks)
6365 src = &r1->code_blocks[i1++];
6366 else if ( r1->code_blocks[i1].start
6367 < r2->code_blocks[i2].start)
6369 src = &r1->code_blocks[i1++];
6370 assert(src->end < r2->code_blocks[i2].start);
6373 assert( r1->code_blocks[i1].start
6374 > r2->code_blocks[i2].start);
6375 src = &r2->code_blocks[i2++];
6377 assert(src->end < r1->code_blocks[i1].start);
6380 assert(pat[src->start] == '(');
6381 assert(pat[src->end] == ')');
6382 dst->start = src->start;
6383 dst->end = src->end;
6384 dst->block = src->block;
6385 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6389 r1->num_code_blocks += r2->num_code_blocks;
6390 Safefree(r1->code_blocks);
6391 r1->code_blocks = new_block;
6394 SvREFCNT_dec_NN(qr);
6400 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6401 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6402 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6403 STRLEN longest_length, bool eol, bool meol)
6405 /* This is the common code for setting up the floating and fixed length
6406 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6407 * as to whether succeeded or not */
6412 if (! (longest_length
6413 || (eol /* Can't have SEOL and MULTI */
6414 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6416 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6417 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6422 /* copy the information about the longest from the reg_scan_data
6423 over to the program. */
6424 if (SvUTF8(sv_longest)) {
6425 *rx_utf8 = sv_longest;
6428 *rx_substr = sv_longest;
6431 /* end_shift is how many chars that must be matched that
6432 follow this item. We calculate it ahead of time as once the
6433 lookbehind offset is added in we lose the ability to correctly
6435 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6436 *rx_end_shift = ml - offset
6437 - longest_length + (SvTAIL(sv_longest) != 0)
6440 t = (eol/* Can't have SEOL and MULTI */
6441 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6442 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6448 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6449 * regular expression into internal code.
6450 * The pattern may be passed either as:
6451 * a list of SVs (patternp plus pat_count)
6452 * a list of OPs (expr)
6453 * If both are passed, the SV list is used, but the OP list indicates
6454 * which SVs are actually pre-compiled code blocks
6456 * The SVs in the list have magic and qr overloading applied to them (and
6457 * the list may be modified in-place with replacement SVs in the latter
6460 * If the pattern hasn't changed from old_re, then old_re will be
6463 * eng is the current engine. If that engine has an op_comp method, then
6464 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6465 * do the initial concatenation of arguments and pass on to the external
6468 * If is_bare_re is not null, set it to a boolean indicating whether the
6469 * arg list reduced (after overloading) to a single bare regex which has
6470 * been returned (i.e. /$qr/).
6472 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6474 * pm_flags contains the PMf_* flags, typically based on those from the
6475 * pm_flags field of the related PMOP. Currently we're only interested in
6476 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6478 * We can't allocate space until we know how big the compiled form will be,
6479 * but we can't compile it (and thus know how big it is) until we've got a
6480 * place to put the code. So we cheat: we compile it twice, once with code
6481 * generation turned off and size counting turned on, and once "for real".
6482 * This also means that we don't allocate space until we are sure that the
6483 * thing really will compile successfully, and we never have to move the
6484 * code and thus invalidate pointers into it. (Note that it has to be in
6485 * one piece because free() must be able to free it all.) [NB: not true in perl]
6487 * Beware that the optimization-preparation code in here knows about some
6488 * of the structure of the compiled regexp. [I'll say.]
6492 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6493 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6494 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6498 regexp_internal *ri;
6506 SV *code_blocksv = NULL;
6507 SV** new_patternp = patternp;
6509 /* these are all flags - maybe they should be turned
6510 * into a single int with different bit masks */
6511 I32 sawlookahead = 0;
6516 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6518 bool runtime_code = 0;
6520 RExC_state_t RExC_state;
6521 RExC_state_t * const pRExC_state = &RExC_state;
6522 #ifdef TRIE_STUDY_OPT
6524 RExC_state_t copyRExC_state;
6526 GET_RE_DEBUG_FLAGS_DECL;
6528 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6530 DEBUG_r(if (!PL_colorset) reginitcolors());
6532 #ifndef PERL_IN_XSUB_RE
6533 /* Initialize these here instead of as-needed, as is quick and avoids
6534 * having to test them each time otherwise */
6535 if (! PL_AboveLatin1) {
6536 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6537 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6538 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6539 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6540 PL_HasMultiCharFold =
6541 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6543 /* This is calculated here, because the Perl program that generates the
6544 * static global ones doesn't currently have access to
6545 * NUM_ANYOF_CODE_POINTS */
6546 PL_InBitmap = _new_invlist(2);
6547 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6548 NUM_ANYOF_CODE_POINTS - 1);
6552 pRExC_state->code_blocks = NULL;
6553 pRExC_state->num_code_blocks = 0;
6556 *is_bare_re = FALSE;
6558 if (expr && (expr->op_type == OP_LIST ||
6559 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6560 /* allocate code_blocks if needed */
6564 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6565 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6566 ncode++; /* count of DO blocks */
6568 pRExC_state->num_code_blocks = ncode;
6569 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6574 /* compile-time pattern with just OP_CONSTs and DO blocks */
6579 /* find how many CONSTs there are */
6582 if (expr->op_type == OP_CONST)
6585 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6586 if (o->op_type == OP_CONST)
6590 /* fake up an SV array */
6592 assert(!new_patternp);
6593 Newx(new_patternp, n, SV*);
6594 SAVEFREEPV(new_patternp);
6598 if (expr->op_type == OP_CONST)
6599 new_patternp[n] = cSVOPx_sv(expr);
6601 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6602 if (o->op_type == OP_CONST)
6603 new_patternp[n++] = cSVOPo_sv;
6608 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6609 "Assembling pattern from %d elements%s\n", pat_count,
6610 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6612 /* set expr to the first arg op */
6614 if (pRExC_state->num_code_blocks
6615 && expr->op_type != OP_CONST)
6617 expr = cLISTOPx(expr)->op_first;
6618 assert( expr->op_type == OP_PUSHMARK
6619 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6620 || expr->op_type == OP_PADRANGE);
6621 expr = OP_SIBLING(expr);
6624 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6625 expr, &recompile, NULL);
6627 /* handle bare (possibly after overloading) regex: foo =~ $re */
6632 if (SvTYPE(re) == SVt_REGEXP) {
6636 Safefree(pRExC_state->code_blocks);
6637 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6638 "Precompiled pattern%s\n",
6639 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6645 exp = SvPV_nomg(pat, plen);
6647 if (!eng->op_comp) {
6648 if ((SvUTF8(pat) && IN_BYTES)
6649 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6651 /* make a temporary copy; either to convert to bytes,
6652 * or to avoid repeating get-magic / overloaded stringify */
6653 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6654 (IN_BYTES ? 0 : SvUTF8(pat)));
6656 Safefree(pRExC_state->code_blocks);
6657 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6660 /* ignore the utf8ness if the pattern is 0 length */
6661 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6662 RExC_uni_semantics = 0;
6663 RExC_contains_locale = 0;
6664 RExC_contains_i = 0;
6665 pRExC_state->runtime_code_qr = NULL;
6666 RExC_frame_head= NULL;
6667 RExC_frame_last= NULL;
6668 RExC_frame_count= 0;
6671 RExC_mysv1= sv_newmortal();
6672 RExC_mysv2= sv_newmortal();
6675 SV *dsv= sv_newmortal();
6676 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6677 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6678 PL_colors[4],PL_colors[5],s);
6682 /* we jump here if we upgrade the pattern to utf8 and have to
6685 if ((pm_flags & PMf_USE_RE_EVAL)
6686 /* this second condition covers the non-regex literal case,
6687 * i.e. $foo =~ '(?{})'. */
6688 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6690 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6692 /* return old regex if pattern hasn't changed */
6693 /* XXX: note in the below we have to check the flags as well as the
6696 * Things get a touch tricky as we have to compare the utf8 flag
6697 * independently from the compile flags. */
6701 && !!RX_UTF8(old_re) == !!RExC_utf8
6702 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6703 && RX_PRECOMP(old_re)
6704 && RX_PRELEN(old_re) == plen
6705 && memEQ(RX_PRECOMP(old_re), exp, plen)
6706 && !runtime_code /* with runtime code, always recompile */ )
6708 Safefree(pRExC_state->code_blocks);
6712 rx_flags = orig_rx_flags;
6714 if (rx_flags & PMf_FOLD) {
6715 RExC_contains_i = 1;
6717 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6719 /* Set to use unicode semantics if the pattern is in utf8 and has the
6720 * 'depends' charset specified, as it means unicode when utf8 */
6721 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6725 RExC_flags = rx_flags;
6726 RExC_pm_flags = pm_flags;
6729 if (TAINTING_get && TAINT_get)
6730 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6732 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6733 /* whoops, we have a non-utf8 pattern, whilst run-time code
6734 * got compiled as utf8. Try again with a utf8 pattern */
6735 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6736 pRExC_state->num_code_blocks);
6737 goto redo_first_pass;
6740 assert(!pRExC_state->runtime_code_qr);
6746 RExC_in_lookbehind = 0;
6747 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6749 RExC_override_recoding = 0;
6750 RExC_in_multi_char_class = 0;
6752 /* First pass: determine size, legality. */
6755 RExC_end = exp + plen;
6760 RExC_emit = (regnode *) &RExC_emit_dummy;
6761 RExC_whilem_seen = 0;
6762 RExC_open_parens = NULL;
6763 RExC_close_parens = NULL;
6765 RExC_paren_names = NULL;
6767 RExC_paren_name_list = NULL;
6769 RExC_recurse = NULL;
6770 RExC_study_chunk_recursed = NULL;
6771 RExC_study_chunk_recursed_bytes= 0;
6772 RExC_recurse_count = 0;
6773 pRExC_state->code_index = 0;
6775 #if 0 /* REGC() is (currently) a NOP at the first pass.
6776 * Clever compilers notice this and complain. --jhi */
6777 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6780 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6782 RExC_lastparse=NULL;
6784 /* reg may croak on us, not giving us a chance to free
6785 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6786 need it to survive as long as the regexp (qr/(?{})/).
6787 We must check that code_blocksv is not already set, because we may
6788 have jumped back to restart the sizing pass. */
6789 if (pRExC_state->code_blocks && !code_blocksv) {
6790 code_blocksv = newSV_type(SVt_PV);
6791 SAVEFREESV(code_blocksv);
6792 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6793 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6795 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6796 /* It's possible to write a regexp in ascii that represents Unicode
6797 codepoints outside of the byte range, such as via \x{100}. If we
6798 detect such a sequence we have to convert the entire pattern to utf8
6799 and then recompile, as our sizing calculation will have been based
6800 on 1 byte == 1 character, but we will need to use utf8 to encode
6801 at least some part of the pattern, and therefore must convert the whole
6804 if (flags & RESTART_UTF8) {
6805 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6806 pRExC_state->num_code_blocks);
6807 goto redo_first_pass;
6809 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6812 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6815 PerlIO_printf(Perl_debug_log,
6816 "Required size %"IVdf" nodes\n"
6817 "Starting second pass (creation)\n",
6820 RExC_lastparse=NULL;
6823 /* The first pass could have found things that force Unicode semantics */
6824 if ((RExC_utf8 || RExC_uni_semantics)
6825 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6827 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6830 /* Small enough for pointer-storage convention?
6831 If extralen==0, this means that we will not need long jumps. */
6832 if (RExC_size >= 0x10000L && RExC_extralen)
6833 RExC_size += RExC_extralen;
6836 if (RExC_whilem_seen > 15)
6837 RExC_whilem_seen = 15;
6839 /* Allocate space and zero-initialize. Note, the two step process
6840 of zeroing when in debug mode, thus anything assigned has to
6841 happen after that */
6842 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6844 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6845 char, regexp_internal);
6846 if ( r == NULL || ri == NULL )
6847 FAIL("Regexp out of space");
6849 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6850 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6853 /* bulk initialize base fields with 0. */
6854 Zero(ri, sizeof(regexp_internal), char);
6857 /* non-zero initialization begins here */
6860 r->extflags = rx_flags;
6861 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6863 if (pm_flags & PMf_IS_QR) {
6864 ri->code_blocks = pRExC_state->code_blocks;
6865 ri->num_code_blocks = pRExC_state->num_code_blocks;
6870 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6871 if (pRExC_state->code_blocks[n].src_regex)
6872 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6873 SAVEFREEPV(pRExC_state->code_blocks);
6877 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6878 bool has_charset = (get_regex_charset(r->extflags)
6879 != REGEX_DEPENDS_CHARSET);
6881 /* The caret is output if there are any defaults: if not all the STD
6882 * flags are set, or if no character set specifier is needed */
6884 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6886 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6887 == REG_RUN_ON_COMMENT_SEEN);
6888 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6889 >> RXf_PMf_STD_PMMOD_SHIFT);
6890 const char *fptr = STD_PAT_MODS; /*"msix"*/
6892 /* Allocate for the worst case, which is all the std flags are turned
6893 * on. If more precision is desired, we could do a population count of
6894 * the flags set. This could be done with a small lookup table, or by
6895 * shifting, masking and adding, or even, when available, assembly
6896 * language for a machine-language population count.
6897 * We never output a minus, as all those are defaults, so are
6898 * covered by the caret */
6899 const STRLEN wraplen = plen + has_p + has_runon
6900 + has_default /* If needs a caret */
6902 /* If needs a character set specifier */
6903 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6904 + (sizeof(STD_PAT_MODS) - 1)
6905 + (sizeof("(?:)") - 1);
6907 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6908 r->xpv_len_u.xpvlenu_pv = p;
6910 SvFLAGS(rx) |= SVf_UTF8;
6913 /* If a default, cover it using the caret */
6915 *p++= DEFAULT_PAT_MOD;
6919 const char* const name = get_regex_charset_name(r->extflags, &len);
6920 Copy(name, p, len, char);
6924 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6927 while((ch = *fptr++)) {
6935 Copy(RExC_precomp, p, plen, char);
6936 assert ((RX_WRAPPED(rx) - p) < 16);
6937 r->pre_prefix = p - RX_WRAPPED(rx);
6943 SvCUR_set(rx, p - RX_WRAPPED(rx));
6947 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6949 /* setup various meta data about recursion, this all requires
6950 * RExC_npar to be correctly set, and a bit later on we clear it */
6951 if (RExC_seen & REG_RECURSE_SEEN) {
6952 Newxz(RExC_open_parens, RExC_npar,regnode *);
6953 SAVEFREEPV(RExC_open_parens);
6954 Newxz(RExC_close_parens,RExC_npar,regnode *);
6955 SAVEFREEPV(RExC_close_parens);
6957 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6958 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6959 * So its 1 if there are no parens. */
6960 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6961 ((RExC_npar & 0x07) != 0);
6962 Newx(RExC_study_chunk_recursed,
6963 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6964 SAVEFREEPV(RExC_study_chunk_recursed);
6967 /* Useful during FAIL. */
6968 #ifdef RE_TRACK_PATTERN_OFFSETS
6969 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6970 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6971 "%s %"UVuf" bytes for offset annotations.\n",
6972 ri->u.offsets ? "Got" : "Couldn't get",
6973 (UV)((2*RExC_size+1) * sizeof(U32))));
6975 SetProgLen(ri,RExC_size);
6980 /* Second pass: emit code. */
6981 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6982 RExC_pm_flags = pm_flags;
6984 RExC_end = exp + plen;
6987 RExC_emit_start = ri->program;
6988 RExC_emit = ri->program;
6989 RExC_emit_bound = ri->program + RExC_size + 1;
6990 pRExC_state->code_index = 0;
6992 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6993 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6995 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6997 /* XXXX To minimize changes to RE engine we always allocate
6998 3-units-long substrs field. */
6999 Newx(r->substrs, 1, struct reg_substr_data);
7000 if (RExC_recurse_count) {
7001 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7002 SAVEFREEPV(RExC_recurse);
7006 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7008 RExC_study_chunk_recursed_count= 0;
7010 Zero(r->substrs, 1, struct reg_substr_data);
7011 if (RExC_study_chunk_recursed) {
7012 Zero(RExC_study_chunk_recursed,
7013 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7017 #ifdef TRIE_STUDY_OPT
7019 StructCopy(&zero_scan_data, &data, scan_data_t);
7020 copyRExC_state = RExC_state;
7023 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
7025 RExC_state = copyRExC_state;
7026 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7027 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7029 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7030 StructCopy(&zero_scan_data, &data, scan_data_t);
7033 StructCopy(&zero_scan_data, &data, scan_data_t);
7036 /* Dig out information for optimizations. */
7037 r->extflags = RExC_flags; /* was pm_op */
7038 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7041 SvUTF8_on(rx); /* Unicode in it? */
7042 ri->regstclass = NULL;
7043 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
7044 r->intflags |= PREGf_NAUGHTY;
7045 scan = ri->program + 1; /* First BRANCH. */
7047 /* testing for BRANCH here tells us whether there is "must appear"
7048 data in the pattern. If there is then we can use it for optimisations */
7049 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7052 STRLEN longest_float_length, longest_fixed_length;
7053 regnode_ssc ch_class; /* pointed to by data */
7055 SSize_t last_close = 0; /* pointed to by data */
7056 regnode *first= scan;
7057 regnode *first_next= regnext(first);
7059 * Skip introductions and multiplicators >= 1
7060 * so that we can extract the 'meat' of the pattern that must
7061 * match in the large if() sequence following.
7062 * NOTE that EXACT is NOT covered here, as it is normally
7063 * picked up by the optimiser separately.
7065 * This is unfortunate as the optimiser isnt handling lookahead
7066 * properly currently.
7069 while ((OP(first) == OPEN && (sawopen = 1)) ||
7070 /* An OR of *one* alternative - should not happen now. */
7071 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7072 /* for now we can't handle lookbehind IFMATCH*/
7073 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7074 (OP(first) == PLUS) ||
7075 (OP(first) == MINMOD) ||
7076 /* An {n,m} with n>0 */
7077 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7078 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7081 * the only op that could be a regnode is PLUS, all the rest
7082 * will be regnode_1 or regnode_2.
7084 * (yves doesn't think this is true)
7086 if (OP(first) == PLUS)
7089 if (OP(first) == MINMOD)
7091 first += regarglen[OP(first)];
7093 first = NEXTOPER(first);
7094 first_next= regnext(first);
7097 /* Starting-point info. */
7099 DEBUG_PEEP("first:",first,0);
7100 /* Ignore EXACT as we deal with it later. */
7101 if (PL_regkind[OP(first)] == EXACT) {
7102 if (OP(first) == EXACT)
7103 NOOP; /* Empty, get anchored substr later. */
7105 ri->regstclass = first;
7108 else if (PL_regkind[OP(first)] == TRIE &&
7109 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7111 /* this can happen only on restudy */
7112 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7115 else if (REGNODE_SIMPLE(OP(first)))
7116 ri->regstclass = first;
7117 else if (PL_regkind[OP(first)] == BOUND ||
7118 PL_regkind[OP(first)] == NBOUND)
7119 ri->regstclass = first;
7120 else if (PL_regkind[OP(first)] == BOL) {
7121 r->intflags |= (OP(first) == MBOL
7124 first = NEXTOPER(first);
7127 else if (OP(first) == GPOS) {
7128 r->intflags |= PREGf_ANCH_GPOS;
7129 first = NEXTOPER(first);
7132 else if ((!sawopen || !RExC_sawback) &&
7134 (OP(first) == STAR &&
7135 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7136 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7138 /* turn .* into ^.* with an implied $*=1 */
7140 (OP(NEXTOPER(first)) == REG_ANY)
7143 r->intflags |= (type | PREGf_IMPLICIT);
7144 first = NEXTOPER(first);
7147 if (sawplus && !sawminmod && !sawlookahead
7148 && (!sawopen || !RExC_sawback)
7149 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7150 /* x+ must match at the 1st pos of run of x's */
7151 r->intflags |= PREGf_SKIP;
7153 /* Scan is after the zeroth branch, first is atomic matcher. */
7154 #ifdef TRIE_STUDY_OPT
7157 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7158 (IV)(first - scan + 1))
7162 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7163 (IV)(first - scan + 1))
7169 * If there's something expensive in the r.e., find the
7170 * longest literal string that must appear and make it the
7171 * regmust. Resolve ties in favor of later strings, since
7172 * the regstart check works with the beginning of the r.e.
7173 * and avoiding duplication strengthens checking. Not a
7174 * strong reason, but sufficient in the absence of others.
7175 * [Now we resolve ties in favor of the earlier string if
7176 * it happens that c_offset_min has been invalidated, since the
7177 * earlier string may buy us something the later one won't.]
7180 data.longest_fixed = newSVpvs("");
7181 data.longest_float = newSVpvs("");
7182 data.last_found = newSVpvs("");
7183 data.longest = &(data.longest_fixed);
7184 ENTER_with_name("study_chunk");
7185 SAVEFREESV(data.longest_fixed);
7186 SAVEFREESV(data.longest_float);
7187 SAVEFREESV(data.last_found);
7189 if (!ri->regstclass) {
7190 ssc_init(pRExC_state, &ch_class);
7191 data.start_class = &ch_class;
7192 stclass_flag = SCF_DO_STCLASS_AND;
7193 } else /* XXXX Check for BOUND? */
7195 data.last_closep = &last_close;
7198 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7199 scan + RExC_size, /* Up to end */
7201 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7202 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7206 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7209 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7210 && data.last_start_min == 0 && data.last_end > 0
7211 && !RExC_seen_zerolen
7212 && !(RExC_seen & REG_VERBARG_SEEN)
7213 && !(RExC_seen & REG_GPOS_SEEN)
7215 r->extflags |= RXf_CHECK_ALL;
7217 scan_commit(pRExC_state, &data,&minlen,0);
7219 longest_float_length = CHR_SVLEN(data.longest_float);
7221 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7222 && data.offset_fixed == data.offset_float_min
7223 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7224 && S_setup_longest (aTHX_ pRExC_state,
7228 &(r->float_end_shift),
7229 data.lookbehind_float,
7230 data.offset_float_min,
7232 longest_float_length,
7233 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7234 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7236 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7237 r->float_max_offset = data.offset_float_max;
7238 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7239 r->float_max_offset -= data.lookbehind_float;
7240 SvREFCNT_inc_simple_void_NN(data.longest_float);
7243 r->float_substr = r->float_utf8 = NULL;
7244 longest_float_length = 0;
7247 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7249 if (S_setup_longest (aTHX_ pRExC_state,
7251 &(r->anchored_utf8),
7252 &(r->anchored_substr),
7253 &(r->anchored_end_shift),
7254 data.lookbehind_fixed,
7257 longest_fixed_length,
7258 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7259 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7261 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7262 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7265 r->anchored_substr = r->anchored_utf8 = NULL;
7266 longest_fixed_length = 0;
7268 LEAVE_with_name("study_chunk");
7271 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7272 ri->regstclass = NULL;
7274 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7276 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7277 && is_ssc_worth_it(pRExC_state, data.start_class))
7279 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7281 ssc_finalize(pRExC_state, data.start_class);
7283 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7284 StructCopy(data.start_class,
7285 (regnode_ssc*)RExC_rxi->data->data[n],
7287 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7288 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7289 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7290 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7291 PerlIO_printf(Perl_debug_log,
7292 "synthetic stclass \"%s\".\n",
7293 SvPVX_const(sv));});
7294 data.start_class = NULL;
7297 /* A temporary algorithm prefers floated substr to fixed one to dig
7299 if (longest_fixed_length > longest_float_length) {
7300 r->substrs->check_ix = 0;
7301 r->check_end_shift = r->anchored_end_shift;
7302 r->check_substr = r->anchored_substr;
7303 r->check_utf8 = r->anchored_utf8;
7304 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7305 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7306 r->intflags |= PREGf_NOSCAN;
7309 r->substrs->check_ix = 1;
7310 r->check_end_shift = r->float_end_shift;
7311 r->check_substr = r->float_substr;
7312 r->check_utf8 = r->float_utf8;
7313 r->check_offset_min = r->float_min_offset;
7314 r->check_offset_max = r->float_max_offset;
7316 if ((r->check_substr || r->check_utf8) ) {
7317 r->extflags |= RXf_USE_INTUIT;
7318 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7319 r->extflags |= RXf_INTUIT_TAIL;
7321 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7323 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7324 if ( (STRLEN)minlen < longest_float_length )
7325 minlen= longest_float_length;
7326 if ( (STRLEN)minlen < longest_fixed_length )
7327 minlen= longest_fixed_length;
7331 /* Several toplevels. Best we can is to set minlen. */
7333 regnode_ssc ch_class;
7334 SSize_t last_close = 0;
7336 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7338 scan = ri->program + 1;
7339 ssc_init(pRExC_state, &ch_class);
7340 data.start_class = &ch_class;
7341 data.last_closep = &last_close;
7344 minlen = study_chunk(pRExC_state,
7345 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7346 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7347 ? SCF_TRIE_DOING_RESTUDY
7351 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7353 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7354 = r->float_substr = r->float_utf8 = NULL;
7356 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7357 && is_ssc_worth_it(pRExC_state, data.start_class))
7359 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7361 ssc_finalize(pRExC_state, data.start_class);
7363 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7364 StructCopy(data.start_class,
7365 (regnode_ssc*)RExC_rxi->data->data[n],
7367 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7368 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7369 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7370 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7371 PerlIO_printf(Perl_debug_log,
7372 "synthetic stclass \"%s\".\n",
7373 SvPVX_const(sv));});
7374 data.start_class = NULL;
7378 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7379 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7380 r->maxlen = REG_INFTY;
7383 r->maxlen = RExC_maxlen;
7386 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7387 the "real" pattern. */
7389 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7390 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7392 r->minlenret = minlen;
7393 if (r->minlen < minlen)
7396 if (RExC_seen & REG_GPOS_SEEN)
7397 r->intflags |= PREGf_GPOS_SEEN;
7398 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7399 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7401 if (pRExC_state->num_code_blocks)
7402 r->extflags |= RXf_EVAL_SEEN;
7403 if (RExC_seen & REG_CANY_SEEN)
7404 r->intflags |= PREGf_CANY_SEEN;
7405 if (RExC_seen & REG_VERBARG_SEEN)
7407 r->intflags |= PREGf_VERBARG_SEEN;
7408 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7410 if (RExC_seen & REG_CUTGROUP_SEEN)
7411 r->intflags |= PREGf_CUTGROUP_SEEN;
7412 if (pm_flags & PMf_USE_RE_EVAL)
7413 r->intflags |= PREGf_USE_RE_EVAL;
7414 if (RExC_paren_names)
7415 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7417 RXp_PAREN_NAMES(r) = NULL;
7419 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7420 * so it can be used in pp.c */
7421 if (r->intflags & PREGf_ANCH)
7422 r->extflags |= RXf_IS_ANCHORED;
7426 /* this is used to identify "special" patterns that might result
7427 * in Perl NOT calling the regex engine and instead doing the match "itself",
7428 * particularly special cases in split//. By having the regex compiler
7429 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7430 * we avoid weird issues with equivalent patterns resulting in different behavior,
7431 * AND we allow non Perl engines to get the same optimizations by the setting the
7432 * flags appropriately - Yves */
7433 regnode *first = ri->program + 1;
7435 regnode *next = NEXTOPER(first);
7438 if (PL_regkind[fop] == NOTHING && nop == END)
7439 r->extflags |= RXf_NULL;
7440 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7441 /* when fop is SBOL first->flags will be true only when it was
7442 * produced by parsing /\A/, and not when parsing /^/. This is
7443 * very important for the split code as there we want to
7444 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7445 * See rt #122761 for more details. -- Yves */
7446 r->extflags |= RXf_START_ONLY;
7447 else if (fop == PLUS
7448 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7449 && OP(regnext(first)) == END)
7450 r->extflags |= RXf_WHITE;
7451 else if ( r->extflags & RXf_SPLIT
7453 && STR_LEN(first) == 1
7454 && *(STRING(first)) == ' '
7455 && OP(regnext(first)) == END )
7456 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7460 if (RExC_contains_locale) {
7461 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7465 if (RExC_paren_names) {
7466 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7467 ri->data->data[ri->name_list_idx]
7468 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7471 ri->name_list_idx = 0;
7473 if (RExC_recurse_count) {
7474 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7475 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7476 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7479 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7480 /* assume we don't need to swap parens around before we match */
7482 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7483 (unsigned long)RExC_study_chunk_recursed_count);
7487 PerlIO_printf(Perl_debug_log,"Final program:\n");
7490 #ifdef RE_TRACK_PATTERN_OFFSETS
7491 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7492 const STRLEN len = ri->u.offsets[0];
7494 GET_RE_DEBUG_FLAGS_DECL;
7495 PerlIO_printf(Perl_debug_log,
7496 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7497 for (i = 1; i <= len; i++) {
7498 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7499 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7500 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7502 PerlIO_printf(Perl_debug_log, "\n");
7507 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7508 * by setting the regexp SV to readonly-only instead. If the
7509 * pattern's been recompiled, the USEDness should remain. */
7510 if (old_re && SvREADONLY(old_re))
7518 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7521 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7523 PERL_UNUSED_ARG(value);
7525 if (flags & RXapif_FETCH) {
7526 return reg_named_buff_fetch(rx, key, flags);
7527 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7528 Perl_croak_no_modify();
7530 } else if (flags & RXapif_EXISTS) {
7531 return reg_named_buff_exists(rx, key, flags)
7534 } else if (flags & RXapif_REGNAMES) {
7535 return reg_named_buff_all(rx, flags);
7536 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7537 return reg_named_buff_scalar(rx, flags);
7539 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7545 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7548 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7549 PERL_UNUSED_ARG(lastkey);
7551 if (flags & RXapif_FIRSTKEY)
7552 return reg_named_buff_firstkey(rx, flags);
7553 else if (flags & RXapif_NEXTKEY)
7554 return reg_named_buff_nextkey(rx, flags);
7556 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7563 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7566 AV *retarray = NULL;
7568 struct regexp *const rx = ReANY(r);
7570 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7572 if (flags & RXapif_ALL)
7575 if (rx && RXp_PAREN_NAMES(rx)) {
7576 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7579 SV* sv_dat=HeVAL(he_str);
7580 I32 *nums=(I32*)SvPVX(sv_dat);
7581 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7582 if ((I32)(rx->nparens) >= nums[i]
7583 && rx->offs[nums[i]].start != -1
7584 && rx->offs[nums[i]].end != -1)
7587 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7592 ret = newSVsv(&PL_sv_undef);
7595 av_push(retarray, ret);
7598 return newRV_noinc(MUTABLE_SV(retarray));
7605 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7608 struct regexp *const rx = ReANY(r);
7610 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7612 if (rx && RXp_PAREN_NAMES(rx)) {
7613 if (flags & RXapif_ALL) {
7614 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7616 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7618 SvREFCNT_dec_NN(sv);
7630 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7632 struct regexp *const rx = ReANY(r);
7634 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7636 if ( rx && RXp_PAREN_NAMES(rx) ) {
7637 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7639 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7646 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7648 struct regexp *const rx = ReANY(r);
7649 GET_RE_DEBUG_FLAGS_DECL;
7651 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7653 if (rx && RXp_PAREN_NAMES(rx)) {
7654 HV *hv = RXp_PAREN_NAMES(rx);
7656 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7659 SV* sv_dat = HeVAL(temphe);
7660 I32 *nums = (I32*)SvPVX(sv_dat);
7661 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7662 if ((I32)(rx->lastparen) >= nums[i] &&
7663 rx->offs[nums[i]].start != -1 &&
7664 rx->offs[nums[i]].end != -1)
7670 if (parno || flags & RXapif_ALL) {
7671 return newSVhek(HeKEY_hek(temphe));
7679 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7684 struct regexp *const rx = ReANY(r);
7686 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7688 if (rx && RXp_PAREN_NAMES(rx)) {
7689 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7690 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7691 } else if (flags & RXapif_ONE) {
7692 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7693 av = MUTABLE_AV(SvRV(ret));
7694 length = av_tindex(av);
7695 SvREFCNT_dec_NN(ret);
7696 return newSViv(length + 1);
7698 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7703 return &PL_sv_undef;
7707 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7709 struct regexp *const rx = ReANY(r);
7712 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7714 if (rx && RXp_PAREN_NAMES(rx)) {
7715 HV *hv= RXp_PAREN_NAMES(rx);
7717 (void)hv_iterinit(hv);
7718 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7721 SV* sv_dat = HeVAL(temphe);
7722 I32 *nums = (I32*)SvPVX(sv_dat);
7723 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7724 if ((I32)(rx->lastparen) >= nums[i] &&
7725 rx->offs[nums[i]].start != -1 &&
7726 rx->offs[nums[i]].end != -1)
7732 if (parno || flags & RXapif_ALL) {
7733 av_push(av, newSVhek(HeKEY_hek(temphe)));
7738 return newRV_noinc(MUTABLE_SV(av));
7742 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7745 struct regexp *const rx = ReANY(r);
7751 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7753 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7754 || n == RX_BUFF_IDX_CARET_FULLMATCH
7755 || n == RX_BUFF_IDX_CARET_POSTMATCH
7758 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7760 /* on something like
7763 * the KEEPCOPY is set on the PMOP rather than the regex */
7764 if (PL_curpm && r == PM_GETRE(PL_curpm))
7765 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7774 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7775 /* no need to distinguish between them any more */
7776 n = RX_BUFF_IDX_FULLMATCH;
7778 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7779 && rx->offs[0].start != -1)
7781 /* $`, ${^PREMATCH} */
7782 i = rx->offs[0].start;
7786 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7787 && rx->offs[0].end != -1)
7789 /* $', ${^POSTMATCH} */
7790 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7791 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7794 if ( 0 <= n && n <= (I32)rx->nparens &&
7795 (s1 = rx->offs[n].start) != -1 &&
7796 (t1 = rx->offs[n].end) != -1)
7798 /* $&, ${^MATCH}, $1 ... */
7800 s = rx->subbeg + s1 - rx->suboffset;
7805 assert(s >= rx->subbeg);
7806 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7808 #ifdef NO_TAINT_SUPPORT
7809 sv_setpvn(sv, s, i);
7811 const int oldtainted = TAINT_get;
7813 sv_setpvn(sv, s, i);
7814 TAINT_set(oldtainted);
7816 if ( (rx->intflags & PREGf_CANY_SEEN)
7817 ? (RXp_MATCH_UTF8(rx)
7818 && (!i || is_utf8_string((U8*)s, i)))
7819 : (RXp_MATCH_UTF8(rx)) )
7826 if (RXp_MATCH_TAINTED(rx)) {
7827 if (SvTYPE(sv) >= SVt_PVMG) {
7828 MAGIC* const mg = SvMAGIC(sv);
7831 SvMAGIC_set(sv, mg->mg_moremagic);
7833 if ((mgt = SvMAGIC(sv))) {
7834 mg->mg_moremagic = mgt;
7835 SvMAGIC_set(sv, mg);
7846 sv_setsv(sv,&PL_sv_undef);
7852 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7853 SV const * const value)
7855 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7857 PERL_UNUSED_ARG(rx);
7858 PERL_UNUSED_ARG(paren);
7859 PERL_UNUSED_ARG(value);
7862 Perl_croak_no_modify();
7866 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7869 struct regexp *const rx = ReANY(r);
7873 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7875 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7876 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7877 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7880 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7882 /* on something like
7885 * the KEEPCOPY is set on the PMOP rather than the regex */
7886 if (PL_curpm && r == PM_GETRE(PL_curpm))
7887 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7893 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7895 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7896 case RX_BUFF_IDX_PREMATCH: /* $` */
7897 if (rx->offs[0].start != -1) {
7898 i = rx->offs[0].start;
7907 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7908 case RX_BUFF_IDX_POSTMATCH: /* $' */
7909 if (rx->offs[0].end != -1) {
7910 i = rx->sublen - rx->offs[0].end;
7912 s1 = rx->offs[0].end;
7919 default: /* $& / ${^MATCH}, $1, $2, ... */
7920 if (paren <= (I32)rx->nparens &&
7921 (s1 = rx->offs[paren].start) != -1 &&
7922 (t1 = rx->offs[paren].end) != -1)
7928 if (ckWARN(WARN_UNINITIALIZED))
7929 report_uninit((const SV *)sv);
7934 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7935 const char * const s = rx->subbeg - rx->suboffset + s1;
7940 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7947 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7949 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7950 PERL_UNUSED_ARG(rx);
7954 return newSVpvs("Regexp");
7957 /* Scans the name of a named buffer from the pattern.
7958 * If flags is REG_RSN_RETURN_NULL returns null.
7959 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7960 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7961 * to the parsed name as looked up in the RExC_paren_names hash.
7962 * If there is an error throws a vFAIL().. type exception.
7965 #define REG_RSN_RETURN_NULL 0
7966 #define REG_RSN_RETURN_NAME 1
7967 #define REG_RSN_RETURN_DATA 2
7970 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7972 char *name_start = RExC_parse;
7974 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7976 assert (RExC_parse <= RExC_end);
7977 if (RExC_parse == RExC_end) NOOP;
7978 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7979 /* skip IDFIRST by using do...while */
7982 RExC_parse += UTF8SKIP(RExC_parse);
7983 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7987 } while (isWORDCHAR(*RExC_parse));
7989 RExC_parse++; /* so the <- from the vFAIL is after the offending
7991 vFAIL("Group name must start with a non-digit word character");
7995 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7996 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7997 if ( flags == REG_RSN_RETURN_NAME)
7999 else if (flags==REG_RSN_RETURN_DATA) {
8002 if ( ! sv_name ) /* should not happen*/
8003 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8004 if (RExC_paren_names)
8005 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8007 sv_dat = HeVAL(he_str);
8009 vFAIL("Reference to nonexistent named group");
8013 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8014 (unsigned long) flags);
8016 NOT_REACHED; /* NOT REACHED */
8021 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8023 if (RExC_lastparse!=RExC_parse) { \
8024 PerlIO_printf(Perl_debug_log, "%s", \
8025 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8026 RExC_end - RExC_parse, 16, \
8028 PERL_PV_ESCAPE_UNI_DETECT | \
8029 PERL_PV_PRETTY_ELLIPSES | \
8030 PERL_PV_PRETTY_LTGT | \
8031 PERL_PV_ESCAPE_RE | \
8032 PERL_PV_PRETTY_EXACTSIZE \
8036 PerlIO_printf(Perl_debug_log,"%16s",""); \
8039 num = RExC_size + 1; \
8041 num=REG_NODE_NUM(RExC_emit); \
8042 if (RExC_lastnum!=num) \
8043 PerlIO_printf(Perl_debug_log,"|%4d",num); \
8045 PerlIO_printf(Perl_debug_log,"|%4s",""); \
8046 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
8047 (int)((depth*2)), "", \
8051 RExC_lastparse=RExC_parse; \
8056 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8057 DEBUG_PARSE_MSG((funcname)); \
8058 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
8060 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
8061 DEBUG_PARSE_MSG((funcname)); \
8062 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
8065 /* This section of code defines the inversion list object and its methods. The
8066 * interfaces are highly subject to change, so as much as possible is static to
8067 * this file. An inversion list is here implemented as a malloc'd C UV array
8068 * as an SVt_INVLIST scalar.
8070 * An inversion list for Unicode is an array of code points, sorted by ordinal
8071 * number. The zeroth element is the first code point in the list. The 1th
8072 * element is the first element beyond that not in the list. In other words,
8073 * the first range is
8074 * invlist[0]..(invlist[1]-1)
8075 * The other ranges follow. Thus every element whose index is divisible by two
8076 * marks the beginning of a range that is in the list, and every element not
8077 * divisible by two marks the beginning of a range not in the list. A single
8078 * element inversion list that contains the single code point N generally
8079 * consists of two elements
8082 * (The exception is when N is the highest representable value on the
8083 * machine, in which case the list containing just it would be a single
8084 * element, itself. By extension, if the last range in the list extends to
8085 * infinity, then the first element of that range will be in the inversion list
8086 * at a position that is divisible by two, and is the final element in the
8088 * Taking the complement (inverting) an inversion list is quite simple, if the
8089 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8090 * This implementation reserves an element at the beginning of each inversion
8091 * list to always contain 0; there is an additional flag in the header which
8092 * indicates if the list begins at the 0, or is offset to begin at the next
8095 * More about inversion lists can be found in "Unicode Demystified"
8096 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8097 * More will be coming when functionality is added later.
8099 * The inversion list data structure is currently implemented as an SV pointing
8100 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8101 * array of UV whose memory management is automatically handled by the existing
8102 * facilities for SV's.
8104 * Some of the methods should always be private to the implementation, and some
8105 * should eventually be made public */
8107 /* The header definitions are in F<inline_invlist.c> */
8109 PERL_STATIC_INLINE UV*
8110 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8112 /* Returns a pointer to the first element in the inversion list's array.
8113 * This is called upon initialization of an inversion list. Where the
8114 * array begins depends on whether the list has the code point U+0000 in it
8115 * or not. The other parameter tells it whether the code that follows this
8116 * call is about to put a 0 in the inversion list or not. The first
8117 * element is either the element reserved for 0, if TRUE, or the element
8118 * after it, if FALSE */
8120 bool* offset = get_invlist_offset_addr(invlist);
8121 UV* zero_addr = (UV *) SvPVX(invlist);
8123 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8126 assert(! _invlist_len(invlist));
8130 /* 1^1 = 0; 1^0 = 1 */
8131 *offset = 1 ^ will_have_0;
8132 return zero_addr + *offset;
8135 PERL_STATIC_INLINE UV*
8136 S_invlist_array(SV* const invlist)
8138 /* Returns the pointer to the inversion list's array. Every time the
8139 * length changes, this needs to be called in case malloc or realloc moved
8142 PERL_ARGS_ASSERT_INVLIST_ARRAY;
8144 /* Must not be empty. If these fail, you probably didn't check for <len>
8145 * being non-zero before trying to get the array */
8146 assert(_invlist_len(invlist));
8148 /* The very first element always contains zero, The array begins either
8149 * there, or if the inversion list is offset, at the element after it.
8150 * The offset header field determines which; it contains 0 or 1 to indicate
8151 * how much additionally to add */
8152 assert(0 == *(SvPVX(invlist)));
8153 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8156 PERL_STATIC_INLINE void
8157 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8159 /* Sets the current number of elements stored in the inversion list.
8160 * Updates SvCUR correspondingly */
8161 PERL_UNUSED_CONTEXT;
8162 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8164 assert(SvTYPE(invlist) == SVt_INVLIST);
8169 : TO_INTERNAL_SIZE(len + offset));
8170 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8173 #ifndef PERL_IN_XSUB_RE
8175 PERL_STATIC_INLINE IV*
8176 S_get_invlist_previous_index_addr(SV* invlist)
8178 /* Return the address of the IV that is reserved to hold the cached index
8180 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8182 assert(SvTYPE(invlist) == SVt_INVLIST);
8184 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8187 PERL_STATIC_INLINE IV
8188 S_invlist_previous_index(SV* const invlist)
8190 /* Returns cached index of previous search */
8192 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8194 return *get_invlist_previous_index_addr(invlist);
8197 PERL_STATIC_INLINE void
8198 S_invlist_set_previous_index(SV* const invlist, const IV index)
8200 /* Caches <index> for later retrieval */
8202 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8204 assert(index == 0 || index < (int) _invlist_len(invlist));
8206 *get_invlist_previous_index_addr(invlist) = index;
8209 PERL_STATIC_INLINE void
8210 S_invlist_trim(SV* const invlist)
8212 PERL_ARGS_ASSERT_INVLIST_TRIM;
8214 assert(SvTYPE(invlist) == SVt_INVLIST);
8216 /* Change the length of the inversion list to how many entries it currently
8218 SvPV_shrink_to_cur((SV *) invlist);
8221 PERL_STATIC_INLINE bool
8222 S_invlist_is_iterating(SV* const invlist)
8224 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8226 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8229 #endif /* ifndef PERL_IN_XSUB_RE */
8231 PERL_STATIC_INLINE UV
8232 S_invlist_max(SV* const invlist)
8234 /* Returns the maximum number of elements storable in the inversion list's
8235 * array, without having to realloc() */
8237 PERL_ARGS_ASSERT_INVLIST_MAX;
8239 assert(SvTYPE(invlist) == SVt_INVLIST);
8241 /* Assumes worst case, in which the 0 element is not counted in the
8242 * inversion list, so subtracts 1 for that */
8243 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8244 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8245 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8248 #ifndef PERL_IN_XSUB_RE
8250 Perl__new_invlist(pTHX_ IV initial_size)
8253 /* Return a pointer to a newly constructed inversion list, with enough
8254 * space to store 'initial_size' elements. If that number is negative, a
8255 * system default is used instead */
8259 if (initial_size < 0) {
8263 /* Allocate the initial space */
8264 new_list = newSV_type(SVt_INVLIST);
8266 /* First 1 is in case the zero element isn't in the list; second 1 is for
8268 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8269 invlist_set_len(new_list, 0, 0);
8271 /* Force iterinit() to be used to get iteration to work */
8272 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8274 *get_invlist_previous_index_addr(new_list) = 0;
8280 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8282 /* Return a pointer to a newly constructed inversion list, initialized to
8283 * point to <list>, which has to be in the exact correct inversion list
8284 * form, including internal fields. Thus this is a dangerous routine that
8285 * should not be used in the wrong hands. The passed in 'list' contains
8286 * several header fields at the beginning that are not part of the
8287 * inversion list body proper */
8289 const STRLEN length = (STRLEN) list[0];
8290 const UV version_id = list[1];
8291 const bool offset = cBOOL(list[2]);
8292 #define HEADER_LENGTH 3
8293 /* If any of the above changes in any way, you must change HEADER_LENGTH
8294 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8295 * perl -E 'say int(rand 2**31-1)'
8297 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8298 data structure type, so that one being
8299 passed in can be validated to be an
8300 inversion list of the correct vintage.
8303 SV* invlist = newSV_type(SVt_INVLIST);
8305 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8307 if (version_id != INVLIST_VERSION_ID) {
8308 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8311 /* The generated array passed in includes header elements that aren't part
8312 * of the list proper, so start it just after them */
8313 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8315 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8316 shouldn't touch it */
8318 *(get_invlist_offset_addr(invlist)) = offset;
8320 /* The 'length' passed to us is the physical number of elements in the
8321 * inversion list. But if there is an offset the logical number is one
8323 invlist_set_len(invlist, length - offset, offset);
8325 invlist_set_previous_index(invlist, 0);
8327 /* Initialize the iteration pointer. */
8328 invlist_iterfinish(invlist);
8330 SvREADONLY_on(invlist);
8334 #endif /* ifndef PERL_IN_XSUB_RE */
8337 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8339 /* Grow the maximum size of an inversion list */
8341 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8343 assert(SvTYPE(invlist) == SVt_INVLIST);
8345 /* Add one to account for the zero element at the beginning which may not
8346 * be counted by the calling parameters */
8347 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8351 S__append_range_to_invlist(pTHX_ SV* const invlist,
8352 const UV start, const UV end)
8354 /* Subject to change or removal. Append the range from 'start' to 'end' at
8355 * the end of the inversion list. The range must be above any existing
8359 UV max = invlist_max(invlist);
8360 UV len = _invlist_len(invlist);
8363 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8365 if (len == 0) { /* Empty lists must be initialized */
8366 offset = start != 0;
8367 array = _invlist_array_init(invlist, ! offset);
8370 /* Here, the existing list is non-empty. The current max entry in the
8371 * list is generally the first value not in the set, except when the
8372 * set extends to the end of permissible values, in which case it is
8373 * the first entry in that final set, and so this call is an attempt to
8374 * append out-of-order */
8376 UV final_element = len - 1;
8377 array = invlist_array(invlist);
8378 if (array[final_element] > start
8379 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8381 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8382 array[final_element], start,
8383 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8386 /* Here, it is a legal append. If the new range begins with the first
8387 * value not in the set, it is extending the set, so the new first
8388 * value not in the set is one greater than the newly extended range.
8390 offset = *get_invlist_offset_addr(invlist);
8391 if (array[final_element] == start) {
8392 if (end != UV_MAX) {
8393 array[final_element] = end + 1;
8396 /* But if the end is the maximum representable on the machine,
8397 * just let the range that this would extend to have no end */
8398 invlist_set_len(invlist, len - 1, offset);
8404 /* Here the new range doesn't extend any existing set. Add it */
8406 len += 2; /* Includes an element each for the start and end of range */
8408 /* If wll overflow the existing space, extend, which may cause the array to
8411 invlist_extend(invlist, len);
8413 /* Have to set len here to avoid assert failure in invlist_array() */
8414 invlist_set_len(invlist, len, offset);
8416 array = invlist_array(invlist);
8419 invlist_set_len(invlist, len, offset);
8422 /* The next item on the list starts the range, the one after that is
8423 * one past the new range. */
8424 array[len - 2] = start;
8425 if (end != UV_MAX) {
8426 array[len - 1] = end + 1;
8429 /* But if the end is the maximum representable on the machine, just let
8430 * the range have no end */
8431 invlist_set_len(invlist, len - 1, offset);
8435 #ifndef PERL_IN_XSUB_RE
8438 Perl__invlist_search(SV* const invlist, const UV cp)
8440 /* Searches the inversion list for the entry that contains the input code
8441 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8442 * return value is the index into the list's array of the range that
8447 IV high = _invlist_len(invlist);
8448 const IV highest_element = high - 1;
8451 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8453 /* If list is empty, return failure. */
8458 /* (We can't get the array unless we know the list is non-empty) */
8459 array = invlist_array(invlist);
8461 mid = invlist_previous_index(invlist);
8462 assert(mid >=0 && mid <= highest_element);
8464 /* <mid> contains the cache of the result of the previous call to this
8465 * function (0 the first time). See if this call is for the same result,
8466 * or if it is for mid-1. This is under the theory that calls to this
8467 * function will often be for related code points that are near each other.
8468 * And benchmarks show that caching gives better results. We also test
8469 * here if the code point is within the bounds of the list. These tests
8470 * replace others that would have had to be made anyway to make sure that
8471 * the array bounds were not exceeded, and these give us extra information
8472 * at the same time */
8473 if (cp >= array[mid]) {
8474 if (cp >= array[highest_element]) {
8475 return highest_element;
8478 /* Here, array[mid] <= cp < array[highest_element]. This means that
8479 * the final element is not the answer, so can exclude it; it also
8480 * means that <mid> is not the final element, so can refer to 'mid + 1'
8482 if (cp < array[mid + 1]) {
8488 else { /* cp < aray[mid] */
8489 if (cp < array[0]) { /* Fail if outside the array */
8493 if (cp >= array[mid - 1]) {
8498 /* Binary search. What we are looking for is <i> such that
8499 * array[i] <= cp < array[i+1]
8500 * The loop below converges on the i+1. Note that there may not be an
8501 * (i+1)th element in the array, and things work nonetheless */
8502 while (low < high) {
8503 mid = (low + high) / 2;
8504 assert(mid <= highest_element);
8505 if (array[mid] <= cp) { /* cp >= array[mid] */
8508 /* We could do this extra test to exit the loop early.
8509 if (cp < array[low]) {
8514 else { /* cp < array[mid] */
8521 invlist_set_previous_index(invlist, high);
8526 Perl__invlist_populate_swatch(SV* const invlist,
8527 const UV start, const UV end, U8* swatch)
8529 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8530 * but is used when the swash has an inversion list. This makes this much
8531 * faster, as it uses a binary search instead of a linear one. This is
8532 * intimately tied to that function, and perhaps should be in utf8.c,
8533 * except it is intimately tied to inversion lists as well. It assumes
8534 * that <swatch> is all 0's on input */
8537 const IV len = _invlist_len(invlist);
8541 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8543 if (len == 0) { /* Empty inversion list */
8547 array = invlist_array(invlist);
8549 /* Find which element it is */
8550 i = _invlist_search(invlist, start);
8552 /* We populate from <start> to <end> */
8553 while (current < end) {
8556 /* The inversion list gives the results for every possible code point
8557 * after the first one in the list. Only those ranges whose index is
8558 * even are ones that the inversion list matches. For the odd ones,
8559 * and if the initial code point is not in the list, we have to skip
8560 * forward to the next element */
8561 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8563 if (i >= len) { /* Finished if beyond the end of the array */
8567 if (current >= end) { /* Finished if beyond the end of what we
8569 if (LIKELY(end < UV_MAX)) {
8573 /* We get here when the upper bound is the maximum
8574 * representable on the machine, and we are looking for just
8575 * that code point. Have to special case it */
8577 goto join_end_of_list;
8580 assert(current >= start);
8582 /* The current range ends one below the next one, except don't go past
8585 upper = (i < len && array[i] < end) ? array[i] : end;
8587 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8588 * for each code point in it */
8589 for (; current < upper; current++) {
8590 const STRLEN offset = (STRLEN)(current - start);
8591 swatch[offset >> 3] |= 1 << (offset & 7);
8596 /* Quit if at the end of the list */
8599 /* But first, have to deal with the highest possible code point on
8600 * the platform. The previous code assumes that <end> is one
8601 * beyond where we want to populate, but that is impossible at the
8602 * platform's infinity, so have to handle it specially */
8603 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8605 const STRLEN offset = (STRLEN)(end - start);
8606 swatch[offset >> 3] |= 1 << (offset & 7);
8611 /* Advance to the next range, which will be for code points not in the
8620 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8621 const bool complement_b, SV** output)
8623 /* Take the union of two inversion lists and point <output> to it. *output
8624 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8625 * the reference count to that list will be decremented if not already a
8626 * temporary (mortal); otherwise *output will be made correspondingly
8627 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8628 * second list is returned. If <complement_b> is TRUE, the union is taken
8629 * of the complement (inversion) of <b> instead of b itself.
8631 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8632 * Richard Gillam, published by Addison-Wesley, and explained at some
8633 * length there. The preface says to incorporate its examples into your
8634 * code at your own risk.
8636 * The algorithm is like a merge sort.
8638 * XXX A potential performance improvement is to keep track as we go along
8639 * if only one of the inputs contributes to the result, meaning the other
8640 * is a subset of that one. In that case, we can skip the final copy and
8641 * return the larger of the input lists, but then outside code might need
8642 * to keep track of whether to free the input list or not */
8644 const UV* array_a; /* a's array */
8646 UV len_a; /* length of a's array */
8649 SV* u; /* the resulting union */
8653 UV i_a = 0; /* current index into a's array */
8657 /* running count, as explained in the algorithm source book; items are
8658 * stopped accumulating and are output when the count changes to/from 0.
8659 * The count is incremented when we start a range that's in the set, and
8660 * decremented when we start a range that's not in the set. So its range
8661 * is 0 to 2. Only when the count is zero is something not in the set.
8665 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8668 /* If either one is empty, the union is the other one */
8669 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8670 bool make_temp = FALSE; /* Should we mortalize the result? */
8674 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8680 *output = invlist_clone(b);
8682 _invlist_invert(*output);
8684 } /* else *output already = b; */
8687 sv_2mortal(*output);
8691 else if ((len_b = _invlist_len(b)) == 0) {
8692 bool make_temp = FALSE;
8694 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8699 /* The complement of an empty list is a list that has everything in it,
8700 * so the union with <a> includes everything too */
8703 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8707 *output = _new_invlist(1);
8708 _append_range_to_invlist(*output, 0, UV_MAX);
8710 else if (*output != a) {
8711 *output = invlist_clone(a);
8713 /* else *output already = a; */
8716 sv_2mortal(*output);
8721 /* Here both lists exist and are non-empty */
8722 array_a = invlist_array(a);
8723 array_b = invlist_array(b);
8725 /* If are to take the union of 'a' with the complement of b, set it
8726 * up so are looking at b's complement. */
8729 /* To complement, we invert: if the first element is 0, remove it. To
8730 * do this, we just pretend the array starts one later */
8731 if (array_b[0] == 0) {
8737 /* But if the first element is not zero, we pretend the list starts
8738 * at the 0 that is always stored immediately before the array. */
8744 /* Size the union for the worst case: that the sets are completely
8746 u = _new_invlist(len_a + len_b);
8748 /* Will contain U+0000 if either component does */
8749 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8750 || (len_b > 0 && array_b[0] == 0));
8752 /* Go through each list item by item, stopping when exhausted one of
8754 while (i_a < len_a && i_b < len_b) {
8755 UV cp; /* The element to potentially add to the union's array */
8756 bool cp_in_set; /* is it in the the input list's set or not */
8758 /* We need to take one or the other of the two inputs for the union.
8759 * Since we are merging two sorted lists, we take the smaller of the
8760 * next items. In case of a tie, we take the one that is in its set
8761 * first. If we took one not in the set first, it would decrement the
8762 * count, possibly to 0 which would cause it to be output as ending the
8763 * range, and the next time through we would take the same number, and
8764 * output it again as beginning the next range. By doing it the
8765 * opposite way, there is no possibility that the count will be
8766 * momentarily decremented to 0, and thus the two adjoining ranges will
8767 * be seamlessly merged. (In a tie and both are in the set or both not
8768 * in the set, it doesn't matter which we take first.) */
8769 if (array_a[i_a] < array_b[i_b]
8770 || (array_a[i_a] == array_b[i_b]
8771 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8773 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8777 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8778 cp = array_b[i_b++];
8781 /* Here, have chosen which of the two inputs to look at. Only output
8782 * if the running count changes to/from 0, which marks the
8783 * beginning/end of a range in that's in the set */
8786 array_u[i_u++] = cp;
8793 array_u[i_u++] = cp;
8798 /* Here, we are finished going through at least one of the lists, which
8799 * means there is something remaining in at most one. We check if the list
8800 * that hasn't been exhausted is positioned such that we are in the middle
8801 * of a range in its set or not. (i_a and i_b point to the element beyond
8802 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8803 * is potentially more to output.
8804 * There are four cases:
8805 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8806 * in the union is entirely from the non-exhausted set.
8807 * 2) Both were in their sets, count is 2. Nothing further should
8808 * be output, as everything that remains will be in the exhausted
8809 * list's set, hence in the union; decrementing to 1 but not 0 insures
8811 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8812 * Nothing further should be output because the union includes
8813 * everything from the exhausted set. Not decrementing ensures that.
8814 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8815 * decrementing to 0 insures that we look at the remainder of the
8816 * non-exhausted set */
8817 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8818 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8823 /* The final length is what we've output so far, plus what else is about to
8824 * be output. (If 'count' is non-zero, then the input list we exhausted
8825 * has everything remaining up to the machine's limit in its set, and hence
8826 * in the union, so there will be no further output. */
8829 /* At most one of the subexpressions will be non-zero */
8830 len_u += (len_a - i_a) + (len_b - i_b);
8833 /* Set result to final length, which can change the pointer to array_u, so
8835 if (len_u != _invlist_len(u)) {
8836 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8838 array_u = invlist_array(u);
8841 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8842 * the other) ended with everything above it not in its set. That means
8843 * that the remaining part of the union is precisely the same as the
8844 * non-exhausted list, so can just copy it unchanged. (If both list were
8845 * exhausted at the same time, then the operations below will be both 0.)
8848 IV copy_count; /* At most one will have a non-zero copy count */
8849 if ((copy_count = len_a - i_a) > 0) {
8850 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8852 else if ((copy_count = len_b - i_b) > 0) {
8853 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8857 /* We may be removing a reference to one of the inputs. If so, the output
8858 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8859 * count decremented) */
8860 if (a == *output || b == *output) {
8861 assert(! invlist_is_iterating(*output));
8862 if ((SvTEMP(*output))) {
8866 SvREFCNT_dec_NN(*output);
8876 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8877 const bool complement_b, SV** i)
8879 /* Take the intersection of two inversion lists and point <i> to it. *i
8880 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8881 * the reference count to that list will be decremented if not already a
8882 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8883 * The first list, <a>, may be NULL, in which case an empty list is
8884 * returned. If <complement_b> is TRUE, the result will be the
8885 * intersection of <a> and the complement (or inversion) of <b> instead of
8888 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8889 * Richard Gillam, published by Addison-Wesley, and explained at some
8890 * length there. The preface says to incorporate its examples into your
8891 * code at your own risk. In fact, it had bugs
8893 * The algorithm is like a merge sort, and is essentially the same as the
8897 const UV* array_a; /* a's array */
8899 UV len_a; /* length of a's array */
8902 SV* r; /* the resulting intersection */
8906 UV i_a = 0; /* current index into a's array */
8910 /* running count, as explained in the algorithm source book; items are
8911 * stopped accumulating and are output when the count changes to/from 2.
8912 * The count is incremented when we start a range that's in the set, and
8913 * decremented when we start a range that's not in the set. So its range
8914 * is 0 to 2. Only when the count is 2 is something in the intersection.
8918 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8921 /* Special case if either one is empty */
8922 len_a = (a == NULL) ? 0 : _invlist_len(a);
8923 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8924 bool make_temp = FALSE;
8926 if (len_a != 0 && complement_b) {
8928 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8929 * be empty. Here, also we are using 'b's complement, which hence
8930 * must be every possible code point. Thus the intersection is
8934 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8939 *i = invlist_clone(a);
8941 /* else *i is already 'a' */
8949 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8950 * intersection must be empty */
8952 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8957 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8961 *i = _new_invlist(0);
8969 /* Here both lists exist and are non-empty */
8970 array_a = invlist_array(a);
8971 array_b = invlist_array(b);
8973 /* If are to take the intersection of 'a' with the complement of b, set it
8974 * up so are looking at b's complement. */
8977 /* To complement, we invert: if the first element is 0, remove it. To
8978 * do this, we just pretend the array starts one later */
8979 if (array_b[0] == 0) {
8985 /* But if the first element is not zero, we pretend the list starts
8986 * at the 0 that is always stored immediately before the array. */
8992 /* Size the intersection for the worst case: that the intersection ends up
8993 * fragmenting everything to be completely disjoint */
8994 r= _new_invlist(len_a + len_b);
8996 /* Will contain U+0000 iff both components do */
8997 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8998 && len_b > 0 && array_b[0] == 0);
9000 /* Go through each list item by item, stopping when exhausted one of
9002 while (i_a < len_a && i_b < len_b) {
9003 UV cp; /* The element to potentially add to the intersection's
9005 bool cp_in_set; /* Is it in the input list's set or not */
9007 /* We need to take one or the other of the two inputs for the
9008 * intersection. Since we are merging two sorted lists, we take the
9009 * smaller of the next items. In case of a tie, we take the one that
9010 * is not in its set first (a difference from the union algorithm). If
9011 * we took one in the set first, it would increment the count, possibly
9012 * to 2 which would cause it to be output as starting a range in the
9013 * intersection, and the next time through we would take that same
9014 * number, and output it again as ending the set. By doing it the
9015 * opposite of this, there is no possibility that the count will be
9016 * momentarily incremented to 2. (In a tie and both are in the set or
9017 * both not in the set, it doesn't matter which we take first.) */
9018 if (array_a[i_a] < array_b[i_b]
9019 || (array_a[i_a] == array_b[i_b]
9020 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9022 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9026 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9030 /* Here, have chosen which of the two inputs to look at. Only output
9031 * if the running count changes to/from 2, which marks the
9032 * beginning/end of a range that's in the intersection */
9036 array_r[i_r++] = cp;
9041 array_r[i_r++] = cp;
9047 /* Here, we are finished going through at least one of the lists, which
9048 * means there is something remaining in at most one. We check if the list
9049 * that has been exhausted is positioned such that we are in the middle
9050 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
9051 * the ones we care about.) There are four cases:
9052 * 1) Both weren't in their sets, count is 0, and remains 0. There's
9053 * nothing left in the intersection.
9054 * 2) Both were in their sets, count is 2 and perhaps is incremented to
9055 * above 2. What should be output is exactly that which is in the
9056 * non-exhausted set, as everything it has is also in the intersection
9057 * set, and everything it doesn't have can't be in the intersection
9058 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9059 * gets incremented to 2. Like the previous case, the intersection is
9060 * everything that remains in the non-exhausted set.
9061 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9062 * remains 1. And the intersection has nothing more. */
9063 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9064 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9069 /* The final length is what we've output so far plus what else is in the
9070 * intersection. At most one of the subexpressions below will be non-zero
9074 len_r += (len_a - i_a) + (len_b - i_b);
9077 /* Set result to final length, which can change the pointer to array_r, so
9079 if (len_r != _invlist_len(r)) {
9080 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9082 array_r = invlist_array(r);
9085 /* Finish outputting any remaining */
9086 if (count >= 2) { /* At most one will have a non-zero copy count */
9088 if ((copy_count = len_a - i_a) > 0) {
9089 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9091 else if ((copy_count = len_b - i_b) > 0) {
9092 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9096 /* We may be removing a reference to one of the inputs. If so, the output
9097 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
9098 * count decremented) */
9099 if (a == *i || b == *i) {
9100 assert(! invlist_is_iterating(*i));
9105 SvREFCNT_dec_NN(*i);
9115 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9117 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9118 * set. A pointer to the inversion list is returned. This may actually be
9119 * a new list, in which case the passed in one has been destroyed. The
9120 * passed-in inversion list can be NULL, in which case a new one is created
9121 * with just the one range in it */
9126 if (invlist == NULL) {
9127 invlist = _new_invlist(2);
9131 len = _invlist_len(invlist);
9134 /* If comes after the final entry actually in the list, can just append it
9137 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9138 && start >= invlist_array(invlist)[len - 1]))
9140 _append_range_to_invlist(invlist, start, end);
9144 /* Here, can't just append things, create and return a new inversion list
9145 * which is the union of this range and the existing inversion list */
9146 range_invlist = _new_invlist(2);
9147 _append_range_to_invlist(range_invlist, start, end);
9149 _invlist_union(invlist, range_invlist, &invlist);
9151 /* The temporary can be freed */
9152 SvREFCNT_dec_NN(range_invlist);
9158 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9159 UV** other_elements_ptr)
9161 /* Create and return an inversion list whose contents are to be populated
9162 * by the caller. The caller gives the number of elements (in 'size') and
9163 * the very first element ('element0'). This function will set
9164 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9167 * Obviously there is some trust involved that the caller will properly
9168 * fill in the other elements of the array.
9170 * (The first element needs to be passed in, as the underlying code does
9171 * things differently depending on whether it is zero or non-zero) */
9173 SV* invlist = _new_invlist(size);
9176 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9178 _append_range_to_invlist(invlist, element0, element0);
9179 offset = *get_invlist_offset_addr(invlist);
9181 invlist_set_len(invlist, size, offset);
9182 *other_elements_ptr = invlist_array(invlist) + 1;
9188 PERL_STATIC_INLINE SV*
9189 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9190 return _add_range_to_invlist(invlist, cp, cp);
9193 #ifndef PERL_IN_XSUB_RE
9195 Perl__invlist_invert(pTHX_ SV* const invlist)
9197 /* Complement the input inversion list. This adds a 0 if the list didn't
9198 * have a zero; removes it otherwise. As described above, the data
9199 * structure is set up so that this is very efficient */
9201 PERL_ARGS_ASSERT__INVLIST_INVERT;
9203 assert(! invlist_is_iterating(invlist));
9205 /* The inverse of matching nothing is matching everything */
9206 if (_invlist_len(invlist) == 0) {
9207 _append_range_to_invlist(invlist, 0, UV_MAX);
9211 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9216 PERL_STATIC_INLINE SV*
9217 S_invlist_clone(pTHX_ SV* const invlist)
9220 /* Return a new inversion list that is a copy of the input one, which is
9221 * unchanged. The new list will not be mortal even if the old one was. */
9223 /* Need to allocate extra space to accommodate Perl's addition of a
9224 * trailing NUL to SvPV's, since it thinks they are always strings */
9225 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9226 STRLEN physical_length = SvCUR(invlist);
9227 bool offset = *(get_invlist_offset_addr(invlist));
9229 PERL_ARGS_ASSERT_INVLIST_CLONE;
9231 *(get_invlist_offset_addr(new_invlist)) = offset;
9232 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9233 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9238 PERL_STATIC_INLINE STRLEN*
9239 S_get_invlist_iter_addr(SV* invlist)
9241 /* Return the address of the UV that contains the current iteration
9244 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9246 assert(SvTYPE(invlist) == SVt_INVLIST);
9248 return &(((XINVLIST*) SvANY(invlist))->iterator);
9251 PERL_STATIC_INLINE void
9252 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9254 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9256 *get_invlist_iter_addr(invlist) = 0;
9259 PERL_STATIC_INLINE void
9260 S_invlist_iterfinish(SV* invlist)
9262 /* Terminate iterator for invlist. This is to catch development errors.
9263 * Any iteration that is interrupted before completed should call this
9264 * function. Functions that add code points anywhere else but to the end
9265 * of an inversion list assert that they are not in the middle of an
9266 * iteration. If they were, the addition would make the iteration
9267 * problematical: if the iteration hadn't reached the place where things
9268 * were being added, it would be ok */
9270 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9272 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9276 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9278 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9279 * This call sets in <*start> and <*end>, the next range in <invlist>.
9280 * Returns <TRUE> if successful and the next call will return the next
9281 * range; <FALSE> if was already at the end of the list. If the latter,
9282 * <*start> and <*end> are unchanged, and the next call to this function
9283 * will start over at the beginning of the list */
9285 STRLEN* pos = get_invlist_iter_addr(invlist);
9286 UV len = _invlist_len(invlist);
9289 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9292 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9296 array = invlist_array(invlist);
9298 *start = array[(*pos)++];
9304 *end = array[(*pos)++] - 1;
9310 PERL_STATIC_INLINE UV
9311 S_invlist_highest(SV* const invlist)
9313 /* Returns the highest code point that matches an inversion list. This API
9314 * has an ambiguity, as it returns 0 under either the highest is actually
9315 * 0, or if the list is empty. If this distinction matters to you, check
9316 * for emptiness before calling this function */
9318 UV len = _invlist_len(invlist);
9321 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9327 array = invlist_array(invlist);
9329 /* The last element in the array in the inversion list always starts a
9330 * range that goes to infinity. That range may be for code points that are
9331 * matched in the inversion list, or it may be for ones that aren't
9332 * matched. In the latter case, the highest code point in the set is one
9333 * less than the beginning of this range; otherwise it is the final element
9334 * of this range: infinity */
9335 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9337 : array[len - 1] - 1;
9340 #ifndef PERL_IN_XSUB_RE
9342 Perl__invlist_contents(pTHX_ SV* const invlist)
9344 /* Get the contents of an inversion list into a string SV so that they can
9345 * be printed out. It uses the format traditionally done for debug tracing
9349 SV* output = newSVpvs("\n");
9351 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9353 assert(! invlist_is_iterating(invlist));
9355 invlist_iterinit(invlist);
9356 while (invlist_iternext(invlist, &start, &end)) {
9357 if (end == UV_MAX) {
9358 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9360 else if (end != start) {
9361 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9365 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9373 #ifndef PERL_IN_XSUB_RE
9375 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9376 const char * const indent, SV* const invlist)
9378 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9379 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9380 * the string 'indent'. The output looks like this:
9381 [0] 0x000A .. 0x000D
9383 [4] 0x2028 .. 0x2029
9384 [6] 0x3104 .. INFINITY
9385 * This means that the first range of code points matched by the list are
9386 * 0xA through 0xD; the second range contains only the single code point
9387 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9388 * are used to define each range (except if the final range extends to
9389 * infinity, only a single element is needed). The array index of the
9390 * first element for the corresponding range is given in brackets. */
9395 PERL_ARGS_ASSERT__INVLIST_DUMP;
9397 if (invlist_is_iterating(invlist)) {
9398 Perl_dump_indent(aTHX_ level, file,
9399 "%sCan't dump inversion list because is in middle of iterating\n",
9404 invlist_iterinit(invlist);
9405 while (invlist_iternext(invlist, &start, &end)) {
9406 if (end == UV_MAX) {
9407 Perl_dump_indent(aTHX_ level, file,
9408 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9409 indent, (UV)count, start);
9411 else if (end != start) {
9412 Perl_dump_indent(aTHX_ level, file,
9413 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9414 indent, (UV)count, start, end);
9417 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9418 indent, (UV)count, start);
9425 Perl__load_PL_utf8_foldclosures (pTHX)
9427 assert(! PL_utf8_foldclosures);
9429 /* If the folds haven't been read in, call a fold function
9431 if (! PL_utf8_tofold) {
9432 U8 dummy[UTF8_MAXBYTES_CASE+1];
9434 /* This string is just a short named one above \xff */
9435 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9436 assert(PL_utf8_tofold); /* Verify that worked */
9438 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9442 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9444 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9446 /* Return a boolean as to if the two passed in inversion lists are
9447 * identical. The final argument, if TRUE, says to take the complement of
9448 * the second inversion list before doing the comparison */
9450 const UV* array_a = invlist_array(a);
9451 const UV* array_b = invlist_array(b);
9452 UV len_a = _invlist_len(a);
9453 UV len_b = _invlist_len(b);
9455 UV i = 0; /* current index into the arrays */
9456 bool retval = TRUE; /* Assume are identical until proven otherwise */
9458 PERL_ARGS_ASSERT__INVLISTEQ;
9460 /* If are to compare 'a' with the complement of b, set it
9461 * up so are looking at b's complement. */
9464 /* The complement of nothing is everything, so <a> would have to have
9465 * just one element, starting at zero (ending at infinity) */
9467 return (len_a == 1 && array_a[0] == 0);
9469 else if (array_b[0] == 0) {
9471 /* Otherwise, to complement, we invert. Here, the first element is
9472 * 0, just remove it. To do this, we just pretend the array starts
9480 /* But if the first element is not zero, we pretend the list starts
9481 * at the 0 that is always stored immediately before the array. */
9487 /* Make sure that the lengths are the same, as well as the final element
9488 * before looping through the remainder. (Thus we test the length, final,
9489 * and first elements right off the bat) */
9490 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9493 else for (i = 0; i < len_a - 1; i++) {
9494 if (array_a[i] != array_b[i]) {
9504 #undef HEADER_LENGTH
9505 #undef TO_INTERNAL_SIZE
9506 #undef FROM_INTERNAL_SIZE
9507 #undef INVLIST_VERSION_ID
9509 /* End of inversion list object */
9512 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9514 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9515 * constructs, and updates RExC_flags with them. On input, RExC_parse
9516 * should point to the first flag; it is updated on output to point to the
9517 * final ')' or ':'. There needs to be at least one flag, or this will
9520 /* for (?g), (?gc), and (?o) warnings; warning
9521 about (?c) will warn about (?g) -- japhy */
9523 #define WASTED_O 0x01
9524 #define WASTED_G 0x02
9525 #define WASTED_C 0x04
9526 #define WASTED_GC (WASTED_G|WASTED_C)
9527 I32 wastedflags = 0x00;
9528 U32 posflags = 0, negflags = 0;
9529 U32 *flagsp = &posflags;
9530 char has_charset_modifier = '\0';
9532 bool has_use_defaults = FALSE;
9533 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9534 int x_mod_count = 0;
9536 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9538 /* '^' as an initial flag sets certain defaults */
9539 if (UCHARAT(RExC_parse) == '^') {
9541 has_use_defaults = TRUE;
9542 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9543 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9544 ? REGEX_UNICODE_CHARSET
9545 : REGEX_DEPENDS_CHARSET);
9548 cs = get_regex_charset(RExC_flags);
9549 if (cs == REGEX_DEPENDS_CHARSET
9550 && (RExC_utf8 || RExC_uni_semantics))
9552 cs = REGEX_UNICODE_CHARSET;
9555 while (*RExC_parse) {
9556 /* && strchr("iogcmsx", *RExC_parse) */
9557 /* (?g), (?gc) and (?o) are useless here
9558 and must be globally applied -- japhy */
9559 switch (*RExC_parse) {
9561 /* Code for the imsx flags */
9562 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9564 case LOCALE_PAT_MOD:
9565 if (has_charset_modifier) {
9566 goto excess_modifier;
9568 else if (flagsp == &negflags) {
9571 cs = REGEX_LOCALE_CHARSET;
9572 has_charset_modifier = LOCALE_PAT_MOD;
9574 case UNICODE_PAT_MOD:
9575 if (has_charset_modifier) {
9576 goto excess_modifier;
9578 else if (flagsp == &negflags) {
9581 cs = REGEX_UNICODE_CHARSET;
9582 has_charset_modifier = UNICODE_PAT_MOD;
9584 case ASCII_RESTRICT_PAT_MOD:
9585 if (flagsp == &negflags) {
9588 if (has_charset_modifier) {
9589 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9590 goto excess_modifier;
9592 /* Doubled modifier implies more restricted */
9593 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9596 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9598 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9600 case DEPENDS_PAT_MOD:
9601 if (has_use_defaults) {
9602 goto fail_modifiers;
9604 else if (flagsp == &negflags) {
9607 else if (has_charset_modifier) {
9608 goto excess_modifier;
9611 /* The dual charset means unicode semantics if the
9612 * pattern (or target, not known until runtime) are
9613 * utf8, or something in the pattern indicates unicode
9615 cs = (RExC_utf8 || RExC_uni_semantics)
9616 ? REGEX_UNICODE_CHARSET
9617 : REGEX_DEPENDS_CHARSET;
9618 has_charset_modifier = DEPENDS_PAT_MOD;
9622 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9623 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9625 else if (has_charset_modifier == *(RExC_parse - 1)) {
9626 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9630 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9632 NOT_REACHED; /*NOTREACHED*/
9635 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9637 NOT_REACHED; /*NOTREACHED*/
9638 case ONCE_PAT_MOD: /* 'o' */
9639 case GLOBAL_PAT_MOD: /* 'g' */
9640 if (PASS2 && ckWARN(WARN_REGEXP)) {
9641 const I32 wflagbit = *RExC_parse == 'o'
9644 if (! (wastedflags & wflagbit) ) {
9645 wastedflags |= wflagbit;
9646 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9649 "Useless (%s%c) - %suse /%c modifier",
9650 flagsp == &negflags ? "?-" : "?",
9652 flagsp == &negflags ? "don't " : "",
9659 case CONTINUE_PAT_MOD: /* 'c' */
9660 if (PASS2 && ckWARN(WARN_REGEXP)) {
9661 if (! (wastedflags & WASTED_C) ) {
9662 wastedflags |= WASTED_GC;
9663 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9666 "Useless (%sc) - %suse /gc modifier",
9667 flagsp == &negflags ? "?-" : "?",
9668 flagsp == &negflags ? "don't " : ""
9673 case KEEPCOPY_PAT_MOD: /* 'p' */
9674 if (flagsp == &negflags) {
9676 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9678 *flagsp |= RXf_PMf_KEEPCOPY;
9682 /* A flag is a default iff it is following a minus, so
9683 * if there is a minus, it means will be trying to
9684 * re-specify a default which is an error */
9685 if (has_use_defaults || flagsp == &negflags) {
9686 goto fail_modifiers;
9689 wastedflags = 0; /* reset so (?g-c) warns twice */
9693 RExC_flags |= posflags;
9694 RExC_flags &= ~negflags;
9695 set_regex_charset(&RExC_flags, cs);
9696 if (RExC_flags & RXf_PMf_FOLD) {
9697 RExC_contains_i = 1;
9700 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9706 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9707 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9708 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9709 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9710 NOT_REACHED; /*NOTREACHED*/
9717 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9722 - reg - regular expression, i.e. main body or parenthesized thing
9724 * Caller must absorb opening parenthesis.
9726 * Combining parenthesis handling with the base level of regular expression
9727 * is a trifle forced, but the need to tie the tails of the branches to what
9728 * follows makes it hard to avoid.
9730 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9732 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9734 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9737 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9738 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9739 needs to be restarted.
9740 Otherwise would only return NULL if regbranch() returns NULL, which
9743 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9744 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9745 * 2 is like 1, but indicates that nextchar() has been called to advance
9746 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9747 * this flag alerts us to the need to check for that */
9749 regnode *ret; /* Will be the head of the group. */
9752 regnode *ender = NULL;
9755 U32 oregflags = RExC_flags;
9756 bool have_branch = 0;
9758 I32 freeze_paren = 0;
9759 I32 after_freeze = 0;
9760 I32 num; /* numeric backreferences */
9762 char * parse_start = RExC_parse; /* MJD */
9763 char * const oregcomp_parse = RExC_parse;
9765 GET_RE_DEBUG_FLAGS_DECL;
9767 PERL_ARGS_ASSERT_REG;
9768 DEBUG_PARSE("reg ");
9770 *flagp = 0; /* Tentatively. */
9773 /* Make an OPEN node, if parenthesized. */
9776 /* Under /x, space and comments can be gobbled up between the '(' and
9777 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9778 * intervening space, as the sequence is a token, and a token should be
9780 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9782 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9783 char *start_verb = RExC_parse;
9784 STRLEN verb_len = 0;
9785 char *start_arg = NULL;
9786 unsigned char op = 0;
9788 int internal_argval = 0; /* internal_argval is only useful if
9791 if (has_intervening_patws) {
9793 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9795 while ( *RExC_parse && *RExC_parse != ')' ) {
9796 if ( *RExC_parse == ':' ) {
9797 start_arg = RExC_parse + 1;
9803 verb_len = RExC_parse - start_verb;
9806 while ( *RExC_parse && *RExC_parse != ')' )
9808 if ( *RExC_parse != ')' )
9809 vFAIL("Unterminated verb pattern argument");
9810 if ( RExC_parse == start_arg )
9813 if ( *RExC_parse != ')' )
9814 vFAIL("Unterminated verb pattern");
9817 switch ( *start_verb ) {
9818 case 'A': /* (*ACCEPT) */
9819 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9821 internal_argval = RExC_nestroot;
9824 case 'C': /* (*COMMIT) */
9825 if ( memEQs(start_verb,verb_len,"COMMIT") )
9828 case 'F': /* (*FAIL) */
9829 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9834 case ':': /* (*:NAME) */
9835 case 'M': /* (*MARK:NAME) */
9836 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9841 case 'P': /* (*PRUNE) */
9842 if ( memEQs(start_verb,verb_len,"PRUNE") )
9845 case 'S': /* (*SKIP) */
9846 if ( memEQs(start_verb,verb_len,"SKIP") )
9849 case 'T': /* (*THEN) */
9850 /* [19:06] <TimToady> :: is then */
9851 if ( memEQs(start_verb,verb_len,"THEN") ) {
9853 RExC_seen |= REG_CUTGROUP_SEEN;
9858 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9860 "Unknown verb pattern '%"UTF8f"'",
9861 UTF8fARG(UTF, verb_len, start_verb));
9864 if ( start_arg && internal_argval ) {
9865 vFAIL3("Verb pattern '%.*s' may not have an argument",
9866 verb_len, start_verb);
9867 } else if ( argok < 0 && !start_arg ) {
9868 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9869 verb_len, start_verb);
9871 ret = reganode(pRExC_state, op, internal_argval);
9872 if ( ! internal_argval && ! SIZE_ONLY ) {
9874 SV *sv = newSVpvn( start_arg,
9875 RExC_parse - start_arg);
9876 ARG(ret) = add_data( pRExC_state,
9878 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9885 if (!internal_argval)
9886 RExC_seen |= REG_VERBARG_SEEN;
9887 } else if ( start_arg ) {
9888 vFAIL3("Verb pattern '%.*s' may not have an argument",
9889 verb_len, start_verb);
9891 ret = reg_node(pRExC_state, op);
9893 nextchar(pRExC_state);
9896 else if (*RExC_parse == '?') { /* (?...) */
9897 bool is_logical = 0;
9898 const char * const seqstart = RExC_parse;
9899 const char * endptr;
9900 if (has_intervening_patws) {
9902 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9906 paren = *RExC_parse++;
9907 ret = NULL; /* For look-ahead/behind. */
9910 case 'P': /* (?P...) variants for those used to PCRE/Python */
9911 paren = *RExC_parse++;
9912 if ( paren == '<') /* (?P<...>) named capture */
9914 else if (paren == '>') { /* (?P>name) named recursion */
9915 goto named_recursion;
9917 else if (paren == '=') { /* (?P=...) named backref */
9918 /* this pretty much dupes the code for \k<NAME> in
9919 * regatom(), if you change this make sure you change that
9921 char* name_start = RExC_parse;
9923 SV *sv_dat = reg_scan_name(pRExC_state,
9924 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9925 if (RExC_parse == name_start || *RExC_parse != ')')
9926 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9927 vFAIL2("Sequence %.3s... not terminated",parse_start);
9930 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9931 RExC_rxi->data->data[num]=(void*)sv_dat;
9932 SvREFCNT_inc_simple_void(sv_dat);
9935 ret = reganode(pRExC_state,
9938 : (ASCII_FOLD_RESTRICTED)
9940 : (AT_LEAST_UNI_SEMANTICS)
9948 Set_Node_Offset(ret, parse_start+1);
9949 Set_Node_Cur_Length(ret, parse_start);
9951 nextchar(pRExC_state);
9955 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9956 vFAIL3("Sequence (%.*s...) not recognized",
9957 RExC_parse-seqstart, seqstart);
9958 NOT_REACHED; /*NOTREACHED*/
9959 case '<': /* (?<...) */
9960 if (*RExC_parse == '!')
9962 else if (*RExC_parse != '=')
9968 case '\'': /* (?'...') */
9969 name_start= RExC_parse;
9970 svname = reg_scan_name(pRExC_state,
9971 SIZE_ONLY /* reverse test from the others */
9972 ? REG_RSN_RETURN_NAME
9973 : REG_RSN_RETURN_NULL);
9974 if (RExC_parse == name_start || *RExC_parse != paren)
9975 vFAIL2("Sequence (?%c... not terminated",
9976 paren=='>' ? '<' : paren);
9980 if (!svname) /* shouldn't happen */
9982 "panic: reg_scan_name returned NULL");
9983 if (!RExC_paren_names) {
9984 RExC_paren_names= newHV();
9985 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9987 RExC_paren_name_list= newAV();
9988 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9991 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9993 sv_dat = HeVAL(he_str);
9995 /* croak baby croak */
9997 "panic: paren_name hash element allocation failed");
9998 } else if ( SvPOK(sv_dat) ) {
9999 /* (?|...) can mean we have dupes so scan to check
10000 its already been stored. Maybe a flag indicating
10001 we are inside such a construct would be useful,
10002 but the arrays are likely to be quite small, so
10003 for now we punt -- dmq */
10004 IV count = SvIV(sv_dat);
10005 I32 *pv = (I32*)SvPVX(sv_dat);
10007 for ( i = 0 ; i < count ; i++ ) {
10008 if ( pv[i] == RExC_npar ) {
10014 pv = (I32*)SvGROW(sv_dat,
10015 SvCUR(sv_dat) + sizeof(I32)+1);
10016 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10017 pv[count] = RExC_npar;
10018 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10021 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10022 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10025 SvIV_set(sv_dat, 1);
10028 /* Yes this does cause a memory leak in debugging Perls
10030 if (!av_store(RExC_paren_name_list,
10031 RExC_npar, SvREFCNT_inc(svname)))
10032 SvREFCNT_dec_NN(svname);
10035 /*sv_dump(sv_dat);*/
10037 nextchar(pRExC_state);
10039 goto capturing_parens;
10041 RExC_seen |= REG_LOOKBEHIND_SEEN;
10042 RExC_in_lookbehind++;
10045 case '=': /* (?=...) */
10046 RExC_seen_zerolen++;
10048 case '!': /* (?!...) */
10049 RExC_seen_zerolen++;
10050 if (*RExC_parse == ')') {
10051 ret=reg_node(pRExC_state, OPFAIL);
10052 nextchar(pRExC_state);
10056 case '|': /* (?|...) */
10057 /* branch reset, behave like a (?:...) except that
10058 buffers in alternations share the same numbers */
10060 after_freeze = freeze_paren = RExC_npar;
10062 case ':': /* (?:...) */
10063 case '>': /* (?>...) */
10065 case '$': /* (?$...) */
10066 case '@': /* (?@...) */
10067 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10069 case '0' : /* (?0) */
10070 case 'R' : /* (?R) */
10071 if (*RExC_parse != ')')
10072 FAIL("Sequence (?R) not terminated");
10073 ret = reg_node(pRExC_state, GOSTART);
10074 RExC_seen |= REG_GOSTART_SEEN;
10075 *flagp |= POSTPONED;
10076 nextchar(pRExC_state);
10079 /* named and numeric backreferences */
10080 case '&': /* (?&NAME) */
10081 parse_start = RExC_parse - 1;
10084 SV *sv_dat = reg_scan_name(pRExC_state,
10085 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10086 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10088 if (RExC_parse == RExC_end || *RExC_parse != ')')
10089 vFAIL("Sequence (?&... not terminated");
10090 goto gen_recurse_regop;
10093 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10095 vFAIL("Illegal pattern");
10097 goto parse_recursion;
10099 case '-': /* (?-1) */
10100 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10101 RExC_parse--; /* rewind to let it be handled later */
10105 case '1': case '2': case '3': case '4': /* (?1) */
10106 case '5': case '6': case '7': case '8': case '9':
10110 bool is_neg = FALSE;
10111 parse_start = RExC_parse - 1; /* MJD */
10112 if (*RExC_parse == '-') {
10116 num = grok_atou(RExC_parse, &endptr);
10118 RExC_parse = (char*)endptr;
10120 /* Some limit for num? */
10124 if (*RExC_parse!=')')
10125 vFAIL("Expecting close bracket");
10128 if ( paren == '-' ) {
10130 Diagram of capture buffer numbering.
10131 Top line is the normal capture buffer numbers
10132 Bottom line is the negative indexing as from
10136 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10140 num = RExC_npar + num;
10143 vFAIL("Reference to nonexistent group");
10145 } else if ( paren == '+' ) {
10146 num = RExC_npar + num - 1;
10149 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10151 if (num > (I32)RExC_rx->nparens) {
10153 vFAIL("Reference to nonexistent group");
10155 RExC_recurse_count++;
10156 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10157 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10158 22, "| |", (int)(depth * 2 + 1), "",
10159 (UV)ARG(ret), (IV)ARG2L(ret)));
10161 RExC_seen |= REG_RECURSE_SEEN;
10162 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10163 Set_Node_Offset(ret, parse_start); /* MJD */
10165 *flagp |= POSTPONED;
10166 nextchar(pRExC_state);
10171 case '?': /* (??...) */
10173 if (*RExC_parse != '{') {
10175 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10177 "Sequence (%"UTF8f"...) not recognized",
10178 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10179 NOT_REACHED; /*NOTREACHED*/
10181 *flagp |= POSTPONED;
10182 paren = *RExC_parse++;
10184 case '{': /* (?{...}) */
10187 struct reg_code_block *cb;
10189 RExC_seen_zerolen++;
10191 if ( !pRExC_state->num_code_blocks
10192 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10193 || pRExC_state->code_blocks[pRExC_state->code_index].start
10194 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10197 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10198 FAIL("panic: Sequence (?{...}): no code block found\n");
10199 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10201 /* this is a pre-compiled code block (?{...}) */
10202 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10203 RExC_parse = RExC_start + cb->end;
10206 if (cb->src_regex) {
10207 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10208 RExC_rxi->data->data[n] =
10209 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10210 RExC_rxi->data->data[n+1] = (void*)o;
10213 n = add_data(pRExC_state,
10214 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10215 RExC_rxi->data->data[n] = (void*)o;
10218 pRExC_state->code_index++;
10219 nextchar(pRExC_state);
10223 ret = reg_node(pRExC_state, LOGICAL);
10225 eval = reg2Lanode(pRExC_state, EVAL,
10228 /* for later propagation into (??{})
10230 RExC_flags & RXf_PMf_COMPILETIME
10235 REGTAIL(pRExC_state, ret, eval);
10236 /* deal with the length of this later - MJD */
10239 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10240 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10241 Set_Node_Offset(ret, parse_start);
10244 case '(': /* (?(?{...})...) and (?(?=...)...) */
10247 const int DEFINE_len = sizeof("DEFINE") - 1;
10248 if (RExC_parse[0] == '?') { /* (?(?...)) */
10249 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10250 || RExC_parse[1] == '<'
10251 || RExC_parse[1] == '{') { /* Lookahead or eval. */
10255 ret = reg_node(pRExC_state, LOGICAL);
10259 tail = reg(pRExC_state, 1, &flag, depth+1);
10260 if (flag & RESTART_UTF8) {
10261 *flagp = RESTART_UTF8;
10264 REGTAIL(pRExC_state, ret, tail);
10267 /* Fall through to ‘Unknown switch condition’ at the
10268 end of the if/else chain. */
10270 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10271 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10273 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10274 char *name_start= RExC_parse++;
10276 SV *sv_dat=reg_scan_name(pRExC_state,
10277 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10278 if (RExC_parse == name_start || *RExC_parse != ch)
10279 vFAIL2("Sequence (?(%c... not terminated",
10280 (ch == '>' ? '<' : ch));
10283 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10284 RExC_rxi->data->data[num]=(void*)sv_dat;
10285 SvREFCNT_inc_simple_void(sv_dat);
10287 ret = reganode(pRExC_state,NGROUPP,num);
10288 goto insert_if_check_paren;
10290 else if (strnEQ(RExC_parse, "DEFINE",
10291 MIN(DEFINE_len, RExC_end - RExC_parse)))
10293 ret = reganode(pRExC_state,DEFINEP,0);
10294 RExC_parse += DEFINE_len;
10296 goto insert_if_check_paren;
10298 else if (RExC_parse[0] == 'R') {
10301 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10302 parno = grok_atou(RExC_parse, &endptr);
10304 RExC_parse = (char*)endptr;
10305 } else if (RExC_parse[0] == '&') {
10308 sv_dat = reg_scan_name(pRExC_state,
10310 ? REG_RSN_RETURN_NULL
10311 : REG_RSN_RETURN_DATA);
10312 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10314 ret = reganode(pRExC_state,INSUBP,parno);
10315 goto insert_if_check_paren;
10317 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10321 parno = grok_atou(RExC_parse, &endptr);
10323 RExC_parse = (char*)endptr;
10324 ret = reganode(pRExC_state, GROUPP, parno);
10326 insert_if_check_paren:
10327 if (*(tmp = nextchar(pRExC_state)) != ')') {
10328 /* nextchar also skips comments, so undo its work
10329 * and skip over the the next character.
10332 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10333 vFAIL("Switch condition not recognized");
10336 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10337 br = regbranch(pRExC_state, &flags, 1,depth+1);
10339 if (flags & RESTART_UTF8) {
10340 *flagp = RESTART_UTF8;
10343 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10346 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10348 c = *nextchar(pRExC_state);
10349 if (flags&HASWIDTH)
10350 *flagp |= HASWIDTH;
10353 vFAIL("(?(DEFINE)....) does not allow branches");
10355 /* Fake one for optimizer. */
10356 lastbr = reganode(pRExC_state, IFTHEN, 0);
10358 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10359 if (flags & RESTART_UTF8) {
10360 *flagp = RESTART_UTF8;
10363 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10366 REGTAIL(pRExC_state, ret, lastbr);
10367 if (flags&HASWIDTH)
10368 *flagp |= HASWIDTH;
10369 c = *nextchar(pRExC_state);
10374 if (RExC_parse>RExC_end)
10375 vFAIL("Switch (?(condition)... not terminated");
10377 vFAIL("Switch (?(condition)... contains too many branches");
10379 ender = reg_node(pRExC_state, TAIL);
10380 REGTAIL(pRExC_state, br, ender);
10382 REGTAIL(pRExC_state, lastbr, ender);
10383 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10386 REGTAIL(pRExC_state, ret, ender);
10387 RExC_size++; /* XXX WHY do we need this?!!
10388 For large programs it seems to be required
10389 but I can't figure out why. -- dmq*/
10392 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10393 vFAIL("Unknown switch condition (?(...))");
10395 case '[': /* (?[ ... ]) */
10396 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10399 RExC_parse--; /* for vFAIL to print correctly */
10400 vFAIL("Sequence (? incomplete");
10402 default: /* e.g., (?i) */
10405 parse_lparen_question_flags(pRExC_state);
10406 if (UCHARAT(RExC_parse) != ':') {
10407 nextchar(pRExC_state);
10412 nextchar(pRExC_state);
10422 ret = reganode(pRExC_state, OPEN, parno);
10424 if (!RExC_nestroot)
10425 RExC_nestroot = parno;
10426 if (RExC_seen & REG_RECURSE_SEEN
10427 && !RExC_open_parens[parno-1])
10429 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10430 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10431 22, "| |", (int)(depth * 2 + 1), "",
10432 (IV)parno, REG_NODE_NUM(ret)));
10433 RExC_open_parens[parno-1]= ret;
10436 Set_Node_Length(ret, 1); /* MJD */
10437 Set_Node_Offset(ret, RExC_parse); /* MJD */
10445 /* Pick up the branches, linking them together. */
10446 parse_start = RExC_parse; /* MJD */
10447 br = regbranch(pRExC_state, &flags, 1,depth+1);
10449 /* branch_len = (paren != 0); */
10452 if (flags & RESTART_UTF8) {
10453 *flagp = RESTART_UTF8;
10456 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10458 if (*RExC_parse == '|') {
10459 if (!SIZE_ONLY && RExC_extralen) {
10460 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10463 reginsert(pRExC_state, BRANCH, br, depth+1);
10464 Set_Node_Length(br, paren != 0);
10465 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10469 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10471 else if (paren == ':') {
10472 *flagp |= flags&SIMPLE;
10474 if (is_open) { /* Starts with OPEN. */
10475 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10477 else if (paren != '?') /* Not Conditional */
10479 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10481 while (*RExC_parse == '|') {
10482 if (!SIZE_ONLY && RExC_extralen) {
10483 ender = reganode(pRExC_state, LONGJMP,0);
10485 /* Append to the previous. */
10486 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10489 RExC_extralen += 2; /* Account for LONGJMP. */
10490 nextchar(pRExC_state);
10491 if (freeze_paren) {
10492 if (RExC_npar > after_freeze)
10493 after_freeze = RExC_npar;
10494 RExC_npar = freeze_paren;
10496 br = regbranch(pRExC_state, &flags, 0, depth+1);
10499 if (flags & RESTART_UTF8) {
10500 *flagp = RESTART_UTF8;
10503 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10505 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10507 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10510 if (have_branch || paren != ':') {
10511 /* Make a closing node, and hook it on the end. */
10514 ender = reg_node(pRExC_state, TAIL);
10517 ender = reganode(pRExC_state, CLOSE, parno);
10518 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10519 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10520 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10521 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10522 RExC_close_parens[parno-1]= ender;
10523 if (RExC_nestroot == parno)
10526 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10527 Set_Node_Length(ender,1); /* MJD */
10533 *flagp &= ~HASWIDTH;
10536 ender = reg_node(pRExC_state, SUCCEED);
10539 ender = reg_node(pRExC_state, END);
10541 assert(!RExC_opend); /* there can only be one! */
10542 RExC_opend = ender;
10546 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10547 DEBUG_PARSE_MSG("lsbr");
10548 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10549 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10550 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10551 SvPV_nolen_const(RExC_mysv1),
10552 (IV)REG_NODE_NUM(lastbr),
10553 SvPV_nolen_const(RExC_mysv2),
10554 (IV)REG_NODE_NUM(ender),
10555 (IV)(ender - lastbr)
10558 REGTAIL(pRExC_state, lastbr, ender);
10560 if (have_branch && !SIZE_ONLY) {
10561 char is_nothing= 1;
10563 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10565 /* Hook the tails of the branches to the closing node. */
10566 for (br = ret; br; br = regnext(br)) {
10567 const U8 op = PL_regkind[OP(br)];
10568 if (op == BRANCH) {
10569 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10570 if ( OP(NEXTOPER(br)) != NOTHING
10571 || regnext(NEXTOPER(br)) != ender)
10574 else if (op == BRANCHJ) {
10575 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10576 /* for now we always disable this optimisation * /
10577 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10578 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10584 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10585 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10586 DEBUG_PARSE_MSG("NADA");
10587 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10588 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10589 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10590 SvPV_nolen_const(RExC_mysv1),
10591 (IV)REG_NODE_NUM(ret),
10592 SvPV_nolen_const(RExC_mysv2),
10593 (IV)REG_NODE_NUM(ender),
10598 if (OP(ender) == TAIL) {
10603 for ( opt= br + 1; opt < ender ; opt++ )
10604 OP(opt)= OPTIMIZED;
10605 NEXT_OFF(br)= ender - br;
10613 static const char parens[] = "=!<,>";
10615 if (paren && (p = strchr(parens, paren))) {
10616 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10617 int flag = (p - parens) > 1;
10620 node = SUSPEND, flag = 0;
10621 reginsert(pRExC_state, node,ret, depth+1);
10622 Set_Node_Cur_Length(ret, parse_start);
10623 Set_Node_Offset(ret, parse_start + 1);
10625 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10629 /* Check for proper termination. */
10631 /* restore original flags, but keep (?p) */
10632 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10633 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10634 RExC_parse = oregcomp_parse;
10635 vFAIL("Unmatched (");
10638 else if (!paren && RExC_parse < RExC_end) {
10639 if (*RExC_parse == ')') {
10641 vFAIL("Unmatched )");
10644 FAIL("Junk on end of regexp"); /* "Can't happen". */
10645 NOT_REACHED; /* NOTREACHED */
10648 if (RExC_in_lookbehind) {
10649 RExC_in_lookbehind--;
10651 if (after_freeze > RExC_npar)
10652 RExC_npar = after_freeze;
10657 - regbranch - one alternative of an | operator
10659 * Implements the concatenation operator.
10661 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10665 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10668 regnode *chain = NULL;
10670 I32 flags = 0, c = 0;
10671 GET_RE_DEBUG_FLAGS_DECL;
10673 PERL_ARGS_ASSERT_REGBRANCH;
10675 DEBUG_PARSE("brnc");
10680 if (!SIZE_ONLY && RExC_extralen)
10681 ret = reganode(pRExC_state, BRANCHJ,0);
10683 ret = reg_node(pRExC_state, BRANCH);
10684 Set_Node_Length(ret, 1);
10688 if (!first && SIZE_ONLY)
10689 RExC_extralen += 1; /* BRANCHJ */
10691 *flagp = WORST; /* Tentatively. */
10694 nextchar(pRExC_state);
10695 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10696 flags &= ~TRYAGAIN;
10697 latest = regpiece(pRExC_state, &flags,depth+1);
10698 if (latest == NULL) {
10699 if (flags & TRYAGAIN)
10701 if (flags & RESTART_UTF8) {
10702 *flagp = RESTART_UTF8;
10705 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10707 else if (ret == NULL)
10709 *flagp |= flags&(HASWIDTH|POSTPONED);
10710 if (chain == NULL) /* First piece. */
10711 *flagp |= flags&SPSTART;
10714 REGTAIL(pRExC_state, chain, latest);
10719 if (chain == NULL) { /* Loop ran zero times. */
10720 chain = reg_node(pRExC_state, NOTHING);
10725 *flagp |= flags&SIMPLE;
10732 - regpiece - something followed by possible [*+?]
10734 * Note that the branching code sequences used for ? and the general cases
10735 * of * and + are somewhat optimized: they use the same NOTHING node as
10736 * both the endmarker for their branch list and the body of the last branch.
10737 * It might seem that this node could be dispensed with entirely, but the
10738 * endmarker role is not redundant.
10740 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10742 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10746 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10752 const char * const origparse = RExC_parse;
10754 I32 max = REG_INFTY;
10755 #ifdef RE_TRACK_PATTERN_OFFSETS
10758 const char *maxpos = NULL;
10760 /* Save the original in case we change the emitted regop to a FAIL. */
10761 regnode * const orig_emit = RExC_emit;
10763 GET_RE_DEBUG_FLAGS_DECL;
10765 PERL_ARGS_ASSERT_REGPIECE;
10767 DEBUG_PARSE("piec");
10769 ret = regatom(pRExC_state, &flags,depth+1);
10771 if (flags & (TRYAGAIN|RESTART_UTF8))
10772 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10774 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10780 if (op == '{' && regcurly(RExC_parse)) {
10782 #ifdef RE_TRACK_PATTERN_OFFSETS
10783 parse_start = RExC_parse; /* MJD */
10785 next = RExC_parse + 1;
10786 while (isDIGIT(*next) || *next == ',') {
10787 if (*next == ',') {
10795 if (*next == '}') { /* got one */
10796 const char* endptr;
10800 min = grok_atou(RExC_parse, &endptr);
10801 if (*maxpos == ',')
10804 maxpos = RExC_parse;
10805 max = grok_atou(maxpos, &endptr);
10806 if (!max && *maxpos != '0')
10807 max = REG_INFTY; /* meaning "infinity" */
10808 else if (max >= REG_INFTY)
10809 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10811 nextchar(pRExC_state);
10812 if (max < min) { /* If can't match, warn and optimize to fail
10816 /* We can't back off the size because we have to reserve
10817 * enough space for all the things we are about to throw
10818 * away, but we can shrink it by the ammount we are about
10819 * to re-use here */
10820 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10823 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10824 RExC_emit = orig_emit;
10826 ret = reg_node(pRExC_state, OPFAIL);
10829 else if (min == max
10830 && RExC_parse < RExC_end
10831 && (*RExC_parse == '?' || *RExC_parse == '+'))
10834 ckWARN2reg(RExC_parse + 1,
10835 "Useless use of greediness modifier '%c'",
10838 /* Absorb the modifier, so later code doesn't see nor use
10840 nextchar(pRExC_state);
10844 if ((flags&SIMPLE)) {
10845 RExC_naughty += 2 + RExC_naughty / 2;
10846 reginsert(pRExC_state, CURLY, ret, depth+1);
10847 Set_Node_Offset(ret, parse_start+1); /* MJD */
10848 Set_Node_Cur_Length(ret, parse_start);
10851 regnode * const w = reg_node(pRExC_state, WHILEM);
10854 REGTAIL(pRExC_state, ret, w);
10855 if (!SIZE_ONLY && RExC_extralen) {
10856 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10857 reginsert(pRExC_state, NOTHING,ret, depth+1);
10858 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10860 reginsert(pRExC_state, CURLYX,ret, depth+1);
10862 Set_Node_Offset(ret, parse_start+1);
10863 Set_Node_Length(ret,
10864 op == '{' ? (RExC_parse - parse_start) : 1);
10866 if (!SIZE_ONLY && RExC_extralen)
10867 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10868 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10870 RExC_whilem_seen++, RExC_extralen += 3;
10871 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10878 *flagp |= HASWIDTH;
10880 ARG1_SET(ret, (U16)min);
10881 ARG2_SET(ret, (U16)max);
10883 if (max == REG_INFTY)
10884 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10890 if (!ISMULT1(op)) {
10895 #if 0 /* Now runtime fix should be reliable. */
10897 /* if this is reinstated, don't forget to put this back into perldiag:
10899 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10901 (F) The part of the regexp subject to either the * or + quantifier
10902 could match an empty string. The {#} shows in the regular
10903 expression about where the problem was discovered.
10907 if (!(flags&HASWIDTH) && op != '?')
10908 vFAIL("Regexp *+ operand could be empty");
10911 #ifdef RE_TRACK_PATTERN_OFFSETS
10912 parse_start = RExC_parse;
10914 nextchar(pRExC_state);
10916 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10918 if (op == '*' && (flags&SIMPLE)) {
10919 reginsert(pRExC_state, STAR, ret, depth+1);
10922 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10924 else if (op == '*') {
10928 else if (op == '+' && (flags&SIMPLE)) {
10929 reginsert(pRExC_state, PLUS, ret, depth+1);
10932 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10934 else if (op == '+') {
10938 else if (op == '?') {
10943 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10944 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10945 ckWARN2reg(RExC_parse,
10946 "%"UTF8f" matches null string many times",
10947 UTF8fARG(UTF, (RExC_parse >= origparse
10948 ? RExC_parse - origparse
10951 (void)ReREFCNT_inc(RExC_rx_sv);
10954 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10955 nextchar(pRExC_state);
10956 reginsert(pRExC_state, MINMOD, ret, depth+1);
10957 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10960 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10962 nextchar(pRExC_state);
10963 ender = reg_node(pRExC_state, SUCCEED);
10964 REGTAIL(pRExC_state, ret, ender);
10965 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10967 ender = reg_node(pRExC_state, TAIL);
10968 REGTAIL(pRExC_state, ret, ender);
10971 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10973 vFAIL("Nested quantifiers");
10980 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10981 UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10985 /* This is expected to be called by a parser routine that has recognized '\N'
10986 and needs to handle the rest. RExC_parse is expected to point at the first
10987 char following the N at the time of the call. On successful return,
10988 RExC_parse has been updated to point to just after the sequence identified
10989 by this routine, <*flagp> has been updated, and the non-NULL input pointers
10990 have been set appropriately.
10992 The typical case for this is \N{some character name}. This is usually
10993 called while parsing the input, filling in or ready to fill in an EXACTish
10994 node, and the code point for the character should be returned, so that it
10995 can be added to the node, and parsing continued with the next input
10996 character. But it may be that instead of a single character the \N{}
10997 expands to more than one, a named sequence. In this case any following
10998 quantifier applies to the whole sequence, and it is easier, given the code
10999 structure that calls this, to handle it from a different area of the code.
11000 For this reason, the input parameters can be set so that it returns valid
11001 only on one or the other of these cases.
11003 Another possibility is for the input to be an empty \N{}, which for
11004 backwards compatibility we accept, but generate a NOTHING node which should
11005 later get optimized out. This is handled from the area of code which can
11006 handle a named sequence, so if called with the parameters for the other, it
11009 Still another possibility is for the \N to mean [^\n], and not a single
11010 character or explicit sequence at all. This is determined by context.
11011 Again, this is handled from the area of code which can handle a named
11012 sequence, so if called with the parameters for the other, it also fails.
11014 And the final possibility is for the \N to be called from within a bracketed
11015 character class. In this case the [^\n] meaning makes no sense, and so is
11016 an error. Other anomalous situations are left to the calling code to handle.
11018 For non-single-quoted regexes, the tokenizer has attempted to decide which
11019 of the above applies, and in the case of a named sequence, has converted it
11020 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11021 where c1... are the characters in the sequence. For single-quoted regexes,
11022 the tokenizer passes the \N sequence through unchanged; this code will not
11023 attempt to determine this nor expand those, instead raising a syntax error.
11024 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11025 or there is no '}', it signals that this \N occurrence means to match a
11026 non-newline. (This mostly was done because of [perl #56444].)
11028 The API is somewhat convoluted due to historical and the above reasons.
11030 The function raises an error (via vFAIL), and doesn't return for various
11031 syntax errors. For other failures, it returns (STRLEN) -1. For successes,
11032 it returns a count of how many characters were accounted for by it. (This
11033 can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11034 points in the sequence. It sets <node_p>, <valuep>, and/or
11035 <substitute_parse> on success.
11037 If <valuep> is non-null, it means the caller can accept an input sequence
11038 consisting of a just a single code point; <*valuep> is set to the value
11039 of the only or first code point in the input.
11041 If <substitute_parse> is non-null, it means the caller can accept an input
11042 sequence consisting of one or more code points; <*substitute_parse> is a
11043 newly created mortal SV* in this case, containing \x{} escapes representing
11046 Both <valuep> and <substitute_parse> can be non-NULL.
11048 If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
11049 that the caller can accept any legal sequence other than a single code
11050 point. To wit, <*node_p> is set as follows:
11051 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11052 2) \N{}: points to a new NOTHING node; return is 0
11053 3) otherwise: points to a new EXACT node containing the resolved
11054 string; return is the number of code points in the
11055 string. This will never be 1.
11056 Note that failure is returned for single code point sequences if <valuep> is
11057 null and <node_p> is not.
11060 char * endbrace; /* '}' following the name */
11062 char *endchar; /* Points to '.' or '}' ending cur char in the input
11064 bool has_multiple_chars; /* true if the input stream contains a sequence of
11065 more than one character */
11066 bool in_char_class = substitute_parse != NULL;
11067 STRLEN count = 0; /* Number of characters in this sequence */
11069 GET_RE_DEBUG_FLAGS_DECL;
11071 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11073 GET_RE_DEBUG_FLAGS;
11075 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
11076 assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11078 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11079 * modifier. The other meaning does not, so use a temporary until we find
11080 * out which we are being called with */
11081 p = (RExC_flags & RXf_PMf_EXTENDED)
11082 ? regpatws(pRExC_state, RExC_parse,
11083 TRUE) /* means recognize comments */
11086 /* Disambiguate between \N meaning a named character versus \N meaning
11087 * [^\n]. The former is assumed when it can't be the latter. */
11088 if (*p != '{' || regcurly(p)) {
11091 /* no bare \N allowed in a charclass */
11092 if (in_char_class) {
11093 vFAIL("\\N in a character class must be a named character: \\N{...}");
11095 return (STRLEN) -1;
11097 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
11099 nextchar(pRExC_state);
11100 *node_p = reg_node(pRExC_state, REG_ANY);
11101 *flagp |= HASWIDTH|SIMPLE;
11103 Set_Node_Length(*node_p, 1); /* MJD */
11107 /* Here, we have decided it should be a named character or sequence */
11109 /* The test above made sure that the next real character is a '{', but
11110 * under the /x modifier, it could be separated by space (or a comment and
11111 * \n) and this is not allowed (for consistency with \x{...} and the
11112 * tokenizer handling of \N{NAME}). */
11113 if (*RExC_parse != '{') {
11114 vFAIL("Missing braces on \\N{}");
11117 RExC_parse++; /* Skip past the '{' */
11119 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11120 || ! (endbrace == RExC_parse /* nothing between the {} */
11121 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
11123 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
11126 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11127 vFAIL("\\N{NAME} must be resolved by the lexer");
11130 if (endbrace == RExC_parse) { /* empty: \N{} */
11132 *node_p = reg_node(pRExC_state,NOTHING);
11134 else if (! in_char_class) {
11135 return (STRLEN) -1;
11137 nextchar(pRExC_state);
11141 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11142 RExC_parse += 2; /* Skip past the 'U+' */
11144 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11146 /* Code points are separated by dots. If none, there is only one code
11147 * point, and is terminated by the brace */
11148 has_multiple_chars = (endchar < endbrace);
11150 /* We get the first code point if we want it, and either there is only one,
11151 * or we can accept both cases of one and more than one */
11152 if (valuep && (substitute_parse || ! has_multiple_chars)) {
11153 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11154 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11155 | PERL_SCAN_DISALLOW_PREFIX
11157 /* No errors in the first pass (See [perl
11158 * #122671].) We let the code below find the
11159 * errors when there are multiple chars. */
11160 | ((SIZE_ONLY || has_multiple_chars)
11161 ? PERL_SCAN_SILENT_ILLDIGIT
11164 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11166 /* The tokenizer should have guaranteed validity, but it's possible to
11167 * bypass it by using single quoting, so check. Don't do the check
11168 * here when there are multiple chars; we do it below anyway. */
11169 if (! has_multiple_chars) {
11170 if (length_of_hex == 0
11171 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11173 RExC_parse += length_of_hex; /* Includes all the valid */
11174 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11175 ? UTF8SKIP(RExC_parse)
11177 /* Guard against malformed utf8 */
11178 if (RExC_parse >= endchar) {
11179 RExC_parse = endchar;
11181 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11184 RExC_parse = endbrace + 1;
11189 /* Here, we should have already handled the case where a single character
11190 * is expected and found. So it is a failure if we aren't expecting
11191 * multiple chars and got them; or didn't get them but wanted them. We
11192 * fail without advancing the parse, so that the caller can try again with
11193 * different acceptance criteria */
11194 if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11196 return (STRLEN) -1;
11201 /* What is done here is to convert this to a sub-pattern of the form
11202 * \x{char1}\x{char2}...
11203 * and then either return it in <*substitute_parse> if non-null; or
11204 * call reg recursively to parse it (enclosing in "(?: ... )" ). That
11205 * way, it retains its atomicness, while not having to worry about
11206 * special handling that some code points may have. toke.c has
11207 * converted the original Unicode values to native, so that we can just
11208 * pass on the hex values unchanged. We do have to set a flag to keep
11209 * recoding from happening in the recursion */
11213 char *orig_end = RExC_end;
11216 if (substitute_parse) {
11217 *substitute_parse = newSVpvs("");
11220 substitute_parse = &dummy;
11221 *substitute_parse = newSVpvs("?:");
11223 *substitute_parse = sv_2mortal(*substitute_parse);
11225 while (RExC_parse < endbrace) {
11227 /* Convert to notation the rest of the code understands */
11228 sv_catpv(*substitute_parse, "\\x{");
11229 sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11230 sv_catpv(*substitute_parse, "}");
11232 /* Point to the beginning of the next character in the sequence. */
11233 RExC_parse = endchar + 1;
11234 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11238 if (! in_char_class) {
11239 sv_catpv(*substitute_parse, ")");
11242 RExC_parse = SvPV(*substitute_parse, len);
11244 /* Don't allow empty number */
11245 if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11246 RExC_parse = endbrace;
11247 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11249 RExC_end = RExC_parse + len;
11251 /* The values are Unicode, and therefore not subject to recoding */
11252 RExC_override_recoding = 1;
11255 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11256 if (flags & RESTART_UTF8) {
11257 *flagp = RESTART_UTF8;
11258 return (STRLEN) -1;
11260 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11263 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11266 RExC_parse = endbrace;
11267 RExC_end = orig_end;
11268 RExC_override_recoding = 0;
11270 nextchar(pRExC_state);
11280 * It returns the code point in utf8 for the value in *encp.
11281 * value: a code value in the source encoding
11282 * encp: a pointer to an Encode object
11284 * If the result from Encode is not a single character,
11285 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11288 S_reg_recode(pTHX_ const char value, SV **encp)
11291 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11292 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11293 const STRLEN newlen = SvCUR(sv);
11294 UV uv = UNICODE_REPLACEMENT;
11296 PERL_ARGS_ASSERT_REG_RECODE;
11300 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11303 if (!newlen || numlen != newlen) {
11304 uv = UNICODE_REPLACEMENT;
11310 PERL_STATIC_INLINE U8
11311 S_compute_EXACTish(RExC_state_t *pRExC_state)
11315 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11321 op = get_regex_charset(RExC_flags);
11322 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11323 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11324 been, so there is no hole */
11327 return op + EXACTF;
11330 PERL_STATIC_INLINE void
11331 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11332 regnode *node, I32* flagp, STRLEN len, UV code_point,
11335 /* This knows the details about sizing an EXACTish node, setting flags for
11336 * it (by setting <*flagp>, and potentially populating it with a single
11339 * If <len> (the length in bytes) is non-zero, this function assumes that
11340 * the node has already been populated, and just does the sizing. In this
11341 * case <code_point> should be the final code point that has already been
11342 * placed into the node. This value will be ignored except that under some
11343 * circumstances <*flagp> is set based on it.
11345 * If <len> is zero, the function assumes that the node is to contain only
11346 * the single character given by <code_point> and calculates what <len>
11347 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11348 * additionally will populate the node's STRING with <code_point> or its
11351 * In both cases <*flagp> is appropriately set
11353 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11354 * 255, must be folded (the former only when the rules indicate it can
11357 * When it does the populating, it looks at the flag 'downgradable'. If
11358 * true with a node that folds, it checks if the single code point
11359 * participates in a fold, and if not downgrades the node to an EXACT.
11360 * This helps the optimizer */
11362 bool len_passed_in = cBOOL(len != 0);
11363 U8 character[UTF8_MAXBYTES_CASE+1];
11365 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11367 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11368 * sizing difference, and is extra work that is thrown away */
11369 if (downgradable && ! PASS2) {
11370 downgradable = FALSE;
11373 if (! len_passed_in) {
11375 if (UVCHR_IS_INVARIANT(code_point)) {
11376 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11377 *character = (U8) code_point;
11379 else { /* Here is /i and not /l. (toFOLD() is defined on just
11380 ASCII, which isn't the same thing as INVARIANT on
11381 EBCDIC, but it works there, as the extra invariants
11382 fold to themselves) */
11383 *character = toFOLD((U8) code_point);
11385 /* We can downgrade to an EXACT node if this character
11386 * isn't a folding one. Note that this assumes that
11387 * nothing above Latin1 folds to some other invariant than
11388 * one of these alphabetics; otherwise we would also have
11390 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11391 * || ASCII_FOLD_RESTRICTED))
11393 if (downgradable && PL_fold[code_point] == code_point) {
11399 else if (FOLD && (! LOC
11400 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11401 { /* Folding, and ok to do so now */
11402 UV folded = _to_uni_fold_flags(
11406 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11407 ? FOLD_FLAGS_NOMIX_ASCII
11410 && folded == code_point /* This quickly rules out many
11411 cases, avoiding the
11412 _invlist_contains_cp() overhead
11414 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11419 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11421 /* Not folding this cp, and can output it directly */
11422 *character = UTF8_TWO_BYTE_HI(code_point);
11423 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11427 uvchr_to_utf8( character, code_point);
11428 len = UTF8SKIP(character);
11430 } /* Else pattern isn't UTF8. */
11432 *character = (U8) code_point;
11434 } /* Else is folded non-UTF8 */
11435 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11437 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11438 * comments at join_exact()); */
11439 *character = (U8) code_point;
11442 /* Can turn into an EXACT node if we know the fold at compile time,
11443 * and it folds to itself and doesn't particpate in other folds */
11446 && PL_fold_latin1[code_point] == code_point
11447 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11448 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11452 } /* else is Sharp s. May need to fold it */
11453 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11455 *(character + 1) = 's';
11459 *character = LATIN_SMALL_LETTER_SHARP_S;
11465 RExC_size += STR_SZ(len);
11468 RExC_emit += STR_SZ(len);
11469 STR_LEN(node) = len;
11470 if (! len_passed_in) {
11471 Copy((char *) character, STRING(node), len, char);
11475 *flagp |= HASWIDTH;
11477 /* A single character node is SIMPLE, except for the special-cased SHARP S
11479 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11480 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11481 || ! FOLD || ! DEPENDS_SEMANTICS))
11486 /* The OP may not be well defined in PASS1 */
11487 if (PASS2 && OP(node) == EXACTFL) {
11488 RExC_contains_locale = 1;
11493 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11494 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11497 S_backref_value(char *p)
11499 const char* endptr;
11500 UV val = grok_atou(p, &endptr);
11501 if (endptr == p || endptr == NULL || val > I32_MAX)
11508 - regatom - the lowest level
11510 Try to identify anything special at the start of the pattern. If there
11511 is, then handle it as required. This may involve generating a single regop,
11512 such as for an assertion; or it may involve recursing, such as to
11513 handle a () structure.
11515 If the string doesn't start with something special then we gobble up
11516 as much literal text as we can.
11518 Once we have been able to handle whatever type of thing started the
11519 sequence, we return.
11521 Note: we have to be careful with escapes, as they can be both literal
11522 and special, and in the case of \10 and friends, context determines which.
11524 A summary of the code structure is:
11526 switch (first_byte) {
11527 cases for each special:
11528 handle this special;
11531 switch (2nd byte) {
11532 cases for each unambiguous special:
11533 handle this special;
11535 cases for each ambigous special/literal:
11537 if (special) handle here
11539 default: // unambiguously literal:
11542 default: // is a literal char
11545 create EXACTish node for literal;
11546 while (more input and node isn't full) {
11547 switch (input_byte) {
11548 cases for each special;
11549 make sure parse pointer is set so that the next call to
11550 regatom will see this special first
11551 goto loopdone; // EXACTish node terminated by prev. char
11553 append char to EXACTISH node;
11555 get next input byte;
11559 return the generated node;
11561 Specifically there are two separate switches for handling
11562 escape sequences, with the one for handling literal escapes requiring
11563 a dummy entry for all of the special escapes that are actually handled
11566 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11568 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11570 Otherwise does not return NULL.
11574 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11576 regnode *ret = NULL;
11578 char *parse_start = RExC_parse;
11583 GET_RE_DEBUG_FLAGS_DECL;
11585 *flagp = WORST; /* Tentatively. */
11587 DEBUG_PARSE("atom");
11589 PERL_ARGS_ASSERT_REGATOM;
11592 switch ((U8)*RExC_parse) {
11594 RExC_seen_zerolen++;
11595 nextchar(pRExC_state);
11596 if (RExC_flags & RXf_PMf_MULTILINE)
11597 ret = reg_node(pRExC_state, MBOL);
11599 ret = reg_node(pRExC_state, SBOL);
11600 Set_Node_Length(ret, 1); /* MJD */
11603 nextchar(pRExC_state);
11605 RExC_seen_zerolen++;
11606 if (RExC_flags & RXf_PMf_MULTILINE)
11607 ret = reg_node(pRExC_state, MEOL);
11609 ret = reg_node(pRExC_state, SEOL);
11610 Set_Node_Length(ret, 1); /* MJD */
11613 nextchar(pRExC_state);
11614 if (RExC_flags & RXf_PMf_SINGLELINE)
11615 ret = reg_node(pRExC_state, SANY);
11617 ret = reg_node(pRExC_state, REG_ANY);
11618 *flagp |= HASWIDTH|SIMPLE;
11620 Set_Node_Length(ret, 1); /* MJD */
11624 char * const oregcomp_parse = ++RExC_parse;
11625 ret = regclass(pRExC_state, flagp,depth+1,
11626 FALSE, /* means parse the whole char class */
11627 TRUE, /* allow multi-char folds */
11628 FALSE, /* don't silence non-portable warnings. */
11630 if (*RExC_parse != ']') {
11631 RExC_parse = oregcomp_parse;
11632 vFAIL("Unmatched [");
11635 if (*flagp & RESTART_UTF8)
11637 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11640 nextchar(pRExC_state);
11641 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11645 nextchar(pRExC_state);
11646 ret = reg(pRExC_state, 2, &flags,depth+1);
11648 if (flags & TRYAGAIN) {
11649 if (RExC_parse == RExC_end) {
11650 /* Make parent create an empty node if needed. */
11651 *flagp |= TRYAGAIN;
11656 if (flags & RESTART_UTF8) {
11657 *flagp = RESTART_UTF8;
11660 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11663 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11667 if (flags & TRYAGAIN) {
11668 *flagp |= TRYAGAIN;
11671 vFAIL("Internal urp");
11672 /* Supposed to be caught earlier. */
11678 vFAIL("Quantifier follows nothing");
11683 This switch handles escape sequences that resolve to some kind
11684 of special regop and not to literal text. Escape sequnces that
11685 resolve to literal text are handled below in the switch marked
11688 Every entry in this switch *must* have a corresponding entry
11689 in the literal escape switch. However, the opposite is not
11690 required, as the default for this switch is to jump to the
11691 literal text handling code.
11693 switch ((U8)*++RExC_parse) {
11694 /* Special Escapes */
11696 RExC_seen_zerolen++;
11697 ret = reg_node(pRExC_state, SBOL);
11698 /* SBOL is shared with /^/ so we set the flags so we can tell
11699 * /\A/ from /^/ in split. We check ret because first pass we
11700 * have no regop struct to set the flags on. */
11704 goto finish_meta_pat;
11706 ret = reg_node(pRExC_state, GPOS);
11707 RExC_seen |= REG_GPOS_SEEN;
11709 goto finish_meta_pat;
11711 RExC_seen_zerolen++;
11712 ret = reg_node(pRExC_state, KEEPS);
11714 /* XXX:dmq : disabling in-place substitution seems to
11715 * be necessary here to avoid cases of memory corruption, as
11716 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11718 RExC_seen |= REG_LOOKBEHIND_SEEN;
11719 goto finish_meta_pat;
11721 ret = reg_node(pRExC_state, SEOL);
11723 RExC_seen_zerolen++; /* Do not optimize RE away */
11724 goto finish_meta_pat;
11726 ret = reg_node(pRExC_state, EOS);
11728 RExC_seen_zerolen++; /* Do not optimize RE away */
11729 goto finish_meta_pat;
11731 ret = reg_node(pRExC_state, CANY);
11732 RExC_seen |= REG_CANY_SEEN;
11733 *flagp |= HASWIDTH|SIMPLE;
11735 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11737 goto finish_meta_pat;
11739 ret = reg_node(pRExC_state, CLUMP);
11740 *flagp |= HASWIDTH;
11741 goto finish_meta_pat;
11747 arg = ANYOF_WORDCHAR;
11751 RExC_seen_zerolen++;
11752 RExC_seen |= REG_LOOKBEHIND_SEEN;
11753 op = BOUND + get_regex_charset(RExC_flags);
11754 if (op > BOUNDA) { /* /aa is same as /a */
11757 else if (op == BOUNDL) {
11758 RExC_contains_locale = 1;
11760 ret = reg_node(pRExC_state, op);
11761 FLAGS(ret) = get_regex_charset(RExC_flags);
11763 if ((U8) *(RExC_parse + 1) == '{') {
11764 /* diag_listed_as: Use "%s" instead of "%s" */
11765 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11767 goto finish_meta_pat;
11769 RExC_seen_zerolen++;
11770 RExC_seen |= REG_LOOKBEHIND_SEEN;
11771 op = NBOUND + get_regex_charset(RExC_flags);
11772 if (op > NBOUNDA) { /* /aa is same as /a */
11775 else if (op == NBOUNDL) {
11776 RExC_contains_locale = 1;
11778 ret = reg_node(pRExC_state, op);
11779 FLAGS(ret) = get_regex_charset(RExC_flags);
11781 if ((U8) *(RExC_parse + 1) == '{') {
11782 /* diag_listed_as: Use "%s" instead of "%s" */
11783 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11785 goto finish_meta_pat;
11795 ret = reg_node(pRExC_state, LNBREAK);
11796 *flagp |= HASWIDTH|SIMPLE;
11797 goto finish_meta_pat;
11805 goto join_posix_op_known;
11811 arg = ANYOF_VERTWS;
11813 goto join_posix_op_known;
11823 op = POSIXD + get_regex_charset(RExC_flags);
11824 if (op > POSIXA) { /* /aa is same as /a */
11827 else if (op == POSIXL) {
11828 RExC_contains_locale = 1;
11831 join_posix_op_known:
11834 op += NPOSIXD - POSIXD;
11837 ret = reg_node(pRExC_state, op);
11839 FLAGS(ret) = namedclass_to_classnum(arg);
11842 *flagp |= HASWIDTH|SIMPLE;
11846 nextchar(pRExC_state);
11847 Set_Node_Length(ret, 2); /* MJD */
11853 char* parse_start = RExC_parse - 2;
11858 ret = regclass(pRExC_state, flagp,depth+1,
11859 TRUE, /* means just parse this element */
11860 FALSE, /* don't allow multi-char folds */
11861 FALSE, /* don't silence non-portable warnings.
11862 It would be a bug if these returned
11865 /* regclass() can only return RESTART_UTF8 if multi-char folds
11868 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11873 Set_Node_Offset(ret, parse_start + 2);
11874 Set_Node_Cur_Length(ret, parse_start);
11875 nextchar(pRExC_state);
11879 /* Handle \N and \N{NAME} with multiple code points here and not
11880 * below because it can be multicharacter. join_exact() will join
11881 * them up later on. Also this makes sure that things like
11882 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11883 * The options to the grok function call causes it to fail if the
11884 * sequence is just a single code point. We then go treat it as
11885 * just another character in the current EXACT node, and hence it
11886 * gets uniform treatment with all the other characters. The
11887 * special treatment for quantifiers is not needed for such single
11888 * character sequences */
11890 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11893 if (*flagp & RESTART_UTF8)
11899 case 'k': /* Handle \k<NAME> and \k'NAME' */
11902 char ch= RExC_parse[1];
11903 if (ch != '<' && ch != '\'' && ch != '{') {
11905 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11906 vFAIL2("Sequence %.2s... not terminated",parse_start);
11908 /* this pretty much dupes the code for (?P=...) in reg(), if
11909 you change this make sure you change that */
11910 char* name_start = (RExC_parse += 2);
11912 SV *sv_dat = reg_scan_name(pRExC_state,
11913 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11914 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11915 if (RExC_parse == name_start || *RExC_parse != ch)
11916 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11917 vFAIL2("Sequence %.3s... not terminated",parse_start);
11920 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11921 RExC_rxi->data->data[num]=(void*)sv_dat;
11922 SvREFCNT_inc_simple_void(sv_dat);
11926 ret = reganode(pRExC_state,
11929 : (ASCII_FOLD_RESTRICTED)
11931 : (AT_LEAST_UNI_SEMANTICS)
11937 *flagp |= HASWIDTH;
11939 /* override incorrect value set in reganode MJD */
11940 Set_Node_Offset(ret, parse_start+1);
11941 Set_Node_Cur_Length(ret, parse_start);
11942 nextchar(pRExC_state);
11948 case '1': case '2': case '3': case '4':
11949 case '5': case '6': case '7': case '8': case '9':
11954 if (*RExC_parse == 'g') {
11958 if (*RExC_parse == '{') {
11962 if (*RExC_parse == '-') {
11966 if (hasbrace && !isDIGIT(*RExC_parse)) {
11967 if (isrel) RExC_parse--;
11969 goto parse_named_seq;
11972 num = S_backref_value(RExC_parse);
11974 vFAIL("Reference to invalid group 0");
11975 else if (num == I32_MAX) {
11976 if (isDIGIT(*RExC_parse))
11977 vFAIL("Reference to nonexistent group");
11979 vFAIL("Unterminated \\g... pattern");
11983 num = RExC_npar - num;
11985 vFAIL("Reference to nonexistent or unclosed group");
11989 num = S_backref_value(RExC_parse);
11990 /* bare \NNN might be backref or octal - if it is larger than or equal
11991 * RExC_npar then it is assumed to be and octal escape.
11992 * Note RExC_npar is +1 from the actual number of parens*/
11993 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11994 && *RExC_parse != '8' && *RExC_parse != '9'))
11996 /* Probably a character specified in octal, e.g. \35 */
12001 /* at this point RExC_parse definitely points to a backref
12004 #ifdef RE_TRACK_PATTERN_OFFSETS
12005 char * const parse_start = RExC_parse - 1; /* MJD */
12007 while (isDIGIT(*RExC_parse))
12010 if (*RExC_parse != '}')
12011 vFAIL("Unterminated \\g{...} pattern");
12015 if (num > (I32)RExC_rx->nparens)
12016 vFAIL("Reference to nonexistent group");
12019 ret = reganode(pRExC_state,
12022 : (ASCII_FOLD_RESTRICTED)
12024 : (AT_LEAST_UNI_SEMANTICS)
12030 *flagp |= HASWIDTH;
12032 /* override incorrect value set in reganode MJD */
12033 Set_Node_Offset(ret, parse_start+1);
12034 Set_Node_Cur_Length(ret, parse_start);
12036 nextchar(pRExC_state);
12041 if (RExC_parse >= RExC_end)
12042 FAIL("Trailing \\");
12045 /* Do not generate "unrecognized" warnings here, we fall
12046 back into the quick-grab loop below */
12053 if (RExC_flags & RXf_PMf_EXTENDED) {
12054 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12055 if (RExC_parse < RExC_end)
12062 parse_start = RExC_parse - 1;
12071 #define MAX_NODE_STRING_SIZE 127
12072 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12074 U8 upper_parse = MAX_NODE_STRING_SIZE;
12075 U8 node_type = compute_EXACTish(pRExC_state);
12076 bool next_is_quantifier;
12077 char * oldp = NULL;
12079 /* We can convert EXACTF nodes to EXACTFU if they contain only
12080 * characters that match identically regardless of the target
12081 * string's UTF8ness. The reason to do this is that EXACTF is not
12082 * trie-able, EXACTFU is.
12084 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12085 * contain only above-Latin1 characters (hence must be in UTF8),
12086 * which don't participate in folds with Latin1-range characters,
12087 * as the latter's folds aren't known until runtime. (We don't
12088 * need to figure this out until pass 2) */
12089 bool maybe_exactfu = PASS2
12090 && (node_type == EXACTF || node_type == EXACTFL);
12092 /* If a folding node contains only code points that don't
12093 * participate in folds, it can be changed into an EXACT node,
12094 * which allows the optimizer more things to look for */
12097 ret = reg_node(pRExC_state, node_type);
12099 /* In pass1, folded, we use a temporary buffer instead of the
12100 * actual node, as the node doesn't exist yet */
12101 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12107 /* We do the EXACTFish to EXACT node only if folding. (And we
12108 * don't need to figure this out until pass 2) */
12109 maybe_exact = FOLD && PASS2;
12111 /* XXX The node can hold up to 255 bytes, yet this only goes to
12112 * 127. I (khw) do not know why. Keeping it somewhat less than
12113 * 255 allows us to not have to worry about overflow due to
12114 * converting to utf8 and fold expansion, but that value is
12115 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12116 * split up by this limit into a single one using the real max of
12117 * 255. Even at 127, this breaks under rare circumstances. If
12118 * folding, we do not want to split a node at a character that is a
12119 * non-final in a multi-char fold, as an input string could just
12120 * happen to want to match across the node boundary. The join
12121 * would solve that problem if the join actually happens. But a
12122 * series of more than two nodes in a row each of 127 would cause
12123 * the first join to succeed to get to 254, but then there wouldn't
12124 * be room for the next one, which could at be one of those split
12125 * multi-char folds. I don't know of any fool-proof solution. One
12126 * could back off to end with only a code point that isn't such a
12127 * non-final, but it is possible for there not to be any in the
12129 for (p = RExC_parse - 1;
12130 len < upper_parse && p < RExC_end;
12135 if (RExC_flags & RXf_PMf_EXTENDED)
12136 p = regpatws(pRExC_state, p,
12137 TRUE); /* means recognize comments */
12148 /* Literal Escapes Switch
12150 This switch is meant to handle escape sequences that
12151 resolve to a literal character.
12153 Every escape sequence that represents something
12154 else, like an assertion or a char class, is handled
12155 in the switch marked 'Special Escapes' above in this
12156 routine, but also has an entry here as anything that
12157 isn't explicitly mentioned here will be treated as
12158 an unescaped equivalent literal.
12161 switch ((U8)*++p) {
12162 /* These are all the special escapes. */
12163 case 'A': /* Start assertion */
12164 case 'b': case 'B': /* Word-boundary assertion*/
12165 case 'C': /* Single char !DANGEROUS! */
12166 case 'd': case 'D': /* digit class */
12167 case 'g': case 'G': /* generic-backref, pos assertion */
12168 case 'h': case 'H': /* HORIZWS */
12169 case 'k': case 'K': /* named backref, keep marker */
12170 case 'p': case 'P': /* Unicode property */
12171 case 'R': /* LNBREAK */
12172 case 's': case 'S': /* space class */
12173 case 'v': case 'V': /* VERTWS */
12174 case 'w': case 'W': /* word class */
12175 case 'X': /* eXtended Unicode "combining
12176 character sequence" */
12177 case 'z': case 'Z': /* End of line/string assertion */
12181 /* Anything after here is an escape that resolves to a
12182 literal. (Except digits, which may or may not)
12188 case 'N': /* Handle a single-code point named character. */
12189 /* The options cause it to fail if a multiple code
12190 * point sequence. Handle those in the switch() above
12192 RExC_parse = p + 1;
12193 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12199 if (*flagp & RESTART_UTF8)
12200 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12201 RExC_parse = p = oldp;
12205 if (ender > 0xff) {
12222 ender = ESC_NATIVE;
12232 const char* error_msg;
12234 bool valid = grok_bslash_o(&p,
12237 PASS2, /* out warnings */
12238 FALSE, /* not strict */
12239 TRUE, /* Output warnings
12244 RExC_parse = p; /* going to die anyway; point
12245 to exact spot of failure */
12249 if (IN_ENCODING && ender < 0x100) {
12250 goto recode_encoding;
12252 if (ender > 0xff) {
12259 UV result = UV_MAX; /* initialize to erroneous
12261 const char* error_msg;
12263 bool valid = grok_bslash_x(&p,
12266 PASS2, /* out warnings */
12267 FALSE, /* not strict */
12268 TRUE, /* Output warnings
12273 RExC_parse = p; /* going to die anyway; point
12274 to exact spot of failure */
12279 if (IN_ENCODING && ender < 0x100) {
12280 goto recode_encoding;
12282 if (ender > 0xff) {
12289 ender = grok_bslash_c(*p++, PASS2);
12291 case '8': case '9': /* must be a backreference */
12294 case '1': case '2': case '3':case '4':
12295 case '5': case '6': case '7':
12296 /* When we parse backslash escapes there is ambiguity
12297 * between backreferences and octal escapes. Any escape
12298 * from \1 - \9 is a backreference, any multi-digit
12299 * escape which does not start with 0 and which when
12300 * evaluated as decimal could refer to an already
12301 * parsed capture buffer is a backslash. Anything else
12304 * Note this implies that \118 could be interpreted as
12305 * 118 OR as "\11" . "8" depending on whether there
12306 * were 118 capture buffers defined already in the
12309 /* NOTE, RExC_npar is 1 more than the actual number of
12310 * parens we have seen so far, hence the < RExC_npar below. */
12312 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12313 { /* Not to be treated as an octal constant, go
12321 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12323 ender = grok_oct(p, &numlen, &flags, NULL);
12324 if (ender > 0xff) {
12328 if (PASS2 /* like \08, \178 */
12331 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12333 reg_warn_non_literal_string(
12335 form_short_octal_warning(p, numlen));
12338 if (IN_ENCODING && ender < 0x100)
12339 goto recode_encoding;
12342 if (! RExC_override_recoding) {
12343 SV* enc = _get_encoding();
12344 ender = reg_recode((const char)(U8)ender, &enc);
12346 ckWARNreg(p, "Invalid escape in the specified encoding");
12352 FAIL("Trailing \\");
12355 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12356 /* Include any { following the alpha to emphasize
12357 * that it could be part of an escape at some point
12359 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12360 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12362 goto normal_default;
12363 } /* End of switch on '\' */
12366 /* Currently we don't warn when the lbrace is at the start
12367 * of a construct. This catches it in the middle of a
12368 * literal string, or when its the first thing after
12369 * something like "\b" */
12371 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12373 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12376 default: /* A literal character */
12378 if (UTF8_IS_START(*p) && UTF) {
12380 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12381 &numlen, UTF8_ALLOW_DEFAULT);
12387 } /* End of switch on the literal */
12389 /* Here, have looked at the literal character and <ender>
12390 * contains its ordinal, <p> points to the character after it
12393 if ( RExC_flags & RXf_PMf_EXTENDED)
12394 p = regpatws(pRExC_state, p,
12395 TRUE); /* means recognize comments */
12397 /* If the next thing is a quantifier, it applies to this
12398 * character only, which means that this character has to be in
12399 * its own node and can't just be appended to the string in an
12400 * existing node, so if there are already other characters in
12401 * the node, close the node with just them, and set up to do
12402 * this character again next time through, when it will be the
12403 * only thing in its new node */
12404 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12410 if (! FOLD /* The simple case, just append the literal */
12411 || (LOC /* Also don't fold for tricky chars under /l */
12412 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12415 const STRLEN unilen = reguni(pRExC_state, ender, s);
12421 /* The loop increments <len> each time, as all but this
12422 * path (and one other) through it add a single byte to
12423 * the EXACTish node. But this one has changed len to
12424 * be the correct final value, so subtract one to
12425 * cancel out the increment that follows */
12429 REGC((char)ender, s++);
12432 /* Can get here if folding only if is one of the /l
12433 * characters whose fold depends on the locale. The
12434 * occurrence of any of these indicate that we can't
12435 * simplify things */
12437 maybe_exact = FALSE;
12438 maybe_exactfu = FALSE;
12443 /* See comments for join_exact() as to why we fold this
12444 * non-UTF at compile time */
12445 || (node_type == EXACTFU
12446 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12448 /* Here, are folding and are not UTF-8 encoded; therefore
12449 * the character must be in the range 0-255, and is not /l
12450 * (Not /l because we already handled these under /l in
12451 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12452 if (IS_IN_SOME_FOLD_L1(ender)) {
12453 maybe_exact = FALSE;
12455 /* See if the character's fold differs between /d and
12456 * /u. This includes the multi-char fold SHARP S to
12459 && (PL_fold[ender] != PL_fold_latin1[ender]
12460 || ender == LATIN_SMALL_LETTER_SHARP_S
12462 && isALPHA_FOLD_EQ(ender, 's')
12463 && isALPHA_FOLD_EQ(*(s-1), 's'))))
12465 maybe_exactfu = FALSE;
12469 /* Even when folding, we store just the input character, as
12470 * we have an array that finds its fold quickly */
12471 *(s++) = (char) ender;
12473 else { /* FOLD and UTF */
12474 /* Unlike the non-fold case, we do actually have to
12475 * calculate the results here in pass 1. This is for two
12476 * reasons, the folded length may be longer than the
12477 * unfolded, and we have to calculate how many EXACTish
12478 * nodes it will take; and we may run out of room in a node
12479 * in the middle of a potential multi-char fold, and have
12480 * to back off accordingly. (Hence we can't use REGC for
12481 * the simple case just below.) */
12484 if (isASCII_uni(ender)) {
12485 folded = toFOLD(ender);
12486 *(s)++ = (U8) folded;
12491 folded = _to_uni_fold_flags(
12495 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12496 ? FOLD_FLAGS_NOMIX_ASCII
12500 /* The loop increments <len> each time, as all but this
12501 * path (and one other) through it add a single byte to
12502 * the EXACTish node. But this one has changed len to
12503 * be the correct final value, so subtract one to
12504 * cancel out the increment that follows */
12505 len += foldlen - 1;
12507 /* If this node only contains non-folding code points so
12508 * far, see if this new one is also non-folding */
12510 if (folded != ender) {
12511 maybe_exact = FALSE;
12514 /* Here the fold is the original; we have to check
12515 * further to see if anything folds to it */
12516 if (_invlist_contains_cp(PL_utf8_foldable,
12519 maybe_exact = FALSE;
12526 if (next_is_quantifier) {
12528 /* Here, the next input is a quantifier, and to get here,
12529 * the current character is the only one in the node.
12530 * Also, here <len> doesn't include the final byte for this
12536 } /* End of loop through literal characters */
12538 /* Here we have either exhausted the input or ran out of room in
12539 * the node. (If we encountered a character that can't be in the
12540 * node, transfer is made directly to <loopdone>, and so we
12541 * wouldn't have fallen off the end of the loop.) In the latter
12542 * case, we artificially have to split the node into two, because
12543 * we just don't have enough space to hold everything. This
12544 * creates a problem if the final character participates in a
12545 * multi-character fold in the non-final position, as a match that
12546 * should have occurred won't, due to the way nodes are matched,
12547 * and our artificial boundary. So back off until we find a non-
12548 * problematic character -- one that isn't at the beginning or
12549 * middle of such a fold. (Either it doesn't participate in any
12550 * folds, or appears only in the final position of all the folds it
12551 * does participate in.) A better solution with far fewer false
12552 * positives, and that would fill the nodes more completely, would
12553 * be to actually have available all the multi-character folds to
12554 * test against, and to back-off only far enough to be sure that
12555 * this node isn't ending with a partial one. <upper_parse> is set
12556 * further below (if we need to reparse the node) to include just
12557 * up through that final non-problematic character that this code
12558 * identifies, so when it is set to less than the full node, we can
12559 * skip the rest of this */
12560 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12562 const STRLEN full_len = len;
12564 assert(len >= MAX_NODE_STRING_SIZE);
12566 /* Here, <s> points to the final byte of the final character.
12567 * Look backwards through the string until find a non-
12568 * problematic character */
12572 /* This has no multi-char folds to non-UTF characters */
12573 if (ASCII_FOLD_RESTRICTED) {
12577 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12581 if (! PL_NonL1NonFinalFold) {
12582 PL_NonL1NonFinalFold = _new_invlist_C_array(
12583 NonL1_Perl_Non_Final_Folds_invlist);
12586 /* Point to the first byte of the final character */
12587 s = (char *) utf8_hop((U8 *) s, -1);
12589 while (s >= s0) { /* Search backwards until find
12590 non-problematic char */
12591 if (UTF8_IS_INVARIANT(*s)) {
12593 /* There are no ascii characters that participate
12594 * in multi-char folds under /aa. In EBCDIC, the
12595 * non-ascii invariants are all control characters,
12596 * so don't ever participate in any folds. */
12597 if (ASCII_FOLD_RESTRICTED
12598 || ! IS_NON_FINAL_FOLD(*s))
12603 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12604 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12610 else if (! _invlist_contains_cp(
12611 PL_NonL1NonFinalFold,
12612 valid_utf8_to_uvchr((U8 *) s, NULL)))
12617 /* Here, the current character is problematic in that
12618 * it does occur in the non-final position of some
12619 * fold, so try the character before it, but have to
12620 * special case the very first byte in the string, so
12621 * we don't read outside the string */
12622 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12623 } /* End of loop backwards through the string */
12625 /* If there were only problematic characters in the string,
12626 * <s> will point to before s0, in which case the length
12627 * should be 0, otherwise include the length of the
12628 * non-problematic character just found */
12629 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12632 /* Here, have found the final character, if any, that is
12633 * non-problematic as far as ending the node without splitting
12634 * it across a potential multi-char fold. <len> contains the
12635 * number of bytes in the node up-to and including that
12636 * character, or is 0 if there is no such character, meaning
12637 * the whole node contains only problematic characters. In
12638 * this case, give up and just take the node as-is. We can't
12643 /* If the node ends in an 's' we make sure it stays EXACTF,
12644 * as if it turns into an EXACTFU, it could later get
12645 * joined with another 's' that would then wrongly match
12647 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12649 maybe_exactfu = FALSE;
12653 /* Here, the node does contain some characters that aren't
12654 * problematic. If one such is the final character in the
12655 * node, we are done */
12656 if (len == full_len) {
12659 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12661 /* If the final character is problematic, but the
12662 * penultimate is not, back-off that last character to
12663 * later start a new node with it */
12668 /* Here, the final non-problematic character is earlier
12669 * in the input than the penultimate character. What we do
12670 * is reparse from the beginning, going up only as far as
12671 * this final ok one, thus guaranteeing that the node ends
12672 * in an acceptable character. The reason we reparse is
12673 * that we know how far in the character is, but we don't
12674 * know how to correlate its position with the input parse.
12675 * An alternate implementation would be to build that
12676 * correlation as we go along during the original parse,
12677 * but that would entail extra work for every node, whereas
12678 * this code gets executed only when the string is too
12679 * large for the node, and the final two characters are
12680 * problematic, an infrequent occurrence. Yet another
12681 * possible strategy would be to save the tail of the
12682 * string, and the next time regatom is called, initialize
12683 * with that. The problem with this is that unless you
12684 * back off one more character, you won't be guaranteed
12685 * regatom will get called again, unless regbranch,
12686 * regpiece ... are also changed. If you do back off that
12687 * extra character, so that there is input guaranteed to
12688 * force calling regatom, you can't handle the case where
12689 * just the first character in the node is acceptable. I
12690 * (khw) decided to try this method which doesn't have that
12691 * pitfall; if performance issues are found, we can do a
12692 * combination of the current approach plus that one */
12698 } /* End of verifying node ends with an appropriate char */
12700 loopdone: /* Jumped to when encounters something that shouldn't be in
12703 /* I (khw) don't know if you can get here with zero length, but the
12704 * old code handled this situation by creating a zero-length EXACT
12705 * node. Might as well be NOTHING instead */
12711 /* If 'maybe_exact' is still set here, means there are no
12712 * code points in the node that participate in folds;
12713 * similarly for 'maybe_exactfu' and code points that match
12714 * differently depending on UTF8ness of the target string
12715 * (for /u), or depending on locale for /l */
12719 else if (maybe_exactfu) {
12723 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12724 FALSE /* Don't look to see if could
12725 be turned into an EXACT
12726 node, as we have already
12731 RExC_parse = p - 1;
12732 Set_Node_Cur_Length(ret, parse_start);
12733 nextchar(pRExC_state);
12735 /* len is STRLEN which is unsigned, need to copy to signed */
12738 vFAIL("Internal disaster");
12741 } /* End of label 'defchar:' */
12743 } /* End of giant switch on input character */
12749 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12751 /* Returns the next non-pattern-white space, non-comment character (the
12752 * latter only if 'recognize_comment is true) in the string p, which is
12753 * ended by RExC_end. See also reg_skipcomment */
12754 const char *e = RExC_end;
12756 PERL_ARGS_ASSERT_REGPATWS;
12760 if ((len = is_PATWS_safe(p, e, UTF))) {
12763 else if (recognize_comment && *p == '#') {
12764 p = reg_skipcomment(pRExC_state, p);
12773 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12775 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12776 * sets up the bitmap and any flags, removing those code points from the
12777 * inversion list, setting it to NULL should it become completely empty */
12779 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12780 assert(PL_regkind[OP(node)] == ANYOF);
12782 ANYOF_BITMAP_ZERO(node);
12783 if (*invlist_ptr) {
12785 /* This gets set if we actually need to modify things */
12786 bool change_invlist = FALSE;
12790 /* Start looking through *invlist_ptr */
12791 invlist_iterinit(*invlist_ptr);
12792 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12796 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12797 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12799 else if (end >= NUM_ANYOF_CODE_POINTS) {
12800 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12803 /* Quit if are above what we should change */
12804 if (start >= NUM_ANYOF_CODE_POINTS) {
12808 change_invlist = TRUE;
12810 /* Set all the bits in the range, up to the max that we are doing */
12811 high = (end < NUM_ANYOF_CODE_POINTS - 1)
12813 : NUM_ANYOF_CODE_POINTS - 1;
12814 for (i = start; i <= (int) high; i++) {
12815 if (! ANYOF_BITMAP_TEST(node, i)) {
12816 ANYOF_BITMAP_SET(node, i);
12820 invlist_iterfinish(*invlist_ptr);
12822 /* Done with loop; remove any code points that are in the bitmap from
12823 * *invlist_ptr; similarly for code points above the bitmap if we have
12824 * a flag to match all of them anyways */
12825 if (change_invlist) {
12826 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12828 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12829 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12832 /* If have completely emptied it, remove it completely */
12833 if (_invlist_len(*invlist_ptr) == 0) {
12834 SvREFCNT_dec_NN(*invlist_ptr);
12835 *invlist_ptr = NULL;
12840 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12841 Character classes ([:foo:]) can also be negated ([:^foo:]).
12842 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12843 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12844 but trigger failures because they are currently unimplemented. */
12846 #define POSIXCC_DONE(c) ((c) == ':')
12847 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12848 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12850 PERL_STATIC_INLINE I32
12851 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12853 I32 namedclass = OOB_NAMEDCLASS;
12855 PERL_ARGS_ASSERT_REGPPOSIXCC;
12857 if (value == '[' && RExC_parse + 1 < RExC_end &&
12858 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12859 POSIXCC(UCHARAT(RExC_parse)))
12861 const char c = UCHARAT(RExC_parse);
12862 char* const s = RExC_parse++;
12864 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12866 if (RExC_parse == RExC_end) {
12869 /* Try to give a better location for the error (than the end of
12870 * the string) by looking for the matching ']' */
12872 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12875 vFAIL2("Unmatched '%c' in POSIX class", c);
12877 /* Grandfather lone [:, [=, [. */
12881 const char* const t = RExC_parse++; /* skip over the c */
12884 if (UCHARAT(RExC_parse) == ']') {
12885 const char *posixcc = s + 1;
12886 RExC_parse++; /* skip over the ending ] */
12889 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12890 const I32 skip = t - posixcc;
12892 /* Initially switch on the length of the name. */
12895 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12896 this is the Perl \w
12898 namedclass = ANYOF_WORDCHAR;
12901 /* Names all of length 5. */
12902 /* alnum alpha ascii blank cntrl digit graph lower
12903 print punct space upper */
12904 /* Offset 4 gives the best switch position. */
12905 switch (posixcc[4]) {
12907 if (memEQ(posixcc, "alph", 4)) /* alpha */
12908 namedclass = ANYOF_ALPHA;
12911 if (memEQ(posixcc, "spac", 4)) /* space */
12912 namedclass = ANYOF_PSXSPC;
12915 if (memEQ(posixcc, "grap", 4)) /* graph */
12916 namedclass = ANYOF_GRAPH;
12919 if (memEQ(posixcc, "asci", 4)) /* ascii */
12920 namedclass = ANYOF_ASCII;
12923 if (memEQ(posixcc, "blan", 4)) /* blank */
12924 namedclass = ANYOF_BLANK;
12927 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12928 namedclass = ANYOF_CNTRL;
12931 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12932 namedclass = ANYOF_ALPHANUMERIC;
12935 if (memEQ(posixcc, "lowe", 4)) /* lower */
12936 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12937 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12938 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12941 if (memEQ(posixcc, "digi", 4)) /* digit */
12942 namedclass = ANYOF_DIGIT;
12943 else if (memEQ(posixcc, "prin", 4)) /* print */
12944 namedclass = ANYOF_PRINT;
12945 else if (memEQ(posixcc, "punc", 4)) /* punct */
12946 namedclass = ANYOF_PUNCT;
12951 if (memEQ(posixcc, "xdigit", 6))
12952 namedclass = ANYOF_XDIGIT;
12956 if (namedclass == OOB_NAMEDCLASS)
12958 "POSIX class [:%"UTF8f":] unknown",
12959 UTF8fARG(UTF, t - s - 1, s + 1));
12961 /* The #defines are structured so each complement is +1 to
12962 * the normal one */
12966 assert (posixcc[skip] == ':');
12967 assert (posixcc[skip+1] == ']');
12968 } else if (!SIZE_ONLY) {
12969 /* [[=foo=]] and [[.foo.]] are still future. */
12971 /* adjust RExC_parse so the warning shows after
12972 the class closes */
12973 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12975 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12978 /* Maternal grandfather:
12979 * "[:" ending in ":" but not in ":]" */
12981 vFAIL("Unmatched '[' in POSIX class");
12984 /* Grandfather lone [:, [=, [. */
12994 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12996 /* This applies some heuristics at the current parse position (which should
12997 * be at a '[') to see if what follows might be intended to be a [:posix:]
12998 * class. It returns true if it really is a posix class, of course, but it
12999 * also can return true if it thinks that what was intended was a posix
13000 * class that didn't quite make it.
13002 * It will return true for
13004 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13005 * ')' indicating the end of the (?[
13006 * [:any garbage including %^&$ punctuation:]
13008 * This is designed to be called only from S_handle_regex_sets; it could be
13009 * easily adapted to be called from the spot at the beginning of regclass()
13010 * that checks to see in a normal bracketed class if the surrounding []
13011 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13012 * change long-standing behavior, so I (khw) didn't do that */
13013 char* p = RExC_parse + 1;
13014 char first_char = *p;
13016 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13018 assert(*(p - 1) == '[');
13020 if (! POSIXCC(first_char)) {
13025 while (p < RExC_end && isWORDCHAR(*p)) p++;
13027 if (p >= RExC_end) {
13031 if (p - RExC_parse > 2 /* Got at least 1 word character */
13032 && (*p == first_char
13033 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13038 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13041 && p - RExC_parse > 2 /* [:] evaluates to colon;
13042 [::] is a bad posix class. */
13043 && first_char == *(p - 1));
13047 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13048 I32 *flagp, U32 depth,
13049 char * const oregcomp_parse)
13051 /* Handle the (?[...]) construct to do set operations */
13054 UV start, end; /* End points of code point ranges */
13056 char *save_end, *save_parse;
13061 const bool save_fold = FOLD;
13063 GET_RE_DEBUG_FLAGS_DECL;
13065 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13068 vFAIL("(?[...]) not valid in locale");
13070 RExC_uni_semantics = 1;
13072 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13073 * (such as EXACT). Thus we can skip most everything if just sizing. We
13074 * call regclass to handle '[]' so as to not have to reinvent its parsing
13075 * rules here (throwing away the size it computes each time). And, we exit
13076 * upon an unescaped ']' that isn't one ending a regclass. To do both
13077 * these things, we need to realize that something preceded by a backslash
13078 * is escaped, so we have to keep track of backslashes */
13080 Perl_ck_warner_d(aTHX_
13081 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13082 "The regex_sets feature is experimental" REPORT_LOCATION,
13083 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13085 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13086 RExC_precomp + (RExC_parse - RExC_precomp)));
13089 UV depth = 0; /* how many nested (?[...]) constructs */
13091 while (RExC_parse < RExC_end) {
13092 SV* current = NULL;
13093 RExC_parse = regpatws(pRExC_state, RExC_parse,
13094 TRUE); /* means recognize comments */
13095 switch (*RExC_parse) {
13097 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13102 /* Skip the next byte (which could cause us to end up in
13103 * the middle of a UTF-8 character, but since none of those
13104 * are confusable with anything we currently handle in this
13105 * switch (invariants all), it's safe. We'll just hit the
13106 * default: case next time and keep on incrementing until
13107 * we find one of the invariants we do handle. */
13112 /* If this looks like it is a [:posix:] class, leave the
13113 * parse pointer at the '[' to fool regclass() into
13114 * thinking it is part of a '[[:posix:]]'. That function
13115 * will use strict checking to force a syntax error if it
13116 * doesn't work out to a legitimate class */
13117 bool is_posix_class
13118 = could_it_be_a_POSIX_class(pRExC_state);
13119 if (! is_posix_class) {
13123 /* regclass() can only return RESTART_UTF8 if multi-char
13124 folds are allowed. */
13125 if (!regclass(pRExC_state, flagp,depth+1,
13126 is_posix_class, /* parse the whole char
13127 class only if not a
13129 FALSE, /* don't allow multi-char folds */
13130 TRUE, /* silence non-portable warnings. */
13132 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13135 /* function call leaves parse pointing to the ']', except
13136 * if we faked it */
13137 if (is_posix_class) {
13141 SvREFCNT_dec(current); /* In case it returned something */
13146 if (depth--) break;
13148 if (RExC_parse < RExC_end
13149 && *RExC_parse == ')')
13151 node = reganode(pRExC_state, ANYOF, 0);
13152 RExC_size += ANYOF_SKIP;
13153 nextchar(pRExC_state);
13154 Set_Node_Length(node,
13155 RExC_parse - oregcomp_parse + 1); /* MJD */
13164 FAIL("Syntax error in (?[...])");
13167 /* Pass 2 only after this. Everything in this construct is a
13168 * metacharacter. Operands begin with either a '\' (for an escape
13169 * sequence), or a '[' for a bracketed character class. Any other
13170 * character should be an operator, or parenthesis for grouping. Both
13171 * types of operands are handled by calling regclass() to parse them. It
13172 * is called with a parameter to indicate to return the computed inversion
13173 * list. The parsing here is implemented via a stack. Each entry on the
13174 * stack is a single character representing one of the operators, or the
13175 * '('; or else a pointer to an operand inversion list. */
13177 #define IS_OPERAND(a) (! SvIOK(a))
13179 /* The stack starts empty. It is a syntax error if the first thing parsed
13180 * is a binary operator; everything else is pushed on the stack. When an
13181 * operand is parsed, the top of the stack is examined. If it is a binary
13182 * operator, the item before it should be an operand, and both are replaced
13183 * by the result of doing that operation on the new operand and the one on
13184 * the stack. Thus a sequence of binary operands is reduced to a single
13185 * one before the next one is parsed.
13187 * A unary operator may immediately follow a binary in the input, for
13190 * When an operand is parsed and the top of the stack is a unary operator,
13191 * the operation is performed, and then the stack is rechecked to see if
13192 * this new operand is part of a binary operation; if so, it is handled as
13195 * A '(' is simply pushed on the stack; it is valid only if the stack is
13196 * empty, or the top element of the stack is an operator or another '('
13197 * (for which the parenthesized expression will become an operand). By the
13198 * time the corresponding ')' is parsed everything in between should have
13199 * been parsed and evaluated to a single operand (or else is a syntax
13200 * error), and is handled as a regular operand */
13202 sv_2mortal((SV *)(stack = newAV()));
13204 while (RExC_parse < RExC_end) {
13205 I32 top_index = av_tindex(stack);
13207 SV* current = NULL;
13209 /* Skip white space */
13210 RExC_parse = regpatws(pRExC_state, RExC_parse,
13211 TRUE /* means recognize comments */ );
13212 if (RExC_parse >= RExC_end) {
13213 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13215 if ((curchar = UCHARAT(RExC_parse)) == ']') {
13222 if (av_tindex(stack) >= 0 /* This makes sure that we can
13223 safely subtract 1 from
13224 RExC_parse in the next clause.
13225 If we have something on the
13226 stack, we have parsed something
13228 && UCHARAT(RExC_parse - 1) == '('
13229 && RExC_parse < RExC_end)
13231 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13232 * This happens when we have some thing like
13234 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13236 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13238 * Here we would be handling the interpolated
13239 * '$thai_or_lao'. We handle this by a recursive call to
13240 * ourselves which returns the inversion list the
13241 * interpolated expression evaluates to. We use the flags
13242 * from the interpolated pattern. */
13243 U32 save_flags = RExC_flags;
13244 const char * const save_parse = ++RExC_parse;
13246 parse_lparen_question_flags(pRExC_state);
13248 if (RExC_parse == save_parse /* Makes sure there was at
13249 least one flag (or this
13250 embedding wasn't compiled)
13252 || RExC_parse >= RExC_end - 4
13253 || UCHARAT(RExC_parse) != ':'
13254 || UCHARAT(++RExC_parse) != '('
13255 || UCHARAT(++RExC_parse) != '?'
13256 || UCHARAT(++RExC_parse) != '[')
13259 /* In combination with the above, this moves the
13260 * pointer to the point just after the first erroneous
13261 * character (or if there are no flags, to where they
13262 * should have been) */
13263 if (RExC_parse >= RExC_end - 4) {
13264 RExC_parse = RExC_end;
13266 else if (RExC_parse != save_parse) {
13267 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13269 vFAIL("Expecting '(?flags:(?[...'");
13272 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13273 depth+1, oregcomp_parse);
13275 /* Here, 'current' contains the embedded expression's
13276 * inversion list, and RExC_parse points to the trailing
13277 * ']'; the next character should be the ')' which will be
13278 * paired with the '(' that has been put on the stack, so
13279 * the whole embedded expression reduces to '(operand)' */
13282 RExC_flags = save_flags;
13283 goto handle_operand;
13288 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13289 vFAIL("Unexpected character");
13292 /* regclass() can only return RESTART_UTF8 if multi-char
13293 folds are allowed. */
13294 if (!regclass(pRExC_state, flagp,depth+1,
13295 TRUE, /* means parse just the next thing */
13296 FALSE, /* don't allow multi-char folds */
13297 FALSE, /* don't silence non-portable warnings. */
13299 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13301 /* regclass() will return with parsing just the \ sequence,
13302 * leaving the parse pointer at the next thing to parse */
13304 goto handle_operand;
13306 case '[': /* Is a bracketed character class */
13308 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13310 if (! is_posix_class) {
13314 /* regclass() can only return RESTART_UTF8 if multi-char
13315 folds are allowed. */
13316 if(!regclass(pRExC_state, flagp,depth+1,
13317 is_posix_class, /* parse the whole char class
13318 only if not a posix class */
13319 FALSE, /* don't allow multi-char folds */
13320 FALSE, /* don't silence non-portable warnings. */
13322 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13324 /* function call leaves parse pointing to the ']', except if we
13326 if (is_posix_class) {
13330 goto handle_operand;
13339 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13340 || ! IS_OPERAND(*top_ptr))
13343 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13345 av_push(stack, newSVuv(curchar));
13349 av_push(stack, newSVuv(curchar));
13353 if (top_index >= 0) {
13354 top_ptr = av_fetch(stack, top_index, FALSE);
13356 if (IS_OPERAND(*top_ptr)) {
13358 vFAIL("Unexpected '(' with no preceding operator");
13361 av_push(stack, newSVuv(curchar));
13368 || ! (current = av_pop(stack))
13369 || ! IS_OPERAND(current)
13370 || ! (lparen = av_pop(stack))
13371 || IS_OPERAND(lparen)
13372 || SvUV(lparen) != '(')
13374 SvREFCNT_dec(current);
13376 vFAIL("Unexpected ')'");
13379 SvREFCNT_dec_NN(lparen);
13386 /* Here, we have an operand to process, in 'current' */
13388 if (top_index < 0) { /* Just push if stack is empty */
13389 av_push(stack, current);
13392 SV* top = av_pop(stack);
13394 char current_operator;
13396 if (IS_OPERAND(top)) {
13397 SvREFCNT_dec_NN(top);
13398 SvREFCNT_dec_NN(current);
13399 vFAIL("Operand with no preceding operator");
13401 current_operator = (char) SvUV(top);
13402 switch (current_operator) {
13403 case '(': /* Push the '(' back on followed by the new
13405 av_push(stack, top);
13406 av_push(stack, current);
13407 SvREFCNT_inc(top); /* Counters the '_dec' done
13408 just after the 'break', so
13409 it doesn't get wrongly freed
13414 _invlist_invert(current);
13416 /* Unlike binary operators, the top of the stack,
13417 * now that this unary one has been popped off, may
13418 * legally be an operator, and we now have operand
13421 SvREFCNT_dec_NN(top);
13422 goto handle_operand;
13425 prev = av_pop(stack);
13426 _invlist_intersection(prev,
13429 av_push(stack, current);
13434 prev = av_pop(stack);
13435 _invlist_union(prev, current, ¤t);
13436 av_push(stack, current);
13440 prev = av_pop(stack);;
13441 _invlist_subtract(prev, current, ¤t);
13442 av_push(stack, current);
13445 case '^': /* The union minus the intersection */
13451 prev = av_pop(stack);
13452 _invlist_union(prev, current, &u);
13453 _invlist_intersection(prev, current, &i);
13454 /* _invlist_subtract will overwrite current
13455 without freeing what it already contains */
13457 _invlist_subtract(u, i, ¤t);
13458 av_push(stack, current);
13459 SvREFCNT_dec_NN(i);
13460 SvREFCNT_dec_NN(u);
13461 SvREFCNT_dec_NN(element);
13466 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13468 SvREFCNT_dec_NN(top);
13469 SvREFCNT_dec(prev);
13473 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13476 if (av_tindex(stack) < 0 /* Was empty */
13477 || ((final = av_pop(stack)) == NULL)
13478 || ! IS_OPERAND(final)
13479 || av_tindex(stack) >= 0) /* More left on stack */
13481 vFAIL("Incomplete expression within '(?[ ])'");
13484 /* Here, 'final' is the resultant inversion list from evaluating the
13485 * expression. Return it if so requested */
13486 if (return_invlist) {
13487 *return_invlist = final;
13491 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13492 * expecting a string of ranges and individual code points */
13493 invlist_iterinit(final);
13494 result_string = newSVpvs("");
13495 while (invlist_iternext(final, &start, &end)) {
13496 if (start == end) {
13497 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13500 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13505 save_parse = RExC_parse;
13506 RExC_parse = SvPV(result_string, len);
13507 save_end = RExC_end;
13508 RExC_end = RExC_parse + len;
13510 /* We turn off folding around the call, as the class we have constructed
13511 * already has all folding taken into consideration, and we don't want
13512 * regclass() to add to that */
13513 RExC_flags &= ~RXf_PMf_FOLD;
13514 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13516 node = regclass(pRExC_state, flagp,depth+1,
13517 FALSE, /* means parse the whole char class */
13518 FALSE, /* don't allow multi-char folds */
13519 TRUE, /* silence non-portable warnings. The above may very
13520 well have generated non-portable code points, but
13521 they're valid on this machine */
13524 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13527 RExC_flags |= RXf_PMf_FOLD;
13529 RExC_parse = save_parse + 1;
13530 RExC_end = save_end;
13531 SvREFCNT_dec_NN(final);
13532 SvREFCNT_dec_NN(result_string);
13534 nextchar(pRExC_state);
13535 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13541 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13543 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13544 * innocent-looking character class, like /[ks]/i won't have to go out to
13545 * disk to find the possible matches.
13547 * This should be called only for a Latin1-range code points, cp, which is
13548 * known to be involved in a simple fold with other code points above
13549 * Latin1. It would give false results if /aa has been specified.
13550 * Multi-char folds are outside the scope of this, and must be handled
13553 * XXX It would be better to generate these via regen, in case a new
13554 * version of the Unicode standard adds new mappings, though that is not
13555 * really likely, and may be caught by the default: case of the switch
13558 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13560 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13566 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13570 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13573 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13574 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13576 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13577 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13578 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13580 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13581 *invlist = add_cp_to_invlist(*invlist,
13582 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13584 case LATIN_SMALL_LETTER_SHARP_S:
13585 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13588 /* Use deprecated warning to increase the chances of this being
13591 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13598 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13600 /* This adds the string scalar <multi_string> to the array
13601 * <multi_char_matches>. <multi_string> is known to have exactly
13602 * <cp_count> code points in it. This is used when constructing a
13603 * bracketed character class and we find something that needs to match more
13604 * than a single character.
13606 * <multi_char_matches> is actually an array of arrays. Each top-level
13607 * element is an array that contains all the strings known so far that are
13608 * the same length. And that length (in number of code points) is the same
13609 * as the index of the top-level array. Hence, the [2] element is an
13610 * array, each element thereof is a string containing TWO code points;
13611 * while element [3] is for strings of THREE characters, and so on. Since
13612 * this is for multi-char strings there can never be a [0] nor [1] element.
13614 * When we rewrite the character class below, we will do so such that the
13615 * longest strings are written first, so that it prefers the longest
13616 * matching strings first. This is done even if it turns out that any
13617 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
13618 * Christiansen has agreed that this is ok. This makes the test for the
13619 * ligature 'ffi' come before the test for 'ff', for example */
13622 AV** this_array_ptr;
13624 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13626 if (! multi_char_matches) {
13627 multi_char_matches = newAV();
13630 if (av_exists(multi_char_matches, cp_count)) {
13631 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13632 this_array = *this_array_ptr;
13635 this_array = newAV();
13636 av_store(multi_char_matches, cp_count,
13639 av_push(this_array, multi_string);
13641 return multi_char_matches;
13644 /* The names of properties whose definitions are not known at compile time are
13645 * stored in this SV, after a constant heading. So if the length has been
13646 * changed since initialization, then there is a run-time definition. */
13647 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13648 (SvCUR(listsv) != initial_listsv_len)
13651 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13652 const bool stop_at_1, /* Just parse the next thing, don't
13653 look for a full character class */
13654 bool allow_multi_folds,
13655 const bool silence_non_portable, /* Don't output warnings
13658 SV** ret_invlist) /* Return an inversion list, not a node */
13660 /* parse a bracketed class specification. Most of these will produce an
13661 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13662 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13663 * under /i with multi-character folds: it will be rewritten following the
13664 * paradigm of this example, where the <multi-fold>s are characters which
13665 * fold to multiple character sequences:
13666 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13667 * gets effectively rewritten as:
13668 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13669 * reg() gets called (recursively) on the rewritten version, and this
13670 * function will return what it constructs. (Actually the <multi-fold>s
13671 * aren't physically removed from the [abcdefghi], it's just that they are
13672 * ignored in the recursion by means of a flag:
13673 * <RExC_in_multi_char_class>.)
13675 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13676 * characters, with the corresponding bit set if that character is in the
13677 * list. For characters above this, a range list or swash is used. There
13678 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13679 * determinable at compile time
13681 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13682 * to be restarted. This can only happen if ret_invlist is non-NULL.
13685 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13687 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13690 IV namedclass = OOB_NAMEDCLASS;
13691 char *rangebegin = NULL;
13692 bool need_class = 0;
13694 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13695 than just initialized. */
13696 SV* properties = NULL; /* Code points that match \p{} \P{} */
13697 SV* posixes = NULL; /* Code points that match classes like [:word:],
13698 extended beyond the Latin1 range. These have to
13699 be kept separate from other code points for much
13700 of this function because their handling is
13701 different under /i, and for most classes under
13703 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13704 separate for a while from the non-complemented
13705 versions because of complications with /d
13707 UV element_count = 0; /* Number of distinct elements in the class.
13708 Optimizations may be possible if this is tiny */
13709 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13710 character; used under /i */
13712 char * stop_ptr = RExC_end; /* where to stop parsing */
13713 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13715 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13717 /* Unicode properties are stored in a swash; this holds the current one
13718 * being parsed. If this swash is the only above-latin1 component of the
13719 * character class, an optimization is to pass it directly on to the
13720 * execution engine. Otherwise, it is set to NULL to indicate that there
13721 * are other things in the class that have to be dealt with at execution
13723 SV* swash = NULL; /* Code points that match \p{} \P{} */
13725 /* Set if a component of this character class is user-defined; just passed
13726 * on to the engine */
13727 bool has_user_defined_property = FALSE;
13729 /* inversion list of code points this node matches only when the target
13730 * string is in UTF-8. (Because is under /d) */
13731 SV* depends_list = NULL;
13733 /* Inversion list of code points this node matches regardless of things
13734 * like locale, folding, utf8ness of the target string */
13735 SV* cp_list = NULL;
13737 /* Like cp_list, but code points on this list need to be checked for things
13738 * that fold to/from them under /i */
13739 SV* cp_foldable_list = NULL;
13741 /* Like cp_list, but code points on this list are valid only when the
13742 * runtime locale is UTF-8 */
13743 SV* only_utf8_locale_list = NULL;
13746 /* In a range, counts how many 0-2 of the ends of it came from literals,
13747 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13748 UV literal_endpoint = 0;
13750 /* Is the range unicode? which means on a platform that isn't 1-1 native
13751 * to Unicode (i.e. non-ASCII), each code point in it should be considered
13752 * to be a Unicode value. */
13753 bool unicode_range = FALSE;
13755 bool invert = FALSE; /* Is this class to be complemented */
13757 bool warn_super = ALWAYS_WARN_SUPER;
13759 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13760 case we need to change the emitted regop to an EXACT. */
13761 const char * orig_parse = RExC_parse;
13762 const SSize_t orig_size = RExC_size;
13763 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13764 GET_RE_DEBUG_FLAGS_DECL;
13766 PERL_ARGS_ASSERT_REGCLASS;
13768 PERL_UNUSED_ARG(depth);
13771 DEBUG_PARSE("clas");
13773 /* Assume we are going to generate an ANYOF node. */
13774 ret = reganode(pRExC_state, ANYOF, 0);
13777 RExC_size += ANYOF_SKIP;
13778 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13781 ANYOF_FLAGS(ret) = 0;
13783 RExC_emit += ANYOF_SKIP;
13784 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13785 initial_listsv_len = SvCUR(listsv);
13786 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13790 RExC_parse = regpatws(pRExC_state, RExC_parse,
13791 FALSE /* means don't recognize comments */ );
13794 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13797 allow_multi_folds = FALSE;
13800 RExC_parse = regpatws(pRExC_state, RExC_parse,
13801 FALSE /* means don't recognize comments */ );
13805 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13806 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13807 const char *s = RExC_parse;
13808 const char c = *s++;
13810 while (isWORDCHAR(*s))
13812 if (*s && c == *s && s[1] == ']') {
13813 SAVEFREESV(RExC_rx_sv);
13815 "POSIX syntax [%c %c] belongs inside character classes",
13817 (void)ReREFCNT_inc(RExC_rx_sv);
13821 /* If the caller wants us to just parse a single element, accomplish this
13822 * by faking the loop ending condition */
13823 if (stop_at_1 && RExC_end > RExC_parse) {
13824 stop_ptr = RExC_parse + 1;
13827 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13828 if (UCHARAT(RExC_parse) == ']')
13829 goto charclassloop;
13832 if (RExC_parse >= stop_ptr) {
13837 RExC_parse = regpatws(pRExC_state, RExC_parse,
13838 FALSE /* means don't recognize comments */ );
13841 if (UCHARAT(RExC_parse) == ']') {
13847 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13848 save_value = value;
13849 save_prevvalue = prevvalue;
13852 rangebegin = RExC_parse;
13856 value = utf8n_to_uvchr((U8*)RExC_parse,
13857 RExC_end - RExC_parse,
13858 &numlen, UTF8_ALLOW_DEFAULT);
13859 RExC_parse += numlen;
13862 value = UCHARAT(RExC_parse++);
13865 && RExC_parse < RExC_end
13866 && POSIXCC(UCHARAT(RExC_parse)))
13868 namedclass = regpposixcc(pRExC_state, value, strict);
13870 else if (value != '\\') {
13872 literal_endpoint++;
13876 /* Is a backslash; get the code point of the char after it */
13877 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13878 value = utf8n_to_uvchr((U8*)RExC_parse,
13879 RExC_end - RExC_parse,
13880 &numlen, UTF8_ALLOW_DEFAULT);
13881 RExC_parse += numlen;
13884 value = UCHARAT(RExC_parse++);
13886 /* Some compilers cannot handle switching on 64-bit integer
13887 * values, therefore value cannot be an UV. Yes, this will
13888 * be a problem later if we want switch on Unicode.
13889 * A similar issue a little bit later when switching on
13890 * namedclass. --jhi */
13892 /* If the \ is escaping white space when white space is being
13893 * skipped, it means that that white space is wanted literally, and
13894 * is already in 'value'. Otherwise, need to translate the escape
13895 * into what it signifies. */
13896 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13898 case 'w': namedclass = ANYOF_WORDCHAR; break;
13899 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13900 case 's': namedclass = ANYOF_SPACE; break;
13901 case 'S': namedclass = ANYOF_NSPACE; break;
13902 case 'd': namedclass = ANYOF_DIGIT; break;
13903 case 'D': namedclass = ANYOF_NDIGIT; break;
13904 case 'v': namedclass = ANYOF_VERTWS; break;
13905 case 'V': namedclass = ANYOF_NVERTWS; break;
13906 case 'h': namedclass = ANYOF_HORIZWS; break;
13907 case 'H': namedclass = ANYOF_NHORIZWS; break;
13908 case 'N': /* Handle \N{NAME} in class */
13911 STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13912 flagp, depth, &as_text);
13913 if (*flagp & RESTART_UTF8)
13914 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13915 if (cp_count != 1) { /* The typical case drops through */
13916 assert(cp_count != (STRLEN) -1);
13917 if (cp_count == 0) {
13919 RExC_parse++; /* Position after the "}" */
13920 vFAIL("Zero length \\N{}");
13923 ckWARNreg(RExC_parse,
13924 "Ignoring zero length \\N{} in character class");
13927 else { /* cp_count > 1 */
13928 if (! RExC_in_multi_char_class) {
13929 if (invert || range || *RExC_parse == '-') {
13932 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13935 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13940 = add_multi_match(multi_char_matches,
13944 break; /* <value> contains the first code
13945 point. Drop out of the switch to
13948 } /* End of cp_count != 1 */
13950 /* This element should not be processed further in this
13953 value = save_value;
13954 prevvalue = save_prevvalue;
13955 continue; /* Back to top of loop to get next char */
13957 /* Here, is a single code point, and <value> contains it */
13959 /* We consider named characters to be literal characters,
13960 * and they are Unicode */
13961 literal_endpoint++;
13962 unicode_range = TRUE;
13971 /* We will handle any undefined properties ourselves */
13972 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13973 /* And we actually would prefer to get
13974 * the straight inversion list of the
13975 * swash, since we will be accessing it
13976 * anyway, to save a little time */
13977 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13979 if (RExC_parse >= RExC_end)
13980 vFAIL2("Empty \\%c{}", (U8)value);
13981 if (*RExC_parse == '{') {
13982 const U8 c = (U8)value;
13983 e = strchr(RExC_parse++, '}');
13985 vFAIL2("Missing right brace on \\%c{}", c);
13986 while (isSPACE(*RExC_parse))
13988 if (e == RExC_parse)
13989 vFAIL2("Empty \\%c{}", c);
13990 n = e - RExC_parse;
13991 while (isSPACE(*(RExC_parse + n - 1)))
14002 if (UCHARAT(RExC_parse) == '^') {
14005 /* toggle. (The rhs xor gets the single bit that
14006 * differs between P and p; the other xor inverts just
14008 value ^= 'P' ^ 'p';
14010 while (isSPACE(*RExC_parse)) {
14015 /* Try to get the definition of the property into
14016 * <invlist>. If /i is in effect, the effective property
14017 * will have its name be <__NAME_i>. The design is
14018 * discussed in commit
14019 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14020 name = savepv(Perl_form(aTHX_
14022 (FOLD) ? "__" : "",
14028 /* Look up the property name, and get its swash and
14029 * inversion list, if the property is found */
14031 SvREFCNT_dec_NN(swash);
14033 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14036 NULL, /* No inversion list */
14039 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14040 HV* curpkg = (IN_PERL_COMPILETIME)
14042 : CopSTASH(PL_curcop);
14044 SvREFCNT_dec_NN(swash);
14048 /* Here didn't find it. It could be a user-defined
14049 * property that will be available at run-time. If we
14050 * accept only compile-time properties, is an error;
14051 * otherwise add it to the list for run-time look up */
14053 RExC_parse = e + 1;
14055 "Property '%"UTF8f"' is unknown",
14056 UTF8fARG(UTF, n, name));
14059 /* If the property name doesn't already have a package
14060 * name, add the current one to it so that it can be
14061 * referred to outside it. [perl #121777] */
14062 if (curpkg && ! instr(name, "::")) {
14063 char* pkgname = HvNAME(curpkg);
14064 if (strNE(pkgname, "main")) {
14065 char* full_name = Perl_form(aTHX_
14069 n = strlen(full_name);
14071 name = savepvn(full_name, n);
14074 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14075 (value == 'p' ? '+' : '!'),
14076 UTF8fARG(UTF, n, name));
14077 has_user_defined_property = TRUE;
14079 /* We don't know yet, so have to assume that the
14080 * property could match something in the Latin1 range,
14081 * hence something that isn't utf8. Note that this
14082 * would cause things in <depends_list> to match
14083 * inappropriately, except that any \p{}, including
14084 * this one forces Unicode semantics, which means there
14085 * is no <depends_list> */
14087 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14091 /* Here, did get the swash and its inversion list. If
14092 * the swash is from a user-defined property, then this
14093 * whole character class should be regarded as such */
14094 if (swash_init_flags
14095 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14097 has_user_defined_property = TRUE;
14100 /* We warn on matching an above-Unicode code point
14101 * if the match would return true, except don't
14102 * warn for \p{All}, which has exactly one element
14104 (_invlist_contains_cp(invlist, 0x110000)
14105 && (! (_invlist_len(invlist) == 1
14106 && *invlist_array(invlist) == 0)))
14112 /* Invert if asking for the complement */
14113 if (value == 'P') {
14114 _invlist_union_complement_2nd(properties,
14118 /* The swash can't be used as-is, because we've
14119 * inverted things; delay removing it to here after
14120 * have copied its invlist above */
14121 SvREFCNT_dec_NN(swash);
14125 _invlist_union(properties, invlist, &properties);
14130 RExC_parse = e + 1;
14131 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14134 /* \p means they want Unicode semantics */
14135 RExC_uni_semantics = 1;
14138 case 'n': value = '\n'; break;
14139 case 'r': value = '\r'; break;
14140 case 't': value = '\t'; break;
14141 case 'f': value = '\f'; break;
14142 case 'b': value = '\b'; break;
14143 case 'e': value = ESC_NATIVE; break;
14144 case 'a': value = '\a'; break;
14146 RExC_parse--; /* function expects to be pointed at the 'o' */
14148 const char* error_msg;
14149 bool valid = grok_bslash_o(&RExC_parse,
14152 PASS2, /* warnings only in
14155 silence_non_portable,
14161 if (IN_ENCODING && value < 0x100) {
14162 goto recode_encoding;
14166 RExC_parse--; /* function expects to be pointed at the 'x' */
14168 const char* error_msg;
14169 bool valid = grok_bslash_x(&RExC_parse,
14172 PASS2, /* Output warnings */
14174 silence_non_portable,
14180 if (IN_ENCODING && value < 0x100)
14181 goto recode_encoding;
14184 value = grok_bslash_c(*RExC_parse++, PASS2);
14186 case '0': case '1': case '2': case '3': case '4':
14187 case '5': case '6': case '7':
14189 /* Take 1-3 octal digits */
14190 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14191 numlen = (strict) ? 4 : 3;
14192 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14193 RExC_parse += numlen;
14196 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14197 vFAIL("Need exactly 3 octal digits");
14199 else if (! SIZE_ONLY /* like \08, \178 */
14201 && RExC_parse < RExC_end
14202 && isDIGIT(*RExC_parse)
14203 && ckWARN(WARN_REGEXP))
14205 SAVEFREESV(RExC_rx_sv);
14206 reg_warn_non_literal_string(
14208 form_short_octal_warning(RExC_parse, numlen));
14209 (void)ReREFCNT_inc(RExC_rx_sv);
14212 if (IN_ENCODING && value < 0x100)
14213 goto recode_encoding;
14217 if (! RExC_override_recoding) {
14218 SV* enc = _get_encoding();
14219 value = reg_recode((const char)(U8)value, &enc);
14222 vFAIL("Invalid escape in the specified encoding");
14225 ckWARNreg(RExC_parse,
14226 "Invalid escape in the specified encoding");
14232 /* Allow \_ to not give an error */
14233 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14235 vFAIL2("Unrecognized escape \\%c in character class",
14239 SAVEFREESV(RExC_rx_sv);
14240 ckWARN2reg(RExC_parse,
14241 "Unrecognized escape \\%c in character class passed through",
14243 (void)ReREFCNT_inc(RExC_rx_sv);
14247 } /* End of switch on char following backslash */
14248 } /* end of handling backslash escape sequences */
14250 /* Here, we have the current token in 'value' */
14252 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14255 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14256 * literal, as is the character that began the false range, i.e.
14257 * the 'a' in the examples */
14260 const int w = (RExC_parse >= rangebegin)
14261 ? RExC_parse - rangebegin
14265 "False [] range \"%"UTF8f"\"",
14266 UTF8fARG(UTF, w, rangebegin));
14269 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14270 ckWARN2reg(RExC_parse,
14271 "False [] range \"%"UTF8f"\"",
14272 UTF8fARG(UTF, w, rangebegin));
14273 (void)ReREFCNT_inc(RExC_rx_sv);
14274 cp_list = add_cp_to_invlist(cp_list, '-');
14275 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14280 range = 0; /* this was not a true range */
14281 element_count += 2; /* So counts for three values */
14284 classnum = namedclass_to_classnum(namedclass);
14286 if (LOC && namedclass < ANYOF_POSIXL_MAX
14287 #ifndef HAS_ISASCII
14288 && classnum != _CC_ASCII
14291 /* What the Posix classes (like \w, [:space:]) match in locale
14292 * isn't knowable under locale until actual match time. Room
14293 * must be reserved (one time per outer bracketed class) to
14294 * store such classes. The space will contain a bit for each
14295 * named class that is to be matched against. This isn't
14296 * needed for \p{} and pseudo-classes, as they are not affected
14297 * by locale, and hence are dealt with separately */
14298 if (! need_class) {
14301 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14304 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14306 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14307 ANYOF_POSIXL_ZERO(ret);
14310 /* Coverity thinks it is possible for this to be negative; both
14311 * jhi and khw think it's not, but be safer */
14312 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14313 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14315 /* See if it already matches the complement of this POSIX
14317 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14318 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14322 posixl_matches_all = TRUE;
14323 break; /* No need to continue. Since it matches both
14324 e.g., \w and \W, it matches everything, and the
14325 bracketed class can be optimized into qr/./s */
14328 /* Add this class to those that should be checked at runtime */
14329 ANYOF_POSIXL_SET(ret, namedclass);
14331 /* The above-Latin1 characters are not subject to locale rules.
14332 * Just add them, in the second pass, to the
14333 * unconditionally-matched list */
14335 SV* scratch_list = NULL;
14337 /* Get the list of the above-Latin1 code points this
14339 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14340 PL_XPosix_ptrs[classnum],
14342 /* Odd numbers are complements, like
14343 * NDIGIT, NASCII, ... */
14344 namedclass % 2 != 0,
14346 /* Checking if 'cp_list' is NULL first saves an extra
14347 * clone. Its reference count will be decremented at the
14348 * next union, etc, or if this is the only instance, at the
14349 * end of the routine */
14351 cp_list = scratch_list;
14354 _invlist_union(cp_list, scratch_list, &cp_list);
14355 SvREFCNT_dec_NN(scratch_list);
14357 continue; /* Go get next character */
14360 else if (! SIZE_ONLY) {
14362 /* Here, not in pass1 (in that pass we skip calculating the
14363 * contents of this class), and is /l, or is a POSIX class for
14364 * which /l doesn't matter (or is a Unicode property, which is
14365 * skipped here). */
14366 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14367 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14369 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14370 * nor /l make a difference in what these match,
14371 * therefore we just add what they match to cp_list. */
14372 if (classnum != _CC_VERTSPACE) {
14373 assert( namedclass == ANYOF_HORIZWS
14374 || namedclass == ANYOF_NHORIZWS);
14376 /* It turns out that \h is just a synonym for
14378 classnum = _CC_BLANK;
14381 _invlist_union_maybe_complement_2nd(
14383 PL_XPosix_ptrs[classnum],
14384 namedclass % 2 != 0, /* Complement if odd
14385 (NHORIZWS, NVERTWS)
14390 else { /* Garden variety class. If is NASCII, NDIGIT, ...
14391 complement and use nposixes */
14392 SV** posixes_ptr = namedclass % 2 == 0
14395 SV** source_ptr = &PL_XPosix_ptrs[classnum];
14396 _invlist_union_maybe_complement_2nd(
14399 namedclass % 2 != 0,
14403 } /* end of namedclass \blah */
14406 RExC_parse = regpatws(pRExC_state, RExC_parse,
14407 FALSE /* means don't recognize comments */ );
14410 /* If 'range' is set, 'value' is the ending of a range--check its
14411 * validity. (If value isn't a single code point in the case of a
14412 * range, we should have figured that out above in the code that
14413 * catches false ranges). Later, we will handle each individual code
14414 * point in the range. If 'range' isn't set, this could be the
14415 * beginning of a range, so check for that by looking ahead to see if
14416 * the next real character to be processed is the range indicator--the
14421 /* For unicode ranges, we have to test that the Unicode as opposed
14422 * to the native values are not decreasing. (Above 255, and there
14423 * is no difference between native and Unicode) */
14424 if (unicode_range && prevvalue < 255 && value < 255) {
14425 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14426 goto backwards_range;
14431 if (prevvalue > value) /* b-a */ {
14436 w = RExC_parse - rangebegin;
14438 "Invalid [] range \"%"UTF8f"\"",
14439 UTF8fARG(UTF, w, rangebegin));
14440 range = 0; /* not a valid range */
14444 prevvalue = value; /* save the beginning of the potential range */
14445 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14446 && *RExC_parse == '-')
14448 char* next_char_ptr = RExC_parse + 1;
14449 if (skip_white) { /* Get the next real char after the '-' */
14450 next_char_ptr = regpatws(pRExC_state,
14452 FALSE); /* means don't recognize
14456 /* If the '-' is at the end of the class (just before the ']',
14457 * it is a literal minus; otherwise it is a range */
14458 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14459 RExC_parse = next_char_ptr;
14461 /* a bad range like \w-, [:word:]- ? */
14462 if (namedclass > OOB_NAMEDCLASS) {
14463 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14464 const int w = RExC_parse >= rangebegin
14465 ? RExC_parse - rangebegin
14468 vFAIL4("False [] range \"%*.*s\"",
14473 "False [] range \"%*.*s\"",
14478 cp_list = add_cp_to_invlist(cp_list, '-');
14482 range = 1; /* yeah, it's a range! */
14483 continue; /* but do it the next time */
14488 if (namedclass > OOB_NAMEDCLASS) {
14492 /* Here, we have a single value this time through the loop, and
14493 * <prevvalue> is the beginning of the range, if any; or <value> if
14496 /* non-Latin1 code point implies unicode semantics. Must be set in
14497 * pass1 so is there for the whole of pass 2 */
14499 RExC_uni_semantics = 1;
14502 /* Ready to process either the single value, or the completed range.
14503 * For single-valued non-inverted ranges, we consider the possibility
14504 * of multi-char folds. (We made a conscious decision to not do this
14505 * for the other cases because it can often lead to non-intuitive
14506 * results. For example, you have the peculiar case that:
14507 * "s s" =~ /^[^\xDF]+$/i => Y
14508 * "ss" =~ /^[^\xDF]+$/i => N
14510 * See [perl #89750] */
14511 if (FOLD && allow_multi_folds && value == prevvalue) {
14512 if (value == LATIN_SMALL_LETTER_SHARP_S
14513 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14516 /* Here <value> is indeed a multi-char fold. Get what it is */
14518 U8 foldbuf[UTF8_MAXBYTES_CASE];
14521 UV folded = _to_uni_fold_flags(
14525 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14526 ? FOLD_FLAGS_NOMIX_ASCII
14530 /* Here, <folded> should be the first character of the
14531 * multi-char fold of <value>, with <foldbuf> containing the
14532 * whole thing. But, if this fold is not allowed (because of
14533 * the flags), <fold> will be the same as <value>, and should
14534 * be processed like any other character, so skip the special
14536 if (folded != value) {
14538 /* Skip if we are recursed, currently parsing the class
14539 * again. Otherwise add this character to the list of
14540 * multi-char folds. */
14541 if (! RExC_in_multi_char_class) {
14542 STRLEN cp_count = utf8_length(foldbuf,
14543 foldbuf + foldlen);
14544 SV* multi_fold = sv_2mortal(newSVpvs(""));
14546 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14549 = add_multi_match(multi_char_matches,
14555 /* This element should not be processed further in this
14558 value = save_value;
14559 prevvalue = save_prevvalue;
14565 /* Deal with this element of the class */
14568 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14571 /* On non-ASCII platforms, for ranges that span all of 0..255, and
14572 * ones that don't require special handling, we can just add the
14573 * range like we do for ASCII platforms */
14574 if ((UNLIKELY(prevvalue == 0) && value >= 255)
14575 || ! (prevvalue < 256
14577 || (literal_endpoint == 2
14578 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14579 || (isUPPER_A(prevvalue)
14580 && isUPPER_A(value)))))))
14582 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14586 /* Here, requires special handling. This can be because it is
14587 * a range whose code points are considered to be Unicode, and
14588 * so must be individually translated into native, or because
14589 * its a subrange of 'A-Z' or 'a-z' which each aren't
14590 * contiguous in EBCDIC, but we have defined them to include
14591 * only the "expected" upper or lower case ASCII alphabetics.
14592 * Subranges above 255 are the same in native and Unicode, so
14593 * can be added as a range */
14594 U8 start = NATIVE_TO_LATIN1(prevvalue);
14596 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
14597 for (j = start; j <= end; j++) {
14598 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
14601 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14608 range = 0; /* this range (if it was one) is done now */
14609 } /* End of loop through all the text within the brackets */
14611 /* If anything in the class expands to more than one character, we have to
14612 * deal with them by building up a substitute parse string, and recursively
14613 * calling reg() on it, instead of proceeding */
14614 if (multi_char_matches) {
14615 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14618 char *save_end = RExC_end;
14619 char *save_parse = RExC_parse;
14620 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14625 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14626 because too confusing */
14628 sv_catpv(substitute_parse, "(?:");
14632 /* Look at the longest folds first */
14633 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14635 if (av_exists(multi_char_matches, cp_count)) {
14636 AV** this_array_ptr;
14639 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14641 while ((this_sequence = av_pop(*this_array_ptr)) !=
14644 if (! first_time) {
14645 sv_catpv(substitute_parse, "|");
14647 first_time = FALSE;
14649 sv_catpv(substitute_parse, SvPVX(this_sequence));
14654 /* If the character class contains anything else besides these
14655 * multi-character folds, have to include it in recursive parsing */
14656 if (element_count) {
14657 sv_catpv(substitute_parse, "|[");
14658 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14659 sv_catpv(substitute_parse, "]");
14662 sv_catpv(substitute_parse, ")");
14665 /* This is a way to get the parse to skip forward a whole named
14666 * sequence instead of matching the 2nd character when it fails the
14668 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14672 RExC_parse = SvPV(substitute_parse, len);
14673 RExC_end = RExC_parse + len;
14674 RExC_in_multi_char_class = 1;
14675 RExC_override_recoding = 1;
14676 RExC_emit = (regnode *)orig_emit;
14678 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14680 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14682 RExC_parse = save_parse;
14683 RExC_end = save_end;
14684 RExC_in_multi_char_class = 0;
14685 RExC_override_recoding = 0;
14686 SvREFCNT_dec_NN(multi_char_matches);
14690 /* Here, we've gone through the entire class and dealt with multi-char
14691 * folds. We are now in a position that we can do some checks to see if we
14692 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14693 * Currently we only do two checks:
14694 * 1) is in the unlikely event that the user has specified both, eg. \w and
14695 * \W under /l, then the class matches everything. (This optimization
14696 * is done only to make the optimizer code run later work.)
14697 * 2) if the character class contains only a single element (including a
14698 * single range), we see if there is an equivalent node for it.
14699 * Other checks are possible */
14700 if (! ret_invlist /* Can't optimize if returning the constructed
14702 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14707 if (UNLIKELY(posixl_matches_all)) {
14710 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14711 \w or [:digit:] or \p{foo}
14714 /* All named classes are mapped into POSIXish nodes, with its FLAG
14715 * argument giving which class it is */
14716 switch ((I32)namedclass) {
14717 case ANYOF_UNIPROP:
14720 /* These don't depend on the charset modifiers. They always
14721 * match under /u rules */
14722 case ANYOF_NHORIZWS:
14723 case ANYOF_HORIZWS:
14724 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14727 case ANYOF_NVERTWS:
14732 /* The actual POSIXish node for all the rest depends on the
14733 * charset modifier. The ones in the first set depend only on
14734 * ASCII or, if available on this platform, locale */
14738 op = (LOC) ? POSIXL : POSIXA;
14749 /* under /a could be alpha */
14751 if (ASCII_RESTRICTED) {
14752 namedclass = ANYOF_ALPHA + (namedclass % 2);
14760 /* The rest have more possibilities depending on the charset.
14761 * We take advantage of the enum ordering of the charset
14762 * modifiers to get the exact node type, */
14764 op = POSIXD + get_regex_charset(RExC_flags);
14765 if (op > POSIXA) { /* /aa is same as /a */
14770 /* The odd numbered ones are the complements of the
14771 * next-lower even number one */
14772 if (namedclass % 2 == 1) {
14776 arg = namedclass_to_classnum(namedclass);
14780 else if (value == prevvalue) {
14782 /* Here, the class consists of just a single code point */
14785 if (! LOC && value == '\n') {
14786 op = REG_ANY; /* Optimize [^\n] */
14787 *flagp |= HASWIDTH|SIMPLE;
14791 else if (value < 256 || UTF) {
14793 /* Optimize a single value into an EXACTish node, but not if it
14794 * would require converting the pattern to UTF-8. */
14795 op = compute_EXACTish(pRExC_state);
14797 } /* Otherwise is a range */
14798 else if (! LOC) { /* locale could vary these */
14799 if (prevvalue == '0') {
14800 if (value == '9') {
14805 else if (prevvalue == 'A') {
14808 && literal_endpoint == 2
14811 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14815 else if (prevvalue == 'a') {
14818 && literal_endpoint == 2
14821 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14827 /* Here, we have changed <op> away from its initial value iff we found
14828 * an optimization */
14831 /* Throw away this ANYOF regnode, and emit the calculated one,
14832 * which should correspond to the beginning, not current, state of
14834 const char * cur_parse = RExC_parse;
14835 RExC_parse = (char *)orig_parse;
14839 /* To get locale nodes to not use the full ANYOF size would
14840 * require moving the code above that writes the portions
14841 * of it that aren't in other nodes to after this point.
14842 * e.g. ANYOF_POSIXL_SET */
14843 RExC_size = orig_size;
14847 RExC_emit = (regnode *)orig_emit;
14848 if (PL_regkind[op] == POSIXD) {
14849 if (op == POSIXL) {
14850 RExC_contains_locale = 1;
14853 op += NPOSIXD - POSIXD;
14858 ret = reg_node(pRExC_state, op);
14860 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14864 *flagp |= HASWIDTH|SIMPLE;
14866 else if (PL_regkind[op] == EXACT) {
14867 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14868 TRUE /* downgradable to EXACT */
14872 RExC_parse = (char *) cur_parse;
14874 SvREFCNT_dec(posixes);
14875 SvREFCNT_dec(nposixes);
14876 SvREFCNT_dec(cp_list);
14877 SvREFCNT_dec(cp_foldable_list);
14884 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14886 /* If folding, we calculate all characters that could fold to or from the
14887 * ones already on the list */
14888 if (cp_foldable_list) {
14890 UV start, end; /* End points of code point ranges */
14892 SV* fold_intersection = NULL;
14895 /* Our calculated list will be for Unicode rules. For locale
14896 * matching, we have to keep a separate list that is consulted at
14897 * runtime only when the locale indicates Unicode rules. For
14898 * non-locale, we just use to the general list */
14900 use_list = &only_utf8_locale_list;
14903 use_list = &cp_list;
14906 /* Only the characters in this class that participate in folds need
14907 * be checked. Get the intersection of this class and all the
14908 * possible characters that are foldable. This can quickly narrow
14909 * down a large class */
14910 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14911 &fold_intersection);
14913 /* The folds for all the Latin1 characters are hard-coded into this
14914 * program, but we have to go out to disk to get the others. */
14915 if (invlist_highest(cp_foldable_list) >= 256) {
14917 /* This is a hash that for a particular fold gives all
14918 * characters that are involved in it */
14919 if (! PL_utf8_foldclosures) {
14920 _load_PL_utf8_foldclosures();
14924 /* Now look at the foldable characters in this class individually */
14925 invlist_iterinit(fold_intersection);
14926 while (invlist_iternext(fold_intersection, &start, &end)) {
14929 /* Look at every character in the range */
14930 for (j = start; j <= end; j++) {
14931 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14937 if (IS_IN_SOME_FOLD_L1(j)) {
14939 /* ASCII is always matched; non-ASCII is matched
14940 * only under Unicode rules (which could happen
14941 * under /l if the locale is a UTF-8 one */
14942 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14943 *use_list = add_cp_to_invlist(*use_list,
14944 PL_fold_latin1[j]);
14948 add_cp_to_invlist(depends_list,
14949 PL_fold_latin1[j]);
14953 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14954 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14956 add_above_Latin1_folds(pRExC_state,
14963 /* Here is an above Latin1 character. We don't have the
14964 * rules hard-coded for it. First, get its fold. This is
14965 * the simple fold, as the multi-character folds have been
14966 * handled earlier and separated out */
14967 _to_uni_fold_flags(j, foldbuf, &foldlen,
14968 (ASCII_FOLD_RESTRICTED)
14969 ? FOLD_FLAGS_NOMIX_ASCII
14972 /* Single character fold of above Latin1. Add everything in
14973 * its fold closure to the list that this node should match.
14974 * The fold closures data structure is a hash with the keys
14975 * being the UTF-8 of every character that is folded to, like
14976 * 'k', and the values each an array of all code points that
14977 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14978 * Multi-character folds are not included */
14979 if ((listp = hv_fetch(PL_utf8_foldclosures,
14980 (char *) foldbuf, foldlen, FALSE)))
14982 AV* list = (AV*) *listp;
14984 for (k = 0; k <= av_tindex(list); k++) {
14985 SV** c_p = av_fetch(list, k, FALSE);
14991 /* /aa doesn't allow folds between ASCII and non- */
14992 if ((ASCII_FOLD_RESTRICTED
14993 && (isASCII(c) != isASCII(j))))
14998 /* Folds under /l which cross the 255/256 boundary
14999 * are added to a separate list. (These are valid
15000 * only when the locale is UTF-8.) */
15001 if (c < 256 && LOC) {
15002 *use_list = add_cp_to_invlist(*use_list, c);
15006 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15008 cp_list = add_cp_to_invlist(cp_list, c);
15011 /* Similarly folds involving non-ascii Latin1
15012 * characters under /d are added to their list */
15013 depends_list = add_cp_to_invlist(depends_list,
15020 SvREFCNT_dec_NN(fold_intersection);
15023 /* Now that we have finished adding all the folds, there is no reason
15024 * to keep the foldable list separate */
15025 _invlist_union(cp_list, cp_foldable_list, &cp_list);
15026 SvREFCNT_dec_NN(cp_foldable_list);
15029 /* And combine the result (if any) with any inversion list from posix
15030 * classes. The lists are kept separate up to now because we don't want to
15031 * fold the classes (folding of those is automatically handled by the swash
15032 * fetching code) */
15033 if (posixes || nposixes) {
15034 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15035 /* Under /a and /aa, nothing above ASCII matches these */
15036 _invlist_intersection(posixes,
15037 PL_XPosix_ptrs[_CC_ASCII],
15041 if (DEPENDS_SEMANTICS) {
15042 /* Under /d, everything in the upper half of the Latin1 range
15043 * matches these complements */
15044 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15046 else if (AT_LEAST_ASCII_RESTRICTED) {
15047 /* Under /a and /aa, everything above ASCII matches these
15049 _invlist_union_complement_2nd(nposixes,
15050 PL_XPosix_ptrs[_CC_ASCII],
15054 _invlist_union(posixes, nposixes, &posixes);
15055 SvREFCNT_dec_NN(nposixes);
15058 posixes = nposixes;
15061 if (! DEPENDS_SEMANTICS) {
15063 _invlist_union(cp_list, posixes, &cp_list);
15064 SvREFCNT_dec_NN(posixes);
15071 /* Under /d, we put into a separate list the Latin1 things that
15072 * match only when the target string is utf8 */
15073 SV* nonascii_but_latin1_properties = NULL;
15074 _invlist_intersection(posixes, PL_UpperLatin1,
15075 &nonascii_but_latin1_properties);
15076 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15079 _invlist_union(cp_list, posixes, &cp_list);
15080 SvREFCNT_dec_NN(posixes);
15086 if (depends_list) {
15087 _invlist_union(depends_list, nonascii_but_latin1_properties,
15089 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15092 depends_list = nonascii_but_latin1_properties;
15097 /* And combine the result (if any) with any inversion list from properties.
15098 * The lists are kept separate up to now so that we can distinguish the two
15099 * in regards to matching above-Unicode. A run-time warning is generated
15100 * if a Unicode property is matched against a non-Unicode code point. But,
15101 * we allow user-defined properties to match anything, without any warning,
15102 * and we also suppress the warning if there is a portion of the character
15103 * class that isn't a Unicode property, and which matches above Unicode, \W
15104 * or [\x{110000}] for example.
15105 * (Note that in this case, unlike the Posix one above, there is no
15106 * <depends_list>, because having a Unicode property forces Unicode
15111 /* If it matters to the final outcome, see if a non-property
15112 * component of the class matches above Unicode. If so, the
15113 * warning gets suppressed. This is true even if just a single
15114 * such code point is specified, as though not strictly correct if
15115 * another such code point is matched against, the fact that they
15116 * are using above-Unicode code points indicates they should know
15117 * the issues involved */
15119 warn_super = ! (invert
15120 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15123 _invlist_union(properties, cp_list, &cp_list);
15124 SvREFCNT_dec_NN(properties);
15127 cp_list = properties;
15131 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15135 /* Here, we have calculated what code points should be in the character
15138 * Now we can see about various optimizations. Fold calculation (which we
15139 * did above) needs to take place before inversion. Otherwise /[^k]/i
15140 * would invert to include K, which under /i would match k, which it
15141 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15142 * folded until runtime */
15144 /* If we didn't do folding, it's because some information isn't available
15145 * until runtime; set the run-time fold flag for these. (We don't have to
15146 * worry about properties folding, as that is taken care of by the swash
15147 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15148 * locales, or the class matches at least one 0-255 range code point */
15150 if (only_utf8_locale_list) {
15151 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15153 else if (cp_list) { /* Look to see if there a 0-255 code point is in
15156 invlist_iterinit(cp_list);
15157 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15158 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15160 invlist_iterfinish(cp_list);
15164 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15165 * at compile time. Besides not inverting folded locale now, we can't
15166 * invert if there are things such as \w, which aren't known until runtime
15170 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15172 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15174 _invlist_invert(cp_list);
15176 /* Any swash can't be used as-is, because we've inverted things */
15178 SvREFCNT_dec_NN(swash);
15182 /* Clear the invert flag since have just done it here */
15187 *ret_invlist = cp_list;
15188 SvREFCNT_dec(swash);
15190 /* Discard the generated node */
15192 RExC_size = orig_size;
15195 RExC_emit = orig_emit;
15200 /* Some character classes are equivalent to other nodes. Such nodes take
15201 * up less room and generally fewer operations to execute than ANYOF nodes.
15202 * Above, we checked for and optimized into some such equivalents for
15203 * certain common classes that are easy to test. Getting to this point in
15204 * the code means that the class didn't get optimized there. Since this
15205 * code is only executed in Pass 2, it is too late to save space--it has
15206 * been allocated in Pass 1, and currently isn't given back. But turning
15207 * things into an EXACTish node can allow the optimizer to join it to any
15208 * adjacent such nodes. And if the class is equivalent to things like /./,
15209 * expensive run-time swashes can be avoided. Now that we have more
15210 * complete information, we can find things necessarily missed by the
15211 * earlier code. I (khw) am not sure how much to look for here. It would
15212 * be easy, but perhaps too slow, to check any candidates against all the
15213 * node types they could possibly match using _invlistEQ(). */
15218 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15219 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15221 /* We don't optimize if we are supposed to make sure all non-Unicode
15222 * code points raise a warning, as only ANYOF nodes have this check.
15224 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15227 U8 op = END; /* The optimzation node-type */
15228 const char * cur_parse= RExC_parse;
15230 invlist_iterinit(cp_list);
15231 if (! invlist_iternext(cp_list, &start, &end)) {
15233 /* Here, the list is empty. This happens, for example, when a
15234 * Unicode property is the only thing in the character class, and
15235 * it doesn't match anything. (perluniprops.pod notes such
15238 *flagp |= HASWIDTH|SIMPLE;
15240 else if (start == end) { /* The range is a single code point */
15241 if (! invlist_iternext(cp_list, &start, &end)
15243 /* Don't do this optimization if it would require changing
15244 * the pattern to UTF-8 */
15245 && (start < 256 || UTF))
15247 /* Here, the list contains a single code point. Can optimize
15248 * into an EXACTish node */
15257 /* A locale node under folding with one code point can be
15258 * an EXACTFL, as its fold won't be calculated until
15264 /* Here, we are generally folding, but there is only one
15265 * code point to match. If we have to, we use an EXACT
15266 * node, but it would be better for joining with adjacent
15267 * nodes in the optimization pass if we used the same
15268 * EXACTFish node that any such are likely to be. We can
15269 * do this iff the code point doesn't participate in any
15270 * folds. For example, an EXACTF of a colon is the same as
15271 * an EXACT one, since nothing folds to or from a colon. */
15273 if (IS_IN_SOME_FOLD_L1(value)) {
15278 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15283 /* If we haven't found the node type, above, it means we
15284 * can use the prevailing one */
15286 op = compute_EXACTish(pRExC_state);
15291 else if (start == 0) {
15292 if (end == UV_MAX) {
15294 *flagp |= HASWIDTH|SIMPLE;
15297 else if (end == '\n' - 1
15298 && invlist_iternext(cp_list, &start, &end)
15299 && start == '\n' + 1 && end == UV_MAX)
15302 *flagp |= HASWIDTH|SIMPLE;
15306 invlist_iterfinish(cp_list);
15309 RExC_parse = (char *)orig_parse;
15310 RExC_emit = (regnode *)orig_emit;
15312 ret = reg_node(pRExC_state, op);
15314 RExC_parse = (char *)cur_parse;
15316 if (PL_regkind[op] == EXACT) {
15317 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15318 TRUE /* downgradable to EXACT */
15322 SvREFCNT_dec_NN(cp_list);
15327 /* Here, <cp_list> contains all the code points we can determine at
15328 * compile time that match under all conditions. Go through it, and
15329 * for things that belong in the bitmap, put them there, and delete from
15330 * <cp_list>. While we are at it, see if everything above 255 is in the
15331 * list, and if so, set a flag to speed up execution */
15333 populate_ANYOF_from_invlist(ret, &cp_list);
15336 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15339 /* Here, the bitmap has been populated with all the Latin1 code points that
15340 * always match. Can now add to the overall list those that match only
15341 * when the target string is UTF-8 (<depends_list>). */
15342 if (depends_list) {
15344 _invlist_union(cp_list, depends_list, &cp_list);
15345 SvREFCNT_dec_NN(depends_list);
15348 cp_list = depends_list;
15350 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15353 /* If there is a swash and more than one element, we can't use the swash in
15354 * the optimization below. */
15355 if (swash && element_count > 1) {
15356 SvREFCNT_dec_NN(swash);
15360 /* Note that the optimization of using 'swash' if it is the only thing in
15361 * the class doesn't have us change swash at all, so it can include things
15362 * that are also in the bitmap; otherwise we have purposely deleted that
15363 * duplicate information */
15364 set_ANYOF_arg(pRExC_state, ret, cp_list,
15365 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15367 only_utf8_locale_list,
15368 swash, has_user_defined_property);
15370 *flagp |= HASWIDTH|SIMPLE;
15372 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15373 RExC_contains_locale = 1;
15379 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15382 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15383 regnode* const node,
15385 SV* const runtime_defns,
15386 SV* const only_utf8_locale_list,
15388 const bool has_user_defined_property)
15390 /* Sets the arg field of an ANYOF-type node 'node', using information about
15391 * the node passed-in. If there is nothing outside the node's bitmap, the
15392 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
15393 * the count returned by add_data(), having allocated and stored an array,
15394 * av, that that count references, as follows:
15395 * av[0] stores the character class description in its textual form.
15396 * This is used later (regexec.c:Perl_regclass_swash()) to
15397 * initialize the appropriate swash, and is also useful for dumping
15398 * the regnode. This is set to &PL_sv_undef if the textual
15399 * description is not needed at run-time (as happens if the other
15400 * elements completely define the class)
15401 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15402 * computed from av[0]. But if no further computation need be done,
15403 * the swash is stored here now (and av[0] is &PL_sv_undef).
15404 * av[2] stores the inversion list of code points that match only if the
15405 * current locale is UTF-8
15406 * av[3] stores the cp_list inversion list for use in addition or instead
15407 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15408 * (Otherwise everything needed is already in av[0] and av[1])
15409 * av[4] is set if any component of the class is from a user-defined
15410 * property; used only if av[3] exists */
15414 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15416 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15417 assert(! (ANYOF_FLAGS(node)
15418 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15419 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15420 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15423 AV * const av = newAV();
15426 assert(ANYOF_FLAGS(node)
15427 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15428 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15430 av_store(av, 0, (runtime_defns)
15431 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15434 av_store(av, 1, swash);
15435 SvREFCNT_dec_NN(cp_list);
15438 av_store(av, 1, &PL_sv_undef);
15440 av_store(av, 3, cp_list);
15441 av_store(av, 4, newSVuv(has_user_defined_property));
15445 if (only_utf8_locale_list) {
15446 av_store(av, 2, only_utf8_locale_list);
15449 av_store(av, 2, &PL_sv_undef);
15452 rv = newRV_noinc(MUTABLE_SV(av));
15453 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15454 RExC_rxi->data->data[n] = (void*)rv;
15459 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15461 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15462 const regnode* node,
15465 SV** only_utf8_locale_ptr,
15469 /* For internal core use only.
15470 * Returns the swash for the input 'node' in the regex 'prog'.
15471 * If <doinit> is 'true', will attempt to create the swash if not already
15473 * If <listsvp> is non-null, will return the printable contents of the
15474 * swash. This can be used to get debugging information even before the
15475 * swash exists, by calling this function with 'doinit' set to false, in
15476 * which case the components that will be used to eventually create the
15477 * swash are returned (in a printable form).
15478 * If <exclude_list> is not NULL, it is an inversion list of things to
15479 * exclude from what's returned in <listsvp>.
15480 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
15481 * that, in spite of this function's name, the swash it returns may include
15482 * the bitmap data as well */
15485 SV *si = NULL; /* Input swash initialization string */
15486 SV* invlist = NULL;
15488 RXi_GET_DECL(prog,progi);
15489 const struct reg_data * const data = prog ? progi->data : NULL;
15491 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15493 assert(ANYOF_FLAGS(node)
15494 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15495 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15497 if (data && data->count) {
15498 const U32 n = ARG(node);
15500 if (data->what[n] == 's') {
15501 SV * const rv = MUTABLE_SV(data->data[n]);
15502 AV * const av = MUTABLE_AV(SvRV(rv));
15503 SV **const ary = AvARRAY(av);
15504 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15506 si = *ary; /* ary[0] = the string to initialize the swash with */
15508 /* Elements 3 and 4 are either both present or both absent. [3] is
15509 * any inversion list generated at compile time; [4] indicates if
15510 * that inversion list has any user-defined properties in it. */
15511 if (av_tindex(av) >= 2) {
15512 if (only_utf8_locale_ptr
15514 && ary[2] != &PL_sv_undef)
15516 *only_utf8_locale_ptr = ary[2];
15519 assert(only_utf8_locale_ptr);
15520 *only_utf8_locale_ptr = NULL;
15523 if (av_tindex(av) >= 3) {
15525 if (SvUV(ary[4])) {
15526 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15534 /* Element [1] is reserved for the set-up swash. If already there,
15535 * return it; if not, create it and store it there */
15536 if (ary[1] && SvROK(ary[1])) {
15539 else if (doinit && ((si && si != &PL_sv_undef)
15540 || (invlist && invlist != &PL_sv_undef))) {
15542 sw = _core_swash_init("utf8", /* the utf8 package */
15546 0, /* not from tr/// */
15548 &swash_init_flags);
15549 (void)av_store(av, 1, sw);
15554 /* If requested, return a printable version of what this swash matches */
15556 SV* matches_string = newSVpvs("");
15558 /* The swash should be used, if possible, to get the data, as it
15559 * contains the resolved data. But this function can be called at
15560 * compile-time, before everything gets resolved, in which case we
15561 * return the currently best available information, which is the string
15562 * that will eventually be used to do that resolving, 'si' */
15563 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15564 && (si && si != &PL_sv_undef))
15566 sv_catsv(matches_string, si);
15569 /* Add the inversion list to whatever we have. This may have come from
15570 * the swash, or from an input parameter */
15572 if (exclude_list) {
15573 SV* clone = invlist_clone(invlist);
15574 _invlist_subtract(clone, exclude_list, &clone);
15575 sv_catsv(matches_string, _invlist_contents(clone));
15576 SvREFCNT_dec_NN(clone);
15579 sv_catsv(matches_string, _invlist_contents(invlist));
15582 *listsvp = matches_string;
15587 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15589 /* reg_skipcomment()
15591 Absorbs an /x style # comment from the input stream,
15592 returning a pointer to the first character beyond the comment, or if the
15593 comment terminates the pattern without anything following it, this returns
15594 one past the final character of the pattern (in other words, RExC_end) and
15595 sets the REG_RUN_ON_COMMENT_SEEN flag.
15597 Note it's the callers responsibility to ensure that we are
15598 actually in /x mode
15602 PERL_STATIC_INLINE char*
15603 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15605 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15609 while (p < RExC_end) {
15610 if (*(++p) == '\n') {
15615 /* we ran off the end of the pattern without ending the comment, so we have
15616 * to add an \n when wrapping */
15617 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15623 Advances the parse position, and optionally absorbs
15624 "whitespace" from the inputstream.
15626 Without /x "whitespace" means (?#...) style comments only,
15627 with /x this means (?#...) and # comments and whitespace proper.
15629 Returns the RExC_parse point from BEFORE the scan occurs.
15631 This is the /x friendly way of saying RExC_parse++.
15635 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15637 char* const retval = RExC_parse++;
15639 PERL_ARGS_ASSERT_NEXTCHAR;
15642 if (RExC_end - RExC_parse >= 3
15643 && *RExC_parse == '('
15644 && RExC_parse[1] == '?'
15645 && RExC_parse[2] == '#')
15647 while (*RExC_parse != ')') {
15648 if (RExC_parse == RExC_end)
15649 FAIL("Sequence (?#... not terminated");
15655 if (RExC_flags & RXf_PMf_EXTENDED) {
15656 char * p = regpatws(pRExC_state, RExC_parse,
15657 TRUE); /* means recognize comments */
15658 if (p != RExC_parse) {
15668 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15670 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15671 * space. In pass1, it aligns and increments RExC_size; in pass2,
15674 regnode * const ret = RExC_emit;
15675 GET_RE_DEBUG_FLAGS_DECL;
15677 PERL_ARGS_ASSERT_REGNODE_GUTS;
15679 assert(extra_size >= regarglen[op]);
15682 SIZE_ALIGN(RExC_size);
15683 RExC_size += 1 + extra_size;
15686 if (RExC_emit >= RExC_emit_bound)
15687 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15688 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15690 NODE_ALIGN_FILL(ret);
15691 #ifndef RE_TRACK_PATTERN_OFFSETS
15692 PERL_UNUSED_ARG(name);
15694 if (RExC_offsets) { /* MJD */
15696 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15699 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15700 ? "Overwriting end of array!\n" : "OK",
15701 (UV)(RExC_emit - RExC_emit_start),
15702 (UV)(RExC_parse - RExC_start),
15703 (UV)RExC_offsets[0]));
15704 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15711 - reg_node - emit a node
15713 STATIC regnode * /* Location. */
15714 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15716 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15718 PERL_ARGS_ASSERT_REG_NODE;
15720 assert(regarglen[op] == 0);
15723 regnode *ptr = ret;
15724 FILL_ADVANCE_NODE(ptr, op);
15731 - reganode - emit a node with an argument
15733 STATIC regnode * /* Location. */
15734 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15736 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15738 PERL_ARGS_ASSERT_REGANODE;
15740 assert(regarglen[op] == 1);
15743 regnode *ptr = ret;
15744 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15751 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15753 /* emit a node with U32 and I32 arguments */
15755 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15757 PERL_ARGS_ASSERT_REG2LANODE;
15759 assert(regarglen[op] == 2);
15762 regnode *ptr = ret;
15763 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15770 - reguni - emit (if appropriate) a Unicode character
15772 PERL_STATIC_INLINE STRLEN
15773 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15775 PERL_ARGS_ASSERT_REGUNI;
15777 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15781 - reginsert - insert an operator in front of already-emitted operand
15783 * Means relocating the operand.
15786 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15791 const int offset = regarglen[(U8)op];
15792 const int size = NODE_STEP_REGNODE + offset;
15793 GET_RE_DEBUG_FLAGS_DECL;
15795 PERL_ARGS_ASSERT_REGINSERT;
15796 PERL_UNUSED_CONTEXT;
15797 PERL_UNUSED_ARG(depth);
15798 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15799 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15808 if (RExC_open_parens) {
15810 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15811 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15812 if ( RExC_open_parens[paren] >= opnd ) {
15813 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15814 RExC_open_parens[paren] += size;
15816 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15818 if ( RExC_close_parens[paren] >= opnd ) {
15819 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15820 RExC_close_parens[paren] += size;
15822 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15827 while (src > opnd) {
15828 StructCopy(--src, --dst, regnode);
15829 #ifdef RE_TRACK_PATTERN_OFFSETS
15830 if (RExC_offsets) { /* MJD 20010112 */
15832 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15836 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15837 ? "Overwriting end of array!\n" : "OK",
15838 (UV)(src - RExC_emit_start),
15839 (UV)(dst - RExC_emit_start),
15840 (UV)RExC_offsets[0]));
15841 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15842 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15848 place = opnd; /* Op node, where operand used to be. */
15849 #ifdef RE_TRACK_PATTERN_OFFSETS
15850 if (RExC_offsets) { /* MJD */
15852 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15856 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15857 ? "Overwriting end of array!\n" : "OK",
15858 (UV)(place - RExC_emit_start),
15859 (UV)(RExC_parse - RExC_start),
15860 (UV)RExC_offsets[0]));
15861 Set_Node_Offset(place, RExC_parse);
15862 Set_Node_Length(place, 1);
15865 src = NEXTOPER(place);
15866 FILL_ADVANCE_NODE(place, op);
15867 Zero(src, offset, regnode);
15871 - regtail - set the next-pointer at the end of a node chain of p to val.
15872 - SEE ALSO: regtail_study
15874 /* TODO: All three parms should be const */
15876 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15877 const regnode *val,U32 depth)
15880 GET_RE_DEBUG_FLAGS_DECL;
15882 PERL_ARGS_ASSERT_REGTAIL;
15884 PERL_UNUSED_ARG(depth);
15890 /* Find last node. */
15893 regnode * const temp = regnext(scan);
15895 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15896 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15897 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15898 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15899 (temp == NULL ? "->" : ""),
15900 (temp == NULL ? PL_reg_name[OP(val)] : "")
15908 if (reg_off_by_arg[OP(scan)]) {
15909 ARG_SET(scan, val - scan);
15912 NEXT_OFF(scan) = val - scan;
15918 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15919 - Look for optimizable sequences at the same time.
15920 - currently only looks for EXACT chains.
15922 This is experimental code. The idea is to use this routine to perform
15923 in place optimizations on branches and groups as they are constructed,
15924 with the long term intention of removing optimization from study_chunk so
15925 that it is purely analytical.
15927 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15928 to control which is which.
15931 /* TODO: All four parms should be const */
15934 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15935 const regnode *val,U32 depth)
15939 #ifdef EXPERIMENTAL_INPLACESCAN
15942 GET_RE_DEBUG_FLAGS_DECL;
15944 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15950 /* Find last node. */
15954 regnode * const temp = regnext(scan);
15955 #ifdef EXPERIMENTAL_INPLACESCAN
15956 if (PL_regkind[OP(scan)] == EXACT) {
15957 bool unfolded_multi_char; /* Unexamined in this routine */
15958 if (join_exact(pRExC_state, scan, &min,
15959 &unfolded_multi_char, 1, val, depth+1))
15964 switch (OP(scan)) {
15967 case EXACTFA_NO_TRIE:
15972 if( exact == PSEUDO )
15974 else if ( exact != OP(scan) )
15983 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15984 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15985 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15986 SvPV_nolen_const(RExC_mysv),
15987 REG_NODE_NUM(scan),
15988 PL_reg_name[exact]);
15995 DEBUG_PARSE_MSG("");
15996 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
15997 PerlIO_printf(Perl_debug_log,
15998 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15999 SvPV_nolen_const(RExC_mysv),
16000 (IV)REG_NODE_NUM(val),
16004 if (reg_off_by_arg[OP(scan)]) {
16005 ARG_SET(scan, val - scan);
16008 NEXT_OFF(scan) = val - scan;
16016 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16021 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16026 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16028 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16029 if (flags & (1<<bit)) {
16030 if (!set++ && lead)
16031 PerlIO_printf(Perl_debug_log, "%s",lead);
16032 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16037 PerlIO_printf(Perl_debug_log, "\n");
16039 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16044 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16050 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16052 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16053 if (flags & (1<<bit)) {
16054 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16057 if (!set++ && lead)
16058 PerlIO_printf(Perl_debug_log, "%s",lead);
16059 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16062 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16063 if (!set++ && lead) {
16064 PerlIO_printf(Perl_debug_log, "%s",lead);
16067 case REGEX_UNICODE_CHARSET:
16068 PerlIO_printf(Perl_debug_log, "UNICODE");
16070 case REGEX_LOCALE_CHARSET:
16071 PerlIO_printf(Perl_debug_log, "LOCALE");
16073 case REGEX_ASCII_RESTRICTED_CHARSET:
16074 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16076 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16077 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16080 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16086 PerlIO_printf(Perl_debug_log, "\n");
16088 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16094 Perl_regdump(pTHX_ const regexp *r)
16097 SV * const sv = sv_newmortal();
16098 SV *dsv= sv_newmortal();
16099 RXi_GET_DECL(r,ri);
16100 GET_RE_DEBUG_FLAGS_DECL;
16102 PERL_ARGS_ASSERT_REGDUMP;
16104 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16106 /* Header fields of interest. */
16107 if (r->anchored_substr) {
16108 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16109 RE_SV_DUMPLEN(r->anchored_substr), 30);
16110 PerlIO_printf(Perl_debug_log,
16111 "anchored %s%s at %"IVdf" ",
16112 s, RE_SV_TAIL(r->anchored_substr),
16113 (IV)r->anchored_offset);
16114 } else if (r->anchored_utf8) {
16115 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16116 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16117 PerlIO_printf(Perl_debug_log,
16118 "anchored utf8 %s%s at %"IVdf" ",
16119 s, RE_SV_TAIL(r->anchored_utf8),
16120 (IV)r->anchored_offset);
16122 if (r->float_substr) {
16123 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16124 RE_SV_DUMPLEN(r->float_substr), 30);
16125 PerlIO_printf(Perl_debug_log,
16126 "floating %s%s at %"IVdf"..%"UVuf" ",
16127 s, RE_SV_TAIL(r->float_substr),
16128 (IV)r->float_min_offset, (UV)r->float_max_offset);
16129 } else if (r->float_utf8) {
16130 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16131 RE_SV_DUMPLEN(r->float_utf8), 30);
16132 PerlIO_printf(Perl_debug_log,
16133 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16134 s, RE_SV_TAIL(r->float_utf8),
16135 (IV)r->float_min_offset, (UV)r->float_max_offset);
16137 if (r->check_substr || r->check_utf8)
16138 PerlIO_printf(Perl_debug_log,
16140 (r->check_substr == r->float_substr
16141 && r->check_utf8 == r->float_utf8
16142 ? "(checking floating" : "(checking anchored"));
16143 if (r->intflags & PREGf_NOSCAN)
16144 PerlIO_printf(Perl_debug_log, " noscan");
16145 if (r->extflags & RXf_CHECK_ALL)
16146 PerlIO_printf(Perl_debug_log, " isall");
16147 if (r->check_substr || r->check_utf8)
16148 PerlIO_printf(Perl_debug_log, ") ");
16150 if (ri->regstclass) {
16151 regprop(r, sv, ri->regstclass, NULL, NULL);
16152 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16154 if (r->intflags & PREGf_ANCH) {
16155 PerlIO_printf(Perl_debug_log, "anchored");
16156 if (r->intflags & PREGf_ANCH_MBOL)
16157 PerlIO_printf(Perl_debug_log, "(MBOL)");
16158 if (r->intflags & PREGf_ANCH_SBOL)
16159 PerlIO_printf(Perl_debug_log, "(SBOL)");
16160 if (r->intflags & PREGf_ANCH_GPOS)
16161 PerlIO_printf(Perl_debug_log, "(GPOS)");
16162 PerlIO_putc(Perl_debug_log, ' ');
16164 if (r->intflags & PREGf_GPOS_SEEN)
16165 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16166 if (r->intflags & PREGf_SKIP)
16167 PerlIO_printf(Perl_debug_log, "plus ");
16168 if (r->intflags & PREGf_IMPLICIT)
16169 PerlIO_printf(Perl_debug_log, "implicit ");
16170 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16171 if (r->extflags & RXf_EVAL_SEEN)
16172 PerlIO_printf(Perl_debug_log, "with eval ");
16173 PerlIO_printf(Perl_debug_log, "\n");
16175 regdump_extflags("r->extflags: ",r->extflags);
16176 regdump_intflags("r->intflags: ",r->intflags);
16179 PERL_ARGS_ASSERT_REGDUMP;
16180 PERL_UNUSED_CONTEXT;
16181 PERL_UNUSED_ARG(r);
16182 #endif /* DEBUGGING */
16186 - regprop - printable representation of opcode, with run time support
16190 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16195 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16196 static const char * const anyofs[] = {
16197 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16198 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
16199 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
16200 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
16201 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
16202 || _CC_VERTSPACE != 16
16203 #error Need to adjust order of anyofs[]
16240 RXi_GET_DECL(prog,progi);
16241 GET_RE_DEBUG_FLAGS_DECL;
16243 PERL_ARGS_ASSERT_REGPROP;
16245 sv_setpvn(sv, "", 0);
16247 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
16248 /* It would be nice to FAIL() here, but this may be called from
16249 regexec.c, and it would be hard to supply pRExC_state. */
16250 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16251 (int)OP(o), (int)REGNODE_MAX);
16252 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16254 k = PL_regkind[OP(o)];
16257 sv_catpvs(sv, " ");
16258 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16259 * is a crude hack but it may be the best for now since
16260 * we have no flag "this EXACTish node was UTF-8"
16262 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16263 PERL_PV_ESCAPE_UNI_DETECT |
16264 PERL_PV_ESCAPE_NONASCII |
16265 PERL_PV_PRETTY_ELLIPSES |
16266 PERL_PV_PRETTY_LTGT |
16267 PERL_PV_PRETTY_NOCLEAR
16269 } else if (k == TRIE) {
16270 /* print the details of the trie in dumpuntil instead, as
16271 * progi->data isn't available here */
16272 const char op = OP(o);
16273 const U32 n = ARG(o);
16274 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16275 (reg_ac_data *)progi->data->data[n] :
16277 const reg_trie_data * const trie
16278 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16280 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16281 DEBUG_TRIE_COMPILE_r(
16282 Perl_sv_catpvf(aTHX_ sv,
16283 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16284 (UV)trie->startstate,
16285 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16286 (UV)trie->wordcount,
16289 (UV)TRIE_CHARCOUNT(trie),
16290 (UV)trie->uniquecharcount
16293 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16294 sv_catpvs(sv, "[");
16295 (void) put_charclass_bitmap_innards(sv,
16296 (IS_ANYOF_TRIE(op))
16298 : TRIE_BITMAP(trie),
16300 sv_catpvs(sv, "]");
16303 } else if (k == CURLY) {
16304 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16305 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16306 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16308 else if (k == WHILEM && o->flags) /* Ordinal/of */
16309 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16310 else if (k == REF || k == OPEN || k == CLOSE
16311 || k == GROUPP || OP(o)==ACCEPT)
16313 AV *name_list= NULL;
16314 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16315 if ( RXp_PAREN_NAMES(prog) ) {
16316 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16317 } else if ( pRExC_state ) {
16318 name_list= RExC_paren_name_list;
16321 if ( k != REF || (OP(o) < NREF)) {
16322 SV **name= av_fetch(name_list, ARG(o), 0 );
16324 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16327 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16328 I32 *nums=(I32*)SvPVX(sv_dat);
16329 SV **name= av_fetch(name_list, nums[0], 0 );
16332 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16333 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16334 (n ? "," : ""), (IV)nums[n]);
16336 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16340 if ( k == REF && reginfo) {
16341 U32 n = ARG(o); /* which paren pair */
16342 I32 ln = prog->offs[n].start;
16343 if (prog->lastparen < n || ln == -1)
16344 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16345 else if (ln == prog->offs[n].end)
16346 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16348 const char *s = reginfo->strbeg + ln;
16349 Perl_sv_catpvf(aTHX_ sv, ": ");
16350 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16351 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16354 } else if (k == GOSUB) {
16355 AV *name_list= NULL;
16356 if ( RXp_PAREN_NAMES(prog) ) {
16357 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16358 } else if ( pRExC_state ) {
16359 name_list= RExC_paren_name_list;
16362 /* Paren and offset */
16363 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16365 SV **name= av_fetch(name_list, ARG(o), 0 );
16367 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16370 else if (k == VERB) {
16372 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16373 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16374 } else if (k == LOGICAL)
16375 /* 2: embedded, otherwise 1 */
16376 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16377 else if (k == ANYOF) {
16378 const U8 flags = ANYOF_FLAGS(o);
16380 SV* bitmap_invlist; /* Will hold what the bit map contains */
16383 if (flags & ANYOF_LOCALE_FLAGS)
16384 sv_catpvs(sv, "{loc}");
16385 if (flags & ANYOF_LOC_FOLD)
16386 sv_catpvs(sv, "{i}");
16387 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16388 if (flags & ANYOF_INVERT)
16389 sv_catpvs(sv, "^");
16391 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16393 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16396 /* output any special charclass tests (used entirely under use
16398 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16400 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16401 if (ANYOF_POSIXL_TEST(o,i)) {
16402 sv_catpv(sv, anyofs[i]);
16408 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16409 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16410 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16414 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16415 if (flags & ANYOF_INVERT)
16416 /*make sure the invert info is in each */
16417 sv_catpvs(sv, "^");
16420 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16421 sv_catpvs(sv, "{non-utf8-latin1-all}");
16424 /* output information about the unicode matching */
16425 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16426 sv_catpvs(sv, "{above_bitmap_all}");
16427 else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16428 SV *lv; /* Set if there is something outside the bit map. */
16429 bool byte_output = FALSE; /* If something in the bitmap has
16431 SV *only_utf8_locale;
16433 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
16434 * is used to guarantee that nothing in the bitmap gets
16436 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16437 &lv, &only_utf8_locale,
16439 if (lv && lv != &PL_sv_undef) {
16440 char *s = savesvpv(lv);
16441 char * const origs = s;
16443 while (*s && *s != '\n')
16447 const char * const t = ++s;
16449 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16450 sv_catpvs(sv, "{outside bitmap}");
16453 sv_catpvs(sv, "{utf8}");
16457 sv_catpvs(sv, " ");
16463 /* Truncate very long output */
16464 if (s - origs > 256) {
16465 Perl_sv_catpvf(aTHX_ sv,
16467 (int) (s - origs - 1),
16473 else if (*s == '\t') {
16487 SvREFCNT_dec_NN(lv);
16490 if ((flags & ANYOF_LOC_FOLD)
16491 && only_utf8_locale
16492 && only_utf8_locale != &PL_sv_undef)
16495 int max_entries = 256;
16497 sv_catpvs(sv, "{utf8 locale}");
16498 invlist_iterinit(only_utf8_locale);
16499 while (invlist_iternext(only_utf8_locale,
16501 put_range(sv, start, end, FALSE);
16503 if (max_entries < 0) {
16504 sv_catpvs(sv, "...");
16508 invlist_iterfinish(only_utf8_locale);
16512 SvREFCNT_dec(bitmap_invlist);
16515 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16517 else if (k == POSIXD || k == NPOSIXD) {
16518 U8 index = FLAGS(o) * 2;
16519 if (index < C_ARRAY_LENGTH(anyofs)) {
16520 if (*anyofs[index] != '[') {
16523 sv_catpv(sv, anyofs[index]);
16524 if (*anyofs[index] != '[') {
16529 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16532 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16533 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16534 else if (OP(o) == SBOL)
16535 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16537 PERL_UNUSED_CONTEXT;
16538 PERL_UNUSED_ARG(sv);
16539 PERL_UNUSED_ARG(o);
16540 PERL_UNUSED_ARG(prog);
16541 PERL_UNUSED_ARG(reginfo);
16542 PERL_UNUSED_ARG(pRExC_state);
16543 #endif /* DEBUGGING */
16549 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16550 { /* Assume that RE_INTUIT is set */
16551 struct regexp *const prog = ReANY(r);
16552 GET_RE_DEBUG_FLAGS_DECL;
16554 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16555 PERL_UNUSED_CONTEXT;
16559 const char * const s = SvPV_nolen_const(prog->check_substr
16560 ? prog->check_substr : prog->check_utf8);
16562 if (!PL_colorset) reginitcolors();
16563 PerlIO_printf(Perl_debug_log,
16564 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16566 prog->check_substr ? "" : "utf8 ",
16567 PL_colors[5],PL_colors[0],
16570 (strlen(s) > 60 ? "..." : ""));
16573 return prog->check_substr ? prog->check_substr : prog->check_utf8;
16579 handles refcounting and freeing the perl core regexp structure. When
16580 it is necessary to actually free the structure the first thing it
16581 does is call the 'free' method of the regexp_engine associated to
16582 the regexp, allowing the handling of the void *pprivate; member
16583 first. (This routine is not overridable by extensions, which is why
16584 the extensions free is called first.)
16586 See regdupe and regdupe_internal if you change anything here.
16588 #ifndef PERL_IN_XSUB_RE
16590 Perl_pregfree(pTHX_ REGEXP *r)
16596 Perl_pregfree2(pTHX_ REGEXP *rx)
16598 struct regexp *const r = ReANY(rx);
16599 GET_RE_DEBUG_FLAGS_DECL;
16601 PERL_ARGS_ASSERT_PREGFREE2;
16603 if (r->mother_re) {
16604 ReREFCNT_dec(r->mother_re);
16606 CALLREGFREE_PVT(rx); /* free the private data */
16607 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16608 Safefree(r->xpv_len_u.xpvlenu_pv);
16611 SvREFCNT_dec(r->anchored_substr);
16612 SvREFCNT_dec(r->anchored_utf8);
16613 SvREFCNT_dec(r->float_substr);
16614 SvREFCNT_dec(r->float_utf8);
16615 Safefree(r->substrs);
16617 RX_MATCH_COPY_FREE(rx);
16618 #ifdef PERL_ANY_COW
16619 SvREFCNT_dec(r->saved_copy);
16622 SvREFCNT_dec(r->qr_anoncv);
16623 rx->sv_u.svu_rx = 0;
16628 This is a hacky workaround to the structural issue of match results
16629 being stored in the regexp structure which is in turn stored in
16630 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16631 could be PL_curpm in multiple contexts, and could require multiple
16632 result sets being associated with the pattern simultaneously, such
16633 as when doing a recursive match with (??{$qr})
16635 The solution is to make a lightweight copy of the regexp structure
16636 when a qr// is returned from the code executed by (??{$qr}) this
16637 lightweight copy doesn't actually own any of its data except for
16638 the starp/end and the actual regexp structure itself.
16644 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16646 struct regexp *ret;
16647 struct regexp *const r = ReANY(rx);
16648 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16650 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16653 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16655 SvOK_off((SV *)ret_x);
16657 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16658 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16659 made both spots point to the same regexp body.) */
16660 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16661 assert(!SvPVX(ret_x));
16662 ret_x->sv_u.svu_rx = temp->sv_any;
16663 temp->sv_any = NULL;
16664 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16665 SvREFCNT_dec_NN(temp);
16666 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16667 ing below will not set it. */
16668 SvCUR_set(ret_x, SvCUR(rx));
16671 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16672 sv_force_normal(sv) is called. */
16674 ret = ReANY(ret_x);
16676 SvFLAGS(ret_x) |= SvUTF8(rx);
16677 /* We share the same string buffer as the original regexp, on which we
16678 hold a reference count, incremented when mother_re is set below.
16679 The string pointer is copied here, being part of the regexp struct.
16681 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16682 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16684 const I32 npar = r->nparens+1;
16685 Newx(ret->offs, npar, regexp_paren_pair);
16686 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16689 Newx(ret->substrs, 1, struct reg_substr_data);
16690 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16692 SvREFCNT_inc_void(ret->anchored_substr);
16693 SvREFCNT_inc_void(ret->anchored_utf8);
16694 SvREFCNT_inc_void(ret->float_substr);
16695 SvREFCNT_inc_void(ret->float_utf8);
16697 /* check_substr and check_utf8, if non-NULL, point to either their
16698 anchored or float namesakes, and don't hold a second reference. */
16700 RX_MATCH_COPIED_off(ret_x);
16701 #ifdef PERL_ANY_COW
16702 ret->saved_copy = NULL;
16704 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16705 SvREFCNT_inc_void(ret->qr_anoncv);
16711 /* regfree_internal()
16713 Free the private data in a regexp. This is overloadable by
16714 extensions. Perl takes care of the regexp structure in pregfree(),
16715 this covers the *pprivate pointer which technically perl doesn't
16716 know about, however of course we have to handle the
16717 regexp_internal structure when no extension is in use.
16719 Note this is called before freeing anything in the regexp
16724 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16726 struct regexp *const r = ReANY(rx);
16727 RXi_GET_DECL(r,ri);
16728 GET_RE_DEBUG_FLAGS_DECL;
16730 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16736 SV *dsv= sv_newmortal();
16737 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16738 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16739 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16740 PL_colors[4],PL_colors[5],s);
16743 #ifdef RE_TRACK_PATTERN_OFFSETS
16745 Safefree(ri->u.offsets); /* 20010421 MJD */
16747 if (ri->code_blocks) {
16749 for (n = 0; n < ri->num_code_blocks; n++)
16750 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16751 Safefree(ri->code_blocks);
16755 int n = ri->data->count;
16758 /* If you add a ->what type here, update the comment in regcomp.h */
16759 switch (ri->data->what[n]) {
16765 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16768 Safefree(ri->data->data[n]);
16774 { /* Aho Corasick add-on structure for a trie node.
16775 Used in stclass optimization only */
16777 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16778 #ifdef USE_ITHREADS
16782 refcount = --aho->refcount;
16785 PerlMemShared_free(aho->states);
16786 PerlMemShared_free(aho->fail);
16787 /* do this last!!!! */
16788 PerlMemShared_free(ri->data->data[n]);
16789 /* we should only ever get called once, so
16790 * assert as much, and also guard the free
16791 * which /might/ happen twice. At the least
16792 * it will make code anlyzers happy and it
16793 * doesn't cost much. - Yves */
16794 assert(ri->regstclass);
16795 if (ri->regstclass) {
16796 PerlMemShared_free(ri->regstclass);
16797 ri->regstclass = 0;
16804 /* trie structure. */
16806 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16807 #ifdef USE_ITHREADS
16811 refcount = --trie->refcount;
16814 PerlMemShared_free(trie->charmap);
16815 PerlMemShared_free(trie->states);
16816 PerlMemShared_free(trie->trans);
16818 PerlMemShared_free(trie->bitmap);
16820 PerlMemShared_free(trie->jump);
16821 PerlMemShared_free(trie->wordinfo);
16822 /* do this last!!!! */
16823 PerlMemShared_free(ri->data->data[n]);
16828 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16829 ri->data->what[n]);
16832 Safefree(ri->data->what);
16833 Safefree(ri->data);
16839 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16840 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16841 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16844 re_dup - duplicate a regexp.
16846 This routine is expected to clone a given regexp structure. It is only
16847 compiled under USE_ITHREADS.
16849 After all of the core data stored in struct regexp is duplicated
16850 the regexp_engine.dupe method is used to copy any private data
16851 stored in the *pprivate pointer. This allows extensions to handle
16852 any duplication it needs to do.
16854 See pregfree() and regfree_internal() if you change anything here.
16856 #if defined(USE_ITHREADS)
16857 #ifndef PERL_IN_XSUB_RE
16859 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16863 const struct regexp *r = ReANY(sstr);
16864 struct regexp *ret = ReANY(dstr);
16866 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16868 npar = r->nparens+1;
16869 Newx(ret->offs, npar, regexp_paren_pair);
16870 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16872 if (ret->substrs) {
16873 /* Do it this way to avoid reading from *r after the StructCopy().
16874 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16875 cache, it doesn't matter. */
16876 const bool anchored = r->check_substr
16877 ? r->check_substr == r->anchored_substr
16878 : r->check_utf8 == r->anchored_utf8;
16879 Newx(ret->substrs, 1, struct reg_substr_data);
16880 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16882 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16883 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16884 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16885 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16887 /* check_substr and check_utf8, if non-NULL, point to either their
16888 anchored or float namesakes, and don't hold a second reference. */
16890 if (ret->check_substr) {
16892 assert(r->check_utf8 == r->anchored_utf8);
16893 ret->check_substr = ret->anchored_substr;
16894 ret->check_utf8 = ret->anchored_utf8;
16896 assert(r->check_substr == r->float_substr);
16897 assert(r->check_utf8 == r->float_utf8);
16898 ret->check_substr = ret->float_substr;
16899 ret->check_utf8 = ret->float_utf8;
16901 } else if (ret->check_utf8) {
16903 ret->check_utf8 = ret->anchored_utf8;
16905 ret->check_utf8 = ret->float_utf8;
16910 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16911 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16914 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16916 if (RX_MATCH_COPIED(dstr))
16917 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16919 ret->subbeg = NULL;
16920 #ifdef PERL_ANY_COW
16921 ret->saved_copy = NULL;
16924 /* Whether mother_re be set or no, we need to copy the string. We
16925 cannot refrain from copying it when the storage points directly to
16926 our mother regexp, because that's
16927 1: a buffer in a different thread
16928 2: something we no longer hold a reference on
16929 so we need to copy it locally. */
16930 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16931 ret->mother_re = NULL;
16933 #endif /* PERL_IN_XSUB_RE */
16938 This is the internal complement to regdupe() which is used to copy
16939 the structure pointed to by the *pprivate pointer in the regexp.
16940 This is the core version of the extension overridable cloning hook.
16941 The regexp structure being duplicated will be copied by perl prior
16942 to this and will be provided as the regexp *r argument, however
16943 with the /old/ structures pprivate pointer value. Thus this routine
16944 may override any copying normally done by perl.
16946 It returns a pointer to the new regexp_internal structure.
16950 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16953 struct regexp *const r = ReANY(rx);
16954 regexp_internal *reti;
16956 RXi_GET_DECL(r,ri);
16958 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16962 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16963 char, regexp_internal);
16964 Copy(ri->program, reti->program, len+1, regnode);
16966 reti->num_code_blocks = ri->num_code_blocks;
16967 if (ri->code_blocks) {
16969 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16970 struct reg_code_block);
16971 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16972 struct reg_code_block);
16973 for (n = 0; n < ri->num_code_blocks; n++)
16974 reti->code_blocks[n].src_regex = (REGEXP*)
16975 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16978 reti->code_blocks = NULL;
16980 reti->regstclass = NULL;
16983 struct reg_data *d;
16984 const int count = ri->data->count;
16987 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16988 char, struct reg_data);
16989 Newx(d->what, count, U8);
16992 for (i = 0; i < count; i++) {
16993 d->what[i] = ri->data->what[i];
16994 switch (d->what[i]) {
16995 /* see also regcomp.h and regfree_internal() */
16996 case 'a': /* actually an AV, but the dup function is identical. */
17000 case 'u': /* actually an HV, but the dup function is identical. */
17001 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17004 /* This is cheating. */
17005 Newx(d->data[i], 1, regnode_ssc);
17006 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17007 reti->regstclass = (regnode*)d->data[i];
17010 /* Trie stclasses are readonly and can thus be shared
17011 * without duplication. We free the stclass in pregfree
17012 * when the corresponding reg_ac_data struct is freed.
17014 reti->regstclass= ri->regstclass;
17018 ((reg_trie_data*)ri->data->data[i])->refcount++;
17023 d->data[i] = ri->data->data[i];
17026 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17027 ri->data->what[i]);
17036 reti->name_list_idx = ri->name_list_idx;
17038 #ifdef RE_TRACK_PATTERN_OFFSETS
17039 if (ri->u.offsets) {
17040 Newx(reti->u.offsets, 2*len+1, U32);
17041 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17044 SetProgLen(reti,len);
17047 return (void*)reti;
17050 #endif /* USE_ITHREADS */
17052 #ifndef PERL_IN_XSUB_RE
17055 - regnext - dig the "next" pointer out of a node
17058 Perl_regnext(pTHX_ regnode *p)
17065 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17066 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17067 (int)OP(p), (int)REGNODE_MAX);
17070 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17079 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17082 STRLEN l1 = strlen(pat1);
17083 STRLEN l2 = strlen(pat2);
17086 const char *message;
17088 PERL_ARGS_ASSERT_RE_CROAK2;
17094 Copy(pat1, buf, l1 , char);
17095 Copy(pat2, buf + l1, l2 , char);
17096 buf[l1 + l2] = '\n';
17097 buf[l1 + l2 + 1] = '\0';
17098 va_start(args, pat2);
17099 msv = vmess(buf, &args);
17101 message = SvPV_const(msv,l1);
17104 Copy(message, buf, l1 , char);
17105 /* l1-1 to avoid \n */
17106 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17110 /* Certain characters are output as a sequence with the first being a
17112 #define isBACKSLASHED_PUNCT(c) \
17113 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17116 S_put_code_point(pTHX_ SV *sv, UV c)
17118 PERL_ARGS_ASSERT_PUT_CODE_POINT;
17121 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17123 else if (isPRINT(c)) {
17124 const char string = (char) c;
17125 if (isBACKSLASHED_PUNCT(c))
17126 sv_catpvs(sv, "\\");
17127 sv_catpvn(sv, &string, 1);
17130 const char * const mnemonic = cntrl_to_mnemonic((char) c);
17132 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17135 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17140 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17143 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17145 /* Appends to 'sv' a displayable version of the range of code points from
17146 * 'start' to 'end'. It assumes that only ASCII printables are displayable
17147 * as-is (though some of these will be escaped by put_code_point()). */
17149 const unsigned int min_range_count = 3;
17151 assert(start <= end);
17153 PERL_ARGS_ASSERT_PUT_RANGE;
17155 while (start <= end) {
17157 const char * format;
17159 if (end - start < min_range_count) {
17161 /* Individual chars in short ranges */
17162 for (; start <= end; start++) {
17163 put_code_point(sv, start);
17168 /* If permitted by the input options, and there is a possibility that
17169 * this range contains a printable literal, look to see if there is
17171 if (allow_literals && start <= MAX_PRINT_A) {
17173 /* If the range begin isn't an ASCII printable, effectively split
17174 * the range into two parts:
17175 * 1) the portion before the first such printable,
17177 * and output them separately. */
17178 if (! isPRINT_A(start)) {
17179 UV temp_end = start + 1;
17181 /* There is no point looking beyond the final possible
17182 * printable, in MAX_PRINT_A */
17183 UV max = MIN(end, MAX_PRINT_A);
17185 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17189 /* Here, temp_end points to one beyond the first printable if
17190 * found, or to one beyond 'max' if not. If none found, make
17191 * sure that we use the entire range */
17192 if (temp_end > MAX_PRINT_A) {
17193 temp_end = end + 1;
17196 /* Output the first part of the split range, the part that
17197 * doesn't have printables, with no looking for literals
17198 * (otherwise we would infinitely recurse) */
17199 put_range(sv, start, temp_end - 1, FALSE);
17201 /* The 2nd part of the range (if any) starts here. */
17204 /* We continue instead of dropping down because even if the 2nd
17205 * part is non-empty, it could be so short that we want to
17206 * output it specially, as tested for at the top of this loop.
17211 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
17212 * output a sub-range of just the digits or letters, then process
17213 * the remaining portion as usual. */
17214 if (isALPHANUMERIC_A(start)) {
17215 UV mask = (isDIGIT_A(start))
17220 UV temp_end = start + 1;
17222 /* Find the end of the sub-range that includes just the
17223 * characters in the same class as the first character in it */
17224 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17229 /* For short ranges, don't duplicate the code above to output
17230 * them; just call recursively */
17231 if (temp_end - start < min_range_count) {
17232 put_range(sv, start, temp_end, FALSE);
17234 else { /* Output as a range */
17235 put_code_point(sv, start);
17236 sv_catpvs(sv, "-");
17237 put_code_point(sv, temp_end);
17239 start = temp_end + 1;
17243 /* We output any other printables as individual characters */
17244 if (isPUNCT_A(start) || isSPACE_A(start)) {
17245 while (start <= end && (isPUNCT_A(start)
17246 || isSPACE_A(start)))
17248 put_code_point(sv, start);
17253 } /* End of looking for literals */
17255 /* Here is not to output as a literal. Some control characters have
17256 * mnemonic names. Split off any of those at the beginning and end of
17257 * the range to print mnemonically. It isn't possible for many of
17258 * these to be in a row, so this won't overwhelm with output */
17259 while (isMNEMONIC_CNTRL(start) && start <= end) {
17260 put_code_point(sv, start);
17263 if (start < end && isMNEMONIC_CNTRL(end)) {
17265 /* Here, the final character in the range has a mnemonic name.
17266 * Work backwards from the end to find the final non-mnemonic */
17267 UV temp_end = end - 1;
17268 while (isMNEMONIC_CNTRL(temp_end)) {
17272 /* And separately output the range that doesn't have mnemonics */
17273 put_range(sv, start, temp_end, FALSE);
17275 /* Then output the mnemonic trailing controls */
17276 start = temp_end + 1;
17277 while (start <= end) {
17278 put_code_point(sv, start);
17284 /* As a final resort, output the range or subrange as hex. */
17286 this_end = (end < NUM_ANYOF_CODE_POINTS)
17288 : NUM_ANYOF_CODE_POINTS - 1;
17289 format = (this_end < 256)
17290 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17291 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17292 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17298 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17300 /* Appends to 'sv' a displayable version of the innards of the bracketed
17301 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
17302 * output anything, and bitmap_invlist, if not NULL, will point to an
17303 * inversion list of what is in the bit map */
17307 unsigned int punct_count = 0;
17308 SV* invlist = NULL;
17309 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17310 bool allow_literals = TRUE;
17312 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17314 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17316 /* Worst case is exactly every-other code point is in the list */
17317 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17319 /* Convert the bit map to an inversion list, keeping track of how many
17320 * ASCII puncts are set, including an extra amount for the backslashed
17322 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17323 if (BITMAP_TEST(bitmap, i)) {
17324 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17325 if (isPUNCT_A(i)) {
17327 if isBACKSLASHED_PUNCT(i) {
17334 /* Nothing to output */
17335 if (_invlist_len(*invlist_ptr) == 0) {
17336 SvREFCNT_dec(invlist);
17340 /* Generally, it is more readable if printable characters are output as
17341 * literals, but if a range (nearly) spans all of them, it's best to output
17342 * it as a single range. This code will use a single range if all but 2
17343 * printables are in it */
17344 invlist_iterinit(*invlist_ptr);
17345 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17347 /* If range starts beyond final printable, it doesn't have any in it */
17348 if (start > MAX_PRINT_A) {
17352 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
17353 * all but two, the range must start and end no later than 2 from
17355 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17356 if (end > MAX_PRINT_A) {
17362 if (end - start >= MAX_PRINT_A - ' ' - 2) {
17363 allow_literals = FALSE;
17368 invlist_iterfinish(*invlist_ptr);
17370 /* The legibility of the output depends mostly on how many punctuation
17371 * characters are output. There are 32 possible ASCII ones, and some have
17372 * an additional backslash, bringing it to currently 36, so if any more
17373 * than 18 are to be output, we can instead output it as its complement,
17374 * yielding fewer puncts, and making it more legible. But give some weight
17375 * to the fact that outputting it as a complement is less legible than a
17376 * straight output, so don't complement unless we are somewhat over the 18
17378 if (allow_literals && punct_count > 22) {
17379 sv_catpvs(sv, "^");
17381 /* Add everything remaining to the list, so when we invert it just
17382 * below, it will be excluded */
17383 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17384 _invlist_invert(*invlist_ptr);
17387 /* Here we have figured things out. Output each range */
17388 invlist_iterinit(*invlist_ptr);
17389 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17390 if (start >= NUM_ANYOF_CODE_POINTS) {
17393 put_range(sv, start, end, allow_literals);
17395 invlist_iterfinish(*invlist_ptr);
17400 #define CLEAR_OPTSTART \
17401 if (optstart) STMT_START { \
17402 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
17403 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17407 #define DUMPUNTIL(b,e) \
17409 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17411 STATIC const regnode *
17412 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17413 const regnode *last, const regnode *plast,
17414 SV* sv, I32 indent, U32 depth)
17416 U8 op = PSEUDO; /* Arbitrary non-END op. */
17417 const regnode *next;
17418 const regnode *optstart= NULL;
17420 RXi_GET_DECL(r,ri);
17421 GET_RE_DEBUG_FLAGS_DECL;
17423 PERL_ARGS_ASSERT_DUMPUNTIL;
17425 #ifdef DEBUG_DUMPUNTIL
17426 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17427 last ? last-start : 0,plast ? plast-start : 0);
17430 if (plast && plast < last)
17433 while (PL_regkind[op] != END && (!last || node < last)) {
17435 /* While that wasn't END last time... */
17438 if (op == CLOSE || op == WHILEM)
17440 next = regnext((regnode *)node);
17443 if (OP(node) == OPTIMIZED) {
17444 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17451 regprop(r, sv, node, NULL, NULL);
17452 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17453 (int)(2*indent + 1), "", SvPVX_const(sv));
17455 if (OP(node) != OPTIMIZED) {
17456 if (next == NULL) /* Next ptr. */
17457 PerlIO_printf(Perl_debug_log, " (0)");
17458 else if (PL_regkind[(U8)op] == BRANCH
17459 && PL_regkind[OP(next)] != BRANCH )
17460 PerlIO_printf(Perl_debug_log, " (FAIL)");
17462 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17463 (void)PerlIO_putc(Perl_debug_log, '\n');
17467 if (PL_regkind[(U8)op] == BRANCHJ) {
17470 const regnode *nnode = (OP(next) == LONGJMP
17471 ? regnext((regnode *)next)
17473 if (last && nnode > last)
17475 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17478 else if (PL_regkind[(U8)op] == BRANCH) {
17480 DUMPUNTIL(NEXTOPER(node), next);
17482 else if ( PL_regkind[(U8)op] == TRIE ) {
17483 const regnode *this_trie = node;
17484 const char op = OP(node);
17485 const U32 n = ARG(node);
17486 const reg_ac_data * const ac = op>=AHOCORASICK ?
17487 (reg_ac_data *)ri->data->data[n] :
17489 const reg_trie_data * const trie =
17490 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17492 AV *const trie_words
17493 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17495 const regnode *nextbranch= NULL;
17498 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17499 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17501 PerlIO_printf(Perl_debug_log, "%*s%s ",
17502 (int)(2*(indent+3)), "",
17504 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17505 SvCUR(*elem_ptr), 60,
17506 PL_colors[0], PL_colors[1],
17508 ? PERL_PV_ESCAPE_UNI
17510 | PERL_PV_PRETTY_ELLIPSES
17511 | PERL_PV_PRETTY_LTGT
17516 U16 dist= trie->jump[word_idx+1];
17517 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17518 (UV)((dist ? this_trie + dist : next) - start));
17521 nextbranch= this_trie + trie->jump[0];
17522 DUMPUNTIL(this_trie + dist, nextbranch);
17524 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17525 nextbranch= regnext((regnode *)nextbranch);
17527 PerlIO_printf(Perl_debug_log, "\n");
17530 if (last && next > last)
17535 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
17536 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17537 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17539 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17541 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17543 else if ( op == PLUS || op == STAR) {
17544 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17546 else if (PL_regkind[(U8)op] == ANYOF) {
17547 /* arglen 1 + class block */
17548 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17549 ? ANYOF_POSIXL_SKIP
17551 node = NEXTOPER(node);
17553 else if (PL_regkind[(U8)op] == EXACT) {
17554 /* Literal string, where present. */
17555 node += NODE_SZ_STR(node) - 1;
17556 node = NEXTOPER(node);
17559 node = NEXTOPER(node);
17560 node += regarglen[(U8)op];
17562 if (op == CURLYX || op == OPEN)
17566 #ifdef DEBUG_DUMPUNTIL
17567 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17572 #endif /* DEBUGGING */
17576 * c-indentation-style: bsd
17577 * c-basic-offset: 4
17578 * indent-tabs-mode: nil
17581 * ex: set ts=8 sts=4 sw=4 et: