5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
109 /* this is a chain of data about sub patterns we are processing that
110 need to be handled separately/specially in study_chunk. Its so
111 we can simulate recursion without losing state. */
113 typedef struct scan_frame {
114 regnode *last_regnode; /* last node to process in this frame */
115 regnode *next_regnode; /* next node to process when last is reached */
116 U32 prev_recursed_depth;
117 I32 stopparen; /* what stopparen do we use */
118 U32 is_top_frame; /* what flags do we use? */
120 struct scan_frame *this_prev_frame; /* this previous frame */
121 struct scan_frame *prev_frame; /* previous frame */
122 struct scan_frame *next_frame; /* next frame */
125 struct RExC_state_t {
126 U32 flags; /* RXf_* are we folding, multilining? */
127 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
128 char *precomp; /* uncompiled string. */
129 REGEXP *rx_sv; /* The SV that is the regexp. */
130 regexp *rx; /* perl core regexp structure */
131 regexp_internal *rxi; /* internal data for regexp object
133 char *start; /* Start of input for compile */
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
136 SSize_t whilem_seen; /* number of WHILEM in this expr */
137 regnode *emit_start; /* Start of emitted-code area */
138 regnode *emit_bound; /* First regnode outside of the
140 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
141 implies compiling, so don't emit */
142 regnode_ssc emit_dummy; /* placeholder for emit to point to;
143 large enough for the largest
144 non-EXACTish node, so can use it as
146 I32 naughty; /* How bad is this pattern? */
147 I32 sawback; /* Did we see \1, ...? */
149 SSize_t size; /* Code size. */
150 I32 npar; /* Capture buffer count, (OPEN) plus
151 one. ("par" 0 is the whole
153 I32 nestroot; /* root parens we are in - used by
157 regnode **open_parens; /* pointers to open parens */
158 regnode **close_parens; /* pointers to close parens */
159 regnode *opend; /* END node in program */
160 I32 utf8; /* whether the pattern is utf8 or not */
161 I32 orig_utf8; /* whether the pattern was originally in utf8 */
162 /* XXX use this for future optimisation of case
163 * where pattern must be upgraded to utf8. */
164 I32 uni_semantics; /* If a d charset modifier should use unicode
165 rules, even if the pattern is not in
167 HV *paren_names; /* Paren names */
169 regnode **recurse; /* Recurse regops */
170 I32 recurse_count; /* Number of recurse regops */
171 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
173 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
177 I32 override_recoding;
178 I32 in_multi_char_class;
179 struct reg_code_block *code_blocks; /* positions of literal (?{})
181 int num_code_blocks; /* size of code_blocks[] */
182 int code_index; /* next code_blocks[] slot */
183 SSize_t maxlen; /* mininum possible number of chars in string to match */
184 scan_frame *frame_head;
185 scan_frame *frame_last;
187 #ifdef ADD_TO_REGEXEC
188 char *starttry; /* -Dr: where regtry was called. */
189 #define RExC_starttry (pRExC_state->starttry)
191 SV *runtime_code_qr; /* qr with the runtime code blocks */
193 const char *lastparse;
195 AV *paren_name_list; /* idx -> name */
196 U32 study_chunk_recursed_count;
199 #define RExC_lastparse (pRExC_state->lastparse)
200 #define RExC_lastnum (pRExC_state->lastnum)
201 #define RExC_paren_name_list (pRExC_state->paren_name_list)
202 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
203 #define RExC_mysv (pRExC_state->mysv1)
204 #define RExC_mysv1 (pRExC_state->mysv1)
205 #define RExC_mysv2 (pRExC_state->mysv2)
210 #define RExC_flags (pRExC_state->flags)
211 #define RExC_pm_flags (pRExC_state->pm_flags)
212 #define RExC_precomp (pRExC_state->precomp)
213 #define RExC_rx_sv (pRExC_state->rx_sv)
214 #define RExC_rx (pRExC_state->rx)
215 #define RExC_rxi (pRExC_state->rxi)
216 #define RExC_start (pRExC_state->start)
217 #define RExC_end (pRExC_state->end)
218 #define RExC_parse (pRExC_state->parse)
219 #define RExC_whilem_seen (pRExC_state->whilem_seen)
220 #ifdef RE_TRACK_PATTERN_OFFSETS
221 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
224 #define RExC_emit (pRExC_state->emit)
225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
226 #define RExC_emit_start (pRExC_state->emit_start)
227 #define RExC_emit_bound (pRExC_state->emit_bound)
228 #define RExC_naughty (pRExC_state->naughty)
229 #define RExC_sawback (pRExC_state->sawback)
230 #define RExC_seen (pRExC_state->seen)
231 #define RExC_size (pRExC_state->size)
232 #define RExC_maxlen (pRExC_state->maxlen)
233 #define RExC_npar (pRExC_state->npar)
234 #define RExC_nestroot (pRExC_state->nestroot)
235 #define RExC_extralen (pRExC_state->extralen)
236 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
237 #define RExC_utf8 (pRExC_state->utf8)
238 #define RExC_uni_semantics (pRExC_state->uni_semantics)
239 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
240 #define RExC_open_parens (pRExC_state->open_parens)
241 #define RExC_close_parens (pRExC_state->close_parens)
242 #define RExC_opend (pRExC_state->opend)
243 #define RExC_paren_names (pRExC_state->paren_names)
244 #define RExC_recurse (pRExC_state->recurse)
245 #define RExC_recurse_count (pRExC_state->recurse_count)
246 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
247 #define RExC_study_chunk_recursed_bytes \
248 (pRExC_state->study_chunk_recursed_bytes)
249 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
250 #define RExC_contains_locale (pRExC_state->contains_locale)
251 #define RExC_contains_i (pRExC_state->contains_i)
252 #define RExC_override_recoding (pRExC_state->override_recoding)
253 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
254 #define RExC_frame_head (pRExC_state->frame_head)
255 #define RExC_frame_last (pRExC_state->frame_last)
256 #define RExC_frame_count (pRExC_state->frame_count)
259 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
260 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
261 ((*s) == '{' && regcurly(s)))
264 * Flags to be passed up and down.
266 #define WORST 0 /* Worst case. */
267 #define HASWIDTH 0x01 /* Known to match non-null strings. */
269 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
270 * character. (There needs to be a case: in the switch statement in regexec.c
271 * for any node marked SIMPLE.) Note that this is not the same thing as
274 #define SPSTART 0x04 /* Starts with * or + */
275 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
276 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
277 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
279 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
281 /* whether trie related optimizations are enabled */
282 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
283 #define TRIE_STUDY_OPT
284 #define FULL_TRIE_STUDY
290 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
291 #define PBITVAL(paren) (1 << ((paren) & 7))
292 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
293 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
294 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
296 #define REQUIRE_UTF8 STMT_START { \
298 *flagp = RESTART_UTF8; \
303 /* This converts the named class defined in regcomp.h to its equivalent class
304 * number defined in handy.h. */
305 #define namedclass_to_classnum(class) ((int) ((class) / 2))
306 #define classnum_to_namedclass(classnum) ((classnum) * 2)
308 #define _invlist_union_complement_2nd(a, b, output) \
309 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
310 #define _invlist_intersection_complement_2nd(a, b, output) \
311 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
313 /* About scan_data_t.
315 During optimisation we recurse through the regexp program performing
316 various inplace (keyhole style) optimisations. In addition study_chunk
317 and scan_commit populate this data structure with information about
318 what strings MUST appear in the pattern. We look for the longest
319 string that must appear at a fixed location, and we look for the
320 longest string that may appear at a floating location. So for instance
325 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
326 strings (because they follow a .* construct). study_chunk will identify
327 both FOO and BAR as being the longest fixed and floating strings respectively.
329 The strings can be composites, for instance
333 will result in a composite fixed substring 'foo'.
335 For each string some basic information is maintained:
337 - offset or min_offset
338 This is the position the string must appear at, or not before.
339 It also implicitly (when combined with minlenp) tells us how many
340 characters must match before the string we are searching for.
341 Likewise when combined with minlenp and the length of the string it
342 tells us how many characters must appear after the string we have
346 Only used for floating strings. This is the rightmost point that
347 the string can appear at. If set to SSize_t_MAX it indicates that the
348 string can occur infinitely far to the right.
351 A pointer to the minimum number of characters of the pattern that the
352 string was found inside. This is important as in the case of positive
353 lookahead or positive lookbehind we can have multiple patterns
358 The minimum length of the pattern overall is 3, the minimum length
359 of the lookahead part is 3, but the minimum length of the part that
360 will actually match is 1. So 'FOO's minimum length is 3, but the
361 minimum length for the F is 1. This is important as the minimum length
362 is used to determine offsets in front of and behind the string being
363 looked for. Since strings can be composites this is the length of the
364 pattern at the time it was committed with a scan_commit. Note that
365 the length is calculated by study_chunk, so that the minimum lengths
366 are not known until the full pattern has been compiled, thus the
367 pointer to the value.
371 In the case of lookbehind the string being searched for can be
372 offset past the start point of the final matching string.
373 If this value was just blithely removed from the min_offset it would
374 invalidate some of the calculations for how many chars must match
375 before or after (as they are derived from min_offset and minlen and
376 the length of the string being searched for).
377 When the final pattern is compiled and the data is moved from the
378 scan_data_t structure into the regexp structure the information
379 about lookbehind is factored in, with the information that would
380 have been lost precalculated in the end_shift field for the
383 The fields pos_min and pos_delta are used to store the minimum offset
384 and the delta to the maximum offset at the current point in the pattern.
388 typedef struct scan_data_t {
389 /*I32 len_min; unused */
390 /*I32 len_delta; unused */
394 SSize_t last_end; /* min value, <0 unless valid. */
395 SSize_t last_start_min;
396 SSize_t last_start_max;
397 SV **longest; /* Either &l_fixed, or &l_float. */
398 SV *longest_fixed; /* longest fixed string found in pattern */
399 SSize_t offset_fixed; /* offset where it starts */
400 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
401 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
402 SV *longest_float; /* longest floating string found in pattern */
403 SSize_t offset_float_min; /* earliest point in string it can appear */
404 SSize_t offset_float_max; /* latest point in string it can appear */
405 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
406 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
409 SSize_t *last_closep;
410 regnode_ssc *start_class;
414 * Forward declarations for pregcomp()'s friends.
417 static const scan_data_t zero_scan_data =
418 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
420 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
421 #define SF_BEFORE_SEOL 0x0001
422 #define SF_BEFORE_MEOL 0x0002
423 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
424 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
426 #define SF_FIX_SHIFT_EOL (+2)
427 #define SF_FL_SHIFT_EOL (+4)
429 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
430 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
432 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
433 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
434 #define SF_IS_INF 0x0040
435 #define SF_HAS_PAR 0x0080
436 #define SF_IN_PAR 0x0100
437 #define SF_HAS_EVAL 0x0200
438 #define SCF_DO_SUBSTR 0x0400
439 #define SCF_DO_STCLASS_AND 0x0800
440 #define SCF_DO_STCLASS_OR 0x1000
441 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
442 #define SCF_WHILEM_VISITED_POS 0x2000
444 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
445 #define SCF_SEEN_ACCEPT 0x8000
446 #define SCF_TRIE_DOING_RESTUDY 0x10000
447 #define SCF_IN_DEFINE 0x20000
452 #define UTF cBOOL(RExC_utf8)
454 /* The enums for all these are ordered so things work out correctly */
455 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
456 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
457 == REGEX_DEPENDS_CHARSET)
458 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
459 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
460 >= REGEX_UNICODE_CHARSET)
461 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
462 == REGEX_ASCII_RESTRICTED_CHARSET)
463 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
464 >= REGEX_ASCII_RESTRICTED_CHARSET)
465 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
466 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
468 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
470 /* For programs that want to be strictly Unicode compatible by dying if any
471 * attempt is made to match a non-Unicode code point against a Unicode
473 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
475 #define OOB_NAMEDCLASS -1
477 /* There is no code point that is out-of-bounds, so this is problematic. But
478 * its only current use is to initialize a variable that is always set before
480 #define OOB_UNICODE 0xDEADBEEF
482 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
483 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
486 /* length of regex to show in messages that don't mark a position within */
487 #define RegexLengthToShowInErrorMessages 127
490 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
491 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
492 * op/pragma/warn/regcomp.
494 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
495 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
497 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
498 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
500 #define REPORT_LOCATION_ARGS(offset) \
501 UTF8fARG(UTF, offset, RExC_precomp), \
502 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
505 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
506 * arg. Show regex, up to a maximum length. If it's too long, chop and add
509 #define _FAIL(code) STMT_START { \
510 const char *ellipses = ""; \
511 IV len = RExC_end - RExC_precomp; \
514 SAVEFREESV(RExC_rx_sv); \
515 if (len > RegexLengthToShowInErrorMessages) { \
516 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
517 len = RegexLengthToShowInErrorMessages - 10; \
523 #define FAIL(msg) _FAIL( \
524 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
525 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
527 #define FAIL2(msg,arg) _FAIL( \
528 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
529 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
532 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
534 #define Simple_vFAIL(m) STMT_START { \
536 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
537 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
538 m, REPORT_LOCATION_ARGS(offset)); \
542 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
544 #define vFAIL(m) STMT_START { \
546 SAVEFREESV(RExC_rx_sv); \
551 * Like Simple_vFAIL(), but accepts two arguments.
553 #define Simple_vFAIL2(m,a1) STMT_START { \
554 const IV offset = RExC_parse - RExC_precomp; \
555 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
556 REPORT_LOCATION_ARGS(offset)); \
560 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
562 #define vFAIL2(m,a1) STMT_START { \
564 SAVEFREESV(RExC_rx_sv); \
565 Simple_vFAIL2(m, a1); \
570 * Like Simple_vFAIL(), but accepts three arguments.
572 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
573 const IV offset = RExC_parse - RExC_precomp; \
574 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
575 REPORT_LOCATION_ARGS(offset)); \
579 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
581 #define vFAIL3(m,a1,a2) STMT_START { \
583 SAVEFREESV(RExC_rx_sv); \
584 Simple_vFAIL3(m, a1, a2); \
588 * Like Simple_vFAIL(), but accepts four arguments.
590 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
591 const IV offset = RExC_parse - RExC_precomp; \
592 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
593 REPORT_LOCATION_ARGS(offset)); \
596 #define vFAIL4(m,a1,a2,a3) STMT_START { \
598 SAVEFREESV(RExC_rx_sv); \
599 Simple_vFAIL4(m, a1, a2, a3); \
602 /* A specialized version of vFAIL2 that works with UTF8f */
603 #define vFAIL2utf8f(m, a1) STMT_START { \
604 const IV offset = RExC_parse - RExC_precomp; \
606 SAVEFREESV(RExC_rx_sv); \
607 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
608 REPORT_LOCATION_ARGS(offset)); \
611 /* These have asserts in them because of [perl #122671] Many warnings in
612 * regcomp.c can occur twice. If they get output in pass1 and later in that
613 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
614 * would get output again. So they should be output in pass2, and these
615 * asserts make sure new warnings follow that paradigm. */
617 /* m is not necessarily a "literal string", in this macro */
618 #define reg_warn_non_literal_string(loc, m) STMT_START { \
619 const IV offset = loc - RExC_precomp; \
620 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
621 m, REPORT_LOCATION_ARGS(offset)); \
624 #define ckWARNreg(loc,m) STMT_START { \
625 const IV offset = loc - RExC_precomp; \
626 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
627 REPORT_LOCATION_ARGS(offset)); \
630 #define vWARN_dep(loc, m) STMT_START { \
631 const IV offset = loc - RExC_precomp; \
632 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
633 REPORT_LOCATION_ARGS(offset)); \
636 #define ckWARNdep(loc,m) STMT_START { \
637 const IV offset = loc - RExC_precomp; \
638 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
640 REPORT_LOCATION_ARGS(offset)); \
643 #define ckWARNregdep(loc,m) STMT_START { \
644 const IV offset = loc - RExC_precomp; \
645 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
647 REPORT_LOCATION_ARGS(offset)); \
650 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
651 const IV offset = loc - RExC_precomp; \
652 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
654 a1, REPORT_LOCATION_ARGS(offset)); \
657 #define ckWARN2reg(loc, m, a1) STMT_START { \
658 const IV offset = loc - RExC_precomp; \
659 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
660 a1, REPORT_LOCATION_ARGS(offset)); \
663 #define vWARN3(loc, m, a1, a2) STMT_START { \
664 const IV offset = loc - RExC_precomp; \
665 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
666 a1, a2, REPORT_LOCATION_ARGS(offset)); \
669 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
670 const IV offset = loc - RExC_precomp; \
671 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
672 a1, a2, REPORT_LOCATION_ARGS(offset)); \
675 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
676 const IV offset = loc - RExC_precomp; \
677 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
678 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
681 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
682 const IV offset = loc - RExC_precomp; \
683 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
684 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
687 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
688 const IV offset = loc - RExC_precomp; \
689 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
690 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
694 /* Allow for side effects in s */
695 #define REGC(c,s) STMT_START { \
696 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
699 /* Macros for recording node offsets. 20001227 mjd@plover.com
700 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
701 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
702 * Element 0 holds the number n.
703 * Position is 1 indexed.
705 #ifndef RE_TRACK_PATTERN_OFFSETS
706 #define Set_Node_Offset_To_R(node,byte)
707 #define Set_Node_Offset(node,byte)
708 #define Set_Cur_Node_Offset
709 #define Set_Node_Length_To_R(node,len)
710 #define Set_Node_Length(node,len)
711 #define Set_Node_Cur_Length(node,start)
712 #define Node_Offset(n)
713 #define Node_Length(n)
714 #define Set_Node_Offset_Length(node,offset,len)
715 #define ProgLen(ri) ri->u.proglen
716 #define SetProgLen(ri,x) ri->u.proglen = x
718 #define ProgLen(ri) ri->u.offsets[0]
719 #define SetProgLen(ri,x) ri->u.offsets[0] = x
720 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
722 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
723 __LINE__, (int)(node), (int)(byte))); \
725 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
728 RExC_offsets[2*(node)-1] = (byte); \
733 #define Set_Node_Offset(node,byte) \
734 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
735 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
737 #define Set_Node_Length_To_R(node,len) STMT_START { \
739 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
740 __LINE__, (int)(node), (int)(len))); \
742 Perl_croak(aTHX_ "value of node is %d in Length macro", \
745 RExC_offsets[2*(node)] = (len); \
750 #define Set_Node_Length(node,len) \
751 Set_Node_Length_To_R((node)-RExC_emit_start, len)
752 #define Set_Node_Cur_Length(node, start) \
753 Set_Node_Length(node, RExC_parse - start)
755 /* Get offsets and lengths */
756 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
757 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
759 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
760 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
761 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
765 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
766 #define EXPERIMENTAL_INPLACESCAN
767 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
769 #define DEBUG_RExC_seen() \
770 DEBUG_OPTIMISE_MORE_r({ \
771 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
773 if (RExC_seen & REG_ZERO_LEN_SEEN) \
774 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
776 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
777 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
779 if (RExC_seen & REG_GPOS_SEEN) \
780 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
782 if (RExC_seen & REG_CANY_SEEN) \
783 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
785 if (RExC_seen & REG_RECURSE_SEEN) \
786 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
788 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
789 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
791 if (RExC_seen & REG_VERBARG_SEEN) \
792 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
794 if (RExC_seen & REG_CUTGROUP_SEEN) \
795 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
797 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
798 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
800 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
801 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
803 if (RExC_seen & REG_GOSTART_SEEN) \
804 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
806 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
807 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
809 PerlIO_printf(Perl_debug_log,"\n"); \
812 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
813 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
815 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
817 PerlIO_printf(Perl_debug_log, "%s", open_str); \
818 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
819 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
820 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
821 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
822 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
823 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
824 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
825 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
826 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
827 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
828 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
829 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
830 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
831 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
832 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
833 PerlIO_printf(Perl_debug_log, "%s", close_str); \
837 #define DEBUG_STUDYDATA(str,data,depth) \
838 DEBUG_OPTIMISE_MORE_r(if(data){ \
839 PerlIO_printf(Perl_debug_log, \
840 "%*s" str "Pos:%"IVdf"/%"IVdf \
842 (int)(depth)*2, "", \
843 (IV)((data)->pos_min), \
844 (IV)((data)->pos_delta), \
845 (UV)((data)->flags) \
847 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
848 PerlIO_printf(Perl_debug_log, \
849 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
850 (IV)((data)->whilem_c), \
851 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
852 is_inf ? "INF " : "" \
854 if ((data)->last_found) \
855 PerlIO_printf(Perl_debug_log, \
856 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
857 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
858 SvPVX_const((data)->last_found), \
859 (IV)((data)->last_end), \
860 (IV)((data)->last_start_min), \
861 (IV)((data)->last_start_max), \
862 ((data)->longest && \
863 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
864 SvPVX_const((data)->longest_fixed), \
865 (IV)((data)->offset_fixed), \
866 ((data)->longest && \
867 (data)->longest==&((data)->longest_float)) ? "*" : "", \
868 SvPVX_const((data)->longest_float), \
869 (IV)((data)->offset_float_min), \
870 (IV)((data)->offset_float_max) \
872 PerlIO_printf(Perl_debug_log,"\n"); \
877 /* is c a control character for which we have a mnemonic? */
878 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
881 S_cntrl_to_mnemonic(const U8 c)
883 /* Returns the mnemonic string that represents character 'c', if one
884 * exists; NULL otherwise. The only ones that exist for the purposes of
885 * this routine are a few control characters */
888 case '\a': return "\\a";
889 case '\b': return "\\b";
890 case ESC_NATIVE: return "\\e";
891 case '\f': return "\\f";
892 case '\n': return "\\n";
893 case '\r': return "\\r";
894 case '\t': return "\\t";
902 /* Mark that we cannot extend a found fixed substring at this point.
903 Update the longest found anchored substring and the longest found
904 floating substrings if needed. */
907 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
908 SSize_t *minlenp, int is_inf)
910 const STRLEN l = CHR_SVLEN(data->last_found);
911 const STRLEN old_l = CHR_SVLEN(*data->longest);
912 GET_RE_DEBUG_FLAGS_DECL;
914 PERL_ARGS_ASSERT_SCAN_COMMIT;
916 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
917 SvSetMagicSV(*data->longest, data->last_found);
918 if (*data->longest == data->longest_fixed) {
919 data->offset_fixed = l ? data->last_start_min : data->pos_min;
920 if (data->flags & SF_BEFORE_EOL)
922 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
924 data->flags &= ~SF_FIX_BEFORE_EOL;
925 data->minlen_fixed=minlenp;
926 data->lookbehind_fixed=0;
928 else { /* *data->longest == data->longest_float */
929 data->offset_float_min = l ? data->last_start_min : data->pos_min;
930 data->offset_float_max = (l
931 ? data->last_start_max
932 : (data->pos_delta == SSize_t_MAX
934 : data->pos_min + data->pos_delta));
936 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
937 data->offset_float_max = SSize_t_MAX;
938 if (data->flags & SF_BEFORE_EOL)
940 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
942 data->flags &= ~SF_FL_BEFORE_EOL;
943 data->minlen_float=minlenp;
944 data->lookbehind_float=0;
947 SvCUR_set(data->last_found, 0);
949 SV * const sv = data->last_found;
950 if (SvUTF8(sv) && SvMAGICAL(sv)) {
951 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
957 data->flags &= ~SF_BEFORE_EOL;
958 DEBUG_STUDYDATA("commit: ",data,0);
961 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
962 * list that describes which code points it matches */
965 S_ssc_anything(pTHX_ regnode_ssc *ssc)
967 /* Set the SSC 'ssc' to match an empty string or any code point */
969 PERL_ARGS_ASSERT_SSC_ANYTHING;
971 assert(is_ANYOF_SYNTHETIC(ssc));
973 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
974 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
975 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
979 S_ssc_is_anything(const regnode_ssc *ssc)
981 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
982 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
983 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
984 * in any way, so there's no point in using it */
989 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
991 assert(is_ANYOF_SYNTHETIC(ssc));
993 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
997 /* See if the list consists solely of the range 0 - Infinity */
998 invlist_iterinit(ssc->invlist);
999 ret = invlist_iternext(ssc->invlist, &start, &end)
1003 invlist_iterfinish(ssc->invlist);
1009 /* If e.g., both \w and \W are set, matches everything */
1010 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1012 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1013 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1023 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1025 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1026 * string, any code point, or any posix class under locale */
1028 PERL_ARGS_ASSERT_SSC_INIT;
1030 Zero(ssc, 1, regnode_ssc);
1031 set_ANYOF_SYNTHETIC(ssc);
1032 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1035 /* If any portion of the regex is to operate under locale rules,
1036 * initialization includes it. The reason this isn't done for all regexes
1037 * is that the optimizer was written under the assumption that locale was
1038 * all-or-nothing. Given the complexity and lack of documentation in the
1039 * optimizer, and that there are inadequate test cases for locale, many
1040 * parts of it may not work properly, it is safest to avoid locale unless
1042 if (RExC_contains_locale) {
1043 ANYOF_POSIXL_SETALL(ssc);
1046 ANYOF_POSIXL_ZERO(ssc);
1051 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1052 const regnode_ssc *ssc)
1054 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1055 * to the list of code points matched, and locale posix classes; hence does
1056 * not check its flags) */
1061 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1063 assert(is_ANYOF_SYNTHETIC(ssc));
1065 invlist_iterinit(ssc->invlist);
1066 ret = invlist_iternext(ssc->invlist, &start, &end)
1070 invlist_iterfinish(ssc->invlist);
1076 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1084 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1085 const regnode_charclass* const node)
1087 /* Returns a mortal inversion list defining which code points are matched
1088 * by 'node', which is of type ANYOF. Handles complementing the result if
1089 * appropriate. If some code points aren't knowable at this time, the
1090 * returned list must, and will, contain every code point that is a
1093 SV* invlist = sv_2mortal(_new_invlist(0));
1094 SV* only_utf8_locale_invlist = NULL;
1096 const U32 n = ARG(node);
1097 bool new_node_has_latin1 = FALSE;
1099 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1101 /* Look at the data structure created by S_set_ANYOF_arg() */
1102 if (n != ANYOF_ONLY_HAS_BITMAP) {
1103 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1104 AV * const av = MUTABLE_AV(SvRV(rv));
1105 SV **const ary = AvARRAY(av);
1106 assert(RExC_rxi->data->what[n] == 's');
1108 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1109 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1111 else if (ary[0] && ary[0] != &PL_sv_undef) {
1113 /* Here, no compile-time swash, and there are things that won't be
1114 * known until runtime -- we have to assume it could be anything */
1115 return _add_range_to_invlist(invlist, 0, UV_MAX);
1117 else if (ary[3] && ary[3] != &PL_sv_undef) {
1119 /* Here no compile-time swash, and no run-time only data. Use the
1120 * node's inversion list */
1121 invlist = sv_2mortal(invlist_clone(ary[3]));
1124 /* Get the code points valid only under UTF-8 locales */
1125 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1126 && ary[2] && ary[2] != &PL_sv_undef)
1128 only_utf8_locale_invlist = ary[2];
1132 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1133 * code points, and an inversion list for the others, but if there are code
1134 * points that should match only conditionally on the target string being
1135 * UTF-8, those are placed in the inversion list, and not the bitmap.
1136 * Since there are circumstances under which they could match, they are
1137 * included in the SSC. But if the ANYOF node is to be inverted, we have
1138 * to exclude them here, so that when we invert below, the end result
1139 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1140 * have to do this here before we add the unconditionally matched code
1142 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1143 _invlist_intersection_complement_2nd(invlist,
1148 /* Add in the points from the bit map */
1149 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1150 if (ANYOF_BITMAP_TEST(node, i)) {
1151 invlist = add_cp_to_invlist(invlist, i);
1152 new_node_has_latin1 = TRUE;
1156 /* If this can match all upper Latin1 code points, have to add them
1158 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1159 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1162 /* Similarly for these */
1163 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1164 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1167 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1168 _invlist_invert(invlist);
1170 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1172 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1173 * locale. We can skip this if there are no 0-255 at all. */
1174 _invlist_union(invlist, PL_Latin1, &invlist);
1177 /* Similarly add the UTF-8 locale possible matches. These have to be
1178 * deferred until after the non-UTF-8 locale ones are taken care of just
1179 * above, or it leads to wrong results under ANYOF_INVERT */
1180 if (only_utf8_locale_invlist) {
1181 _invlist_union_maybe_complement_2nd(invlist,
1182 only_utf8_locale_invlist,
1183 ANYOF_FLAGS(node) & ANYOF_INVERT,
1190 /* These two functions currently do the exact same thing */
1191 #define ssc_init_zero ssc_init
1193 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1194 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1196 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1197 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1198 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1201 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1202 const regnode_charclass *and_with)
1204 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1205 * another SSC or a regular ANYOF class. Can create false positives. */
1210 PERL_ARGS_ASSERT_SSC_AND;
1212 assert(is_ANYOF_SYNTHETIC(ssc));
1214 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1215 * the code point inversion list and just the relevant flags */
1216 if (is_ANYOF_SYNTHETIC(and_with)) {
1217 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1218 anded_flags = ANYOF_FLAGS(and_with);
1220 /* XXX This is a kludge around what appears to be deficiencies in the
1221 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1222 * there are paths through the optimizer where it doesn't get weeded
1223 * out when it should. And if we don't make some extra provision for
1224 * it like the code just below, it doesn't get added when it should.
1225 * This solution is to add it only when AND'ing, which is here, and
1226 * only when what is being AND'ed is the pristine, original node
1227 * matching anything. Thus it is like adding it to ssc_anything() but
1228 * only when the result is to be AND'ed. Probably the same solution
1229 * could be adopted for the same problem we have with /l matching,
1230 * which is solved differently in S_ssc_init(), and that would lead to
1231 * fewer false positives than that solution has. But if this solution
1232 * creates bugs, the consequences are only that a warning isn't raised
1233 * that should be; while the consequences for having /l bugs is
1234 * incorrect matches */
1235 if (ssc_is_anything((regnode_ssc *)and_with)) {
1236 anded_flags |= ANYOF_WARN_SUPER;
1240 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1241 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1244 ANYOF_FLAGS(ssc) &= anded_flags;
1246 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1247 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1248 * 'and_with' may be inverted. When not inverted, we have the situation of
1250 * (C1 | P1) & (C2 | P2)
1251 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1252 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1253 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1254 * <= ((C1 & C2) | P1 | P2)
1255 * Alternatively, the last few steps could be:
1256 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1257 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1258 * <= (C1 | C2 | (P1 & P2))
1259 * We favor the second approach if either P1 or P2 is non-empty. This is
1260 * because these components are a barrier to doing optimizations, as what
1261 * they match cannot be known until the moment of matching as they are
1262 * dependent on the current locale, 'AND"ing them likely will reduce or
1264 * But we can do better if we know that C1,P1 are in their initial state (a
1265 * frequent occurrence), each matching everything:
1266 * (<everything>) & (C2 | P2) = C2 | P2
1267 * Similarly, if C2,P2 are in their initial state (again a frequent
1268 * occurrence), the result is a no-op
1269 * (C1 | P1) & (<everything>) = C1 | P1
1272 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1273 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1274 * <= (C1 & ~C2) | (P1 & ~P2)
1277 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1278 && ! is_ANYOF_SYNTHETIC(and_with))
1282 ssc_intersection(ssc,
1284 FALSE /* Has already been inverted */
1287 /* If either P1 or P2 is empty, the intersection will be also; can skip
1289 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1290 ANYOF_POSIXL_ZERO(ssc);
1292 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1294 /* Note that the Posix class component P from 'and_with' actually
1296 * P = Pa | Pb | ... | Pn
1297 * where each component is one posix class, such as in [\w\s].
1299 * ~P = ~(Pa | Pb | ... | Pn)
1300 * = ~Pa & ~Pb & ... & ~Pn
1301 * <= ~Pa | ~Pb | ... | ~Pn
1302 * The last is something we can easily calculate, but unfortunately
1303 * is likely to have many false positives. We could do better
1304 * in some (but certainly not all) instances if two classes in
1305 * P have known relationships. For example
1306 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1308 * :lower: & :print: = :lower:
1309 * And similarly for classes that must be disjoint. For example,
1310 * since \s and \w can have no elements in common based on rules in
1311 * the POSIX standard,
1312 * \w & ^\S = nothing
1313 * Unfortunately, some vendor locales do not meet the Posix
1314 * standard, in particular almost everything by Microsoft.
1315 * The loop below just changes e.g., \w into \W and vice versa */
1317 regnode_charclass_posixl temp;
1318 int add = 1; /* To calculate the index of the complement */
1320 ANYOF_POSIXL_ZERO(&temp);
1321 for (i = 0; i < ANYOF_MAX; i++) {
1323 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1324 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1326 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1327 ANYOF_POSIXL_SET(&temp, i + add);
1329 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1331 ANYOF_POSIXL_AND(&temp, ssc);
1333 } /* else ssc already has no posixes */
1334 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1335 in its initial state */
1336 else if (! is_ANYOF_SYNTHETIC(and_with)
1337 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1339 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1340 * copy it over 'ssc' */
1341 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1342 if (is_ANYOF_SYNTHETIC(and_with)) {
1343 StructCopy(and_with, ssc, regnode_ssc);
1346 ssc->invlist = anded_cp_list;
1347 ANYOF_POSIXL_ZERO(ssc);
1348 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1349 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1353 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1354 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1356 /* One or the other of P1, P2 is non-empty. */
1357 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1358 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1360 ssc_union(ssc, anded_cp_list, FALSE);
1362 else { /* P1 = P2 = empty */
1363 ssc_intersection(ssc, anded_cp_list, FALSE);
1369 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1370 const regnode_charclass *or_with)
1372 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1373 * another SSC or a regular ANYOF class. Can create false positives if
1374 * 'or_with' is to be inverted. */
1379 PERL_ARGS_ASSERT_SSC_OR;
1381 assert(is_ANYOF_SYNTHETIC(ssc));
1383 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1384 * the code point inversion list and just the relevant flags */
1385 if (is_ANYOF_SYNTHETIC(or_with)) {
1386 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1387 ored_flags = ANYOF_FLAGS(or_with);
1390 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1391 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1394 ANYOF_FLAGS(ssc) |= ored_flags;
1396 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1397 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1398 * 'or_with' may be inverted. When not inverted, we have the simple
1399 * situation of computing:
1400 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1401 * If P1|P2 yields a situation with both a class and its complement are
1402 * set, like having both \w and \W, this matches all code points, and we
1403 * can delete these from the P component of the ssc going forward. XXX We
1404 * might be able to delete all the P components, but I (khw) am not certain
1405 * about this, and it is better to be safe.
1408 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1409 * <= (C1 | P1) | ~C2
1410 * <= (C1 | ~C2) | P1
1411 * (which results in actually simpler code than the non-inverted case)
1414 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1415 && ! is_ANYOF_SYNTHETIC(or_with))
1417 /* We ignore P2, leaving P1 going forward */
1418 } /* else Not inverted */
1419 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1420 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1421 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1423 for (i = 0; i < ANYOF_MAX; i += 2) {
1424 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1426 ssc_match_all_cp(ssc);
1427 ANYOF_POSIXL_CLEAR(ssc, i);
1428 ANYOF_POSIXL_CLEAR(ssc, i+1);
1436 FALSE /* Already has been inverted */
1440 PERL_STATIC_INLINE void
1441 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1443 PERL_ARGS_ASSERT_SSC_UNION;
1445 assert(is_ANYOF_SYNTHETIC(ssc));
1447 _invlist_union_maybe_complement_2nd(ssc->invlist,
1453 PERL_STATIC_INLINE void
1454 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1456 const bool invert2nd)
1458 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1460 assert(is_ANYOF_SYNTHETIC(ssc));
1462 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1468 PERL_STATIC_INLINE void
1469 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1471 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1473 assert(is_ANYOF_SYNTHETIC(ssc));
1475 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1478 PERL_STATIC_INLINE void
1479 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1481 /* AND just the single code point 'cp' into the SSC 'ssc' */
1483 SV* cp_list = _new_invlist(2);
1485 PERL_ARGS_ASSERT_SSC_CP_AND;
1487 assert(is_ANYOF_SYNTHETIC(ssc));
1489 cp_list = add_cp_to_invlist(cp_list, cp);
1490 ssc_intersection(ssc, cp_list,
1491 FALSE /* Not inverted */
1493 SvREFCNT_dec_NN(cp_list);
1496 PERL_STATIC_INLINE void
1497 S_ssc_clear_locale(regnode_ssc *ssc)
1499 /* Set the SSC 'ssc' to not match any locale things */
1500 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1502 assert(is_ANYOF_SYNTHETIC(ssc));
1504 ANYOF_POSIXL_ZERO(ssc);
1505 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1508 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1511 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1513 /* The synthetic start class is used to hopefully quickly winnow down
1514 * places where a pattern could start a match in the target string. If it
1515 * doesn't really narrow things down that much, there isn't much point to
1516 * having the overhead of using it. This function uses some very crude
1517 * heuristics to decide if to use the ssc or not.
1519 * It returns TRUE if 'ssc' rules out more than half what it considers to
1520 * be the "likely" possible matches, but of course it doesn't know what the
1521 * actual things being matched are going to be; these are only guesses
1523 * For /l matches, it assumes that the only likely matches are going to be
1524 * in the 0-255 range, uniformly distributed, so half of that is 127
1525 * For /a and /d matches, it assumes that the likely matches will be just
1526 * the ASCII range, so half of that is 63
1527 * For /u and there isn't anything matching above the Latin1 range, it
1528 * assumes that that is the only range likely to be matched, and uses
1529 * half that as the cut-off: 127. If anything matches above Latin1,
1530 * it assumes that all of Unicode could match (uniformly), except for
1531 * non-Unicode code points and things in the General Category "Other"
1532 * (unassigned, private use, surrogates, controls and formats). This
1533 * is a much large number. */
1535 const U32 max_match = (LOC)
1539 : (invlist_highest(ssc->invlist) < 256)
1541 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1542 U32 count = 0; /* Running total of number of code points matched by
1544 UV start, end; /* Start and end points of current range in inversion
1547 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1549 invlist_iterinit(ssc->invlist);
1550 while (invlist_iternext(ssc->invlist, &start, &end)) {
1552 /* /u is the only thing that we expect to match above 255; so if not /u
1553 * and even if there are matches above 255, ignore them. This catches
1554 * things like \d under /d which does match the digits above 255, but
1555 * since the pattern is /d, it is not likely to be expecting them */
1556 if (! UNI_SEMANTICS) {
1560 end = MIN(end, 255);
1562 count += end - start + 1;
1563 if (count > max_match) {
1564 invlist_iterfinish(ssc->invlist);
1574 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1576 /* The inversion list in the SSC is marked mortal; now we need a more
1577 * permanent copy, which is stored the same way that is done in a regular
1578 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1581 SV* invlist = invlist_clone(ssc->invlist);
1583 PERL_ARGS_ASSERT_SSC_FINALIZE;
1585 assert(is_ANYOF_SYNTHETIC(ssc));
1587 /* The code in this file assumes that all but these flags aren't relevant
1588 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1589 * by the time we reach here */
1590 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1592 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1594 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1595 NULL, NULL, NULL, FALSE);
1597 /* Make sure is clone-safe */
1598 ssc->invlist = NULL;
1600 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1601 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1604 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1607 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1608 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1609 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1610 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1611 ? (TRIE_LIST_CUR( idx ) - 1) \
1617 dump_trie(trie,widecharmap,revcharmap)
1618 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1619 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1621 These routines dump out a trie in a somewhat readable format.
1622 The _interim_ variants are used for debugging the interim
1623 tables that are used to generate the final compressed
1624 representation which is what dump_trie expects.
1626 Part of the reason for their existence is to provide a form
1627 of documentation as to how the different representations function.
1632 Dumps the final compressed table form of the trie to Perl_debug_log.
1633 Used for debugging make_trie().
1637 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1638 AV *revcharmap, U32 depth)
1641 SV *sv=sv_newmortal();
1642 int colwidth= widecharmap ? 6 : 4;
1644 GET_RE_DEBUG_FLAGS_DECL;
1646 PERL_ARGS_ASSERT_DUMP_TRIE;
1648 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1649 (int)depth * 2 + 2,"",
1650 "Match","Base","Ofs" );
1652 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1653 SV ** const tmp = av_fetch( revcharmap, state, 0);
1655 PerlIO_printf( Perl_debug_log, "%*s",
1657 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1658 PL_colors[0], PL_colors[1],
1659 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1660 PERL_PV_ESCAPE_FIRSTCHAR
1665 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1666 (int)depth * 2 + 2,"");
1668 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1669 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1670 PerlIO_printf( Perl_debug_log, "\n");
1672 for( state = 1 ; state < trie->statecount ; state++ ) {
1673 const U32 base = trie->states[ state ].trans.base;
1675 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1676 (int)depth * 2 + 2,"", (UV)state);
1678 if ( trie->states[ state ].wordnum ) {
1679 PerlIO_printf( Perl_debug_log, " W%4X",
1680 trie->states[ state ].wordnum );
1682 PerlIO_printf( Perl_debug_log, "%6s", "" );
1685 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1690 while( ( base + ofs < trie->uniquecharcount ) ||
1691 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1692 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1696 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1698 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1699 if ( ( base + ofs >= trie->uniquecharcount )
1700 && ( base + ofs - trie->uniquecharcount
1702 && trie->trans[ base + ofs
1703 - trie->uniquecharcount ].check == state )
1705 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1707 (UV)trie->trans[ base + ofs
1708 - trie->uniquecharcount ].next );
1710 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1714 PerlIO_printf( Perl_debug_log, "]");
1717 PerlIO_printf( Perl_debug_log, "\n" );
1719 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1721 for (word=1; word <= trie->wordcount; word++) {
1722 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1723 (int)word, (int)(trie->wordinfo[word].prev),
1724 (int)(trie->wordinfo[word].len));
1726 PerlIO_printf(Perl_debug_log, "\n" );
1729 Dumps a fully constructed but uncompressed trie in list form.
1730 List tries normally only are used for construction when the number of
1731 possible chars (trie->uniquecharcount) is very high.
1732 Used for debugging make_trie().
1735 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1736 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1740 SV *sv=sv_newmortal();
1741 int colwidth= widecharmap ? 6 : 4;
1742 GET_RE_DEBUG_FLAGS_DECL;
1744 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1746 /* print out the table precompression. */
1747 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1748 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1749 "------:-----+-----------------\n" );
1751 for( state=1 ; state < next_alloc ; state ++ ) {
1754 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1755 (int)depth * 2 + 2,"", (UV)state );
1756 if ( ! trie->states[ state ].wordnum ) {
1757 PerlIO_printf( Perl_debug_log, "%5s| ","");
1759 PerlIO_printf( Perl_debug_log, "W%4x| ",
1760 trie->states[ state ].wordnum
1763 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1764 SV ** const tmp = av_fetch( revcharmap,
1765 TRIE_LIST_ITEM(state,charid).forid, 0);
1767 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1769 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1771 PL_colors[0], PL_colors[1],
1772 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1773 | PERL_PV_ESCAPE_FIRSTCHAR
1775 TRIE_LIST_ITEM(state,charid).forid,
1776 (UV)TRIE_LIST_ITEM(state,charid).newstate
1779 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1780 (int)((depth * 2) + 14), "");
1783 PerlIO_printf( Perl_debug_log, "\n");
1788 Dumps a fully constructed but uncompressed trie in table form.
1789 This is the normal DFA style state transition table, with a few
1790 twists to facilitate compression later.
1791 Used for debugging make_trie().
1794 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1795 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1800 SV *sv=sv_newmortal();
1801 int colwidth= widecharmap ? 6 : 4;
1802 GET_RE_DEBUG_FLAGS_DECL;
1804 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1807 print out the table precompression so that we can do a visual check
1808 that they are identical.
1811 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1813 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1814 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1816 PerlIO_printf( Perl_debug_log, "%*s",
1818 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1819 PL_colors[0], PL_colors[1],
1820 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1821 PERL_PV_ESCAPE_FIRSTCHAR
1827 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1829 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1830 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1833 PerlIO_printf( Perl_debug_log, "\n" );
1835 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1837 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1838 (int)depth * 2 + 2,"",
1839 (UV)TRIE_NODENUM( state ) );
1841 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1842 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1844 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1846 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1848 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1849 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1850 (UV)trie->trans[ state ].check );
1852 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1853 (UV)trie->trans[ state ].check,
1854 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1862 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1863 startbranch: the first branch in the whole branch sequence
1864 first : start branch of sequence of branch-exact nodes.
1865 May be the same as startbranch
1866 last : Thing following the last branch.
1867 May be the same as tail.
1868 tail : item following the branch sequence
1869 count : words in the sequence
1870 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1871 depth : indent depth
1873 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1875 A trie is an N'ary tree where the branches are determined by digital
1876 decomposition of the key. IE, at the root node you look up the 1st character and
1877 follow that branch repeat until you find the end of the branches. Nodes can be
1878 marked as "accepting" meaning they represent a complete word. Eg:
1882 would convert into the following structure. Numbers represent states, letters
1883 following numbers represent valid transitions on the letter from that state, if
1884 the number is in square brackets it represents an accepting state, otherwise it
1885 will be in parenthesis.
1887 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1891 (1) +-i->(6)-+-s->[7]
1893 +-s->(3)-+-h->(4)-+-e->[5]
1895 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1897 This shows that when matching against the string 'hers' we will begin at state 1
1898 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1899 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1900 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1901 single traverse. We store a mapping from accepting to state to which word was
1902 matched, and then when we have multiple possibilities we try to complete the
1903 rest of the regex in the order in which they occured in the alternation.
1905 The only prior NFA like behaviour that would be changed by the TRIE support is
1906 the silent ignoring of duplicate alternations which are of the form:
1908 / (DUPE|DUPE) X? (?{ ... }) Y /x
1910 Thus EVAL blocks following a trie may be called a different number of times with
1911 and without the optimisation. With the optimisations dupes will be silently
1912 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1913 the following demonstrates:
1915 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1917 which prints out 'word' three times, but
1919 'words'=~/(word|word|word)(?{ print $1 })S/
1921 which doesnt print it out at all. This is due to other optimisations kicking in.
1923 Example of what happens on a structural level:
1925 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1927 1: CURLYM[1] {1,32767}(18)
1938 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1939 and should turn into:
1941 1: CURLYM[1] {1,32767}(18)
1943 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1951 Cases where tail != last would be like /(?foo|bar)baz/:
1961 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1962 and would end up looking like:
1965 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1972 d = uvchr_to_utf8_flags(d, uv, 0);
1974 is the recommended Unicode-aware way of saying
1979 #define TRIE_STORE_REVCHAR(val) \
1982 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1983 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1984 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1985 SvCUR_set(zlopp, kapow - flrbbbbb); \
1988 av_push(revcharmap, zlopp); \
1990 char ooooff = (char)val; \
1991 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1995 /* This gets the next character from the input, folding it if not already
1997 #define TRIE_READ_CHAR STMT_START { \
2000 /* if it is UTF then it is either already folded, or does not need \
2002 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2004 else if (folder == PL_fold_latin1) { \
2005 /* This folder implies Unicode rules, which in the range expressible \
2006 * by not UTF is the lower case, with the two exceptions, one of \
2007 * which should have been taken care of before calling this */ \
2008 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2009 uvc = toLOWER_L1(*uc); \
2010 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2013 /* raw data, will be folded later if needed */ \
2021 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2022 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2023 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2024 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2026 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2027 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2028 TRIE_LIST_CUR( state )++; \
2031 #define TRIE_LIST_NEW(state) STMT_START { \
2032 Newxz( trie->states[ state ].trans.list, \
2033 4, reg_trie_trans_le ); \
2034 TRIE_LIST_CUR( state ) = 1; \
2035 TRIE_LIST_LEN( state ) = 4; \
2038 #define TRIE_HANDLE_WORD(state) STMT_START { \
2039 U16 dupe= trie->states[ state ].wordnum; \
2040 regnode * const noper_next = regnext( noper ); \
2043 /* store the word for dumping */ \
2045 if (OP(noper) != NOTHING) \
2046 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2048 tmp = newSVpvn_utf8( "", 0, UTF ); \
2049 av_push( trie_words, tmp ); \
2053 trie->wordinfo[curword].prev = 0; \
2054 trie->wordinfo[curword].len = wordlen; \
2055 trie->wordinfo[curword].accept = state; \
2057 if ( noper_next < tail ) { \
2059 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2061 trie->jump[curword] = (U16)(noper_next - convert); \
2063 jumper = noper_next; \
2065 nextbranch= regnext(cur); \
2069 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2070 /* chain, so that when the bits of chain are later */\
2071 /* linked together, the dups appear in the chain */\
2072 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2073 trie->wordinfo[dupe].prev = curword; \
2075 /* we haven't inserted this word yet. */ \
2076 trie->states[ state ].wordnum = curword; \
2081 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2082 ( ( base + charid >= ucharcount \
2083 && base + charid < ubound \
2084 && state == trie->trans[ base - ucharcount + charid ].check \
2085 && trie->trans[ base - ucharcount + charid ].next ) \
2086 ? trie->trans[ base - ucharcount + charid ].next \
2087 : ( state==1 ? special : 0 ) \
2091 #define MADE_JUMP_TRIE 2
2092 #define MADE_EXACT_TRIE 4
2095 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2096 regnode *first, regnode *last, regnode *tail,
2097 U32 word_count, U32 flags, U32 depth)
2099 /* first pass, loop through and scan words */
2100 reg_trie_data *trie;
2101 HV *widecharmap = NULL;
2102 AV *revcharmap = newAV();
2108 regnode *jumper = NULL;
2109 regnode *nextbranch = NULL;
2110 regnode *convert = NULL;
2111 U32 *prev_states; /* temp array mapping each state to previous one */
2112 /* we just use folder as a flag in utf8 */
2113 const U8 * folder = NULL;
2116 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2117 AV *trie_words = NULL;
2118 /* along with revcharmap, this only used during construction but both are
2119 * useful during debugging so we store them in the struct when debugging.
2122 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2123 STRLEN trie_charcount=0;
2125 SV *re_trie_maxbuff;
2126 GET_RE_DEBUG_FLAGS_DECL;
2128 PERL_ARGS_ASSERT_MAKE_TRIE;
2130 PERL_UNUSED_ARG(depth);
2137 case EXACTFU: folder = PL_fold_latin1; break;
2138 case EXACTF: folder = PL_fold; break;
2139 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2142 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2144 trie->startstate = 1;
2145 trie->wordcount = word_count;
2146 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2147 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2149 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2150 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2151 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2154 trie_words = newAV();
2157 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2158 assert(re_trie_maxbuff);
2159 if (!SvIOK(re_trie_maxbuff)) {
2160 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2162 DEBUG_TRIE_COMPILE_r({
2163 PerlIO_printf( Perl_debug_log,
2164 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2165 (int)depth * 2 + 2, "",
2166 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2167 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2170 /* Find the node we are going to overwrite */
2171 if ( first == startbranch && OP( last ) != BRANCH ) {
2172 /* whole branch chain */
2175 /* branch sub-chain */
2176 convert = NEXTOPER( first );
2179 /* -- First loop and Setup --
2181 We first traverse the branches and scan each word to determine if it
2182 contains widechars, and how many unique chars there are, this is
2183 important as we have to build a table with at least as many columns as we
2186 We use an array of integers to represent the character codes 0..255
2187 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2188 the native representation of the character value as the key and IV's for
2191 *TODO* If we keep track of how many times each character is used we can
2192 remap the columns so that the table compression later on is more
2193 efficient in terms of memory by ensuring the most common value is in the
2194 middle and the least common are on the outside. IMO this would be better
2195 than a most to least common mapping as theres a decent chance the most
2196 common letter will share a node with the least common, meaning the node
2197 will not be compressible. With a middle is most common approach the worst
2198 case is when we have the least common nodes twice.
2202 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2203 regnode *noper = NEXTOPER( cur );
2204 const U8 *uc = (U8*)STRING( noper );
2205 const U8 *e = uc + STR_LEN( noper );
2207 U32 wordlen = 0; /* required init */
2208 STRLEN minchars = 0;
2209 STRLEN maxchars = 0;
2210 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2213 if (OP(noper) == NOTHING) {
2214 regnode *noper_next= regnext(noper);
2215 if (noper_next != tail && OP(noper_next) == flags) {
2217 uc= (U8*)STRING(noper);
2218 e= uc + STR_LEN(noper);
2219 trie->minlen= STR_LEN(noper);
2226 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2227 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2228 regardless of encoding */
2229 if (OP( noper ) == EXACTFU_SS) {
2230 /* false positives are ok, so just set this */
2231 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2234 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2236 TRIE_CHARCOUNT(trie)++;
2239 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2240 * is in effect. Under /i, this character can match itself, or
2241 * anything that folds to it. If not under /i, it can match just
2242 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2243 * all fold to k, and all are single characters. But some folds
2244 * expand to more than one character, so for example LATIN SMALL
2245 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2246 * the string beginning at 'uc' is 'ffi', it could be matched by
2247 * three characters, or just by the one ligature character. (It
2248 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2249 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2250 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2251 * match.) The trie needs to know the minimum and maximum number
2252 * of characters that could match so that it can use size alone to
2253 * quickly reject many match attempts. The max is simple: it is
2254 * the number of folded characters in this branch (since a fold is
2255 * never shorter than what folds to it. */
2259 /* And the min is equal to the max if not under /i (indicated by
2260 * 'folder' being NULL), or there are no multi-character folds. If
2261 * there is a multi-character fold, the min is incremented just
2262 * once, for the character that folds to the sequence. Each
2263 * character in the sequence needs to be added to the list below of
2264 * characters in the trie, but we count only the first towards the
2265 * min number of characters needed. This is done through the
2266 * variable 'foldlen', which is returned by the macros that look
2267 * for these sequences as the number of bytes the sequence
2268 * occupies. Each time through the loop, we decrement 'foldlen' by
2269 * how many bytes the current char occupies. Only when it reaches
2270 * 0 do we increment 'minchars' or look for another multi-character
2272 if (folder == NULL) {
2275 else if (foldlen > 0) {
2276 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2281 /* See if *uc is the beginning of a multi-character fold. If
2282 * so, we decrement the length remaining to look at, to account
2283 * for the current character this iteration. (We can use 'uc'
2284 * instead of the fold returned by TRIE_READ_CHAR because for
2285 * non-UTF, the latin1_safe macro is smart enough to account
2286 * for all the unfolded characters, and because for UTF, the
2287 * string will already have been folded earlier in the
2288 * compilation process */
2290 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2291 foldlen -= UTF8SKIP(uc);
2294 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2299 /* The current character (and any potential folds) should be added
2300 * to the possible matching characters for this position in this
2304 U8 folded= folder[ (U8) uvc ];
2305 if ( !trie->charmap[ folded ] ) {
2306 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2307 TRIE_STORE_REVCHAR( folded );
2310 if ( !trie->charmap[ uvc ] ) {
2311 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2312 TRIE_STORE_REVCHAR( uvc );
2315 /* store the codepoint in the bitmap, and its folded
2317 TRIE_BITMAP_SET(trie, uvc);
2319 /* store the folded codepoint */
2320 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2323 /* store first byte of utf8 representation of
2324 variant codepoints */
2325 if (! UVCHR_IS_INVARIANT(uvc)) {
2326 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2329 set_bit = 0; /* We've done our bit :-) */
2333 /* XXX We could come up with the list of code points that fold
2334 * to this using PL_utf8_foldclosures, except not for
2335 * multi-char folds, as there may be multiple combinations
2336 * there that could work, which needs to wait until runtime to
2337 * resolve (The comment about LIGATURE FFI above is such an
2342 widecharmap = newHV();
2344 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2347 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2349 if ( !SvTRUE( *svpp ) ) {
2350 sv_setiv( *svpp, ++trie->uniquecharcount );
2351 TRIE_STORE_REVCHAR(uvc);
2354 } /* end loop through characters in this branch of the trie */
2356 /* We take the min and max for this branch and combine to find the min
2357 * and max for all branches processed so far */
2358 if( cur == first ) {
2359 trie->minlen = minchars;
2360 trie->maxlen = maxchars;
2361 } else if (minchars < trie->minlen) {
2362 trie->minlen = minchars;
2363 } else if (maxchars > trie->maxlen) {
2364 trie->maxlen = maxchars;
2366 } /* end first pass */
2367 DEBUG_TRIE_COMPILE_r(
2368 PerlIO_printf( Perl_debug_log,
2369 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2370 (int)depth * 2 + 2,"",
2371 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2372 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2373 (int)trie->minlen, (int)trie->maxlen )
2377 We now know what we are dealing with in terms of unique chars and
2378 string sizes so we can calculate how much memory a naive
2379 representation using a flat table will take. If it's over a reasonable
2380 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2381 conservative but potentially much slower representation using an array
2384 At the end we convert both representations into the same compressed
2385 form that will be used in regexec.c for matching with. The latter
2386 is a form that cannot be used to construct with but has memory
2387 properties similar to the list form and access properties similar
2388 to the table form making it both suitable for fast searches and
2389 small enough that its feasable to store for the duration of a program.
2391 See the comment in the code where the compressed table is produced
2392 inplace from the flat tabe representation for an explanation of how
2393 the compression works.
2398 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2401 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2402 > SvIV(re_trie_maxbuff) )
2405 Second Pass -- Array Of Lists Representation
2407 Each state will be represented by a list of charid:state records
2408 (reg_trie_trans_le) the first such element holds the CUR and LEN
2409 points of the allocated array. (See defines above).
2411 We build the initial structure using the lists, and then convert
2412 it into the compressed table form which allows faster lookups
2413 (but cant be modified once converted).
2416 STRLEN transcount = 1;
2418 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2419 "%*sCompiling trie using list compiler\n",
2420 (int)depth * 2 + 2, ""));
2422 trie->states = (reg_trie_state *)
2423 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2424 sizeof(reg_trie_state) );
2428 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2430 regnode *noper = NEXTOPER( cur );
2431 U8 *uc = (U8*)STRING( noper );
2432 const U8 *e = uc + STR_LEN( noper );
2433 U32 state = 1; /* required init */
2434 U16 charid = 0; /* sanity init */
2435 U32 wordlen = 0; /* required init */
2437 if (OP(noper) == NOTHING) {
2438 regnode *noper_next= regnext(noper);
2439 if (noper_next != tail && OP(noper_next) == flags) {
2441 uc= (U8*)STRING(noper);
2442 e= uc + STR_LEN(noper);
2446 if (OP(noper) != NOTHING) {
2447 for ( ; uc < e ; uc += len ) {
2452 charid = trie->charmap[ uvc ];
2454 SV** const svpp = hv_fetch( widecharmap,
2461 charid=(U16)SvIV( *svpp );
2464 /* charid is now 0 if we dont know the char read, or
2465 * nonzero if we do */
2472 if ( !trie->states[ state ].trans.list ) {
2473 TRIE_LIST_NEW( state );
2476 check <= TRIE_LIST_USED( state );
2479 if ( TRIE_LIST_ITEM( state, check ).forid
2482 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2487 newstate = next_alloc++;
2488 prev_states[newstate] = state;
2489 TRIE_LIST_PUSH( state, charid, newstate );
2494 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2498 TRIE_HANDLE_WORD(state);
2500 } /* end second pass */
2502 /* next alloc is the NEXT state to be allocated */
2503 trie->statecount = next_alloc;
2504 trie->states = (reg_trie_state *)
2505 PerlMemShared_realloc( trie->states,
2507 * sizeof(reg_trie_state) );
2509 /* and now dump it out before we compress it */
2510 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2511 revcharmap, next_alloc,
2515 trie->trans = (reg_trie_trans *)
2516 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2523 for( state=1 ; state < next_alloc ; state ++ ) {
2527 DEBUG_TRIE_COMPILE_MORE_r(
2528 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2532 if (trie->states[state].trans.list) {
2533 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2537 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2538 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2539 if ( forid < minid ) {
2541 } else if ( forid > maxid ) {
2545 if ( transcount < tp + maxid - minid + 1) {
2547 trie->trans = (reg_trie_trans *)
2548 PerlMemShared_realloc( trie->trans,
2550 * sizeof(reg_trie_trans) );
2551 Zero( trie->trans + (transcount / 2),
2555 base = trie->uniquecharcount + tp - minid;
2556 if ( maxid == minid ) {
2558 for ( ; zp < tp ; zp++ ) {
2559 if ( ! trie->trans[ zp ].next ) {
2560 base = trie->uniquecharcount + zp - minid;
2561 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2563 trie->trans[ zp ].check = state;
2569 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2571 trie->trans[ tp ].check = state;
2576 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2577 const U32 tid = base
2578 - trie->uniquecharcount
2579 + TRIE_LIST_ITEM( state, idx ).forid;
2580 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2582 trie->trans[ tid ].check = state;
2584 tp += ( maxid - minid + 1 );
2586 Safefree(trie->states[ state ].trans.list);
2589 DEBUG_TRIE_COMPILE_MORE_r(
2590 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2593 trie->states[ state ].trans.base=base;
2595 trie->lasttrans = tp + 1;
2599 Second Pass -- Flat Table Representation.
2601 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2602 each. We know that we will need Charcount+1 trans at most to store
2603 the data (one row per char at worst case) So we preallocate both
2604 structures assuming worst case.
2606 We then construct the trie using only the .next slots of the entry
2609 We use the .check field of the first entry of the node temporarily
2610 to make compression both faster and easier by keeping track of how
2611 many non zero fields are in the node.
2613 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2616 There are two terms at use here: state as a TRIE_NODEIDX() which is
2617 a number representing the first entry of the node, and state as a
2618 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2619 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2620 if there are 2 entrys per node. eg:
2628 The table is internally in the right hand, idx form. However as we
2629 also have to deal with the states array which is indexed by nodenum
2630 we have to use TRIE_NODENUM() to convert.
2633 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2634 "%*sCompiling trie using table compiler\n",
2635 (int)depth * 2 + 2, ""));
2637 trie->trans = (reg_trie_trans *)
2638 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2639 * trie->uniquecharcount + 1,
2640 sizeof(reg_trie_trans) );
2641 trie->states = (reg_trie_state *)
2642 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2643 sizeof(reg_trie_state) );
2644 next_alloc = trie->uniquecharcount + 1;
2647 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2649 regnode *noper = NEXTOPER( cur );
2650 const U8 *uc = (U8*)STRING( noper );
2651 const U8 *e = uc + STR_LEN( noper );
2653 U32 state = 1; /* required init */
2655 U16 charid = 0; /* sanity init */
2656 U32 accept_state = 0; /* sanity init */
2658 U32 wordlen = 0; /* required init */
2660 if (OP(noper) == NOTHING) {
2661 regnode *noper_next= regnext(noper);
2662 if (noper_next != tail && OP(noper_next) == flags) {
2664 uc= (U8*)STRING(noper);
2665 e= uc + STR_LEN(noper);
2669 if ( OP(noper) != NOTHING ) {
2670 for ( ; uc < e ; uc += len ) {
2675 charid = trie->charmap[ uvc ];
2677 SV* const * const svpp = hv_fetch( widecharmap,
2681 charid = svpp ? (U16)SvIV(*svpp) : 0;
2685 if ( !trie->trans[ state + charid ].next ) {
2686 trie->trans[ state + charid ].next = next_alloc;
2687 trie->trans[ state ].check++;
2688 prev_states[TRIE_NODENUM(next_alloc)]
2689 = TRIE_NODENUM(state);
2690 next_alloc += trie->uniquecharcount;
2692 state = trie->trans[ state + charid ].next;
2694 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2696 /* charid is now 0 if we dont know the char read, or
2697 * nonzero if we do */
2700 accept_state = TRIE_NODENUM( state );
2701 TRIE_HANDLE_WORD(accept_state);
2703 } /* end second pass */
2705 /* and now dump it out before we compress it */
2706 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2708 next_alloc, depth+1));
2712 * Inplace compress the table.*
2714 For sparse data sets the table constructed by the trie algorithm will
2715 be mostly 0/FAIL transitions or to put it another way mostly empty.
2716 (Note that leaf nodes will not contain any transitions.)
2718 This algorithm compresses the tables by eliminating most such
2719 transitions, at the cost of a modest bit of extra work during lookup:
2721 - Each states[] entry contains a .base field which indicates the
2722 index in the state[] array wheres its transition data is stored.
2724 - If .base is 0 there are no valid transitions from that node.
2726 - If .base is nonzero then charid is added to it to find an entry in
2729 -If trans[states[state].base+charid].check!=state then the
2730 transition is taken to be a 0/Fail transition. Thus if there are fail
2731 transitions at the front of the node then the .base offset will point
2732 somewhere inside the previous nodes data (or maybe even into a node
2733 even earlier), but the .check field determines if the transition is
2737 The following process inplace converts the table to the compressed
2738 table: We first do not compress the root node 1,and mark all its
2739 .check pointers as 1 and set its .base pointer as 1 as well. This
2740 allows us to do a DFA construction from the compressed table later,
2741 and ensures that any .base pointers we calculate later are greater
2744 - We set 'pos' to indicate the first entry of the second node.
2746 - We then iterate over the columns of the node, finding the first and
2747 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2748 and set the .check pointers accordingly, and advance pos
2749 appropriately and repreat for the next node. Note that when we copy
2750 the next pointers we have to convert them from the original
2751 NODEIDX form to NODENUM form as the former is not valid post
2754 - If a node has no transitions used we mark its base as 0 and do not
2755 advance the pos pointer.
2757 - If a node only has one transition we use a second pointer into the
2758 structure to fill in allocated fail transitions from other states.
2759 This pointer is independent of the main pointer and scans forward
2760 looking for null transitions that are allocated to a state. When it
2761 finds one it writes the single transition into the "hole". If the
2762 pointer doesnt find one the single transition is appended as normal.
2764 - Once compressed we can Renew/realloc the structures to release the
2767 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2768 specifically Fig 3.47 and the associated pseudocode.
2772 const U32 laststate = TRIE_NODENUM( next_alloc );
2775 trie->statecount = laststate;
2777 for ( state = 1 ; state < laststate ; state++ ) {
2779 const U32 stateidx = TRIE_NODEIDX( state );
2780 const U32 o_used = trie->trans[ stateidx ].check;
2781 U32 used = trie->trans[ stateidx ].check;
2782 trie->trans[ stateidx ].check = 0;
2785 used && charid < trie->uniquecharcount;
2788 if ( flag || trie->trans[ stateidx + charid ].next ) {
2789 if ( trie->trans[ stateidx + charid ].next ) {
2791 for ( ; zp < pos ; zp++ ) {
2792 if ( ! trie->trans[ zp ].next ) {
2796 trie->states[ state ].trans.base
2798 + trie->uniquecharcount
2800 trie->trans[ zp ].next
2801 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2803 trie->trans[ zp ].check = state;
2804 if ( ++zp > pos ) pos = zp;
2811 trie->states[ state ].trans.base
2812 = pos + trie->uniquecharcount - charid ;
2814 trie->trans[ pos ].next
2815 = SAFE_TRIE_NODENUM(
2816 trie->trans[ stateidx + charid ].next );
2817 trie->trans[ pos ].check = state;
2822 trie->lasttrans = pos + 1;
2823 trie->states = (reg_trie_state *)
2824 PerlMemShared_realloc( trie->states, laststate
2825 * sizeof(reg_trie_state) );
2826 DEBUG_TRIE_COMPILE_MORE_r(
2827 PerlIO_printf( Perl_debug_log,
2828 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2829 (int)depth * 2 + 2,"",
2830 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2834 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2837 } /* end table compress */
2839 DEBUG_TRIE_COMPILE_MORE_r(
2840 PerlIO_printf(Perl_debug_log,
2841 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2842 (int)depth * 2 + 2, "",
2843 (UV)trie->statecount,
2844 (UV)trie->lasttrans)
2846 /* resize the trans array to remove unused space */
2847 trie->trans = (reg_trie_trans *)
2848 PerlMemShared_realloc( trie->trans, trie->lasttrans
2849 * sizeof(reg_trie_trans) );
2851 { /* Modify the program and insert the new TRIE node */
2852 U8 nodetype =(U8)(flags & 0xFF);
2856 regnode *optimize = NULL;
2857 #ifdef RE_TRACK_PATTERN_OFFSETS
2860 U32 mjd_nodelen = 0;
2861 #endif /* RE_TRACK_PATTERN_OFFSETS */
2862 #endif /* DEBUGGING */
2864 This means we convert either the first branch or the first Exact,
2865 depending on whether the thing following (in 'last') is a branch
2866 or not and whther first is the startbranch (ie is it a sub part of
2867 the alternation or is it the whole thing.)
2868 Assuming its a sub part we convert the EXACT otherwise we convert
2869 the whole branch sequence, including the first.
2871 /* Find the node we are going to overwrite */
2872 if ( first != startbranch || OP( last ) == BRANCH ) {
2873 /* branch sub-chain */
2874 NEXT_OFF( first ) = (U16)(last - first);
2875 #ifdef RE_TRACK_PATTERN_OFFSETS
2877 mjd_offset= Node_Offset((convert));
2878 mjd_nodelen= Node_Length((convert));
2881 /* whole branch chain */
2883 #ifdef RE_TRACK_PATTERN_OFFSETS
2886 const regnode *nop = NEXTOPER( convert );
2887 mjd_offset= Node_Offset((nop));
2888 mjd_nodelen= Node_Length((nop));
2892 PerlIO_printf(Perl_debug_log,
2893 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2894 (int)depth * 2 + 2, "",
2895 (UV)mjd_offset, (UV)mjd_nodelen)
2898 /* But first we check to see if there is a common prefix we can
2899 split out as an EXACT and put in front of the TRIE node. */
2900 trie->startstate= 1;
2901 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2903 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2907 const U32 base = trie->states[ state ].trans.base;
2909 if ( trie->states[state].wordnum )
2912 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2913 if ( ( base + ofs >= trie->uniquecharcount ) &&
2914 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2915 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2917 if ( ++count > 1 ) {
2918 SV **tmp = av_fetch( revcharmap, ofs, 0);
2919 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2920 if ( state == 1 ) break;
2922 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2924 PerlIO_printf(Perl_debug_log,
2925 "%*sNew Start State=%"UVuf" Class: [",
2926 (int)depth * 2 + 2, "",
2929 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2930 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2932 TRIE_BITMAP_SET(trie,*ch);
2934 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2936 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2940 TRIE_BITMAP_SET(trie,*ch);
2942 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2943 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2949 SV **tmp = av_fetch( revcharmap, idx, 0);
2951 char *ch = SvPV( *tmp, len );
2953 SV *sv=sv_newmortal();
2954 PerlIO_printf( Perl_debug_log,
2955 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2956 (int)depth * 2 + 2, "",
2958 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2959 PL_colors[0], PL_colors[1],
2960 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2961 PERL_PV_ESCAPE_FIRSTCHAR
2966 OP( convert ) = nodetype;
2967 str=STRING(convert);
2970 STR_LEN(convert) += len;
2976 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2981 trie->prefixlen = (state-1);
2983 regnode *n = convert+NODE_SZ_STR(convert);
2984 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2985 trie->startstate = state;
2986 trie->minlen -= (state - 1);
2987 trie->maxlen -= (state - 1);
2989 /* At least the UNICOS C compiler choked on this
2990 * being argument to DEBUG_r(), so let's just have
2993 #ifdef PERL_EXT_RE_BUILD
2999 regnode *fix = convert;
3000 U32 word = trie->wordcount;
3002 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3003 while( ++fix < n ) {
3004 Set_Node_Offset_Length(fix, 0, 0);
3007 SV ** const tmp = av_fetch( trie_words, word, 0 );
3009 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3010 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3012 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3020 NEXT_OFF(convert) = (U16)(tail - convert);
3021 DEBUG_r(optimize= n);
3027 if ( trie->maxlen ) {
3028 NEXT_OFF( convert ) = (U16)(tail - convert);
3029 ARG_SET( convert, data_slot );
3030 /* Store the offset to the first unabsorbed branch in
3031 jump[0], which is otherwise unused by the jump logic.
3032 We use this when dumping a trie and during optimisation. */
3034 trie->jump[0] = (U16)(nextbranch - convert);
3036 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3037 * and there is a bitmap
3038 * and the first "jump target" node we found leaves enough room
3039 * then convert the TRIE node into a TRIEC node, with the bitmap
3040 * embedded inline in the opcode - this is hypothetically faster.
3042 if ( !trie->states[trie->startstate].wordnum
3044 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3046 OP( convert ) = TRIEC;
3047 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3048 PerlMemShared_free(trie->bitmap);
3051 OP( convert ) = TRIE;
3053 /* store the type in the flags */
3054 convert->flags = nodetype;
3058 + regarglen[ OP( convert ) ];
3060 /* XXX We really should free up the resource in trie now,
3061 as we won't use them - (which resources?) dmq */
3063 /* needed for dumping*/
3064 DEBUG_r(if (optimize) {
3065 regnode *opt = convert;
3067 while ( ++opt < optimize) {
3068 Set_Node_Offset_Length(opt,0,0);
3071 Try to clean up some of the debris left after the
3074 while( optimize < jumper ) {
3075 mjd_nodelen += Node_Length((optimize));
3076 OP( optimize ) = OPTIMIZED;
3077 Set_Node_Offset_Length(optimize,0,0);
3080 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3082 } /* end node insert */
3084 /* Finish populating the prev field of the wordinfo array. Walk back
3085 * from each accept state until we find another accept state, and if
3086 * so, point the first word's .prev field at the second word. If the
3087 * second already has a .prev field set, stop now. This will be the
3088 * case either if we've already processed that word's accept state,
3089 * or that state had multiple words, and the overspill words were
3090 * already linked up earlier.
3097 for (word=1; word <= trie->wordcount; word++) {
3099 if (trie->wordinfo[word].prev)
3101 state = trie->wordinfo[word].accept;
3103 state = prev_states[state];
3106 prev = trie->states[state].wordnum;
3110 trie->wordinfo[word].prev = prev;
3112 Safefree(prev_states);
3116 /* and now dump out the compressed format */
3117 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3119 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3121 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3122 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3124 SvREFCNT_dec_NN(revcharmap);
3128 : trie->startstate>1
3134 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3136 /* The Trie is constructed and compressed now so we can build a fail array if
3139 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3141 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3145 We find the fail state for each state in the trie, this state is the longest
3146 proper suffix of the current state's 'word' that is also a proper prefix of
3147 another word in our trie. State 1 represents the word '' and is thus the
3148 default fail state. This allows the DFA not to have to restart after its
3149 tried and failed a word at a given point, it simply continues as though it
3150 had been matching the other word in the first place.
3152 'abcdgu'=~/abcdefg|cdgu/
3153 When we get to 'd' we are still matching the first word, we would encounter
3154 'g' which would fail, which would bring us to the state representing 'd' in
3155 the second word where we would try 'g' and succeed, proceeding to match
3158 /* add a fail transition */
3159 const U32 trie_offset = ARG(source);
3160 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3162 const U32 ucharcount = trie->uniquecharcount;
3163 const U32 numstates = trie->statecount;
3164 const U32 ubound = trie->lasttrans + ucharcount;
3168 U32 base = trie->states[ 1 ].trans.base;
3171 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3173 GET_RE_DEBUG_FLAGS_DECL;
3175 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3176 PERL_UNUSED_CONTEXT;
3178 PERL_UNUSED_ARG(depth);
3181 if ( OP(source) == TRIE ) {
3182 struct regnode_1 *op = (struct regnode_1 *)
3183 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3184 StructCopy(source,op,struct regnode_1);
3185 stclass = (regnode *)op;
3187 struct regnode_charclass *op = (struct regnode_charclass *)
3188 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3189 StructCopy(source,op,struct regnode_charclass);
3190 stclass = (regnode *)op;
3192 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3194 ARG_SET( stclass, data_slot );
3195 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3196 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3197 aho->trie=trie_offset;
3198 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3199 Copy( trie->states, aho->states, numstates, reg_trie_state );
3200 Newxz( q, numstates, U32);
3201 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3204 /* initialize fail[0..1] to be 1 so that we always have
3205 a valid final fail state */
3206 fail[ 0 ] = fail[ 1 ] = 1;
3208 for ( charid = 0; charid < ucharcount ; charid++ ) {
3209 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3211 q[ q_write ] = newstate;
3212 /* set to point at the root */
3213 fail[ q[ q_write++ ] ]=1;
3216 while ( q_read < q_write) {
3217 const U32 cur = q[ q_read++ % numstates ];
3218 base = trie->states[ cur ].trans.base;
3220 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3221 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3223 U32 fail_state = cur;
3226 fail_state = fail[ fail_state ];
3227 fail_base = aho->states[ fail_state ].trans.base;
3228 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3230 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3231 fail[ ch_state ] = fail_state;
3232 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3234 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3236 q[ q_write++ % numstates] = ch_state;
3240 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3241 when we fail in state 1, this allows us to use the
3242 charclass scan to find a valid start char. This is based on the principle
3243 that theres a good chance the string being searched contains lots of stuff
3244 that cant be a start char.
3246 fail[ 0 ] = fail[ 1 ] = 0;
3247 DEBUG_TRIE_COMPILE_r({
3248 PerlIO_printf(Perl_debug_log,
3249 "%*sStclass Failtable (%"UVuf" states): 0",
3250 (int)(depth * 2), "", (UV)numstates
3252 for( q_read=1; q_read<numstates; q_read++ ) {
3253 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3255 PerlIO_printf(Perl_debug_log, "\n");
3258 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3263 #define DEBUG_PEEP(str,scan,depth) \
3264 DEBUG_OPTIMISE_r({if (scan){ \
3265 regnode *Next = regnext(scan); \
3266 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3267 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3268 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3269 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3270 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3271 PerlIO_printf(Perl_debug_log, "\n"); \
3274 /* The below joins as many adjacent EXACTish nodes as possible into a single
3275 * one. The regop may be changed if the node(s) contain certain sequences that
3276 * require special handling. The joining is only done if:
3277 * 1) there is room in the current conglomerated node to entirely contain the
3279 * 2) they are the exact same node type
3281 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3282 * these get optimized out
3284 * If a node is to match under /i (folded), the number of characters it matches
3285 * can be different than its character length if it contains a multi-character
3286 * fold. *min_subtract is set to the total delta number of characters of the
3289 * And *unfolded_multi_char is set to indicate whether or not the node contains
3290 * an unfolded multi-char fold. This happens when whether the fold is valid or
3291 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3292 * SMALL LETTER SHARP S, as only if the target string being matched against
3293 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3294 * folding rules depend on the locale in force at runtime. (Multi-char folds
3295 * whose components are all above the Latin1 range are not run-time locale
3296 * dependent, and have already been folded by the time this function is
3299 * This is as good a place as any to discuss the design of handling these
3300 * multi-character fold sequences. It's been wrong in Perl for a very long
3301 * time. There are three code points in Unicode whose multi-character folds
3302 * were long ago discovered to mess things up. The previous designs for
3303 * dealing with these involved assigning a special node for them. This
3304 * approach doesn't always work, as evidenced by this example:
3305 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3306 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3307 * would match just the \xDF, it won't be able to handle the case where a
3308 * successful match would have to cross the node's boundary. The new approach
3309 * that hopefully generally solves the problem generates an EXACTFU_SS node
3310 * that is "sss" in this case.
3312 * It turns out that there are problems with all multi-character folds, and not
3313 * just these three. Now the code is general, for all such cases. The
3314 * approach taken is:
3315 * 1) This routine examines each EXACTFish node that could contain multi-
3316 * character folded sequences. Since a single character can fold into
3317 * such a sequence, the minimum match length for this node is less than
3318 * the number of characters in the node. This routine returns in
3319 * *min_subtract how many characters to subtract from the the actual
3320 * length of the string to get a real minimum match length; it is 0 if
3321 * there are no multi-char foldeds. This delta is used by the caller to
3322 * adjust the min length of the match, and the delta between min and max,
3323 * so that the optimizer doesn't reject these possibilities based on size
3325 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3326 * is used for an EXACTFU node that contains at least one "ss" sequence in
3327 * it. For non-UTF-8 patterns and strings, this is the only case where
3328 * there is a possible fold length change. That means that a regular
3329 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3330 * with length changes, and so can be processed faster. regexec.c takes
3331 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3332 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3333 * known until runtime). This saves effort in regex matching. However,
3334 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3335 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3336 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3337 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3338 * possibilities for the non-UTF8 patterns are quite simple, except for
3339 * the sharp s. All the ones that don't involve a UTF-8 target string are
3340 * members of a fold-pair, and arrays are set up for all of them so that
3341 * the other member of the pair can be found quickly. Code elsewhere in
3342 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3343 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3344 * described in the next item.
3345 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3346 * validity of the fold won't be known until runtime, and so must remain
3347 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3348 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3349 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3350 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3351 * The reason this is a problem is that the optimizer part of regexec.c
3352 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3353 * that a character in the pattern corresponds to at most a single
3354 * character in the target string. (And I do mean character, and not byte
3355 * here, unlike other parts of the documentation that have never been
3356 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3357 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3358 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3359 * nodes, violate the assumption, and they are the only instances where it
3360 * is violated. I'm reluctant to try to change the assumption, as the
3361 * code involved is impenetrable to me (khw), so instead the code here
3362 * punts. This routine examines EXACTFL nodes, and (when the pattern
3363 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3364 * boolean indicating whether or not the node contains such a fold. When
3365 * it is true, the caller sets a flag that later causes the optimizer in
3366 * this file to not set values for the floating and fixed string lengths,
3367 * and thus avoids the optimizer code in regexec.c that makes the invalid
3368 * assumption. Thus, there is no optimization based on string lengths for
3369 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3370 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3371 * assumption is wrong only in these cases is that all other non-UTF-8
3372 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3373 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3374 * EXACTF nodes because we don't know at compile time if it actually
3375 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3376 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3377 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3378 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3379 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3380 * string would require the pattern to be forced into UTF-8, the overhead
3381 * of which we want to avoid. Similarly the unfolded multi-char folds in
3382 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3385 * Similarly, the code that generates tries doesn't currently handle
3386 * not-already-folded multi-char folds, and it looks like a pain to change
3387 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3388 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3389 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3390 * using /iaa matching will be doing so almost entirely with ASCII
3391 * strings, so this should rarely be encountered in practice */
3393 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3394 if (PL_regkind[OP(scan)] == EXACT) \
3395 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3398 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3399 UV *min_subtract, bool *unfolded_multi_char,
3400 U32 flags,regnode *val, U32 depth)
3402 /* Merge several consecutive EXACTish nodes into one. */
3403 regnode *n = regnext(scan);
3405 regnode *next = scan + NODE_SZ_STR(scan);
3409 regnode *stop = scan;
3410 GET_RE_DEBUG_FLAGS_DECL;
3412 PERL_UNUSED_ARG(depth);
3415 PERL_ARGS_ASSERT_JOIN_EXACT;
3416 #ifndef EXPERIMENTAL_INPLACESCAN
3417 PERL_UNUSED_ARG(flags);
3418 PERL_UNUSED_ARG(val);
3420 DEBUG_PEEP("join",scan,depth);
3422 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3423 * EXACT ones that are mergeable to the current one. */
3425 && (PL_regkind[OP(n)] == NOTHING
3426 || (stringok && OP(n) == OP(scan)))
3428 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3431 if (OP(n) == TAIL || n > next)
3433 if (PL_regkind[OP(n)] == NOTHING) {
3434 DEBUG_PEEP("skip:",n,depth);
3435 NEXT_OFF(scan) += NEXT_OFF(n);
3436 next = n + NODE_STEP_REGNODE;
3443 else if (stringok) {
3444 const unsigned int oldl = STR_LEN(scan);
3445 regnode * const nnext = regnext(n);
3447 /* XXX I (khw) kind of doubt that this works on platforms (should
3448 * Perl ever run on one) where U8_MAX is above 255 because of lots
3449 * of other assumptions */
3450 /* Don't join if the sum can't fit into a single node */
3451 if (oldl + STR_LEN(n) > U8_MAX)
3454 DEBUG_PEEP("merg",n,depth);
3457 NEXT_OFF(scan) += NEXT_OFF(n);
3458 STR_LEN(scan) += STR_LEN(n);
3459 next = n + NODE_SZ_STR(n);
3460 /* Now we can overwrite *n : */
3461 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3469 #ifdef EXPERIMENTAL_INPLACESCAN
3470 if (flags && !NEXT_OFF(n)) {
3471 DEBUG_PEEP("atch", val, depth);
3472 if (reg_off_by_arg[OP(n)]) {
3473 ARG_SET(n, val - n);
3476 NEXT_OFF(n) = val - n;
3484 *unfolded_multi_char = FALSE;
3486 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3487 * can now analyze for sequences of problematic code points. (Prior to
3488 * this final joining, sequences could have been split over boundaries, and
3489 * hence missed). The sequences only happen in folding, hence for any
3490 * non-EXACT EXACTish node */
3491 if (OP(scan) != EXACT) {
3492 U8* s0 = (U8*) STRING(scan);
3494 U8* s_end = s0 + STR_LEN(scan);
3496 int total_count_delta = 0; /* Total delta number of characters that
3497 multi-char folds expand to */
3499 /* One pass is made over the node's string looking for all the
3500 * possibilities. To avoid some tests in the loop, there are two main
3501 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3506 if (OP(scan) == EXACTFL) {
3509 /* An EXACTFL node would already have been changed to another
3510 * node type unless there is at least one character in it that
3511 * is problematic; likely a character whose fold definition
3512 * won't be known until runtime, and so has yet to be folded.
3513 * For all but the UTF-8 locale, folds are 1-1 in length, but
3514 * to handle the UTF-8 case, we need to create a temporary
3515 * folded copy using UTF-8 locale rules in order to analyze it.
3516 * This is because our macros that look to see if a sequence is
3517 * a multi-char fold assume everything is folded (otherwise the
3518 * tests in those macros would be too complicated and slow).
3519 * Note that here, the non-problematic folds will have already
3520 * been done, so we can just copy such characters. We actually
3521 * don't completely fold the EXACTFL string. We skip the
3522 * unfolded multi-char folds, as that would just create work
3523 * below to figure out the size they already are */
3525 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3528 STRLEN s_len = UTF8SKIP(s);
3529 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3530 Copy(s, d, s_len, U8);
3533 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3534 *unfolded_multi_char = TRUE;
3535 Copy(s, d, s_len, U8);
3538 else if (isASCII(*s)) {
3539 *(d++) = toFOLD(*s);
3543 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3549 /* Point the remainder of the routine to look at our temporary
3553 } /* End of creating folded copy of EXACTFL string */
3555 /* Examine the string for a multi-character fold sequence. UTF-8
3556 * patterns have all characters pre-folded by the time this code is
3558 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3559 length sequence we are looking for is 2 */
3561 int count = 0; /* How many characters in a multi-char fold */
3562 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3563 if (! len) { /* Not a multi-char fold: get next char */
3568 /* Nodes with 'ss' require special handling, except for
3569 * EXACTFA-ish for which there is no multi-char fold to this */
3570 if (len == 2 && *s == 's' && *(s+1) == 's'
3571 && OP(scan) != EXACTFA
3572 && OP(scan) != EXACTFA_NO_TRIE)
3575 if (OP(scan) != EXACTFL) {
3576 OP(scan) = EXACTFU_SS;
3580 else { /* Here is a generic multi-char fold. */
3581 U8* multi_end = s + len;
3583 /* Count how many characters are in it. In the case of
3584 * /aa, no folds which contain ASCII code points are
3585 * allowed, so check for those, and skip if found. */
3586 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3587 count = utf8_length(s, multi_end);
3591 while (s < multi_end) {
3594 goto next_iteration;
3604 /* The delta is how long the sequence is minus 1 (1 is how long
3605 * the character that folds to the sequence is) */
3606 total_count_delta += count - 1;
3610 /* We created a temporary folded copy of the string in EXACTFL
3611 * nodes. Therefore we need to be sure it doesn't go below zero,
3612 * as the real string could be shorter */
3613 if (OP(scan) == EXACTFL) {
3614 int total_chars = utf8_length((U8*) STRING(scan),
3615 (U8*) STRING(scan) + STR_LEN(scan));
3616 if (total_count_delta > total_chars) {
3617 total_count_delta = total_chars;
3621 *min_subtract += total_count_delta;
3624 else if (OP(scan) == EXACTFA) {
3626 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3627 * fold to the ASCII range (and there are no existing ones in the
3628 * upper latin1 range). But, as outlined in the comments preceding
3629 * this function, we need to flag any occurrences of the sharp s.
3630 * This character forbids trie formation (because of added
3633 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3634 OP(scan) = EXACTFA_NO_TRIE;
3635 *unfolded_multi_char = TRUE;
3644 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3645 * folds that are all Latin1. As explained in the comments
3646 * preceding this function, we look also for the sharp s in EXACTF
3647 * and EXACTFL nodes; it can be in the final position. Otherwise
3648 * we can stop looking 1 byte earlier because have to find at least
3649 * two characters for a multi-fold */
3650 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3655 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3656 if (! len) { /* Not a multi-char fold. */
3657 if (*s == LATIN_SMALL_LETTER_SHARP_S
3658 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3660 *unfolded_multi_char = TRUE;
3667 && isALPHA_FOLD_EQ(*s, 's')
3668 && isALPHA_FOLD_EQ(*(s+1), 's'))
3671 /* EXACTF nodes need to know that the minimum length
3672 * changed so that a sharp s in the string can match this
3673 * ss in the pattern, but they remain EXACTF nodes, as they
3674 * won't match this unless the target string is is UTF-8,
3675 * which we don't know until runtime. EXACTFL nodes can't
3676 * transform into EXACTFU nodes */
3677 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3678 OP(scan) = EXACTFU_SS;
3682 *min_subtract += len - 1;
3689 /* Allow dumping but overwriting the collection of skipped
3690 * ops and/or strings with fake optimized ops */
3691 n = scan + NODE_SZ_STR(scan);
3699 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3703 /* REx optimizer. Converts nodes into quicker variants "in place".
3704 Finds fixed substrings. */
3706 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3707 to the position after last scanned or to NULL. */
3709 #define INIT_AND_WITHP \
3710 assert(!and_withp); \
3711 Newx(and_withp,1, regnode_ssc); \
3712 SAVEFREEPV(and_withp)
3716 S_unwind_scan_frames(pTHX_ const void *p)
3718 scan_frame *f= (scan_frame *)p;
3720 scan_frame *n= f->next_frame;
3728 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3729 SSize_t *minlenp, SSize_t *deltap,
3734 regnode_ssc *and_withp,
3735 U32 flags, U32 depth)
3736 /* scanp: Start here (read-write). */
3737 /* deltap: Write maxlen-minlen here. */
3738 /* last: Stop before this one. */
3739 /* data: string data about the pattern */
3740 /* stopparen: treat close N as END */
3741 /* recursed: which subroutines have we recursed into */
3742 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3744 /* There must be at least this number of characters to match */
3747 regnode *scan = *scanp, *next;
3749 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3750 int is_inf_internal = 0; /* The studied chunk is infinite */
3751 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3752 scan_data_t data_fake;
3753 SV *re_trie_maxbuff = NULL;
3754 regnode *first_non_open = scan;
3755 SSize_t stopmin = SSize_t_MAX;
3756 scan_frame *frame = NULL;
3757 GET_RE_DEBUG_FLAGS_DECL;
3759 PERL_ARGS_ASSERT_STUDY_CHUNK;
3763 while (first_non_open && OP(first_non_open) == OPEN)
3764 first_non_open=regnext(first_non_open);
3770 RExC_study_chunk_recursed_count++;
3772 DEBUG_OPTIMISE_MORE_r(
3774 PerlIO_printf(Perl_debug_log,
3775 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3776 (int)(depth*2), "", (long)stopparen,
3777 (unsigned long)RExC_study_chunk_recursed_count,
3778 (unsigned long)depth, (unsigned long)recursed_depth,
3781 if (recursed_depth) {
3784 for ( j = 0 ; j < recursed_depth ; j++ ) {
3785 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3787 PAREN_TEST(RExC_study_chunk_recursed +
3788 ( j * RExC_study_chunk_recursed_bytes), i )
3791 !PAREN_TEST(RExC_study_chunk_recursed +
3792 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3795 PerlIO_printf(Perl_debug_log," %d",(int)i);
3799 if ( j + 1 < recursed_depth ) {
3800 PerlIO_printf(Perl_debug_log, ",");
3804 PerlIO_printf(Perl_debug_log,"\n");
3807 while ( scan && OP(scan) != END && scan < last ){
3808 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3809 node length to get a real minimum (because
3810 the folded version may be shorter) */
3811 bool unfolded_multi_char = FALSE;
3812 /* Peephole optimizer: */
3813 DEBUG_STUDYDATA("Peep:", data, depth);
3814 DEBUG_PEEP("Peep", scan, depth);
3817 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3818 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3819 * by a different invocation of reg() -- Yves
3821 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3823 /* Follow the next-chain of the current node and optimize
3824 away all the NOTHINGs from it. */
3825 if (OP(scan) != CURLYX) {
3826 const int max = (reg_off_by_arg[OP(scan)]
3828 /* I32 may be smaller than U16 on CRAYs! */
3829 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3830 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3834 /* Skip NOTHING and LONGJMP. */
3835 while ((n = regnext(n))
3836 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3837 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3838 && off + noff < max)
3840 if (reg_off_by_arg[OP(scan)])
3843 NEXT_OFF(scan) = off;
3846 /* The principal pseudo-switch. Cannot be a switch, since we
3847 look into several different things. */
3848 if ( OP(scan) == DEFINEP ) {
3850 SSize_t deltanext = 0;
3851 SSize_t fake_last_close = 0;
3852 I32 f = SCF_IN_DEFINE;
3854 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3855 scan = regnext(scan);
3856 assert( OP(scan) == IFTHEN );
3857 DEBUG_PEEP("expect IFTHEN", scan, depth);
3859 data_fake.last_closep= &fake_last_close;
3861 next = regnext(scan);
3862 scan = NEXTOPER(NEXTOPER(scan));
3863 DEBUG_PEEP("scan", scan, depth);
3864 DEBUG_PEEP("next", next, depth);
3866 /* we suppose the run is continuous, last=next...
3867 * NOTE we dont use the return here! */
3868 (void)study_chunk(pRExC_state, &scan, &minlen,
3869 &deltanext, next, &data_fake, stopparen,
3870 recursed_depth, NULL, f, depth+1);
3875 OP(scan) == BRANCH ||
3876 OP(scan) == BRANCHJ ||
3879 next = regnext(scan);
3882 /* The op(next)==code check below is to see if we
3883 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3884 * IFTHEN is special as it might not appear in pairs.
3885 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3886 * we dont handle it cleanly. */
3887 if (OP(next) == code || code == IFTHEN) {
3888 /* NOTE - There is similar code to this block below for
3889 * handling TRIE nodes on a re-study. If you change stuff here
3890 * check there too. */
3891 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3893 regnode * const startbranch=scan;
3895 if (flags & SCF_DO_SUBSTR) {
3896 /* Cannot merge strings after this. */
3897 scan_commit(pRExC_state, data, minlenp, is_inf);
3900 if (flags & SCF_DO_STCLASS)
3901 ssc_init_zero(pRExC_state, &accum);
3903 while (OP(scan) == code) {
3904 SSize_t deltanext, minnext, fake;
3906 regnode_ssc this_class;
3908 DEBUG_PEEP("Branch", scan, depth);
3911 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3913 data_fake.whilem_c = data->whilem_c;
3914 data_fake.last_closep = data->last_closep;
3917 data_fake.last_closep = &fake;
3919 data_fake.pos_delta = delta;
3920 next = regnext(scan);
3922 scan = NEXTOPER(scan); /* everything */
3923 if (code != BRANCH) /* everything but BRANCH */
3924 scan = NEXTOPER(scan);
3926 if (flags & SCF_DO_STCLASS) {
3927 ssc_init(pRExC_state, &this_class);
3928 data_fake.start_class = &this_class;
3929 f = SCF_DO_STCLASS_AND;
3931 if (flags & SCF_WHILEM_VISITED_POS)
3932 f |= SCF_WHILEM_VISITED_POS;
3934 /* we suppose the run is continuous, last=next...*/
3935 minnext = study_chunk(pRExC_state, &scan, minlenp,
3936 &deltanext, next, &data_fake, stopparen,
3937 recursed_depth, NULL, f,depth+1);
3941 if (deltanext == SSize_t_MAX) {
3942 is_inf = is_inf_internal = 1;
3944 } else if (max1 < minnext + deltanext)
3945 max1 = minnext + deltanext;
3947 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3949 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3950 if ( stopmin > minnext)
3951 stopmin = min + min1;
3952 flags &= ~SCF_DO_SUBSTR;
3954 data->flags |= SCF_SEEN_ACCEPT;
3957 if (data_fake.flags & SF_HAS_EVAL)
3958 data->flags |= SF_HAS_EVAL;
3959 data->whilem_c = data_fake.whilem_c;
3961 if (flags & SCF_DO_STCLASS)
3962 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3964 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3966 if (flags & SCF_DO_SUBSTR) {
3967 data->pos_min += min1;
3968 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3969 data->pos_delta = SSize_t_MAX;
3971 data->pos_delta += max1 - min1;
3972 if (max1 != min1 || is_inf)
3973 data->longest = &(data->longest_float);
3976 if (delta == SSize_t_MAX
3977 || SSize_t_MAX - delta - (max1 - min1) < 0)
3978 delta = SSize_t_MAX;
3980 delta += max1 - min1;
3981 if (flags & SCF_DO_STCLASS_OR) {
3982 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3984 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3985 flags &= ~SCF_DO_STCLASS;
3988 else if (flags & SCF_DO_STCLASS_AND) {
3990 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3991 flags &= ~SCF_DO_STCLASS;
3994 /* Switch to OR mode: cache the old value of
3995 * data->start_class */
3997 StructCopy(data->start_class, and_withp, regnode_ssc);
3998 flags &= ~SCF_DO_STCLASS_AND;
3999 StructCopy(&accum, data->start_class, regnode_ssc);
4000 flags |= SCF_DO_STCLASS_OR;
4004 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4005 OP( startbranch ) == BRANCH )
4009 Assuming this was/is a branch we are dealing with: 'scan'
4010 now points at the item that follows the branch sequence,
4011 whatever it is. We now start at the beginning of the
4012 sequence and look for subsequences of
4018 which would be constructed from a pattern like
4021 If we can find such a subsequence we need to turn the first
4022 element into a trie and then add the subsequent branch exact
4023 strings to the trie.
4027 1. patterns where the whole set of branches can be
4030 2. patterns where only a subset can be converted.
4032 In case 1 we can replace the whole set with a single regop
4033 for the trie. In case 2 we need to keep the start and end
4036 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4037 becomes BRANCH TRIE; BRANCH X;
4039 There is an additional case, that being where there is a
4040 common prefix, which gets split out into an EXACT like node
4041 preceding the TRIE node.
4043 If x(1..n)==tail then we can do a simple trie, if not we make
4044 a "jump" trie, such that when we match the appropriate word
4045 we "jump" to the appropriate tail node. Essentially we turn
4046 a nested if into a case structure of sorts.
4051 if (!re_trie_maxbuff) {
4052 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4053 if (!SvIOK(re_trie_maxbuff))
4054 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4056 if ( SvIV(re_trie_maxbuff)>=0 ) {
4058 regnode *first = (regnode *)NULL;
4059 regnode *last = (regnode *)NULL;
4060 regnode *tail = scan;
4064 /* var tail is used because there may be a TAIL
4065 regop in the way. Ie, the exacts will point to the
4066 thing following the TAIL, but the last branch will
4067 point at the TAIL. So we advance tail. If we
4068 have nested (?:) we may have to move through several
4072 while ( OP( tail ) == TAIL ) {
4073 /* this is the TAIL generated by (?:) */
4074 tail = regnext( tail );
4078 DEBUG_TRIE_COMPILE_r({
4079 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4080 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4081 (int)depth * 2 + 2, "",
4082 "Looking for TRIE'able sequences. Tail node is: ",
4083 SvPV_nolen_const( RExC_mysv )
4089 Step through the branches
4090 cur represents each branch,
4091 noper is the first thing to be matched as part
4093 noper_next is the regnext() of that node.
4095 We normally handle a case like this
4096 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4097 support building with NOJUMPTRIE, which restricts
4098 the trie logic to structures like /FOO|BAR/.
4100 If noper is a trieable nodetype then the branch is
4101 a possible optimization target. If we are building
4102 under NOJUMPTRIE then we require that noper_next is
4103 the same as scan (our current position in the regex
4106 Once we have two or more consecutive such branches
4107 we can create a trie of the EXACT's contents and
4108 stitch it in place into the program.
4110 If the sequence represents all of the branches in
4111 the alternation we replace the entire thing with a
4114 Otherwise when it is a subsequence we need to
4115 stitch it in place and replace only the relevant
4116 branches. This means the first branch has to remain
4117 as it is used by the alternation logic, and its
4118 next pointer, and needs to be repointed at the item
4119 on the branch chain following the last branch we
4120 have optimized away.
4122 This could be either a BRANCH, in which case the
4123 subsequence is internal, or it could be the item
4124 following the branch sequence in which case the
4125 subsequence is at the end (which does not
4126 necessarily mean the first node is the start of the
4129 TRIE_TYPE(X) is a define which maps the optype to a
4133 ----------------+-----------
4137 EXACTFU_SS | EXACTFU
4142 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
4143 ( EXACT == (X) ) ? EXACT : \
4144 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
4145 ( EXACTFA == (X) ) ? EXACTFA : \
4148 /* dont use tail as the end marker for this traverse */
4149 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4150 regnode * const noper = NEXTOPER( cur );
4151 U8 noper_type = OP( noper );
4152 U8 noper_trietype = TRIE_TYPE( noper_type );
4153 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4154 regnode * const noper_next = regnext( noper );
4155 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4156 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4159 DEBUG_TRIE_COMPILE_r({
4160 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4161 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4162 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4164 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4165 PerlIO_printf( Perl_debug_log, " -> %s",
4166 SvPV_nolen_const(RExC_mysv));
4169 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4170 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4171 SvPV_nolen_const(RExC_mysv));
4173 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4174 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4175 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4179 /* Is noper a trieable nodetype that can be merged
4180 * with the current trie (if there is one)? */
4184 ( noper_trietype == NOTHING)
4185 || ( trietype == NOTHING )
4186 || ( trietype == noper_trietype )
4189 && noper_next == tail
4193 /* Handle mergable triable node Either we are
4194 * the first node in a new trieable sequence,
4195 * in which case we do some bookkeeping,
4196 * otherwise we update the end pointer. */
4199 if ( noper_trietype == NOTHING ) {
4200 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4201 regnode * const noper_next = regnext( noper );
4202 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4203 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4206 if ( noper_next_trietype ) {
4207 trietype = noper_next_trietype;
4208 } else if (noper_next_type) {
4209 /* a NOTHING regop is 1 regop wide.
4210 * We need at least two for a trie
4211 * so we can't merge this in */
4215 trietype = noper_trietype;
4218 if ( trietype == NOTHING )
4219 trietype = noper_trietype;
4224 } /* end handle mergable triable node */
4226 /* handle unmergable node -
4227 * noper may either be a triable node which can
4228 * not be tried together with the current trie,
4229 * or a non triable node */
4231 /* If last is set and trietype is not
4232 * NOTHING then we have found at least two
4233 * triable branch sequences in a row of a
4234 * similar trietype so we can turn them
4235 * into a trie. If/when we allow NOTHING to
4236 * start a trie sequence this condition
4237 * will be required, and it isn't expensive
4238 * so we leave it in for now. */
4239 if ( trietype && trietype != NOTHING )
4240 make_trie( pRExC_state,
4241 startbranch, first, cur, tail,
4242 count, trietype, depth+1 );
4243 last = NULL; /* note: we clear/update
4244 first, trietype etc below,
4245 so we dont do it here */
4249 && noper_next == tail
4252 /* noper is triable, so we can start a new
4256 trietype = noper_trietype;
4258 /* if we already saw a first but the
4259 * current node is not triable then we have
4260 * to reset the first information. */
4265 } /* end handle unmergable node */
4266 } /* loop over branches */
4267 DEBUG_TRIE_COMPILE_r({
4268 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4269 PerlIO_printf( Perl_debug_log,
4270 "%*s- %s (%d) <SCAN FINISHED>\n",
4272 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4275 if ( last && trietype ) {
4276 if ( trietype != NOTHING ) {
4277 /* the last branch of the sequence was part of
4278 * a trie, so we have to construct it here
4279 * outside of the loop */
4280 made= make_trie( pRExC_state, startbranch,
4281 first, scan, tail, count,
4282 trietype, depth+1 );
4283 #ifdef TRIE_STUDY_OPT
4284 if ( ((made == MADE_EXACT_TRIE &&
4285 startbranch == first)
4286 || ( first_non_open == first )) &&
4288 flags |= SCF_TRIE_RESTUDY;
4289 if ( startbranch == first
4292 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4297 /* at this point we know whatever we have is a
4298 * NOTHING sequence/branch AND if 'startbranch'
4299 * is 'first' then we can turn the whole thing
4302 if ( startbranch == first ) {
4304 /* the entire thing is a NOTHING sequence,
4305 * something like this: (?:|) So we can
4306 * turn it into a plain NOTHING op. */
4307 DEBUG_TRIE_COMPILE_r({
4308 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4309 PerlIO_printf( Perl_debug_log,
4310 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4311 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4314 OP(startbranch)= NOTHING;
4315 NEXT_OFF(startbranch)= tail - startbranch;
4316 for ( opt= startbranch + 1; opt < tail ; opt++ )
4320 } /* end if ( last) */
4321 } /* TRIE_MAXBUF is non zero */
4326 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4327 scan = NEXTOPER(NEXTOPER(scan));
4328 } else /* single branch is optimized. */
4329 scan = NEXTOPER(scan);
4331 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4333 regnode *start = NULL;
4334 regnode *end = NULL;
4335 U32 my_recursed_depth= recursed_depth;
4338 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4339 /* Do setup, note this code has side effects beyond
4340 * the rest of this block. Specifically setting
4341 * RExC_recurse[] must happen at least once during
4343 if (OP(scan) == GOSUB) {
4345 RExC_recurse[ARG2L(scan)] = scan;
4346 start = RExC_open_parens[paren-1];
4347 end = RExC_close_parens[paren-1];
4349 start = RExC_rxi->program + 1;
4352 /* NOTE we MUST always execute the above code, even
4353 * if we do nothing with a GOSUB/GOSTART */
4355 ( flags & SCF_IN_DEFINE )
4358 (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4360 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4363 /* no need to do anything here if we are in a define. */
4364 /* or we are after some kind of infinite construct
4365 * so we can skip recursing into this item.
4366 * Since it is infinite we will not change the maxlen
4367 * or delta, and if we miss something that might raise
4368 * the minlen it will merely pessimise a little.
4370 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4371 * might result in a minlen of 1 and not of 4,
4372 * but this doesn't make us mismatch, just try a bit
4373 * harder than we should.
4375 scan= regnext(scan);
4382 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4384 /* it is quite possible that there are more efficient ways
4385 * to do this. We maintain a bitmap per level of recursion
4386 * of which patterns we have entered so we can detect if a
4387 * pattern creates a possible infinite loop. When we
4388 * recurse down a level we copy the previous levels bitmap
4389 * down. When we are at recursion level 0 we zero the top
4390 * level bitmap. It would be nice to implement a different
4391 * more efficient way of doing this. In particular the top
4392 * level bitmap may be unnecessary.
4394 if (!recursed_depth) {
4395 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4397 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4398 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4399 RExC_study_chunk_recursed_bytes, U8);
4401 /* we havent recursed into this paren yet, so recurse into it */
4402 DEBUG_STUDYDATA("set:", data,depth);
4403 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4404 my_recursed_depth= recursed_depth + 1;
4406 DEBUG_STUDYDATA("inf:", data,depth);
4407 /* some form of infinite recursion, assume infinite length
4409 if (flags & SCF_DO_SUBSTR) {
4410 scan_commit(pRExC_state, data, minlenp, is_inf);
4411 data->longest = &(data->longest_float);
4413 is_inf = is_inf_internal = 1;
4414 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4415 ssc_anything(data->start_class);
4416 flags &= ~SCF_DO_STCLASS;
4418 start= NULL; /* reset start so we dont recurse later on. */
4423 end = regnext(scan);
4426 scan_frame *newframe;
4428 if (!RExC_frame_last) {
4429 Newxz(newframe, 1, scan_frame);
4430 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4431 RExC_frame_head= newframe;
4433 } else if (!RExC_frame_last->next_frame) {
4434 Newxz(newframe,1,scan_frame);
4435 RExC_frame_last->next_frame= newframe;
4436 newframe->prev_frame= RExC_frame_last;
4439 newframe= RExC_frame_last->next_frame;
4441 RExC_frame_last= newframe;
4443 newframe->next_regnode = regnext(scan);
4444 newframe->last_regnode = last;
4445 newframe->stopparen = stopparen;
4446 newframe->prev_recursed_depth = recursed_depth;
4447 newframe->this_prev_frame= frame;
4449 DEBUG_STUDYDATA("frame-new:",data,depth);
4450 DEBUG_PEEP("fnew", scan, depth);
4457 recursed_depth= my_recursed_depth;
4462 else if (OP(scan) == EXACT) {
4463 SSize_t l = STR_LEN(scan);
4466 const U8 * const s = (U8*)STRING(scan);
4467 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4468 l = utf8_length(s, s + l);
4470 uc = *((U8*)STRING(scan));
4473 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4474 /* The code below prefers earlier match for fixed
4475 offset, later match for variable offset. */
4476 if (data->last_end == -1) { /* Update the start info. */
4477 data->last_start_min = data->pos_min;
4478 data->last_start_max = is_inf
4479 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4481 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4483 SvUTF8_on(data->last_found);
4485 SV * const sv = data->last_found;
4486 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4487 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4488 if (mg && mg->mg_len >= 0)
4489 mg->mg_len += utf8_length((U8*)STRING(scan),
4490 (U8*)STRING(scan)+STR_LEN(scan));
4492 data->last_end = data->pos_min + l;
4493 data->pos_min += l; /* As in the first entry. */
4494 data->flags &= ~SF_BEFORE_EOL;
4497 /* ANDing the code point leaves at most it, and not in locale, and
4498 * can't match null string */
4499 if (flags & SCF_DO_STCLASS_AND) {
4500 ssc_cp_and(data->start_class, uc);
4501 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4502 ssc_clear_locale(data->start_class);
4504 else if (flags & SCF_DO_STCLASS_OR) {
4505 ssc_add_cp(data->start_class, uc);
4506 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4508 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4509 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4511 flags &= ~SCF_DO_STCLASS;
4513 else if (PL_regkind[OP(scan)] == EXACT) {
4514 /* But OP != EXACT!, so is EXACTFish */
4515 SSize_t l = STR_LEN(scan);
4516 UV uc = *((U8*)STRING(scan));
4517 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4518 separate code points */
4519 const U8 * s = (U8*)STRING(scan);
4521 /* Search for fixed substrings supports EXACT only. */
4522 if (flags & SCF_DO_SUBSTR) {
4524 scan_commit(pRExC_state, data, minlenp, is_inf);
4527 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4528 l = utf8_length(s, s + l);
4530 if (unfolded_multi_char) {
4531 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4533 min += l - min_subtract;
4535 delta += min_subtract;
4536 if (flags & SCF_DO_SUBSTR) {
4537 data->pos_min += l - min_subtract;
4538 if (data->pos_min < 0) {
4541 data->pos_delta += min_subtract;
4543 data->longest = &(data->longest_float);
4547 if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4548 ssc_clear_locale(data->start_class);
4553 /* We punt and assume can match anything if the node begins
4554 * with a multi-character fold. Things are complicated. For
4555 * example, /ffi/i could match any of:
4556 * "\N{LATIN SMALL LIGATURE FFI}"
4557 * "\N{LATIN SMALL LIGATURE FF}I"
4558 * "F\N{LATIN SMALL LIGATURE FI}"
4559 * plus several other things; and making sure we have all the
4560 * possibilities is hard. */
4561 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4563 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4567 /* Any Latin1 range character can potentially match any
4568 * other depending on the locale */
4569 if (OP(scan) == EXACTFL) {
4570 _invlist_union(EXACTF_invlist, PL_Latin1,
4574 /* But otherwise, it matches at least itself. We can
4575 * quickly tell if it has a distinct fold, and if so,
4576 * it matches that as well */
4577 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4578 if (IS_IN_SOME_FOLD_L1(uc)) {
4579 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4580 PL_fold_latin1[uc]);
4584 /* Some characters match above-Latin1 ones under /i. This
4585 * is true of EXACTFL ones when the locale is UTF-8 */
4586 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4587 && (! isASCII(uc) || (OP(scan) != EXACTFA
4588 && OP(scan) != EXACTFA_NO_TRIE)))
4590 add_above_Latin1_folds(pRExC_state,
4596 else { /* Pattern is UTF-8 */
4597 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4598 STRLEN foldlen = UTF8SKIP(s);
4599 const U8* e = s + STR_LEN(scan);
4602 /* The only code points that aren't folded in a UTF EXACTFish
4603 * node are are the problematic ones in EXACTFL nodes */
4604 if (OP(scan) == EXACTFL
4605 && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4607 /* We need to check for the possibility that this EXACTFL
4608 * node begins with a multi-char fold. Therefore we fold
4609 * the first few characters of it so that we can make that
4614 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4616 *(d++) = (U8) toFOLD(*s);
4621 to_utf8_fold(s, d, &len);
4627 /* And set up so the code below that looks in this folded
4628 * buffer instead of the node's string */
4630 foldlen = UTF8SKIP(folded);
4634 /* When we reach here 's' points to the fold of the first
4635 * character(s) of the node; and 'e' points to far enough along
4636 * the folded string to be just past any possible multi-char
4637 * fold. 'foldlen' is the length in bytes of the first
4640 * Unlike the non-UTF-8 case, the macro for determining if a
4641 * string is a multi-char fold requires all the characters to
4642 * already be folded. This is because of all the complications
4643 * if not. Note that they are folded anyway, except in EXACTFL
4644 * nodes. Like the non-UTF case above, we punt if the node
4645 * begins with a multi-char fold */
4647 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4649 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4651 else { /* Single char fold */
4653 /* It matches all the things that fold to it, which are
4654 * found in PL_utf8_foldclosures (including itself) */
4655 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4656 if (! PL_utf8_foldclosures) {
4657 _load_PL_utf8_foldclosures();
4659 if ((listp = hv_fetch(PL_utf8_foldclosures,
4660 (char *) s, foldlen, FALSE)))
4662 AV* list = (AV*) *listp;
4664 for (k = 0; k <= av_tindex(list); k++) {
4665 SV** c_p = av_fetch(list, k, FALSE);
4671 /* /aa doesn't allow folds between ASCII and non- */
4672 if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4673 && isASCII(c) != isASCII(uc))
4678 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4683 if (flags & SCF_DO_STCLASS_AND) {
4684 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4685 ANYOF_POSIXL_ZERO(data->start_class);
4686 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4688 else if (flags & SCF_DO_STCLASS_OR) {
4689 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4690 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4692 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4693 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4695 flags &= ~SCF_DO_STCLASS;
4696 SvREFCNT_dec(EXACTF_invlist);
4698 else if (REGNODE_VARIES(OP(scan))) {
4699 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4700 I32 fl = 0, f = flags;
4701 regnode * const oscan = scan;
4702 regnode_ssc this_class;
4703 regnode_ssc *oclass = NULL;
4704 I32 next_is_eval = 0;
4706 switch (PL_regkind[OP(scan)]) {
4707 case WHILEM: /* End of (?:...)* . */
4708 scan = NEXTOPER(scan);
4711 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4712 next = NEXTOPER(scan);
4713 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4715 maxcount = REG_INFTY;
4716 next = regnext(scan);
4717 scan = NEXTOPER(scan);
4721 if (flags & SCF_DO_SUBSTR)
4726 if (flags & SCF_DO_STCLASS) {
4728 maxcount = REG_INFTY;
4729 next = regnext(scan);
4730 scan = NEXTOPER(scan);
4733 if (flags & SCF_DO_SUBSTR) {
4734 scan_commit(pRExC_state, data, minlenp, is_inf);
4735 /* Cannot extend fixed substrings */
4736 data->longest = &(data->longest_float);
4738 is_inf = is_inf_internal = 1;
4739 scan = regnext(scan);
4740 goto optimize_curly_tail;
4742 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4743 && (scan->flags == stopparen))
4748 mincount = ARG1(scan);
4749 maxcount = ARG2(scan);
4751 next = regnext(scan);
4752 if (OP(scan) == CURLYX) {
4753 I32 lp = (data ? *(data->last_closep) : 0);
4754 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4756 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4757 next_is_eval = (OP(scan) == EVAL);
4759 if (flags & SCF_DO_SUBSTR) {
4761 scan_commit(pRExC_state, data, minlenp, is_inf);
4762 /* Cannot extend fixed substrings */
4763 pos_before = data->pos_min;
4767 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4769 data->flags |= SF_IS_INF;
4771 if (flags & SCF_DO_STCLASS) {
4772 ssc_init(pRExC_state, &this_class);
4773 oclass = data->start_class;
4774 data->start_class = &this_class;
4775 f |= SCF_DO_STCLASS_AND;
4776 f &= ~SCF_DO_STCLASS_OR;
4778 /* Exclude from super-linear cache processing any {n,m}
4779 regops for which the combination of input pos and regex
4780 pos is not enough information to determine if a match
4783 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4784 regex pos at the \s*, the prospects for a match depend not
4785 only on the input position but also on how many (bar\s*)
4786 repeats into the {4,8} we are. */
4787 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4788 f &= ~SCF_WHILEM_VISITED_POS;
4790 /* This will finish on WHILEM, setting scan, or on NULL: */
4791 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4792 last, data, stopparen, recursed_depth, NULL,
4794 ? (f & ~SCF_DO_SUBSTR)
4798 if (flags & SCF_DO_STCLASS)
4799 data->start_class = oclass;
4800 if (mincount == 0 || minnext == 0) {
4801 if (flags & SCF_DO_STCLASS_OR) {
4802 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4804 else if (flags & SCF_DO_STCLASS_AND) {
4805 /* Switch to OR mode: cache the old value of
4806 * data->start_class */
4808 StructCopy(data->start_class, and_withp, regnode_ssc);
4809 flags &= ~SCF_DO_STCLASS_AND;
4810 StructCopy(&this_class, data->start_class, regnode_ssc);
4811 flags |= SCF_DO_STCLASS_OR;
4812 ANYOF_FLAGS(data->start_class)
4813 |= SSC_MATCHES_EMPTY_STRING;
4815 } else { /* Non-zero len */
4816 if (flags & SCF_DO_STCLASS_OR) {
4817 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4818 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4820 else if (flags & SCF_DO_STCLASS_AND)
4821 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4822 flags &= ~SCF_DO_STCLASS;
4824 if (!scan) /* It was not CURLYX, but CURLY. */
4826 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4827 /* ? quantifier ok, except for (?{ ... }) */
4828 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4829 && (minnext == 0) && (deltanext == 0)
4830 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4831 && maxcount <= REG_INFTY/3) /* Complement check for big
4834 /* Fatal warnings may leak the regexp without this: */
4835 SAVEFREESV(RExC_rx_sv);
4836 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4837 "Quantifier unexpected on zero-length expression "
4838 "in regex m/%"UTF8f"/",
4839 UTF8fARG(UTF, RExC_end - RExC_precomp,
4841 (void)ReREFCNT_inc(RExC_rx_sv);
4844 min += minnext * mincount;
4845 is_inf_internal |= deltanext == SSize_t_MAX
4846 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4847 is_inf |= is_inf_internal;
4849 delta = SSize_t_MAX;
4851 delta += (minnext + deltanext) * maxcount
4852 - minnext * mincount;
4854 /* Try powerful optimization CURLYX => CURLYN. */
4855 if ( OP(oscan) == CURLYX && data
4856 && data->flags & SF_IN_PAR
4857 && !(data->flags & SF_HAS_EVAL)
4858 && !deltanext && minnext == 1 ) {
4859 /* Try to optimize to CURLYN. */
4860 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4861 regnode * const nxt1 = nxt;
4868 if (!REGNODE_SIMPLE(OP(nxt))
4869 && !(PL_regkind[OP(nxt)] == EXACT
4870 && STR_LEN(nxt) == 1))
4876 if (OP(nxt) != CLOSE)
4878 if (RExC_open_parens) {
4879 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4880 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4882 /* Now we know that nxt2 is the only contents: */
4883 oscan->flags = (U8)ARG(nxt);
4885 OP(nxt1) = NOTHING; /* was OPEN. */
4888 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4889 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4890 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4891 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4892 OP(nxt + 1) = OPTIMIZED; /* was count. */
4893 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4898 /* Try optimization CURLYX => CURLYM. */
4899 if ( OP(oscan) == CURLYX && data
4900 && !(data->flags & SF_HAS_PAR)
4901 && !(data->flags & SF_HAS_EVAL)
4902 && !deltanext /* atom is fixed width */
4903 && minnext != 0 /* CURLYM can't handle zero width */
4905 /* Nor characters whose fold at run-time may be
4906 * multi-character */
4907 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4909 /* XXXX How to optimize if data == 0? */
4910 /* Optimize to a simpler form. */
4911 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4915 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4916 && (OP(nxt2) != WHILEM))
4918 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4919 /* Need to optimize away parenths. */
4920 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4921 /* Set the parenth number. */
4922 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4924 oscan->flags = (U8)ARG(nxt);
4925 if (RExC_open_parens) {
4926 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4927 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4929 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4930 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4933 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4934 OP(nxt + 1) = OPTIMIZED; /* was count. */
4935 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4936 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4939 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4940 regnode *nnxt = regnext(nxt1);
4942 if (reg_off_by_arg[OP(nxt1)])
4943 ARG_SET(nxt1, nxt2 - nxt1);
4944 else if (nxt2 - nxt1 < U16_MAX)
4945 NEXT_OFF(nxt1) = nxt2 - nxt1;
4947 OP(nxt) = NOTHING; /* Cannot beautify */
4952 /* Optimize again: */
4953 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4954 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4959 else if ((OP(oscan) == CURLYX)
4960 && (flags & SCF_WHILEM_VISITED_POS)
4961 /* See the comment on a similar expression above.
4962 However, this time it's not a subexpression
4963 we care about, but the expression itself. */
4964 && (maxcount == REG_INFTY)
4965 && data && ++data->whilem_c < 16) {
4966 /* This stays as CURLYX, we can put the count/of pair. */
4967 /* Find WHILEM (as in regexec.c) */
4968 regnode *nxt = oscan + NEXT_OFF(oscan);
4970 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4972 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4973 | (RExC_whilem_seen << 4)); /* On WHILEM */
4975 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4977 if (flags & SCF_DO_SUBSTR) {
4978 SV *last_str = NULL;
4979 STRLEN last_chrs = 0;
4980 int counted = mincount != 0;
4982 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4984 SSize_t b = pos_before >= data->last_start_min
4985 ? pos_before : data->last_start_min;
4987 const char * const s = SvPV_const(data->last_found, l);
4988 SSize_t old = b - data->last_start_min;
4991 old = utf8_hop((U8*)s, old) - (U8*)s;
4993 /* Get the added string: */
4994 last_str = newSVpvn_utf8(s + old, l, UTF);
4995 last_chrs = UTF ? utf8_length((U8*)(s + old),
4996 (U8*)(s + old + l)) : l;
4997 if (deltanext == 0 && pos_before == b) {
4998 /* What was added is a constant string */
5001 SvGROW(last_str, (mincount * l) + 1);
5002 repeatcpy(SvPVX(last_str) + l,
5003 SvPVX_const(last_str), l,
5005 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5006 /* Add additional parts. */
5007 SvCUR_set(data->last_found,
5008 SvCUR(data->last_found) - l);
5009 sv_catsv(data->last_found, last_str);
5011 SV * sv = data->last_found;
5013 SvUTF8(sv) && SvMAGICAL(sv) ?
5014 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5015 if (mg && mg->mg_len >= 0)
5016 mg->mg_len += last_chrs * (mincount-1);
5018 last_chrs *= mincount;
5019 data->last_end += l * (mincount - 1);
5022 /* start offset must point into the last copy */
5023 data->last_start_min += minnext * (mincount - 1);
5024 data->last_start_max += is_inf ? SSize_t_MAX
5025 : (maxcount - 1) * (minnext + data->pos_delta);
5028 /* It is counted once already... */
5029 data->pos_min += minnext * (mincount - counted);
5031 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5032 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5033 " maxcount=%"UVuf" mincount=%"UVuf"\n",
5034 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5036 if (deltanext != SSize_t_MAX)
5037 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5038 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5039 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5041 if (deltanext == SSize_t_MAX
5042 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5043 data->pos_delta = SSize_t_MAX;
5045 data->pos_delta += - counted * deltanext +
5046 (minnext + deltanext) * maxcount - minnext * mincount;
5047 if (mincount != maxcount) {
5048 /* Cannot extend fixed substrings found inside
5050 scan_commit(pRExC_state, data, minlenp, is_inf);
5051 if (mincount && last_str) {
5052 SV * const sv = data->last_found;
5053 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5054 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5058 sv_setsv(sv, last_str);
5059 data->last_end = data->pos_min;
5060 data->last_start_min = data->pos_min - last_chrs;
5061 data->last_start_max = is_inf
5063 : data->pos_min + data->pos_delta - last_chrs;
5065 data->longest = &(data->longest_float);
5067 SvREFCNT_dec(last_str);
5069 if (data && (fl & SF_HAS_EVAL))
5070 data->flags |= SF_HAS_EVAL;
5071 optimize_curly_tail:
5072 if (OP(oscan) != CURLYX) {
5073 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5075 NEXT_OFF(oscan) += NEXT_OFF(next);
5081 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5086 if (flags & SCF_DO_SUBSTR) {
5087 /* Cannot expect anything... */
5088 scan_commit(pRExC_state, data, minlenp, is_inf);
5089 data->longest = &(data->longest_float);
5091 is_inf = is_inf_internal = 1;
5092 if (flags & SCF_DO_STCLASS_OR) {
5093 if (OP(scan) == CLUMP) {
5094 /* Actually is any start char, but very few code points
5095 * aren't start characters */
5096 ssc_match_all_cp(data->start_class);
5099 ssc_anything(data->start_class);
5102 flags &= ~SCF_DO_STCLASS;
5106 else if (OP(scan) == LNBREAK) {
5107 if (flags & SCF_DO_STCLASS) {
5108 if (flags & SCF_DO_STCLASS_AND) {
5109 ssc_intersection(data->start_class,
5110 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5111 ssc_clear_locale(data->start_class);
5112 ANYOF_FLAGS(data->start_class)
5113 &= ~SSC_MATCHES_EMPTY_STRING;
5115 else if (flags & SCF_DO_STCLASS_OR) {
5116 ssc_union(data->start_class,
5117 PL_XPosix_ptrs[_CC_VERTSPACE],
5119 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5121 /* See commit msg for
5122 * 749e076fceedeb708a624933726e7989f2302f6a */
5123 ANYOF_FLAGS(data->start_class)
5124 &= ~SSC_MATCHES_EMPTY_STRING;
5126 flags &= ~SCF_DO_STCLASS;
5129 delta++; /* Because of the 2 char string cr-lf */
5130 if (flags & SCF_DO_SUBSTR) {
5131 /* Cannot expect anything... */
5132 scan_commit(pRExC_state, data, minlenp, is_inf);
5134 data->pos_delta += 1;
5135 data->longest = &(data->longest_float);
5138 else if (REGNODE_SIMPLE(OP(scan))) {
5140 if (flags & SCF_DO_SUBSTR) {
5141 scan_commit(pRExC_state, data, minlenp, is_inf);
5145 if (flags & SCF_DO_STCLASS) {
5147 SV* my_invlist = sv_2mortal(_new_invlist(0));
5150 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5151 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5153 /* Some of the logic below assumes that switching
5154 locale on will only add false positives. */
5159 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5164 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5165 ssc_match_all_cp(data->start_class);
5170 SV* REG_ANY_invlist = _new_invlist(2);
5171 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5173 if (flags & SCF_DO_STCLASS_OR) {
5174 ssc_union(data->start_class,
5176 TRUE /* TRUE => invert, hence all but \n
5180 else if (flags & SCF_DO_STCLASS_AND) {
5181 ssc_intersection(data->start_class,
5183 TRUE /* TRUE => invert */
5185 ssc_clear_locale(data->start_class);
5187 SvREFCNT_dec_NN(REG_ANY_invlist);
5192 if (flags & SCF_DO_STCLASS_AND)
5193 ssc_and(pRExC_state, data->start_class,
5194 (regnode_charclass *) scan);
5196 ssc_or(pRExC_state, data->start_class,
5197 (regnode_charclass *) scan);
5205 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5206 if (flags & SCF_DO_STCLASS_AND) {
5207 bool was_there = cBOOL(
5208 ANYOF_POSIXL_TEST(data->start_class,
5210 ANYOF_POSIXL_ZERO(data->start_class);
5211 if (was_there) { /* Do an AND */
5212 ANYOF_POSIXL_SET(data->start_class, namedclass);
5214 /* No individual code points can now match */
5215 data->start_class->invlist
5216 = sv_2mortal(_new_invlist(0));
5219 int complement = namedclass + ((invert) ? -1 : 1);
5221 assert(flags & SCF_DO_STCLASS_OR);
5223 /* If the complement of this class was already there,
5224 * the result is that they match all code points,
5225 * (\d + \D == everything). Remove the classes from
5226 * future consideration. Locale is not relevant in
5228 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5229 ssc_match_all_cp(data->start_class);
5230 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5231 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5233 else { /* The usual case; just add this class to the
5235 ANYOF_POSIXL_SET(data->start_class, namedclass);
5240 case NPOSIXA: /* For these, we always know the exact set of
5245 if (FLAGS(scan) == _CC_ASCII) {
5246 my_invlist = PL_XPosix_ptrs[_CC_ASCII];
5249 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5250 PL_XPosix_ptrs[_CC_ASCII],
5261 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5263 /* NPOSIXD matches all upper Latin1 code points unless the
5264 * target string being matched is UTF-8, which is
5265 * unknowable until match time. Since we are going to
5266 * invert, we want to get rid of all of them so that the
5267 * inversion will match all */
5268 if (OP(scan) == NPOSIXD) {
5269 _invlist_subtract(my_invlist, PL_UpperLatin1,
5275 if (flags & SCF_DO_STCLASS_AND) {
5276 ssc_intersection(data->start_class, my_invlist, invert);
5277 ssc_clear_locale(data->start_class);
5280 assert(flags & SCF_DO_STCLASS_OR);
5281 ssc_union(data->start_class, my_invlist, invert);
5284 if (flags & SCF_DO_STCLASS_OR)
5285 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5286 flags &= ~SCF_DO_STCLASS;
5289 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5290 data->flags |= (OP(scan) == MEOL
5293 scan_commit(pRExC_state, data, minlenp, is_inf);
5296 else if ( PL_regkind[OP(scan)] == BRANCHJ
5297 /* Lookbehind, or need to calculate parens/evals/stclass: */
5298 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5299 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5301 if ( OP(scan) == UNLESSM &&
5303 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5304 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5307 regnode *upto= regnext(scan);
5309 DEBUG_STUDYDATA("OPFAIL",data,depth);
5311 /*DEBUG_PARSE_MSG("opfail");*/
5312 regprop(RExC_rx, RExC_mysv, upto, NULL, pRExC_state);
5313 PerlIO_printf(Perl_debug_log,
5314 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5315 SvPV_nolen_const(RExC_mysv),
5316 (IV)REG_NODE_NUM(upto),
5321 NEXT_OFF(scan) = upto - scan;
5322 for (opt= scan + 1; opt < upto ; opt++)
5323 OP(opt) = OPTIMIZED;
5327 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5328 || OP(scan) == UNLESSM )
5330 /* Negative Lookahead/lookbehind
5331 In this case we can't do fixed string optimisation.
5334 SSize_t deltanext, minnext, fake = 0;
5339 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5341 data_fake.whilem_c = data->whilem_c;
5342 data_fake.last_closep = data->last_closep;
5345 data_fake.last_closep = &fake;
5346 data_fake.pos_delta = delta;
5347 if ( flags & SCF_DO_STCLASS && !scan->flags
5348 && OP(scan) == IFMATCH ) { /* Lookahead */
5349 ssc_init(pRExC_state, &intrnl);
5350 data_fake.start_class = &intrnl;
5351 f |= SCF_DO_STCLASS_AND;
5353 if (flags & SCF_WHILEM_VISITED_POS)
5354 f |= SCF_WHILEM_VISITED_POS;
5355 next = regnext(scan);
5356 nscan = NEXTOPER(NEXTOPER(scan));
5357 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5358 last, &data_fake, stopparen,
5359 recursed_depth, NULL, f, depth+1);
5362 FAIL("Variable length lookbehind not implemented");
5364 else if (minnext > (I32)U8_MAX) {
5365 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5368 scan->flags = (U8)minnext;
5371 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5373 if (data_fake.flags & SF_HAS_EVAL)
5374 data->flags |= SF_HAS_EVAL;
5375 data->whilem_c = data_fake.whilem_c;
5377 if (f & SCF_DO_STCLASS_AND) {
5378 if (flags & SCF_DO_STCLASS_OR) {
5379 /* OR before, AND after: ideally we would recurse with
5380 * data_fake to get the AND applied by study of the
5381 * remainder of the pattern, and then derecurse;
5382 * *** HACK *** for now just treat as "no information".
5383 * See [perl #56690].
5385 ssc_init(pRExC_state, data->start_class);
5387 /* AND before and after: combine and continue. These
5388 * assertions are zero-length, so can match an EMPTY
5390 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5391 ANYOF_FLAGS(data->start_class)
5392 |= SSC_MATCHES_EMPTY_STRING;
5396 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5398 /* Positive Lookahead/lookbehind
5399 In this case we can do fixed string optimisation,
5400 but we must be careful about it. Note in the case of
5401 lookbehind the positions will be offset by the minimum
5402 length of the pattern, something we won't know about
5403 until after the recurse.
5405 SSize_t deltanext, fake = 0;
5409 /* We use SAVEFREEPV so that when the full compile
5410 is finished perl will clean up the allocated
5411 minlens when it's all done. This way we don't
5412 have to worry about freeing them when we know
5413 they wont be used, which would be a pain.
5416 Newx( minnextp, 1, SSize_t );
5417 SAVEFREEPV(minnextp);
5420 StructCopy(data, &data_fake, scan_data_t);
5421 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5424 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5425 data_fake.last_found=newSVsv(data->last_found);
5429 data_fake.last_closep = &fake;
5430 data_fake.flags = 0;
5431 data_fake.pos_delta = delta;
5433 data_fake.flags |= SF_IS_INF;
5434 if ( flags & SCF_DO_STCLASS && !scan->flags
5435 && OP(scan) == IFMATCH ) { /* Lookahead */
5436 ssc_init(pRExC_state, &intrnl);
5437 data_fake.start_class = &intrnl;
5438 f |= SCF_DO_STCLASS_AND;
5440 if (flags & SCF_WHILEM_VISITED_POS)
5441 f |= SCF_WHILEM_VISITED_POS;
5442 next = regnext(scan);
5443 nscan = NEXTOPER(NEXTOPER(scan));
5445 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5446 &deltanext, last, &data_fake,
5447 stopparen, recursed_depth, NULL,
5451 FAIL("Variable length lookbehind not implemented");
5453 else if (*minnextp > (I32)U8_MAX) {
5454 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5457 scan->flags = (U8)*minnextp;
5462 if (f & SCF_DO_STCLASS_AND) {
5463 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5464 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5467 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5469 if (data_fake.flags & SF_HAS_EVAL)
5470 data->flags |= SF_HAS_EVAL;
5471 data->whilem_c = data_fake.whilem_c;
5472 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5473 if (RExC_rx->minlen<*minnextp)
5474 RExC_rx->minlen=*minnextp;
5475 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5476 SvREFCNT_dec_NN(data_fake.last_found);
5478 if ( data_fake.minlen_fixed != minlenp )
5480 data->offset_fixed= data_fake.offset_fixed;
5481 data->minlen_fixed= data_fake.minlen_fixed;
5482 data->lookbehind_fixed+= scan->flags;
5484 if ( data_fake.minlen_float != minlenp )
5486 data->minlen_float= data_fake.minlen_float;
5487 data->offset_float_min=data_fake.offset_float_min;
5488 data->offset_float_max=data_fake.offset_float_max;
5489 data->lookbehind_float+= scan->flags;
5496 else if (OP(scan) == OPEN) {
5497 if (stopparen != (I32)ARG(scan))
5500 else if (OP(scan) == CLOSE) {
5501 if (stopparen == (I32)ARG(scan)) {
5504 if ((I32)ARG(scan) == is_par) {
5505 next = regnext(scan);
5507 if ( next && (OP(next) != WHILEM) && next < last)
5508 is_par = 0; /* Disable optimization */
5511 *(data->last_closep) = ARG(scan);
5513 else if (OP(scan) == EVAL) {
5515 data->flags |= SF_HAS_EVAL;
5517 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5518 if (flags & SCF_DO_SUBSTR) {
5519 scan_commit(pRExC_state, data, minlenp, is_inf);
5520 flags &= ~SCF_DO_SUBSTR;
5522 if (data && OP(scan)==ACCEPT) {
5523 data->flags |= SCF_SEEN_ACCEPT;
5528 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5530 if (flags & SCF_DO_SUBSTR) {
5531 scan_commit(pRExC_state, data, minlenp, is_inf);
5532 data->longest = &(data->longest_float);
5534 is_inf = is_inf_internal = 1;
5535 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5536 ssc_anything(data->start_class);
5537 flags &= ~SCF_DO_STCLASS;
5539 else if (OP(scan) == GPOS) {
5540 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5541 !(delta || is_inf || (data && data->pos_delta)))
5543 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5544 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5545 if (RExC_rx->gofs < (STRLEN)min)
5546 RExC_rx->gofs = min;
5548 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5552 #ifdef TRIE_STUDY_OPT
5553 #ifdef FULL_TRIE_STUDY
5554 else if (PL_regkind[OP(scan)] == TRIE) {
5555 /* NOTE - There is similar code to this block above for handling
5556 BRANCH nodes on the initial study. If you change stuff here
5558 regnode *trie_node= scan;
5559 regnode *tail= regnext(scan);
5560 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5561 SSize_t max1 = 0, min1 = SSize_t_MAX;
5564 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5565 /* Cannot merge strings after this. */
5566 scan_commit(pRExC_state, data, minlenp, is_inf);
5568 if (flags & SCF_DO_STCLASS)
5569 ssc_init_zero(pRExC_state, &accum);
5575 const regnode *nextbranch= NULL;
5578 for ( word=1 ; word <= trie->wordcount ; word++)
5580 SSize_t deltanext=0, minnext=0, f = 0, fake;
5581 regnode_ssc this_class;
5583 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5585 data_fake.whilem_c = data->whilem_c;
5586 data_fake.last_closep = data->last_closep;
5589 data_fake.last_closep = &fake;
5590 data_fake.pos_delta = delta;
5591 if (flags & SCF_DO_STCLASS) {
5592 ssc_init(pRExC_state, &this_class);
5593 data_fake.start_class = &this_class;
5594 f = SCF_DO_STCLASS_AND;
5596 if (flags & SCF_WHILEM_VISITED_POS)
5597 f |= SCF_WHILEM_VISITED_POS;
5599 if (trie->jump[word]) {
5601 nextbranch = trie_node + trie->jump[0];
5602 scan= trie_node + trie->jump[word];
5603 /* We go from the jump point to the branch that follows
5604 it. Note this means we need the vestigal unused
5605 branches even though they arent otherwise used. */
5606 minnext = study_chunk(pRExC_state, &scan, minlenp,
5607 &deltanext, (regnode *)nextbranch, &data_fake,
5608 stopparen, recursed_depth, NULL, f,depth+1);
5610 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5611 nextbranch= regnext((regnode*)nextbranch);
5613 if (min1 > (SSize_t)(minnext + trie->minlen))
5614 min1 = minnext + trie->minlen;
5615 if (deltanext == SSize_t_MAX) {
5616 is_inf = is_inf_internal = 1;
5618 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5619 max1 = minnext + deltanext + trie->maxlen;
5621 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5623 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5624 if ( stopmin > min + min1)
5625 stopmin = min + min1;
5626 flags &= ~SCF_DO_SUBSTR;
5628 data->flags |= SCF_SEEN_ACCEPT;
5631 if (data_fake.flags & SF_HAS_EVAL)
5632 data->flags |= SF_HAS_EVAL;
5633 data->whilem_c = data_fake.whilem_c;
5635 if (flags & SCF_DO_STCLASS)
5636 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5639 if (flags & SCF_DO_SUBSTR) {
5640 data->pos_min += min1;
5641 data->pos_delta += max1 - min1;
5642 if (max1 != min1 || is_inf)
5643 data->longest = &(data->longest_float);
5646 delta += max1 - min1;
5647 if (flags & SCF_DO_STCLASS_OR) {
5648 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5650 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5651 flags &= ~SCF_DO_STCLASS;
5654 else if (flags & SCF_DO_STCLASS_AND) {
5656 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5657 flags &= ~SCF_DO_STCLASS;
5660 /* Switch to OR mode: cache the old value of
5661 * data->start_class */
5663 StructCopy(data->start_class, and_withp, regnode_ssc);
5664 flags &= ~SCF_DO_STCLASS_AND;
5665 StructCopy(&accum, data->start_class, regnode_ssc);
5666 flags |= SCF_DO_STCLASS_OR;
5673 else if (PL_regkind[OP(scan)] == TRIE) {
5674 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5677 min += trie->minlen;
5678 delta += (trie->maxlen - trie->minlen);
5679 flags &= ~SCF_DO_STCLASS; /* xxx */
5680 if (flags & SCF_DO_SUBSTR) {
5681 /* Cannot expect anything... */
5682 scan_commit(pRExC_state, data, minlenp, is_inf);
5683 data->pos_min += trie->minlen;
5684 data->pos_delta += (trie->maxlen - trie->minlen);
5685 if (trie->maxlen != trie->minlen)
5686 data->longest = &(data->longest_float);
5688 if (trie->jump) /* no more substrings -- for now /grr*/
5689 flags &= ~SCF_DO_SUBSTR;
5691 #endif /* old or new */
5692 #endif /* TRIE_STUDY_OPT */
5694 /* Else: zero-length, ignore. */
5695 scan = regnext(scan);
5697 /* If we are exiting a recursion we can unset its recursed bit
5698 * and allow ourselves to enter it again - no danger of an
5699 * infinite loop there.
5700 if (stopparen > -1 && recursed) {
5701 DEBUG_STUDYDATA("unset:", data,depth);
5702 PAREN_UNSET( recursed, stopparen);
5708 DEBUG_STUDYDATA("frame-end:",data,depth);
5709 DEBUG_PEEP("fend", scan, depth);
5711 /* restore previous context */
5712 last = frame->last_regnode;
5713 scan = frame->next_regnode;
5714 stopparen = frame->stopparen;
5715 recursed_depth = frame->prev_recursed_depth;
5717 RExC_frame_last = frame->prev_frame;
5718 frame = frame->this_prev_frame;
5719 goto fake_study_recurse;
5724 DEBUG_STUDYDATA("pre-fin:",data,depth);
5727 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5729 if (flags & SCF_DO_SUBSTR && is_inf)
5730 data->pos_delta = SSize_t_MAX - data->pos_min;
5731 if (is_par > (I32)U8_MAX)
5733 if (is_par && pars==1 && data) {
5734 data->flags |= SF_IN_PAR;
5735 data->flags &= ~SF_HAS_PAR;
5737 else if (pars && data) {
5738 data->flags |= SF_HAS_PAR;
5739 data->flags &= ~SF_IN_PAR;
5741 if (flags & SCF_DO_STCLASS_OR)
5742 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5743 if (flags & SCF_TRIE_RESTUDY)
5744 data->flags |= SCF_TRIE_RESTUDY;
5746 DEBUG_STUDYDATA("post-fin:",data,depth);
5749 SSize_t final_minlen= min < stopmin ? min : stopmin;
5751 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5752 RExC_maxlen = final_minlen + delta;
5754 return final_minlen;
5760 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5762 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5764 PERL_ARGS_ASSERT_ADD_DATA;
5766 Renewc(RExC_rxi->data,
5767 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5768 char, struct reg_data);
5770 Renew(RExC_rxi->data->what, count + n, U8);
5772 Newx(RExC_rxi->data->what, n, U8);
5773 RExC_rxi->data->count = count + n;
5774 Copy(s, RExC_rxi->data->what + count, n, U8);
5778 /*XXX: todo make this not included in a non debugging perl, but appears to be
5779 * used anyway there, in 'use re' */
5780 #ifndef PERL_IN_XSUB_RE
5782 Perl_reginitcolors(pTHX)
5784 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5786 char *t = savepv(s);
5790 t = strchr(t, '\t');
5796 PL_colors[i] = t = (char *)"";
5801 PL_colors[i++] = (char *)"";
5808 #ifdef TRIE_STUDY_OPT
5809 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5812 (data.flags & SCF_TRIE_RESTUDY) \
5820 #define CHECK_RESTUDY_GOTO_butfirst
5824 * pregcomp - compile a regular expression into internal code
5826 * Decides which engine's compiler to call based on the hint currently in
5830 #ifndef PERL_IN_XSUB_RE
5832 /* return the currently in-scope regex engine (or the default if none) */
5834 regexp_engine const *
5835 Perl_current_re_engine(pTHX)
5837 if (IN_PERL_COMPILETIME) {
5838 HV * const table = GvHV(PL_hintgv);
5841 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5842 return &PL_core_reg_engine;
5843 ptr = hv_fetchs(table, "regcomp", FALSE);
5844 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5845 return &PL_core_reg_engine;
5846 return INT2PTR(regexp_engine*,SvIV(*ptr));
5850 if (!PL_curcop->cop_hints_hash)
5851 return &PL_core_reg_engine;
5852 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5853 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5854 return &PL_core_reg_engine;
5855 return INT2PTR(regexp_engine*,SvIV(ptr));
5861 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5863 regexp_engine const *eng = current_re_engine();
5864 GET_RE_DEBUG_FLAGS_DECL;
5866 PERL_ARGS_ASSERT_PREGCOMP;
5868 /* Dispatch a request to compile a regexp to correct regexp engine. */
5870 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5873 return CALLREGCOMP_ENG(eng, pattern, flags);
5877 /* public(ish) entry point for the perl core's own regex compiling code.
5878 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5879 * pattern rather than a list of OPs, and uses the internal engine rather
5880 * than the current one */
5883 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5885 SV *pat = pattern; /* defeat constness! */
5886 PERL_ARGS_ASSERT_RE_COMPILE;
5887 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5888 #ifdef PERL_IN_XSUB_RE
5891 &PL_core_reg_engine,
5893 NULL, NULL, rx_flags, 0);
5897 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5898 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5899 * point to the realloced string and length.
5901 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5905 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5906 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5908 U8 *const src = (U8*)*pat_p;
5913 GET_RE_DEBUG_FLAGS_DECL;
5915 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5916 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5918 Newx(dst, *plen_p * 2 + 1, U8);
5921 while (s < *plen_p) {
5922 append_utf8_from_native_byte(src[s], &d);
5923 if (n < num_code_blocks) {
5924 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5925 pRExC_state->code_blocks[n].start = d - dst - 1;
5926 assert(*(d - 1) == '(');
5929 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5930 pRExC_state->code_blocks[n].end = d - dst - 1;
5931 assert(*(d - 1) == ')');
5940 *pat_p = (char*) dst;
5942 RExC_orig_utf8 = RExC_utf8 = 1;
5947 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5948 * while recording any code block indices, and handling overloading,
5949 * nested qr// objects etc. If pat is null, it will allocate a new
5950 * string, or just return the first arg, if there's only one.
5952 * Returns the malloced/updated pat.
5953 * patternp and pat_count is the array of SVs to be concatted;
5954 * oplist is the optional list of ops that generated the SVs;
5955 * recompile_p is a pointer to a boolean that will be set if
5956 * the regex will need to be recompiled.
5957 * delim, if non-null is an SV that will be inserted between each element
5961 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5962 SV *pat, SV ** const patternp, int pat_count,
5963 OP *oplist, bool *recompile_p, SV *delim)
5967 bool use_delim = FALSE;
5968 bool alloced = FALSE;
5970 /* if we know we have at least two args, create an empty string,
5971 * then concatenate args to that. For no args, return an empty string */
5972 if (!pat && pat_count != 1) {
5978 for (svp = patternp; svp < patternp + pat_count; svp++) {
5981 STRLEN orig_patlen = 0;
5983 SV *msv = use_delim ? delim : *svp;
5984 if (!msv) msv = &PL_sv_undef;
5986 /* if we've got a delimiter, we go round the loop twice for each
5987 * svp slot (except the last), using the delimiter the second
5996 if (SvTYPE(msv) == SVt_PVAV) {
5997 /* we've encountered an interpolated array within
5998 * the pattern, e.g. /...@a..../. Expand the list of elements,
5999 * then recursively append elements.
6000 * The code in this block is based on S_pushav() */
6002 AV *const av = (AV*)msv;
6003 const SSize_t maxarg = AvFILL(av) + 1;
6007 assert(oplist->op_type == OP_PADAV
6008 || oplist->op_type == OP_RV2AV);
6009 oplist = OP_SIBLING(oplist);
6012 if (SvRMAGICAL(av)) {
6015 Newx(array, maxarg, SV*);
6017 for (i=0; i < maxarg; i++) {
6018 SV ** const svp = av_fetch(av, i, FALSE);
6019 array[i] = svp ? *svp : &PL_sv_undef;
6023 array = AvARRAY(av);
6025 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6026 array, maxarg, NULL, recompile_p,
6028 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6034 /* we make the assumption here that each op in the list of
6035 * op_siblings maps to one SV pushed onto the stack,
6036 * except for code blocks, with have both an OP_NULL and
6038 * This allows us to match up the list of SVs against the
6039 * list of OPs to find the next code block.
6041 * Note that PUSHMARK PADSV PADSV ..
6043 * PADRANGE PADSV PADSV ..
6044 * so the alignment still works. */
6047 if (oplist->op_type == OP_NULL
6048 && (oplist->op_flags & OPf_SPECIAL))
6050 assert(n < pRExC_state->num_code_blocks);
6051 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6052 pRExC_state->code_blocks[n].block = oplist;
6053 pRExC_state->code_blocks[n].src_regex = NULL;
6056 oplist = OP_SIBLING(oplist); /* skip CONST */
6059 oplist = OP_SIBLING(oplist);;
6062 /* apply magic and QR overloading to arg */
6065 if (SvROK(msv) && SvAMAGIC(msv)) {
6066 SV *sv = AMG_CALLunary(msv, regexp_amg);
6070 if (SvTYPE(sv) != SVt_REGEXP)
6071 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6076 /* try concatenation overload ... */
6077 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6078 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6081 /* overloading involved: all bets are off over literal
6082 * code. Pretend we haven't seen it */
6083 pRExC_state->num_code_blocks -= n;
6087 /* ... or failing that, try "" overload */
6088 while (SvAMAGIC(msv)
6089 && (sv = AMG_CALLunary(msv, string_amg))
6093 && SvRV(msv) == SvRV(sv))
6098 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6102 /* this is a partially unrolled
6103 * sv_catsv_nomg(pat, msv);
6104 * that allows us to adjust code block indices if
6107 char *dst = SvPV_force_nomg(pat, dlen);
6109 if (SvUTF8(msv) && !SvUTF8(pat)) {
6110 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6111 sv_setpvn(pat, dst, dlen);
6114 sv_catsv_nomg(pat, msv);
6121 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6124 /* extract any code blocks within any embedded qr//'s */
6125 if (rx && SvTYPE(rx) == SVt_REGEXP
6126 && RX_ENGINE((REGEXP*)rx)->op_comp)
6129 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6130 if (ri->num_code_blocks) {
6132 /* the presence of an embedded qr// with code means
6133 * we should always recompile: the text of the
6134 * qr// may not have changed, but it may be a
6135 * different closure than last time */
6137 Renew(pRExC_state->code_blocks,
6138 pRExC_state->num_code_blocks + ri->num_code_blocks,
6139 struct reg_code_block);
6140 pRExC_state->num_code_blocks += ri->num_code_blocks;
6142 for (i=0; i < ri->num_code_blocks; i++) {
6143 struct reg_code_block *src, *dst;
6144 STRLEN offset = orig_patlen
6145 + ReANY((REGEXP *)rx)->pre_prefix;
6146 assert(n < pRExC_state->num_code_blocks);
6147 src = &ri->code_blocks[i];
6148 dst = &pRExC_state->code_blocks[n];
6149 dst->start = src->start + offset;
6150 dst->end = src->end + offset;
6151 dst->block = src->block;
6152 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6161 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6170 /* see if there are any run-time code blocks in the pattern.
6171 * False positives are allowed */
6174 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6175 char *pat, STRLEN plen)
6180 PERL_UNUSED_CONTEXT;
6182 for (s = 0; s < plen; s++) {
6183 if (n < pRExC_state->num_code_blocks
6184 && s == pRExC_state->code_blocks[n].start)
6186 s = pRExC_state->code_blocks[n].end;
6190 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6192 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6194 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6201 /* Handle run-time code blocks. We will already have compiled any direct
6202 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6203 * copy of it, but with any literal code blocks blanked out and
6204 * appropriate chars escaped; then feed it into
6206 * eval "qr'modified_pattern'"
6210 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6214 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6216 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6217 * and merge them with any code blocks of the original regexp.
6219 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6220 * instead, just save the qr and return FALSE; this tells our caller that
6221 * the original pattern needs upgrading to utf8.
6225 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6226 char *pat, STRLEN plen)
6230 GET_RE_DEBUG_FLAGS_DECL;
6232 if (pRExC_state->runtime_code_qr) {
6233 /* this is the second time we've been called; this should
6234 * only happen if the main pattern got upgraded to utf8
6235 * during compilation; re-use the qr we compiled first time
6236 * round (which should be utf8 too)
6238 qr = pRExC_state->runtime_code_qr;
6239 pRExC_state->runtime_code_qr = NULL;
6240 assert(RExC_utf8 && SvUTF8(qr));
6246 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6250 /* determine how many extra chars we need for ' and \ escaping */
6251 for (s = 0; s < plen; s++) {
6252 if (pat[s] == '\'' || pat[s] == '\\')
6256 Newx(newpat, newlen, char);
6258 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6260 for (s = 0; s < plen; s++) {
6261 if (n < pRExC_state->num_code_blocks
6262 && s == pRExC_state->code_blocks[n].start)
6264 /* blank out literal code block */
6265 assert(pat[s] == '(');
6266 while (s <= pRExC_state->code_blocks[n].end) {
6274 if (pat[s] == '\'' || pat[s] == '\\')
6279 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6283 PerlIO_printf(Perl_debug_log,
6284 "%sre-parsing pattern for runtime code:%s %s\n",
6285 PL_colors[4],PL_colors[5],newpat);
6288 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6293 PUSHSTACKi(PERLSI_REQUIRE);
6294 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6295 * parsing qr''; normally only q'' does this. It also alters
6297 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6298 SvREFCNT_dec_NN(sv);
6303 SV * const errsv = ERRSV;
6304 if (SvTRUE_NN(errsv))
6306 Safefree(pRExC_state->code_blocks);
6307 /* use croak_sv ? */
6308 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6311 assert(SvROK(qr_ref));
6313 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6314 /* the leaving below frees the tmp qr_ref.
6315 * Give qr a life of its own */
6323 if (!RExC_utf8 && SvUTF8(qr)) {
6324 /* first time through; the pattern got upgraded; save the
6325 * qr for the next time through */
6326 assert(!pRExC_state->runtime_code_qr);
6327 pRExC_state->runtime_code_qr = qr;
6332 /* extract any code blocks within the returned qr// */
6335 /* merge the main (r1) and run-time (r2) code blocks into one */
6337 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6338 struct reg_code_block *new_block, *dst;
6339 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6342 if (!r2->num_code_blocks) /* we guessed wrong */
6344 SvREFCNT_dec_NN(qr);
6349 r1->num_code_blocks + r2->num_code_blocks,
6350 struct reg_code_block);
6353 while ( i1 < r1->num_code_blocks
6354 || i2 < r2->num_code_blocks)
6356 struct reg_code_block *src;
6359 if (i1 == r1->num_code_blocks) {
6360 src = &r2->code_blocks[i2++];
6363 else if (i2 == r2->num_code_blocks)
6364 src = &r1->code_blocks[i1++];
6365 else if ( r1->code_blocks[i1].start
6366 < r2->code_blocks[i2].start)
6368 src = &r1->code_blocks[i1++];
6369 assert(src->end < r2->code_blocks[i2].start);
6372 assert( r1->code_blocks[i1].start
6373 > r2->code_blocks[i2].start);
6374 src = &r2->code_blocks[i2++];
6376 assert(src->end < r1->code_blocks[i1].start);
6379 assert(pat[src->start] == '(');
6380 assert(pat[src->end] == ')');
6381 dst->start = src->start;
6382 dst->end = src->end;
6383 dst->block = src->block;
6384 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6388 r1->num_code_blocks += r2->num_code_blocks;
6389 Safefree(r1->code_blocks);
6390 r1->code_blocks = new_block;
6393 SvREFCNT_dec_NN(qr);
6399 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6400 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6401 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6402 STRLEN longest_length, bool eol, bool meol)
6404 /* This is the common code for setting up the floating and fixed length
6405 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6406 * as to whether succeeded or not */
6411 if (! (longest_length
6412 || (eol /* Can't have SEOL and MULTI */
6413 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6415 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6416 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6421 /* copy the information about the longest from the reg_scan_data
6422 over to the program. */
6423 if (SvUTF8(sv_longest)) {
6424 *rx_utf8 = sv_longest;
6427 *rx_substr = sv_longest;
6430 /* end_shift is how many chars that must be matched that
6431 follow this item. We calculate it ahead of time as once the
6432 lookbehind offset is added in we lose the ability to correctly
6434 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6435 *rx_end_shift = ml - offset
6436 - longest_length + (SvTAIL(sv_longest) != 0)
6439 t = (eol/* Can't have SEOL and MULTI */
6440 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6441 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6447 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6448 * regular expression into internal code.
6449 * The pattern may be passed either as:
6450 * a list of SVs (patternp plus pat_count)
6451 * a list of OPs (expr)
6452 * If both are passed, the SV list is used, but the OP list indicates
6453 * which SVs are actually pre-compiled code blocks
6455 * The SVs in the list have magic and qr overloading applied to them (and
6456 * the list may be modified in-place with replacement SVs in the latter
6459 * If the pattern hasn't changed from old_re, then old_re will be
6462 * eng is the current engine. If that engine has an op_comp method, then
6463 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6464 * do the initial concatenation of arguments and pass on to the external
6467 * If is_bare_re is not null, set it to a boolean indicating whether the
6468 * arg list reduced (after overloading) to a single bare regex which has
6469 * been returned (i.e. /$qr/).
6471 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6473 * pm_flags contains the PMf_* flags, typically based on those from the
6474 * pm_flags field of the related PMOP. Currently we're only interested in
6475 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6477 * We can't allocate space until we know how big the compiled form will be,
6478 * but we can't compile it (and thus know how big it is) until we've got a
6479 * place to put the code. So we cheat: we compile it twice, once with code
6480 * generation turned off and size counting turned on, and once "for real".
6481 * This also means that we don't allocate space until we are sure that the
6482 * thing really will compile successfully, and we never have to move the
6483 * code and thus invalidate pointers into it. (Note that it has to be in
6484 * one piece because free() must be able to free it all.) [NB: not true in perl]
6486 * Beware that the optimization-preparation code in here knows about some
6487 * of the structure of the compiled regexp. [I'll say.]
6491 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6492 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6493 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6497 regexp_internal *ri;
6505 SV *code_blocksv = NULL;
6506 SV** new_patternp = patternp;
6508 /* these are all flags - maybe they should be turned
6509 * into a single int with different bit masks */
6510 I32 sawlookahead = 0;
6515 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6517 bool runtime_code = 0;
6519 RExC_state_t RExC_state;
6520 RExC_state_t * const pRExC_state = &RExC_state;
6521 #ifdef TRIE_STUDY_OPT
6523 RExC_state_t copyRExC_state;
6525 GET_RE_DEBUG_FLAGS_DECL;
6527 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6529 DEBUG_r(if (!PL_colorset) reginitcolors());
6531 #ifndef PERL_IN_XSUB_RE
6532 /* Initialize these here instead of as-needed, as is quick and avoids
6533 * having to test them each time otherwise */
6534 if (! PL_AboveLatin1) {
6535 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6536 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6537 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6538 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6539 PL_HasMultiCharFold =
6540 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6542 /* This is calculated here, because the Perl program that generates the
6543 * static global ones doesn't currently have access to
6544 * NUM_ANYOF_CODE_POINTS */
6545 PL_InBitmap = _new_invlist(2);
6546 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6547 NUM_ANYOF_CODE_POINTS - 1);
6551 pRExC_state->code_blocks = NULL;
6552 pRExC_state->num_code_blocks = 0;
6555 *is_bare_re = FALSE;
6557 if (expr && (expr->op_type == OP_LIST ||
6558 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6559 /* allocate code_blocks if needed */
6563 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6564 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6565 ncode++; /* count of DO blocks */
6567 pRExC_state->num_code_blocks = ncode;
6568 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6573 /* compile-time pattern with just OP_CONSTs and DO blocks */
6578 /* find how many CONSTs there are */
6581 if (expr->op_type == OP_CONST)
6584 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6585 if (o->op_type == OP_CONST)
6589 /* fake up an SV array */
6591 assert(!new_patternp);
6592 Newx(new_patternp, n, SV*);
6593 SAVEFREEPV(new_patternp);
6597 if (expr->op_type == OP_CONST)
6598 new_patternp[n] = cSVOPx_sv(expr);
6600 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6601 if (o->op_type == OP_CONST)
6602 new_patternp[n++] = cSVOPo_sv;
6607 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6608 "Assembling pattern from %d elements%s\n", pat_count,
6609 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6611 /* set expr to the first arg op */
6613 if (pRExC_state->num_code_blocks
6614 && expr->op_type != OP_CONST)
6616 expr = cLISTOPx(expr)->op_first;
6617 assert( expr->op_type == OP_PUSHMARK
6618 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6619 || expr->op_type == OP_PADRANGE);
6620 expr = OP_SIBLING(expr);
6623 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6624 expr, &recompile, NULL);
6626 /* handle bare (possibly after overloading) regex: foo =~ $re */
6631 if (SvTYPE(re) == SVt_REGEXP) {
6635 Safefree(pRExC_state->code_blocks);
6636 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6637 "Precompiled pattern%s\n",
6638 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6644 exp = SvPV_nomg(pat, plen);
6646 if (!eng->op_comp) {
6647 if ((SvUTF8(pat) && IN_BYTES)
6648 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6650 /* make a temporary copy; either to convert to bytes,
6651 * or to avoid repeating get-magic / overloaded stringify */
6652 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6653 (IN_BYTES ? 0 : SvUTF8(pat)));
6655 Safefree(pRExC_state->code_blocks);
6656 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6659 /* ignore the utf8ness if the pattern is 0 length */
6660 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6661 RExC_uni_semantics = 0;
6662 RExC_contains_locale = 0;
6663 RExC_contains_i = 0;
6664 pRExC_state->runtime_code_qr = NULL;
6665 RExC_frame_head= NULL;
6666 RExC_frame_last= NULL;
6667 RExC_frame_count= 0;
6670 RExC_mysv1= sv_newmortal();
6671 RExC_mysv2= sv_newmortal();
6674 SV *dsv= sv_newmortal();
6675 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6676 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6677 PL_colors[4],PL_colors[5],s);
6681 /* we jump here if we upgrade the pattern to utf8 and have to
6684 if ((pm_flags & PMf_USE_RE_EVAL)
6685 /* this second condition covers the non-regex literal case,
6686 * i.e. $foo =~ '(?{})'. */
6687 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6689 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6691 /* return old regex if pattern hasn't changed */
6692 /* XXX: note in the below we have to check the flags as well as the
6695 * Things get a touch tricky as we have to compare the utf8 flag
6696 * independently from the compile flags. */
6700 && !!RX_UTF8(old_re) == !!RExC_utf8
6701 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6702 && RX_PRECOMP(old_re)
6703 && RX_PRELEN(old_re) == plen
6704 && memEQ(RX_PRECOMP(old_re), exp, plen)
6705 && !runtime_code /* with runtime code, always recompile */ )
6707 Safefree(pRExC_state->code_blocks);
6711 rx_flags = orig_rx_flags;
6713 if (rx_flags & PMf_FOLD) {
6714 RExC_contains_i = 1;
6716 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6718 /* Set to use unicode semantics if the pattern is in utf8 and has the
6719 * 'depends' charset specified, as it means unicode when utf8 */
6720 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6724 RExC_flags = rx_flags;
6725 RExC_pm_flags = pm_flags;
6728 if (TAINTING_get && TAINT_get)
6729 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6731 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6732 /* whoops, we have a non-utf8 pattern, whilst run-time code
6733 * got compiled as utf8. Try again with a utf8 pattern */
6734 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6735 pRExC_state->num_code_blocks);
6736 goto redo_first_pass;
6739 assert(!pRExC_state->runtime_code_qr);
6745 RExC_in_lookbehind = 0;
6746 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6748 RExC_override_recoding = 0;
6749 RExC_in_multi_char_class = 0;
6751 /* First pass: determine size, legality. */
6754 RExC_end = exp + plen;
6759 RExC_emit = (regnode *) &RExC_emit_dummy;
6760 RExC_whilem_seen = 0;
6761 RExC_open_parens = NULL;
6762 RExC_close_parens = NULL;
6764 RExC_paren_names = NULL;
6766 RExC_paren_name_list = NULL;
6768 RExC_recurse = NULL;
6769 RExC_study_chunk_recursed = NULL;
6770 RExC_study_chunk_recursed_bytes= 0;
6771 RExC_recurse_count = 0;
6772 pRExC_state->code_index = 0;
6774 #if 0 /* REGC() is (currently) a NOP at the first pass.
6775 * Clever compilers notice this and complain. --jhi */
6776 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6779 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6781 RExC_lastparse=NULL;
6783 /* reg may croak on us, not giving us a chance to free
6784 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6785 need it to survive as long as the regexp (qr/(?{})/).
6786 We must check that code_blocksv is not already set, because we may
6787 have jumped back to restart the sizing pass. */
6788 if (pRExC_state->code_blocks && !code_blocksv) {
6789 code_blocksv = newSV_type(SVt_PV);
6790 SAVEFREESV(code_blocksv);
6791 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6792 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6794 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6795 /* It's possible to write a regexp in ascii that represents Unicode
6796 codepoints outside of the byte range, such as via \x{100}. If we
6797 detect such a sequence we have to convert the entire pattern to utf8
6798 and then recompile, as our sizing calculation will have been based
6799 on 1 byte == 1 character, but we will need to use utf8 to encode
6800 at least some part of the pattern, and therefore must convert the whole
6803 if (flags & RESTART_UTF8) {
6804 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6805 pRExC_state->num_code_blocks);
6806 goto redo_first_pass;
6808 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6811 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6814 PerlIO_printf(Perl_debug_log,
6815 "Required size %"IVdf" nodes\n"
6816 "Starting second pass (creation)\n",
6819 RExC_lastparse=NULL;
6822 /* The first pass could have found things that force Unicode semantics */
6823 if ((RExC_utf8 || RExC_uni_semantics)
6824 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6826 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6829 /* Small enough for pointer-storage convention?
6830 If extralen==0, this means that we will not need long jumps. */
6831 if (RExC_size >= 0x10000L && RExC_extralen)
6832 RExC_size += RExC_extralen;
6835 if (RExC_whilem_seen > 15)
6836 RExC_whilem_seen = 15;
6838 /* Allocate space and zero-initialize. Note, the two step process
6839 of zeroing when in debug mode, thus anything assigned has to
6840 happen after that */
6841 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6843 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6844 char, regexp_internal);
6845 if ( r == NULL || ri == NULL )
6846 FAIL("Regexp out of space");
6848 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6849 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6852 /* bulk initialize base fields with 0. */
6853 Zero(ri, sizeof(regexp_internal), char);
6856 /* non-zero initialization begins here */
6859 r->extflags = rx_flags;
6860 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6862 if (pm_flags & PMf_IS_QR) {
6863 ri->code_blocks = pRExC_state->code_blocks;
6864 ri->num_code_blocks = pRExC_state->num_code_blocks;
6869 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6870 if (pRExC_state->code_blocks[n].src_regex)
6871 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6872 SAVEFREEPV(pRExC_state->code_blocks);
6876 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6877 bool has_charset = (get_regex_charset(r->extflags)
6878 != REGEX_DEPENDS_CHARSET);
6880 /* The caret is output if there are any defaults: if not all the STD
6881 * flags are set, or if no character set specifier is needed */
6883 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6885 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6886 == REG_RUN_ON_COMMENT_SEEN);
6887 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6888 >> RXf_PMf_STD_PMMOD_SHIFT);
6889 const char *fptr = STD_PAT_MODS; /*"msix"*/
6891 /* Allocate for the worst case, which is all the std flags are turned
6892 * on. If more precision is desired, we could do a population count of
6893 * the flags set. This could be done with a small lookup table, or by
6894 * shifting, masking and adding, or even, when available, assembly
6895 * language for a machine-language population count.
6896 * We never output a minus, as all those are defaults, so are
6897 * covered by the caret */
6898 const STRLEN wraplen = plen + has_p + has_runon
6899 + has_default /* If needs a caret */
6901 /* If needs a character set specifier */
6902 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6903 + (sizeof(STD_PAT_MODS) - 1)
6904 + (sizeof("(?:)") - 1);
6906 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6907 r->xpv_len_u.xpvlenu_pv = p;
6909 SvFLAGS(rx) |= SVf_UTF8;
6912 /* If a default, cover it using the caret */
6914 *p++= DEFAULT_PAT_MOD;
6918 const char* const name = get_regex_charset_name(r->extflags, &len);
6919 Copy(name, p, len, char);
6923 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6926 while((ch = *fptr++)) {
6934 Copy(RExC_precomp, p, plen, char);
6935 assert ((RX_WRAPPED(rx) - p) < 16);
6936 r->pre_prefix = p - RX_WRAPPED(rx);
6942 SvCUR_set(rx, p - RX_WRAPPED(rx));
6946 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6948 /* setup various meta data about recursion, this all requires
6949 * RExC_npar to be correctly set, and a bit later on we clear it */
6950 if (RExC_seen & REG_RECURSE_SEEN) {
6951 Newxz(RExC_open_parens, RExC_npar,regnode *);
6952 SAVEFREEPV(RExC_open_parens);
6953 Newxz(RExC_close_parens,RExC_npar,regnode *);
6954 SAVEFREEPV(RExC_close_parens);
6956 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6957 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6958 * So its 1 if there are no parens. */
6959 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6960 ((RExC_npar & 0x07) != 0);
6961 Newx(RExC_study_chunk_recursed,
6962 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6963 SAVEFREEPV(RExC_study_chunk_recursed);
6966 /* Useful during FAIL. */
6967 #ifdef RE_TRACK_PATTERN_OFFSETS
6968 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6969 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6970 "%s %"UVuf" bytes for offset annotations.\n",
6971 ri->u.offsets ? "Got" : "Couldn't get",
6972 (UV)((2*RExC_size+1) * sizeof(U32))));
6974 SetProgLen(ri,RExC_size);
6979 /* Second pass: emit code. */
6980 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6981 RExC_pm_flags = pm_flags;
6983 RExC_end = exp + plen;
6986 RExC_emit_start = ri->program;
6987 RExC_emit = ri->program;
6988 RExC_emit_bound = ri->program + RExC_size + 1;
6989 pRExC_state->code_index = 0;
6991 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6992 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6994 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6996 /* XXXX To minimize changes to RE engine we always allocate
6997 3-units-long substrs field. */
6998 Newx(r->substrs, 1, struct reg_substr_data);
6999 if (RExC_recurse_count) {
7000 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7001 SAVEFREEPV(RExC_recurse);
7005 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7007 RExC_study_chunk_recursed_count= 0;
7009 Zero(r->substrs, 1, struct reg_substr_data);
7010 if (RExC_study_chunk_recursed) {
7011 Zero(RExC_study_chunk_recursed,
7012 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7016 #ifdef TRIE_STUDY_OPT
7018 StructCopy(&zero_scan_data, &data, scan_data_t);
7019 copyRExC_state = RExC_state;
7022 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
7024 RExC_state = copyRExC_state;
7025 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7026 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7028 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7029 StructCopy(&zero_scan_data, &data, scan_data_t);
7032 StructCopy(&zero_scan_data, &data, scan_data_t);
7035 /* Dig out information for optimizations. */
7036 r->extflags = RExC_flags; /* was pm_op */
7037 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7040 SvUTF8_on(rx); /* Unicode in it? */
7041 ri->regstclass = NULL;
7042 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
7043 r->intflags |= PREGf_NAUGHTY;
7044 scan = ri->program + 1; /* First BRANCH. */
7046 /* testing for BRANCH here tells us whether there is "must appear"
7047 data in the pattern. If there is then we can use it for optimisations */
7048 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7051 STRLEN longest_float_length, longest_fixed_length;
7052 regnode_ssc ch_class; /* pointed to by data */
7054 SSize_t last_close = 0; /* pointed to by data */
7055 regnode *first= scan;
7056 regnode *first_next= regnext(first);
7058 * Skip introductions and multiplicators >= 1
7059 * so that we can extract the 'meat' of the pattern that must
7060 * match in the large if() sequence following.
7061 * NOTE that EXACT is NOT covered here, as it is normally
7062 * picked up by the optimiser separately.
7064 * This is unfortunate as the optimiser isnt handling lookahead
7065 * properly currently.
7068 while ((OP(first) == OPEN && (sawopen = 1)) ||
7069 /* An OR of *one* alternative - should not happen now. */
7070 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7071 /* for now we can't handle lookbehind IFMATCH*/
7072 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7073 (OP(first) == PLUS) ||
7074 (OP(first) == MINMOD) ||
7075 /* An {n,m} with n>0 */
7076 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7077 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7080 * the only op that could be a regnode is PLUS, all the rest
7081 * will be regnode_1 or regnode_2.
7083 * (yves doesn't think this is true)
7085 if (OP(first) == PLUS)
7088 if (OP(first) == MINMOD)
7090 first += regarglen[OP(first)];
7092 first = NEXTOPER(first);
7093 first_next= regnext(first);
7096 /* Starting-point info. */
7098 DEBUG_PEEP("first:",first,0);
7099 /* Ignore EXACT as we deal with it later. */
7100 if (PL_regkind[OP(first)] == EXACT) {
7101 if (OP(first) == EXACT)
7102 NOOP; /* Empty, get anchored substr later. */
7104 ri->regstclass = first;
7107 else if (PL_regkind[OP(first)] == TRIE &&
7108 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7110 /* this can happen only on restudy */
7111 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7114 else if (REGNODE_SIMPLE(OP(first)))
7115 ri->regstclass = first;
7116 else if (PL_regkind[OP(first)] == BOUND ||
7117 PL_regkind[OP(first)] == NBOUND)
7118 ri->regstclass = first;
7119 else if (PL_regkind[OP(first)] == BOL) {
7120 r->intflags |= (OP(first) == MBOL
7123 first = NEXTOPER(first);
7126 else if (OP(first) == GPOS) {
7127 r->intflags |= PREGf_ANCH_GPOS;
7128 first = NEXTOPER(first);
7131 else if ((!sawopen || !RExC_sawback) &&
7133 (OP(first) == STAR &&
7134 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7135 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7137 /* turn .* into ^.* with an implied $*=1 */
7139 (OP(NEXTOPER(first)) == REG_ANY)
7142 r->intflags |= (type | PREGf_IMPLICIT);
7143 first = NEXTOPER(first);
7146 if (sawplus && !sawminmod && !sawlookahead
7147 && (!sawopen || !RExC_sawback)
7148 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7149 /* x+ must match at the 1st pos of run of x's */
7150 r->intflags |= PREGf_SKIP;
7152 /* Scan is after the zeroth branch, first is atomic matcher. */
7153 #ifdef TRIE_STUDY_OPT
7156 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7157 (IV)(first - scan + 1))
7161 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7162 (IV)(first - scan + 1))
7168 * If there's something expensive in the r.e., find the
7169 * longest literal string that must appear and make it the
7170 * regmust. Resolve ties in favor of later strings, since
7171 * the regstart check works with the beginning of the r.e.
7172 * and avoiding duplication strengthens checking. Not a
7173 * strong reason, but sufficient in the absence of others.
7174 * [Now we resolve ties in favor of the earlier string if
7175 * it happens that c_offset_min has been invalidated, since the
7176 * earlier string may buy us something the later one won't.]
7179 data.longest_fixed = newSVpvs("");
7180 data.longest_float = newSVpvs("");
7181 data.last_found = newSVpvs("");
7182 data.longest = &(data.longest_fixed);
7183 ENTER_with_name("study_chunk");
7184 SAVEFREESV(data.longest_fixed);
7185 SAVEFREESV(data.longest_float);
7186 SAVEFREESV(data.last_found);
7188 if (!ri->regstclass) {
7189 ssc_init(pRExC_state, &ch_class);
7190 data.start_class = &ch_class;
7191 stclass_flag = SCF_DO_STCLASS_AND;
7192 } else /* XXXX Check for BOUND? */
7194 data.last_closep = &last_close;
7197 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7198 scan + RExC_size, /* Up to end */
7200 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7201 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7205 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7208 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7209 && data.last_start_min == 0 && data.last_end > 0
7210 && !RExC_seen_zerolen
7211 && !(RExC_seen & REG_VERBARG_SEEN)
7212 && !(RExC_seen & REG_GPOS_SEEN)
7214 r->extflags |= RXf_CHECK_ALL;
7216 scan_commit(pRExC_state, &data,&minlen,0);
7218 longest_float_length = CHR_SVLEN(data.longest_float);
7220 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7221 && data.offset_fixed == data.offset_float_min
7222 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7223 && S_setup_longest (aTHX_ pRExC_state,
7227 &(r->float_end_shift),
7228 data.lookbehind_float,
7229 data.offset_float_min,
7231 longest_float_length,
7232 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7233 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7235 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7236 r->float_max_offset = data.offset_float_max;
7237 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7238 r->float_max_offset -= data.lookbehind_float;
7239 SvREFCNT_inc_simple_void_NN(data.longest_float);
7242 r->float_substr = r->float_utf8 = NULL;
7243 longest_float_length = 0;
7246 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7248 if (S_setup_longest (aTHX_ pRExC_state,
7250 &(r->anchored_utf8),
7251 &(r->anchored_substr),
7252 &(r->anchored_end_shift),
7253 data.lookbehind_fixed,
7256 longest_fixed_length,
7257 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7258 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7260 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7261 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7264 r->anchored_substr = r->anchored_utf8 = NULL;
7265 longest_fixed_length = 0;
7267 LEAVE_with_name("study_chunk");
7270 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7271 ri->regstclass = NULL;
7273 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7275 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7276 && is_ssc_worth_it(pRExC_state, data.start_class))
7278 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7280 ssc_finalize(pRExC_state, data.start_class);
7282 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7283 StructCopy(data.start_class,
7284 (regnode_ssc*)RExC_rxi->data->data[n],
7286 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7287 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7288 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7289 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7290 PerlIO_printf(Perl_debug_log,
7291 "synthetic stclass \"%s\".\n",
7292 SvPVX_const(sv));});
7293 data.start_class = NULL;
7296 /* A temporary algorithm prefers floated substr to fixed one to dig
7298 if (longest_fixed_length > longest_float_length) {
7299 r->substrs->check_ix = 0;
7300 r->check_end_shift = r->anchored_end_shift;
7301 r->check_substr = r->anchored_substr;
7302 r->check_utf8 = r->anchored_utf8;
7303 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7304 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7305 r->intflags |= PREGf_NOSCAN;
7308 r->substrs->check_ix = 1;
7309 r->check_end_shift = r->float_end_shift;
7310 r->check_substr = r->float_substr;
7311 r->check_utf8 = r->float_utf8;
7312 r->check_offset_min = r->float_min_offset;
7313 r->check_offset_max = r->float_max_offset;
7315 if ((r->check_substr || r->check_utf8) ) {
7316 r->extflags |= RXf_USE_INTUIT;
7317 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7318 r->extflags |= RXf_INTUIT_TAIL;
7320 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7322 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7323 if ( (STRLEN)minlen < longest_float_length )
7324 minlen= longest_float_length;
7325 if ( (STRLEN)minlen < longest_fixed_length )
7326 minlen= longest_fixed_length;
7330 /* Several toplevels. Best we can is to set minlen. */
7332 regnode_ssc ch_class;
7333 SSize_t last_close = 0;
7335 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7337 scan = ri->program + 1;
7338 ssc_init(pRExC_state, &ch_class);
7339 data.start_class = &ch_class;
7340 data.last_closep = &last_close;
7343 minlen = study_chunk(pRExC_state,
7344 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7345 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7346 ? SCF_TRIE_DOING_RESTUDY
7350 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7352 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7353 = r->float_substr = r->float_utf8 = NULL;
7355 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7356 && is_ssc_worth_it(pRExC_state, data.start_class))
7358 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7360 ssc_finalize(pRExC_state, data.start_class);
7362 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7363 StructCopy(data.start_class,
7364 (regnode_ssc*)RExC_rxi->data->data[n],
7366 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7367 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7368 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7369 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7370 PerlIO_printf(Perl_debug_log,
7371 "synthetic stclass \"%s\".\n",
7372 SvPVX_const(sv));});
7373 data.start_class = NULL;
7377 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7378 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7379 r->maxlen = REG_INFTY;
7382 r->maxlen = RExC_maxlen;
7385 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7386 the "real" pattern. */
7388 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7389 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7391 r->minlenret = minlen;
7392 if (r->minlen < minlen)
7395 if (RExC_seen & REG_GPOS_SEEN)
7396 r->intflags |= PREGf_GPOS_SEEN;
7397 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7398 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7400 if (pRExC_state->num_code_blocks)
7401 r->extflags |= RXf_EVAL_SEEN;
7402 if (RExC_seen & REG_CANY_SEEN)
7403 r->intflags |= PREGf_CANY_SEEN;
7404 if (RExC_seen & REG_VERBARG_SEEN)
7406 r->intflags |= PREGf_VERBARG_SEEN;
7407 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7409 if (RExC_seen & REG_CUTGROUP_SEEN)
7410 r->intflags |= PREGf_CUTGROUP_SEEN;
7411 if (pm_flags & PMf_USE_RE_EVAL)
7412 r->intflags |= PREGf_USE_RE_EVAL;
7413 if (RExC_paren_names)
7414 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7416 RXp_PAREN_NAMES(r) = NULL;
7418 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7419 * so it can be used in pp.c */
7420 if (r->intflags & PREGf_ANCH)
7421 r->extflags |= RXf_IS_ANCHORED;
7425 /* this is used to identify "special" patterns that might result
7426 * in Perl NOT calling the regex engine and instead doing the match "itself",
7427 * particularly special cases in split//. By having the regex compiler
7428 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7429 * we avoid weird issues with equivalent patterns resulting in different behavior,
7430 * AND we allow non Perl engines to get the same optimizations by the setting the
7431 * flags appropriately - Yves */
7432 regnode *first = ri->program + 1;
7434 regnode *next = NEXTOPER(first);
7437 if (PL_regkind[fop] == NOTHING && nop == END)
7438 r->extflags |= RXf_NULL;
7439 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7440 /* when fop is SBOL first->flags will be true only when it was
7441 * produced by parsing /\A/, and not when parsing /^/. This is
7442 * very important for the split code as there we want to
7443 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7444 * See rt #122761 for more details. -- Yves */
7445 r->extflags |= RXf_START_ONLY;
7446 else if (fop == PLUS
7447 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7448 && OP(regnext(first)) == END)
7449 r->extflags |= RXf_WHITE;
7450 else if ( r->extflags & RXf_SPLIT
7452 && STR_LEN(first) == 1
7453 && *(STRING(first)) == ' '
7454 && OP(regnext(first)) == END )
7455 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7459 if (RExC_contains_locale) {
7460 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7464 if (RExC_paren_names) {
7465 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7466 ri->data->data[ri->name_list_idx]
7467 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7470 ri->name_list_idx = 0;
7472 if (RExC_recurse_count) {
7473 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7474 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7475 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7478 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7479 /* assume we don't need to swap parens around before we match */
7481 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7482 (unsigned long)RExC_study_chunk_recursed_count);
7486 PerlIO_printf(Perl_debug_log,"Final program:\n");
7489 #ifdef RE_TRACK_PATTERN_OFFSETS
7490 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7491 const STRLEN len = ri->u.offsets[0];
7493 GET_RE_DEBUG_FLAGS_DECL;
7494 PerlIO_printf(Perl_debug_log,
7495 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7496 for (i = 1; i <= len; i++) {
7497 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7498 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7499 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7501 PerlIO_printf(Perl_debug_log, "\n");
7506 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7507 * by setting the regexp SV to readonly-only instead. If the
7508 * pattern's been recompiled, the USEDness should remain. */
7509 if (old_re && SvREADONLY(old_re))
7517 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7520 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7522 PERL_UNUSED_ARG(value);
7524 if (flags & RXapif_FETCH) {
7525 return reg_named_buff_fetch(rx, key, flags);
7526 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7527 Perl_croak_no_modify();
7529 } else if (flags & RXapif_EXISTS) {
7530 return reg_named_buff_exists(rx, key, flags)
7533 } else if (flags & RXapif_REGNAMES) {
7534 return reg_named_buff_all(rx, flags);
7535 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7536 return reg_named_buff_scalar(rx, flags);
7538 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7544 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7547 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7548 PERL_UNUSED_ARG(lastkey);
7550 if (flags & RXapif_FIRSTKEY)
7551 return reg_named_buff_firstkey(rx, flags);
7552 else if (flags & RXapif_NEXTKEY)
7553 return reg_named_buff_nextkey(rx, flags);
7555 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7562 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7565 AV *retarray = NULL;
7567 struct regexp *const rx = ReANY(r);
7569 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7571 if (flags & RXapif_ALL)
7574 if (rx && RXp_PAREN_NAMES(rx)) {
7575 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7578 SV* sv_dat=HeVAL(he_str);
7579 I32 *nums=(I32*)SvPVX(sv_dat);
7580 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7581 if ((I32)(rx->nparens) >= nums[i]
7582 && rx->offs[nums[i]].start != -1
7583 && rx->offs[nums[i]].end != -1)
7586 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7591 ret = newSVsv(&PL_sv_undef);
7594 av_push(retarray, ret);
7597 return newRV_noinc(MUTABLE_SV(retarray));
7604 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7607 struct regexp *const rx = ReANY(r);
7609 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7611 if (rx && RXp_PAREN_NAMES(rx)) {
7612 if (flags & RXapif_ALL) {
7613 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7615 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7617 SvREFCNT_dec_NN(sv);
7629 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7631 struct regexp *const rx = ReANY(r);
7633 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7635 if ( rx && RXp_PAREN_NAMES(rx) ) {
7636 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7638 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7645 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7647 struct regexp *const rx = ReANY(r);
7648 GET_RE_DEBUG_FLAGS_DECL;
7650 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7652 if (rx && RXp_PAREN_NAMES(rx)) {
7653 HV *hv = RXp_PAREN_NAMES(rx);
7655 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7658 SV* sv_dat = HeVAL(temphe);
7659 I32 *nums = (I32*)SvPVX(sv_dat);
7660 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7661 if ((I32)(rx->lastparen) >= nums[i] &&
7662 rx->offs[nums[i]].start != -1 &&
7663 rx->offs[nums[i]].end != -1)
7669 if (parno || flags & RXapif_ALL) {
7670 return newSVhek(HeKEY_hek(temphe));
7678 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7683 struct regexp *const rx = ReANY(r);
7685 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7687 if (rx && RXp_PAREN_NAMES(rx)) {
7688 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7689 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7690 } else if (flags & RXapif_ONE) {
7691 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7692 av = MUTABLE_AV(SvRV(ret));
7693 length = av_tindex(av);
7694 SvREFCNT_dec_NN(ret);
7695 return newSViv(length + 1);
7697 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7702 return &PL_sv_undef;
7706 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7708 struct regexp *const rx = ReANY(r);
7711 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7713 if (rx && RXp_PAREN_NAMES(rx)) {
7714 HV *hv= RXp_PAREN_NAMES(rx);
7716 (void)hv_iterinit(hv);
7717 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7720 SV* sv_dat = HeVAL(temphe);
7721 I32 *nums = (I32*)SvPVX(sv_dat);
7722 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7723 if ((I32)(rx->lastparen) >= nums[i] &&
7724 rx->offs[nums[i]].start != -1 &&
7725 rx->offs[nums[i]].end != -1)
7731 if (parno || flags & RXapif_ALL) {
7732 av_push(av, newSVhek(HeKEY_hek(temphe)));
7737 return newRV_noinc(MUTABLE_SV(av));
7741 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7744 struct regexp *const rx = ReANY(r);
7750 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7752 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7753 || n == RX_BUFF_IDX_CARET_FULLMATCH
7754 || n == RX_BUFF_IDX_CARET_POSTMATCH
7757 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7759 /* on something like
7762 * the KEEPCOPY is set on the PMOP rather than the regex */
7763 if (PL_curpm && r == PM_GETRE(PL_curpm))
7764 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7773 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7774 /* no need to distinguish between them any more */
7775 n = RX_BUFF_IDX_FULLMATCH;
7777 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7778 && rx->offs[0].start != -1)
7780 /* $`, ${^PREMATCH} */
7781 i = rx->offs[0].start;
7785 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7786 && rx->offs[0].end != -1)
7788 /* $', ${^POSTMATCH} */
7789 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7790 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7793 if ( 0 <= n && n <= (I32)rx->nparens &&
7794 (s1 = rx->offs[n].start) != -1 &&
7795 (t1 = rx->offs[n].end) != -1)
7797 /* $&, ${^MATCH}, $1 ... */
7799 s = rx->subbeg + s1 - rx->suboffset;
7804 assert(s >= rx->subbeg);
7805 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7807 #ifdef NO_TAINT_SUPPORT
7808 sv_setpvn(sv, s, i);
7810 const int oldtainted = TAINT_get;
7812 sv_setpvn(sv, s, i);
7813 TAINT_set(oldtainted);
7815 if ( (rx->intflags & PREGf_CANY_SEEN)
7816 ? (RXp_MATCH_UTF8(rx)
7817 && (!i || is_utf8_string((U8*)s, i)))
7818 : (RXp_MATCH_UTF8(rx)) )
7825 if (RXp_MATCH_TAINTED(rx)) {
7826 if (SvTYPE(sv) >= SVt_PVMG) {
7827 MAGIC* const mg = SvMAGIC(sv);
7830 SvMAGIC_set(sv, mg->mg_moremagic);
7832 if ((mgt = SvMAGIC(sv))) {
7833 mg->mg_moremagic = mgt;
7834 SvMAGIC_set(sv, mg);
7845 sv_setsv(sv,&PL_sv_undef);
7851 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7852 SV const * const value)
7854 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7856 PERL_UNUSED_ARG(rx);
7857 PERL_UNUSED_ARG(paren);
7858 PERL_UNUSED_ARG(value);
7861 Perl_croak_no_modify();
7865 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7868 struct regexp *const rx = ReANY(r);
7872 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7874 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7875 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7876 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7879 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7881 /* on something like
7884 * the KEEPCOPY is set on the PMOP rather than the regex */
7885 if (PL_curpm && r == PM_GETRE(PL_curpm))
7886 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7892 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7894 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7895 case RX_BUFF_IDX_PREMATCH: /* $` */
7896 if (rx->offs[0].start != -1) {
7897 i = rx->offs[0].start;
7906 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7907 case RX_BUFF_IDX_POSTMATCH: /* $' */
7908 if (rx->offs[0].end != -1) {
7909 i = rx->sublen - rx->offs[0].end;
7911 s1 = rx->offs[0].end;
7918 default: /* $& / ${^MATCH}, $1, $2, ... */
7919 if (paren <= (I32)rx->nparens &&
7920 (s1 = rx->offs[paren].start) != -1 &&
7921 (t1 = rx->offs[paren].end) != -1)
7927 if (ckWARN(WARN_UNINITIALIZED))
7928 report_uninit((const SV *)sv);
7933 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7934 const char * const s = rx->subbeg - rx->suboffset + s1;
7939 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7946 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7948 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7949 PERL_UNUSED_ARG(rx);
7953 return newSVpvs("Regexp");
7956 /* Scans the name of a named buffer from the pattern.
7957 * If flags is REG_RSN_RETURN_NULL returns null.
7958 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7959 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7960 * to the parsed name as looked up in the RExC_paren_names hash.
7961 * If there is an error throws a vFAIL().. type exception.
7964 #define REG_RSN_RETURN_NULL 0
7965 #define REG_RSN_RETURN_NAME 1
7966 #define REG_RSN_RETURN_DATA 2
7969 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7971 char *name_start = RExC_parse;
7973 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7975 assert (RExC_parse <= RExC_end);
7976 if (RExC_parse == RExC_end) NOOP;
7977 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7978 /* skip IDFIRST by using do...while */
7981 RExC_parse += UTF8SKIP(RExC_parse);
7982 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7986 } while (isWORDCHAR(*RExC_parse));
7988 RExC_parse++; /* so the <- from the vFAIL is after the offending
7990 vFAIL("Group name must start with a non-digit word character");
7994 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7995 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7996 if ( flags == REG_RSN_RETURN_NAME)
7998 else if (flags==REG_RSN_RETURN_DATA) {
8001 if ( ! sv_name ) /* should not happen*/
8002 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8003 if (RExC_paren_names)
8004 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8006 sv_dat = HeVAL(he_str);
8008 vFAIL("Reference to nonexistent named group");
8012 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8013 (unsigned long) flags);
8015 assert(0); /* NOT REACHED */
8020 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8022 if (RExC_lastparse!=RExC_parse) { \
8023 PerlIO_printf(Perl_debug_log, "%s", \
8024 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8025 RExC_end - RExC_parse, 16, \
8027 PERL_PV_ESCAPE_UNI_DETECT | \
8028 PERL_PV_PRETTY_ELLIPSES | \
8029 PERL_PV_PRETTY_LTGT | \
8030 PERL_PV_ESCAPE_RE | \
8031 PERL_PV_PRETTY_EXACTSIZE \
8035 PerlIO_printf(Perl_debug_log,"%16s",""); \
8038 num = RExC_size + 1; \
8040 num=REG_NODE_NUM(RExC_emit); \
8041 if (RExC_lastnum!=num) \
8042 PerlIO_printf(Perl_debug_log,"|%4d",num); \
8044 PerlIO_printf(Perl_debug_log,"|%4s",""); \
8045 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
8046 (int)((depth*2)), "", \
8050 RExC_lastparse=RExC_parse; \
8055 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8056 DEBUG_PARSE_MSG((funcname)); \
8057 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
8059 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
8060 DEBUG_PARSE_MSG((funcname)); \
8061 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
8064 /* This section of code defines the inversion list object and its methods. The
8065 * interfaces are highly subject to change, so as much as possible is static to
8066 * this file. An inversion list is here implemented as a malloc'd C UV array
8067 * as an SVt_INVLIST scalar.
8069 * An inversion list for Unicode is an array of code points, sorted by ordinal
8070 * number. The zeroth element is the first code point in the list. The 1th
8071 * element is the first element beyond that not in the list. In other words,
8072 * the first range is
8073 * invlist[0]..(invlist[1]-1)
8074 * The other ranges follow. Thus every element whose index is divisible by two
8075 * marks the beginning of a range that is in the list, and every element not
8076 * divisible by two marks the beginning of a range not in the list. A single
8077 * element inversion list that contains the single code point N generally
8078 * consists of two elements
8081 * (The exception is when N is the highest representable value on the
8082 * machine, in which case the list containing just it would be a single
8083 * element, itself. By extension, if the last range in the list extends to
8084 * infinity, then the first element of that range will be in the inversion list
8085 * at a position that is divisible by two, and is the final element in the
8087 * Taking the complement (inverting) an inversion list is quite simple, if the
8088 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8089 * This implementation reserves an element at the beginning of each inversion
8090 * list to always contain 0; there is an additional flag in the header which
8091 * indicates if the list begins at the 0, or is offset to begin at the next
8094 * More about inversion lists can be found in "Unicode Demystified"
8095 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8096 * More will be coming when functionality is added later.
8098 * The inversion list data structure is currently implemented as an SV pointing
8099 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8100 * array of UV whose memory management is automatically handled by the existing
8101 * facilities for SV's.
8103 * Some of the methods should always be private to the implementation, and some
8104 * should eventually be made public */
8106 /* The header definitions are in F<inline_invlist.c> */
8108 PERL_STATIC_INLINE UV*
8109 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8111 /* Returns a pointer to the first element in the inversion list's array.
8112 * This is called upon initialization of an inversion list. Where the
8113 * array begins depends on whether the list has the code point U+0000 in it
8114 * or not. The other parameter tells it whether the code that follows this
8115 * call is about to put a 0 in the inversion list or not. The first
8116 * element is either the element reserved for 0, if TRUE, or the element
8117 * after it, if FALSE */
8119 bool* offset = get_invlist_offset_addr(invlist);
8120 UV* zero_addr = (UV *) SvPVX(invlist);
8122 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8125 assert(! _invlist_len(invlist));
8129 /* 1^1 = 0; 1^0 = 1 */
8130 *offset = 1 ^ will_have_0;
8131 return zero_addr + *offset;
8134 PERL_STATIC_INLINE UV*
8135 S_invlist_array(SV* const invlist)
8137 /* Returns the pointer to the inversion list's array. Every time the
8138 * length changes, this needs to be called in case malloc or realloc moved
8141 PERL_ARGS_ASSERT_INVLIST_ARRAY;
8143 /* Must not be empty. If these fail, you probably didn't check for <len>
8144 * being non-zero before trying to get the array */
8145 assert(_invlist_len(invlist));
8147 /* The very first element always contains zero, The array begins either
8148 * there, or if the inversion list is offset, at the element after it.
8149 * The offset header field determines which; it contains 0 or 1 to indicate
8150 * how much additionally to add */
8151 assert(0 == *(SvPVX(invlist)));
8152 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8155 PERL_STATIC_INLINE void
8156 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8158 /* Sets the current number of elements stored in the inversion list.
8159 * Updates SvCUR correspondingly */
8160 PERL_UNUSED_CONTEXT;
8161 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8163 assert(SvTYPE(invlist) == SVt_INVLIST);
8168 : TO_INTERNAL_SIZE(len + offset));
8169 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8172 PERL_STATIC_INLINE IV*
8173 S_get_invlist_previous_index_addr(SV* invlist)
8175 /* Return the address of the IV that is reserved to hold the cached index
8177 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8179 assert(SvTYPE(invlist) == SVt_INVLIST);
8181 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8184 PERL_STATIC_INLINE IV
8185 S_invlist_previous_index(SV* const invlist)
8187 /* Returns cached index of previous search */
8189 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8191 return *get_invlist_previous_index_addr(invlist);
8194 PERL_STATIC_INLINE void
8195 S_invlist_set_previous_index(SV* const invlist, const IV index)
8197 /* Caches <index> for later retrieval */
8199 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8201 assert(index == 0 || index < (int) _invlist_len(invlist));
8203 *get_invlist_previous_index_addr(invlist) = index;
8206 PERL_STATIC_INLINE UV
8207 S_invlist_max(SV* const invlist)
8209 /* Returns the maximum number of elements storable in the inversion list's
8210 * array, without having to realloc() */
8212 PERL_ARGS_ASSERT_INVLIST_MAX;
8214 assert(SvTYPE(invlist) == SVt_INVLIST);
8216 /* Assumes worst case, in which the 0 element is not counted in the
8217 * inversion list, so subtracts 1 for that */
8218 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8219 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8220 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8223 #ifndef PERL_IN_XSUB_RE
8225 Perl__new_invlist(pTHX_ IV initial_size)
8228 /* Return a pointer to a newly constructed inversion list, with enough
8229 * space to store 'initial_size' elements. If that number is negative, a
8230 * system default is used instead */
8234 if (initial_size < 0) {
8238 /* Allocate the initial space */
8239 new_list = newSV_type(SVt_INVLIST);
8241 /* First 1 is in case the zero element isn't in the list; second 1 is for
8243 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8244 invlist_set_len(new_list, 0, 0);
8246 /* Force iterinit() to be used to get iteration to work */
8247 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8249 *get_invlist_previous_index_addr(new_list) = 0;
8255 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8257 /* Return a pointer to a newly constructed inversion list, initialized to
8258 * point to <list>, which has to be in the exact correct inversion list
8259 * form, including internal fields. Thus this is a dangerous routine that
8260 * should not be used in the wrong hands. The passed in 'list' contains
8261 * several header fields at the beginning that are not part of the
8262 * inversion list body proper */
8264 const STRLEN length = (STRLEN) list[0];
8265 const UV version_id = list[1];
8266 const bool offset = cBOOL(list[2]);
8267 #define HEADER_LENGTH 3
8268 /* If any of the above changes in any way, you must change HEADER_LENGTH
8269 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8270 * perl -E 'say int(rand 2**31-1)'
8272 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8273 data structure type, so that one being
8274 passed in can be validated to be an
8275 inversion list of the correct vintage.
8278 SV* invlist = newSV_type(SVt_INVLIST);
8280 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8282 if (version_id != INVLIST_VERSION_ID) {
8283 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8286 /* The generated array passed in includes header elements that aren't part
8287 * of the list proper, so start it just after them */
8288 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8290 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8291 shouldn't touch it */
8293 *(get_invlist_offset_addr(invlist)) = offset;
8295 /* The 'length' passed to us is the physical number of elements in the
8296 * inversion list. But if there is an offset the logical number is one
8298 invlist_set_len(invlist, length - offset, offset);
8300 invlist_set_previous_index(invlist, 0);
8302 /* Initialize the iteration pointer. */
8303 invlist_iterfinish(invlist);
8305 SvREADONLY_on(invlist);
8309 #endif /* ifndef PERL_IN_XSUB_RE */
8312 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8314 /* Grow the maximum size of an inversion list */
8316 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8318 assert(SvTYPE(invlist) == SVt_INVLIST);
8320 /* Add one to account for the zero element at the beginning which may not
8321 * be counted by the calling parameters */
8322 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8325 PERL_STATIC_INLINE void
8326 S_invlist_trim(SV* const invlist)
8328 PERL_ARGS_ASSERT_INVLIST_TRIM;
8330 assert(SvTYPE(invlist) == SVt_INVLIST);
8332 /* Change the length of the inversion list to how many entries it currently
8334 SvPV_shrink_to_cur((SV *) invlist);
8338 S__append_range_to_invlist(pTHX_ SV* const invlist,
8339 const UV start, const UV end)
8341 /* Subject to change or removal. Append the range from 'start' to 'end' at
8342 * the end of the inversion list. The range must be above any existing
8346 UV max = invlist_max(invlist);
8347 UV len = _invlist_len(invlist);
8350 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8352 if (len == 0) { /* Empty lists must be initialized */
8353 offset = start != 0;
8354 array = _invlist_array_init(invlist, ! offset);
8357 /* Here, the existing list is non-empty. The current max entry in the
8358 * list is generally the first value not in the set, except when the
8359 * set extends to the end of permissible values, in which case it is
8360 * the first entry in that final set, and so this call is an attempt to
8361 * append out-of-order */
8363 UV final_element = len - 1;
8364 array = invlist_array(invlist);
8365 if (array[final_element] > start
8366 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8368 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8369 array[final_element], start,
8370 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8373 /* Here, it is a legal append. If the new range begins with the first
8374 * value not in the set, it is extending the set, so the new first
8375 * value not in the set is one greater than the newly extended range.
8377 offset = *get_invlist_offset_addr(invlist);
8378 if (array[final_element] == start) {
8379 if (end != UV_MAX) {
8380 array[final_element] = end + 1;
8383 /* But if the end is the maximum representable on the machine,
8384 * just let the range that this would extend to have no end */
8385 invlist_set_len(invlist, len - 1, offset);
8391 /* Here the new range doesn't extend any existing set. Add it */
8393 len += 2; /* Includes an element each for the start and end of range */
8395 /* If wll overflow the existing space, extend, which may cause the array to
8398 invlist_extend(invlist, len);
8400 /* Have to set len here to avoid assert failure in invlist_array() */
8401 invlist_set_len(invlist, len, offset);
8403 array = invlist_array(invlist);
8406 invlist_set_len(invlist, len, offset);
8409 /* The next item on the list starts the range, the one after that is
8410 * one past the new range. */
8411 array[len - 2] = start;
8412 if (end != UV_MAX) {
8413 array[len - 1] = end + 1;
8416 /* But if the end is the maximum representable on the machine, just let
8417 * the range have no end */
8418 invlist_set_len(invlist, len - 1, offset);
8422 #ifndef PERL_IN_XSUB_RE
8425 Perl__invlist_search(SV* const invlist, const UV cp)
8427 /* Searches the inversion list for the entry that contains the input code
8428 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8429 * return value is the index into the list's array of the range that
8434 IV high = _invlist_len(invlist);
8435 const IV highest_element = high - 1;
8438 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8440 /* If list is empty, return failure. */
8445 /* (We can't get the array unless we know the list is non-empty) */
8446 array = invlist_array(invlist);
8448 mid = invlist_previous_index(invlist);
8449 assert(mid >=0 && mid <= highest_element);
8451 /* <mid> contains the cache of the result of the previous call to this
8452 * function (0 the first time). See if this call is for the same result,
8453 * or if it is for mid-1. This is under the theory that calls to this
8454 * function will often be for related code points that are near each other.
8455 * And benchmarks show that caching gives better results. We also test
8456 * here if the code point is within the bounds of the list. These tests
8457 * replace others that would have had to be made anyway to make sure that
8458 * the array bounds were not exceeded, and these give us extra information
8459 * at the same time */
8460 if (cp >= array[mid]) {
8461 if (cp >= array[highest_element]) {
8462 return highest_element;
8465 /* Here, array[mid] <= cp < array[highest_element]. This means that
8466 * the final element is not the answer, so can exclude it; it also
8467 * means that <mid> is not the final element, so can refer to 'mid + 1'
8469 if (cp < array[mid + 1]) {
8475 else { /* cp < aray[mid] */
8476 if (cp < array[0]) { /* Fail if outside the array */
8480 if (cp >= array[mid - 1]) {
8485 /* Binary search. What we are looking for is <i> such that
8486 * array[i] <= cp < array[i+1]
8487 * The loop below converges on the i+1. Note that there may not be an
8488 * (i+1)th element in the array, and things work nonetheless */
8489 while (low < high) {
8490 mid = (low + high) / 2;
8491 assert(mid <= highest_element);
8492 if (array[mid] <= cp) { /* cp >= array[mid] */
8495 /* We could do this extra test to exit the loop early.
8496 if (cp < array[low]) {
8501 else { /* cp < array[mid] */
8508 invlist_set_previous_index(invlist, high);
8513 Perl__invlist_populate_swatch(SV* const invlist,
8514 const UV start, const UV end, U8* swatch)
8516 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8517 * but is used when the swash has an inversion list. This makes this much
8518 * faster, as it uses a binary search instead of a linear one. This is
8519 * intimately tied to that function, and perhaps should be in utf8.c,
8520 * except it is intimately tied to inversion lists as well. It assumes
8521 * that <swatch> is all 0's on input */
8524 const IV len = _invlist_len(invlist);
8528 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8530 if (len == 0) { /* Empty inversion list */
8534 array = invlist_array(invlist);
8536 /* Find which element it is */
8537 i = _invlist_search(invlist, start);
8539 /* We populate from <start> to <end> */
8540 while (current < end) {
8543 /* The inversion list gives the results for every possible code point
8544 * after the first one in the list. Only those ranges whose index is
8545 * even are ones that the inversion list matches. For the odd ones,
8546 * and if the initial code point is not in the list, we have to skip
8547 * forward to the next element */
8548 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8550 if (i >= len) { /* Finished if beyond the end of the array */
8554 if (current >= end) { /* Finished if beyond the end of what we
8556 if (LIKELY(end < UV_MAX)) {
8560 /* We get here when the upper bound is the maximum
8561 * representable on the machine, and we are looking for just
8562 * that code point. Have to special case it */
8564 goto join_end_of_list;
8567 assert(current >= start);
8569 /* The current range ends one below the next one, except don't go past
8572 upper = (i < len && array[i] < end) ? array[i] : end;
8574 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8575 * for each code point in it */
8576 for (; current < upper; current++) {
8577 const STRLEN offset = (STRLEN)(current - start);
8578 swatch[offset >> 3] |= 1 << (offset & 7);
8583 /* Quit if at the end of the list */
8586 /* But first, have to deal with the highest possible code point on
8587 * the platform. The previous code assumes that <end> is one
8588 * beyond where we want to populate, but that is impossible at the
8589 * platform's infinity, so have to handle it specially */
8590 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8592 const STRLEN offset = (STRLEN)(end - start);
8593 swatch[offset >> 3] |= 1 << (offset & 7);
8598 /* Advance to the next range, which will be for code points not in the
8607 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8608 const bool complement_b, SV** output)
8610 /* Take the union of two inversion lists and point <output> to it. *output
8611 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8612 * the reference count to that list will be decremented if not already a
8613 * temporary (mortal); otherwise *output will be made correspondingly
8614 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8615 * second list is returned. If <complement_b> is TRUE, the union is taken
8616 * of the complement (inversion) of <b> instead of b itself.
8618 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8619 * Richard Gillam, published by Addison-Wesley, and explained at some
8620 * length there. The preface says to incorporate its examples into your
8621 * code at your own risk.
8623 * The algorithm is like a merge sort.
8625 * XXX A potential performance improvement is to keep track as we go along
8626 * if only one of the inputs contributes to the result, meaning the other
8627 * is a subset of that one. In that case, we can skip the final copy and
8628 * return the larger of the input lists, but then outside code might need
8629 * to keep track of whether to free the input list or not */
8631 const UV* array_a; /* a's array */
8633 UV len_a; /* length of a's array */
8636 SV* u; /* the resulting union */
8640 UV i_a = 0; /* current index into a's array */
8644 /* running count, as explained in the algorithm source book; items are
8645 * stopped accumulating and are output when the count changes to/from 0.
8646 * The count is incremented when we start a range that's in the set, and
8647 * decremented when we start a range that's not in the set. So its range
8648 * is 0 to 2. Only when the count is zero is something not in the set.
8652 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8655 /* If either one is empty, the union is the other one */
8656 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8657 bool make_temp = FALSE; /* Should we mortalize the result? */
8661 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8667 *output = invlist_clone(b);
8669 _invlist_invert(*output);
8671 } /* else *output already = b; */
8674 sv_2mortal(*output);
8678 else if ((len_b = _invlist_len(b)) == 0) {
8679 bool make_temp = FALSE;
8681 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8686 /* The complement of an empty list is a list that has everything in it,
8687 * so the union with <a> includes everything too */
8690 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8694 *output = _new_invlist(1);
8695 _append_range_to_invlist(*output, 0, UV_MAX);
8697 else if (*output != a) {
8698 *output = invlist_clone(a);
8700 /* else *output already = a; */
8703 sv_2mortal(*output);
8708 /* Here both lists exist and are non-empty */
8709 array_a = invlist_array(a);
8710 array_b = invlist_array(b);
8712 /* If are to take the union of 'a' with the complement of b, set it
8713 * up so are looking at b's complement. */
8716 /* To complement, we invert: if the first element is 0, remove it. To
8717 * do this, we just pretend the array starts one later */
8718 if (array_b[0] == 0) {
8724 /* But if the first element is not zero, we pretend the list starts
8725 * at the 0 that is always stored immediately before the array. */
8731 /* Size the union for the worst case: that the sets are completely
8733 u = _new_invlist(len_a + len_b);
8735 /* Will contain U+0000 if either component does */
8736 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8737 || (len_b > 0 && array_b[0] == 0));
8739 /* Go through each list item by item, stopping when exhausted one of
8741 while (i_a < len_a && i_b < len_b) {
8742 UV cp; /* The element to potentially add to the union's array */
8743 bool cp_in_set; /* is it in the the input list's set or not */
8745 /* We need to take one or the other of the two inputs for the union.
8746 * Since we are merging two sorted lists, we take the smaller of the
8747 * next items. In case of a tie, we take the one that is in its set
8748 * first. If we took one not in the set first, it would decrement the
8749 * count, possibly to 0 which would cause it to be output as ending the
8750 * range, and the next time through we would take the same number, and
8751 * output it again as beginning the next range. By doing it the
8752 * opposite way, there is no possibility that the count will be
8753 * momentarily decremented to 0, and thus the two adjoining ranges will
8754 * be seamlessly merged. (In a tie and both are in the set or both not
8755 * in the set, it doesn't matter which we take first.) */
8756 if (array_a[i_a] < array_b[i_b]
8757 || (array_a[i_a] == array_b[i_b]
8758 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8760 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8764 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8765 cp = array_b[i_b++];
8768 /* Here, have chosen which of the two inputs to look at. Only output
8769 * if the running count changes to/from 0, which marks the
8770 * beginning/end of a range in that's in the set */
8773 array_u[i_u++] = cp;
8780 array_u[i_u++] = cp;
8785 /* Here, we are finished going through at least one of the lists, which
8786 * means there is something remaining in at most one. We check if the list
8787 * that hasn't been exhausted is positioned such that we are in the middle
8788 * of a range in its set or not. (i_a and i_b point to the element beyond
8789 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8790 * is potentially more to output.
8791 * There are four cases:
8792 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8793 * in the union is entirely from the non-exhausted set.
8794 * 2) Both were in their sets, count is 2. Nothing further should
8795 * be output, as everything that remains will be in the exhausted
8796 * list's set, hence in the union; decrementing to 1 but not 0 insures
8798 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8799 * Nothing further should be output because the union includes
8800 * everything from the exhausted set. Not decrementing ensures that.
8801 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8802 * decrementing to 0 insures that we look at the remainder of the
8803 * non-exhausted set */
8804 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8805 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8810 /* The final length is what we've output so far, plus what else is about to
8811 * be output. (If 'count' is non-zero, then the input list we exhausted
8812 * has everything remaining up to the machine's limit in its set, and hence
8813 * in the union, so there will be no further output. */
8816 /* At most one of the subexpressions will be non-zero */
8817 len_u += (len_a - i_a) + (len_b - i_b);
8820 /* Set result to final length, which can change the pointer to array_u, so
8822 if (len_u != _invlist_len(u)) {
8823 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8825 array_u = invlist_array(u);
8828 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8829 * the other) ended with everything above it not in its set. That means
8830 * that the remaining part of the union is precisely the same as the
8831 * non-exhausted list, so can just copy it unchanged. (If both list were
8832 * exhausted at the same time, then the operations below will be both 0.)
8835 IV copy_count; /* At most one will have a non-zero copy count */
8836 if ((copy_count = len_a - i_a) > 0) {
8837 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8839 else if ((copy_count = len_b - i_b) > 0) {
8840 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8844 /* We may be removing a reference to one of the inputs. If so, the output
8845 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8846 * count decremented) */
8847 if (a == *output || b == *output) {
8848 assert(! invlist_is_iterating(*output));
8849 if ((SvTEMP(*output))) {
8853 SvREFCNT_dec_NN(*output);
8863 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8864 const bool complement_b, SV** i)
8866 /* Take the intersection of two inversion lists and point <i> to it. *i
8867 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8868 * the reference count to that list will be decremented if not already a
8869 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8870 * The first list, <a>, may be NULL, in which case an empty list is
8871 * returned. If <complement_b> is TRUE, the result will be the
8872 * intersection of <a> and the complement (or inversion) of <b> instead of
8875 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8876 * Richard Gillam, published by Addison-Wesley, and explained at some
8877 * length there. The preface says to incorporate its examples into your
8878 * code at your own risk. In fact, it had bugs
8880 * The algorithm is like a merge sort, and is essentially the same as the
8884 const UV* array_a; /* a's array */
8886 UV len_a; /* length of a's array */
8889 SV* r; /* the resulting intersection */
8893 UV i_a = 0; /* current index into a's array */
8897 /* running count, as explained in the algorithm source book; items are
8898 * stopped accumulating and are output when the count changes to/from 2.
8899 * The count is incremented when we start a range that's in the set, and
8900 * decremented when we start a range that's not in the set. So its range
8901 * is 0 to 2. Only when the count is 2 is something in the intersection.
8905 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8908 /* Special case if either one is empty */
8909 len_a = (a == NULL) ? 0 : _invlist_len(a);
8910 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8911 bool make_temp = FALSE;
8913 if (len_a != 0 && complement_b) {
8915 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8916 * be empty. Here, also we are using 'b's complement, which hence
8917 * must be every possible code point. Thus the intersection is
8921 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8926 *i = invlist_clone(a);
8928 /* else *i is already 'a' */
8936 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8937 * intersection must be empty */
8939 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8944 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8948 *i = _new_invlist(0);
8956 /* Here both lists exist and are non-empty */
8957 array_a = invlist_array(a);
8958 array_b = invlist_array(b);
8960 /* If are to take the intersection of 'a' with the complement of b, set it
8961 * up so are looking at b's complement. */
8964 /* To complement, we invert: if the first element is 0, remove it. To
8965 * do this, we just pretend the array starts one later */
8966 if (array_b[0] == 0) {
8972 /* But if the first element is not zero, we pretend the list starts
8973 * at the 0 that is always stored immediately before the array. */
8979 /* Size the intersection for the worst case: that the intersection ends up
8980 * fragmenting everything to be completely disjoint */
8981 r= _new_invlist(len_a + len_b);
8983 /* Will contain U+0000 iff both components do */
8984 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8985 && len_b > 0 && array_b[0] == 0);
8987 /* Go through each list item by item, stopping when exhausted one of
8989 while (i_a < len_a && i_b < len_b) {
8990 UV cp; /* The element to potentially add to the intersection's
8992 bool cp_in_set; /* Is it in the input list's set or not */
8994 /* We need to take one or the other of the two inputs for the
8995 * intersection. Since we are merging two sorted lists, we take the
8996 * smaller of the next items. In case of a tie, we take the one that
8997 * is not in its set first (a difference from the union algorithm). If
8998 * we took one in the set first, it would increment the count, possibly
8999 * to 2 which would cause it to be output as starting a range in the
9000 * intersection, and the next time through we would take that same
9001 * number, and output it again as ending the set. By doing it the
9002 * opposite of this, there is no possibility that the count will be
9003 * momentarily incremented to 2. (In a tie and both are in the set or
9004 * both not in the set, it doesn't matter which we take first.) */
9005 if (array_a[i_a] < array_b[i_b]
9006 || (array_a[i_a] == array_b[i_b]
9007 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9009 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9013 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9017 /* Here, have chosen which of the two inputs to look at. Only output
9018 * if the running count changes to/from 2, which marks the
9019 * beginning/end of a range that's in the intersection */
9023 array_r[i_r++] = cp;
9028 array_r[i_r++] = cp;
9034 /* Here, we are finished going through at least one of the lists, which
9035 * means there is something remaining in at most one. We check if the list
9036 * that has been exhausted is positioned such that we are in the middle
9037 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
9038 * the ones we care about.) There are four cases:
9039 * 1) Both weren't in their sets, count is 0, and remains 0. There's
9040 * nothing left in the intersection.
9041 * 2) Both were in their sets, count is 2 and perhaps is incremented to
9042 * above 2. What should be output is exactly that which is in the
9043 * non-exhausted set, as everything it has is also in the intersection
9044 * set, and everything it doesn't have can't be in the intersection
9045 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9046 * gets incremented to 2. Like the previous case, the intersection is
9047 * everything that remains in the non-exhausted set.
9048 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9049 * remains 1. And the intersection has nothing more. */
9050 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9051 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9056 /* The final length is what we've output so far plus what else is in the
9057 * intersection. At most one of the subexpressions below will be non-zero
9061 len_r += (len_a - i_a) + (len_b - i_b);
9064 /* Set result to final length, which can change the pointer to array_r, so
9066 if (len_r != _invlist_len(r)) {
9067 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9069 array_r = invlist_array(r);
9072 /* Finish outputting any remaining */
9073 if (count >= 2) { /* At most one will have a non-zero copy count */
9075 if ((copy_count = len_a - i_a) > 0) {
9076 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9078 else if ((copy_count = len_b - i_b) > 0) {
9079 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9083 /* We may be removing a reference to one of the inputs. If so, the output
9084 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
9085 * count decremented) */
9086 if (a == *i || b == *i) {
9087 assert(! invlist_is_iterating(*i));
9092 SvREFCNT_dec_NN(*i);
9102 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9104 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9105 * set. A pointer to the inversion list is returned. This may actually be
9106 * a new list, in which case the passed in one has been destroyed. The
9107 * passed-in inversion list can be NULL, in which case a new one is created
9108 * with just the one range in it */
9113 if (invlist == NULL) {
9114 invlist = _new_invlist(2);
9118 len = _invlist_len(invlist);
9121 /* If comes after the final entry actually in the list, can just append it
9124 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9125 && start >= invlist_array(invlist)[len - 1]))
9127 _append_range_to_invlist(invlist, start, end);
9131 /* Here, can't just append things, create and return a new inversion list
9132 * which is the union of this range and the existing inversion list */
9133 range_invlist = _new_invlist(2);
9134 _append_range_to_invlist(range_invlist, start, end);
9136 _invlist_union(invlist, range_invlist, &invlist);
9138 /* The temporary can be freed */
9139 SvREFCNT_dec_NN(range_invlist);
9145 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9146 UV** other_elements_ptr)
9148 /* Create and return an inversion list whose contents are to be populated
9149 * by the caller. The caller gives the number of elements (in 'size') and
9150 * the very first element ('element0'). This function will set
9151 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9154 * Obviously there is some trust involved that the caller will properly
9155 * fill in the other elements of the array.
9157 * (The first element needs to be passed in, as the underlying code does
9158 * things differently depending on whether it is zero or non-zero) */
9160 SV* invlist = _new_invlist(size);
9163 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9165 _append_range_to_invlist(invlist, element0, element0);
9166 offset = *get_invlist_offset_addr(invlist);
9168 invlist_set_len(invlist, size, offset);
9169 *other_elements_ptr = invlist_array(invlist) + 1;
9175 PERL_STATIC_INLINE SV*
9176 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9177 return _add_range_to_invlist(invlist, cp, cp);
9180 #ifndef PERL_IN_XSUB_RE
9182 Perl__invlist_invert(pTHX_ SV* const invlist)
9184 /* Complement the input inversion list. This adds a 0 if the list didn't
9185 * have a zero; removes it otherwise. As described above, the data
9186 * structure is set up so that this is very efficient */
9188 PERL_ARGS_ASSERT__INVLIST_INVERT;
9190 assert(! invlist_is_iterating(invlist));
9192 /* The inverse of matching nothing is matching everything */
9193 if (_invlist_len(invlist) == 0) {
9194 _append_range_to_invlist(invlist, 0, UV_MAX);
9198 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9203 PERL_STATIC_INLINE SV*
9204 S_invlist_clone(pTHX_ SV* const invlist)
9207 /* Return a new inversion list that is a copy of the input one, which is
9208 * unchanged. The new list will not be mortal even if the old one was. */
9210 /* Need to allocate extra space to accommodate Perl's addition of a
9211 * trailing NUL to SvPV's, since it thinks they are always strings */
9212 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9213 STRLEN physical_length = SvCUR(invlist);
9214 bool offset = *(get_invlist_offset_addr(invlist));
9216 PERL_ARGS_ASSERT_INVLIST_CLONE;
9218 *(get_invlist_offset_addr(new_invlist)) = offset;
9219 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9220 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9225 PERL_STATIC_INLINE STRLEN*
9226 S_get_invlist_iter_addr(SV* invlist)
9228 /* Return the address of the UV that contains the current iteration
9231 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9233 assert(SvTYPE(invlist) == SVt_INVLIST);
9235 return &(((XINVLIST*) SvANY(invlist))->iterator);
9238 PERL_STATIC_INLINE void
9239 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9241 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9243 *get_invlist_iter_addr(invlist) = 0;
9246 PERL_STATIC_INLINE void
9247 S_invlist_iterfinish(SV* invlist)
9249 /* Terminate iterator for invlist. This is to catch development errors.
9250 * Any iteration that is interrupted before completed should call this
9251 * function. Functions that add code points anywhere else but to the end
9252 * of an inversion list assert that they are not in the middle of an
9253 * iteration. If they were, the addition would make the iteration
9254 * problematical: if the iteration hadn't reached the place where things
9255 * were being added, it would be ok */
9257 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9259 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9263 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9265 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9266 * This call sets in <*start> and <*end>, the next range in <invlist>.
9267 * Returns <TRUE> if successful and the next call will return the next
9268 * range; <FALSE> if was already at the end of the list. If the latter,
9269 * <*start> and <*end> are unchanged, and the next call to this function
9270 * will start over at the beginning of the list */
9272 STRLEN* pos = get_invlist_iter_addr(invlist);
9273 UV len = _invlist_len(invlist);
9276 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9279 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9283 array = invlist_array(invlist);
9285 *start = array[(*pos)++];
9291 *end = array[(*pos)++] - 1;
9297 PERL_STATIC_INLINE bool
9298 S_invlist_is_iterating(SV* const invlist)
9300 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9302 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9305 PERL_STATIC_INLINE UV
9306 S_invlist_highest(SV* const invlist)
9308 /* Returns the highest code point that matches an inversion list. This API
9309 * has an ambiguity, as it returns 0 under either the highest is actually
9310 * 0, or if the list is empty. If this distinction matters to you, check
9311 * for emptiness before calling this function */
9313 UV len = _invlist_len(invlist);
9316 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9322 array = invlist_array(invlist);
9324 /* The last element in the array in the inversion list always starts a
9325 * range that goes to infinity. That range may be for code points that are
9326 * matched in the inversion list, or it may be for ones that aren't
9327 * matched. In the latter case, the highest code point in the set is one
9328 * less than the beginning of this range; otherwise it is the final element
9329 * of this range: infinity */
9330 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9332 : array[len - 1] - 1;
9335 #ifndef PERL_IN_XSUB_RE
9337 Perl__invlist_contents(pTHX_ SV* const invlist)
9339 /* Get the contents of an inversion list into a string SV so that they can
9340 * be printed out. It uses the format traditionally done for debug tracing
9344 SV* output = newSVpvs("\n");
9346 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9348 assert(! invlist_is_iterating(invlist));
9350 invlist_iterinit(invlist);
9351 while (invlist_iternext(invlist, &start, &end)) {
9352 if (end == UV_MAX) {
9353 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9355 else if (end != start) {
9356 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9360 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9368 #ifndef PERL_IN_XSUB_RE
9370 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9371 const char * const indent, SV* const invlist)
9373 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9374 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9375 * the string 'indent'. The output looks like this:
9376 [0] 0x000A .. 0x000D
9378 [4] 0x2028 .. 0x2029
9379 [6] 0x3104 .. INFINITY
9380 * This means that the first range of code points matched by the list are
9381 * 0xA through 0xD; the second range contains only the single code point
9382 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9383 * are used to define each range (except if the final range extends to
9384 * infinity, only a single element is needed). The array index of the
9385 * first element for the corresponding range is given in brackets. */
9390 PERL_ARGS_ASSERT__INVLIST_DUMP;
9392 if (invlist_is_iterating(invlist)) {
9393 Perl_dump_indent(aTHX_ level, file,
9394 "%sCan't dump inversion list because is in middle of iterating\n",
9399 invlist_iterinit(invlist);
9400 while (invlist_iternext(invlist, &start, &end)) {
9401 if (end == UV_MAX) {
9402 Perl_dump_indent(aTHX_ level, file,
9403 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9404 indent, (UV)count, start);
9406 else if (end != start) {
9407 Perl_dump_indent(aTHX_ level, file,
9408 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9409 indent, (UV)count, start, end);
9412 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9413 indent, (UV)count, start);
9420 Perl__load_PL_utf8_foldclosures (pTHX)
9422 assert(! PL_utf8_foldclosures);
9424 /* If the folds haven't been read in, call a fold function
9426 if (! PL_utf8_tofold) {
9427 U8 dummy[UTF8_MAXBYTES_CASE+1];
9429 /* This string is just a short named one above \xff */
9430 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9431 assert(PL_utf8_tofold); /* Verify that worked */
9433 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9437 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9439 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9441 /* Return a boolean as to if the two passed in inversion lists are
9442 * identical. The final argument, if TRUE, says to take the complement of
9443 * the second inversion list before doing the comparison */
9445 const UV* array_a = invlist_array(a);
9446 const UV* array_b = invlist_array(b);
9447 UV len_a = _invlist_len(a);
9448 UV len_b = _invlist_len(b);
9450 UV i = 0; /* current index into the arrays */
9451 bool retval = TRUE; /* Assume are identical until proven otherwise */
9453 PERL_ARGS_ASSERT__INVLISTEQ;
9455 /* If are to compare 'a' with the complement of b, set it
9456 * up so are looking at b's complement. */
9459 /* The complement of nothing is everything, so <a> would have to have
9460 * just one element, starting at zero (ending at infinity) */
9462 return (len_a == 1 && array_a[0] == 0);
9464 else if (array_b[0] == 0) {
9466 /* Otherwise, to complement, we invert. Here, the first element is
9467 * 0, just remove it. To do this, we just pretend the array starts
9475 /* But if the first element is not zero, we pretend the list starts
9476 * at the 0 that is always stored immediately before the array. */
9482 /* Make sure that the lengths are the same, as well as the final element
9483 * before looping through the remainder. (Thus we test the length, final,
9484 * and first elements right off the bat) */
9485 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9488 else for (i = 0; i < len_a - 1; i++) {
9489 if (array_a[i] != array_b[i]) {
9499 #undef HEADER_LENGTH
9500 #undef TO_INTERNAL_SIZE
9501 #undef FROM_INTERNAL_SIZE
9502 #undef INVLIST_VERSION_ID
9504 /* End of inversion list object */
9507 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9509 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9510 * constructs, and updates RExC_flags with them. On input, RExC_parse
9511 * should point to the first flag; it is updated on output to point to the
9512 * final ')' or ':'. There needs to be at least one flag, or this will
9515 /* for (?g), (?gc), and (?o) warnings; warning
9516 about (?c) will warn about (?g) -- japhy */
9518 #define WASTED_O 0x01
9519 #define WASTED_G 0x02
9520 #define WASTED_C 0x04
9521 #define WASTED_GC (WASTED_G|WASTED_C)
9522 I32 wastedflags = 0x00;
9523 U32 posflags = 0, negflags = 0;
9524 U32 *flagsp = &posflags;
9525 char has_charset_modifier = '\0';
9527 bool has_use_defaults = FALSE;
9528 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9529 int x_mod_count = 0;
9531 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9533 /* '^' as an initial flag sets certain defaults */
9534 if (UCHARAT(RExC_parse) == '^') {
9536 has_use_defaults = TRUE;
9537 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9538 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9539 ? REGEX_UNICODE_CHARSET
9540 : REGEX_DEPENDS_CHARSET);
9543 cs = get_regex_charset(RExC_flags);
9544 if (cs == REGEX_DEPENDS_CHARSET
9545 && (RExC_utf8 || RExC_uni_semantics))
9547 cs = REGEX_UNICODE_CHARSET;
9550 while (*RExC_parse) {
9551 /* && strchr("iogcmsx", *RExC_parse) */
9552 /* (?g), (?gc) and (?o) are useless here
9553 and must be globally applied -- japhy */
9554 switch (*RExC_parse) {
9556 /* Code for the imsx flags */
9557 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9559 case LOCALE_PAT_MOD:
9560 if (has_charset_modifier) {
9561 goto excess_modifier;
9563 else if (flagsp == &negflags) {
9566 cs = REGEX_LOCALE_CHARSET;
9567 has_charset_modifier = LOCALE_PAT_MOD;
9569 case UNICODE_PAT_MOD:
9570 if (has_charset_modifier) {
9571 goto excess_modifier;
9573 else if (flagsp == &negflags) {
9576 cs = REGEX_UNICODE_CHARSET;
9577 has_charset_modifier = UNICODE_PAT_MOD;
9579 case ASCII_RESTRICT_PAT_MOD:
9580 if (flagsp == &negflags) {
9583 if (has_charset_modifier) {
9584 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9585 goto excess_modifier;
9587 /* Doubled modifier implies more restricted */
9588 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9591 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9593 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9595 case DEPENDS_PAT_MOD:
9596 if (has_use_defaults) {
9597 goto fail_modifiers;
9599 else if (flagsp == &negflags) {
9602 else if (has_charset_modifier) {
9603 goto excess_modifier;
9606 /* The dual charset means unicode semantics if the
9607 * pattern (or target, not known until runtime) are
9608 * utf8, or something in the pattern indicates unicode
9610 cs = (RExC_utf8 || RExC_uni_semantics)
9611 ? REGEX_UNICODE_CHARSET
9612 : REGEX_DEPENDS_CHARSET;
9613 has_charset_modifier = DEPENDS_PAT_MOD;
9617 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9618 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9620 else if (has_charset_modifier == *(RExC_parse - 1)) {
9621 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9625 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9630 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9633 case ONCE_PAT_MOD: /* 'o' */
9634 case GLOBAL_PAT_MOD: /* 'g' */
9635 if (PASS2 && ckWARN(WARN_REGEXP)) {
9636 const I32 wflagbit = *RExC_parse == 'o'
9639 if (! (wastedflags & wflagbit) ) {
9640 wastedflags |= wflagbit;
9641 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9644 "Useless (%s%c) - %suse /%c modifier",
9645 flagsp == &negflags ? "?-" : "?",
9647 flagsp == &negflags ? "don't " : "",
9654 case CONTINUE_PAT_MOD: /* 'c' */
9655 if (PASS2 && ckWARN(WARN_REGEXP)) {
9656 if (! (wastedflags & WASTED_C) ) {
9657 wastedflags |= WASTED_GC;
9658 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9661 "Useless (%sc) - %suse /gc modifier",
9662 flagsp == &negflags ? "?-" : "?",
9663 flagsp == &negflags ? "don't " : ""
9668 case KEEPCOPY_PAT_MOD: /* 'p' */
9669 if (flagsp == &negflags) {
9671 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9673 *flagsp |= RXf_PMf_KEEPCOPY;
9677 /* A flag is a default iff it is following a minus, so
9678 * if there is a minus, it means will be trying to
9679 * re-specify a default which is an error */
9680 if (has_use_defaults || flagsp == &negflags) {
9681 goto fail_modifiers;
9684 wastedflags = 0; /* reset so (?g-c) warns twice */
9688 RExC_flags |= posflags;
9689 RExC_flags &= ~negflags;
9690 set_regex_charset(&RExC_flags, cs);
9691 if (RExC_flags & RXf_PMf_FOLD) {
9692 RExC_contains_i = 1;
9695 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9701 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9702 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9703 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9704 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9712 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9717 - reg - regular expression, i.e. main body or parenthesized thing
9719 * Caller must absorb opening parenthesis.
9721 * Combining parenthesis handling with the base level of regular expression
9722 * is a trifle forced, but the need to tie the tails of the branches to what
9723 * follows makes it hard to avoid.
9725 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9727 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9729 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9732 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9733 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9734 needs to be restarted.
9735 Otherwise would only return NULL if regbranch() returns NULL, which
9738 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9739 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9740 * 2 is like 1, but indicates that nextchar() has been called to advance
9741 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9742 * this flag alerts us to the need to check for that */
9744 regnode *ret; /* Will be the head of the group. */
9747 regnode *ender = NULL;
9750 U32 oregflags = RExC_flags;
9751 bool have_branch = 0;
9753 I32 freeze_paren = 0;
9754 I32 after_freeze = 0;
9755 I32 num; /* numeric backreferences */
9757 char * parse_start = RExC_parse; /* MJD */
9758 char * const oregcomp_parse = RExC_parse;
9760 GET_RE_DEBUG_FLAGS_DECL;
9762 PERL_ARGS_ASSERT_REG;
9763 DEBUG_PARSE("reg ");
9765 *flagp = 0; /* Tentatively. */
9768 /* Make an OPEN node, if parenthesized. */
9771 /* Under /x, space and comments can be gobbled up between the '(' and
9772 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9773 * intervening space, as the sequence is a token, and a token should be
9775 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9777 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9778 char *start_verb = RExC_parse;
9779 STRLEN verb_len = 0;
9780 char *start_arg = NULL;
9781 unsigned char op = 0;
9783 int internal_argval = 0; /* internal_argval is only useful if
9786 if (has_intervening_patws) {
9788 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9790 while ( *RExC_parse && *RExC_parse != ')' ) {
9791 if ( *RExC_parse == ':' ) {
9792 start_arg = RExC_parse + 1;
9798 verb_len = RExC_parse - start_verb;
9801 while ( *RExC_parse && *RExC_parse != ')' )
9803 if ( *RExC_parse != ')' )
9804 vFAIL("Unterminated verb pattern argument");
9805 if ( RExC_parse == start_arg )
9808 if ( *RExC_parse != ')' )
9809 vFAIL("Unterminated verb pattern");
9812 switch ( *start_verb ) {
9813 case 'A': /* (*ACCEPT) */
9814 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9816 internal_argval = RExC_nestroot;
9819 case 'C': /* (*COMMIT) */
9820 if ( memEQs(start_verb,verb_len,"COMMIT") )
9823 case 'F': /* (*FAIL) */
9824 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9829 case ':': /* (*:NAME) */
9830 case 'M': /* (*MARK:NAME) */
9831 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9836 case 'P': /* (*PRUNE) */
9837 if ( memEQs(start_verb,verb_len,"PRUNE") )
9840 case 'S': /* (*SKIP) */
9841 if ( memEQs(start_verb,verb_len,"SKIP") )
9844 case 'T': /* (*THEN) */
9845 /* [19:06] <TimToady> :: is then */
9846 if ( memEQs(start_verb,verb_len,"THEN") ) {
9848 RExC_seen |= REG_CUTGROUP_SEEN;
9853 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9855 "Unknown verb pattern '%"UTF8f"'",
9856 UTF8fARG(UTF, verb_len, start_verb));
9859 if ( start_arg && internal_argval ) {
9860 vFAIL3("Verb pattern '%.*s' may not have an argument",
9861 verb_len, start_verb);
9862 } else if ( argok < 0 && !start_arg ) {
9863 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9864 verb_len, start_verb);
9866 ret = reganode(pRExC_state, op, internal_argval);
9867 if ( ! internal_argval && ! SIZE_ONLY ) {
9869 SV *sv = newSVpvn( start_arg,
9870 RExC_parse - start_arg);
9871 ARG(ret) = add_data( pRExC_state,
9873 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9880 if (!internal_argval)
9881 RExC_seen |= REG_VERBARG_SEEN;
9882 } else if ( start_arg ) {
9883 vFAIL3("Verb pattern '%.*s' may not have an argument",
9884 verb_len, start_verb);
9886 ret = reg_node(pRExC_state, op);
9888 nextchar(pRExC_state);
9891 else if (*RExC_parse == '?') { /* (?...) */
9892 bool is_logical = 0;
9893 const char * const seqstart = RExC_parse;
9894 const char * endptr;
9895 if (has_intervening_patws) {
9897 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9901 paren = *RExC_parse++;
9902 ret = NULL; /* For look-ahead/behind. */
9905 case 'P': /* (?P...) variants for those used to PCRE/Python */
9906 paren = *RExC_parse++;
9907 if ( paren == '<') /* (?P<...>) named capture */
9909 else if (paren == '>') { /* (?P>name) named recursion */
9910 goto named_recursion;
9912 else if (paren == '=') { /* (?P=...) named backref */
9913 /* this pretty much dupes the code for \k<NAME> in
9914 * regatom(), if you change this make sure you change that
9916 char* name_start = RExC_parse;
9918 SV *sv_dat = reg_scan_name(pRExC_state,
9919 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9920 if (RExC_parse == name_start || *RExC_parse != ')')
9921 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9922 vFAIL2("Sequence %.3s... not terminated",parse_start);
9925 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9926 RExC_rxi->data->data[num]=(void*)sv_dat;
9927 SvREFCNT_inc_simple_void(sv_dat);
9930 ret = reganode(pRExC_state,
9933 : (ASCII_FOLD_RESTRICTED)
9935 : (AT_LEAST_UNI_SEMANTICS)
9943 Set_Node_Offset(ret, parse_start+1);
9944 Set_Node_Cur_Length(ret, parse_start);
9946 nextchar(pRExC_state);
9950 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9951 vFAIL3("Sequence (%.*s...) not recognized",
9952 RExC_parse-seqstart, seqstart);
9954 case '<': /* (?<...) */
9955 if (*RExC_parse == '!')
9957 else if (*RExC_parse != '=')
9963 case '\'': /* (?'...') */
9964 name_start= RExC_parse;
9965 svname = reg_scan_name(pRExC_state,
9966 SIZE_ONLY /* reverse test from the others */
9967 ? REG_RSN_RETURN_NAME
9968 : REG_RSN_RETURN_NULL);
9969 if (RExC_parse == name_start || *RExC_parse != paren)
9970 vFAIL2("Sequence (?%c... not terminated",
9971 paren=='>' ? '<' : paren);
9975 if (!svname) /* shouldn't happen */
9977 "panic: reg_scan_name returned NULL");
9978 if (!RExC_paren_names) {
9979 RExC_paren_names= newHV();
9980 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9982 RExC_paren_name_list= newAV();
9983 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9986 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9988 sv_dat = HeVAL(he_str);
9990 /* croak baby croak */
9992 "panic: paren_name hash element allocation failed");
9993 } else if ( SvPOK(sv_dat) ) {
9994 /* (?|...) can mean we have dupes so scan to check
9995 its already been stored. Maybe a flag indicating
9996 we are inside such a construct would be useful,
9997 but the arrays are likely to be quite small, so
9998 for now we punt -- dmq */
9999 IV count = SvIV(sv_dat);
10000 I32 *pv = (I32*)SvPVX(sv_dat);
10002 for ( i = 0 ; i < count ; i++ ) {
10003 if ( pv[i] == RExC_npar ) {
10009 pv = (I32*)SvGROW(sv_dat,
10010 SvCUR(sv_dat) + sizeof(I32)+1);
10011 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10012 pv[count] = RExC_npar;
10013 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10016 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10017 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10020 SvIV_set(sv_dat, 1);
10023 /* Yes this does cause a memory leak in debugging Perls
10025 if (!av_store(RExC_paren_name_list,
10026 RExC_npar, SvREFCNT_inc(svname)))
10027 SvREFCNT_dec_NN(svname);
10030 /*sv_dump(sv_dat);*/
10032 nextchar(pRExC_state);
10034 goto capturing_parens;
10036 RExC_seen |= REG_LOOKBEHIND_SEEN;
10037 RExC_in_lookbehind++;
10040 case '=': /* (?=...) */
10041 RExC_seen_zerolen++;
10043 case '!': /* (?!...) */
10044 RExC_seen_zerolen++;
10045 if (*RExC_parse == ')') {
10046 ret=reg_node(pRExC_state, OPFAIL);
10047 nextchar(pRExC_state);
10051 case '|': /* (?|...) */
10052 /* branch reset, behave like a (?:...) except that
10053 buffers in alternations share the same numbers */
10055 after_freeze = freeze_paren = RExC_npar;
10057 case ':': /* (?:...) */
10058 case '>': /* (?>...) */
10060 case '$': /* (?$...) */
10061 case '@': /* (?@...) */
10062 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10064 case '0' : /* (?0) */
10065 case 'R' : /* (?R) */
10066 if (*RExC_parse != ')')
10067 FAIL("Sequence (?R) not terminated");
10068 ret = reg_node(pRExC_state, GOSTART);
10069 RExC_seen |= REG_GOSTART_SEEN;
10070 *flagp |= POSTPONED;
10071 nextchar(pRExC_state);
10074 /* named and numeric backreferences */
10075 case '&': /* (?&NAME) */
10076 parse_start = RExC_parse - 1;
10079 SV *sv_dat = reg_scan_name(pRExC_state,
10080 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10081 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10083 if (RExC_parse == RExC_end || *RExC_parse != ')')
10084 vFAIL("Sequence (?&... not terminated");
10085 goto gen_recurse_regop;
10086 assert(0); /* NOT REACHED */
10088 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10090 vFAIL("Illegal pattern");
10092 goto parse_recursion;
10094 case '-': /* (?-1) */
10095 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10096 RExC_parse--; /* rewind to let it be handled later */
10100 case '1': case '2': case '3': case '4': /* (?1) */
10101 case '5': case '6': case '7': case '8': case '9':
10105 bool is_neg = FALSE;
10106 parse_start = RExC_parse - 1; /* MJD */
10107 if (*RExC_parse == '-') {
10111 num = grok_atou(RExC_parse, &endptr);
10113 RExC_parse = (char*)endptr;
10115 /* Some limit for num? */
10119 if (*RExC_parse!=')')
10120 vFAIL("Expecting close bracket");
10123 if ( paren == '-' ) {
10125 Diagram of capture buffer numbering.
10126 Top line is the normal capture buffer numbers
10127 Bottom line is the negative indexing as from
10131 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10135 num = RExC_npar + num;
10138 vFAIL("Reference to nonexistent group");
10140 } else if ( paren == '+' ) {
10141 num = RExC_npar + num - 1;
10144 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10146 if (num > (I32)RExC_rx->nparens) {
10148 vFAIL("Reference to nonexistent group");
10150 RExC_recurse_count++;
10151 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10152 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10153 22, "| |", (int)(depth * 2 + 1), "",
10154 (UV)ARG(ret), (IV)ARG2L(ret)));
10156 RExC_seen |= REG_RECURSE_SEEN;
10157 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10158 Set_Node_Offset(ret, parse_start); /* MJD */
10160 *flagp |= POSTPONED;
10161 nextchar(pRExC_state);
10164 assert(0); /* NOT REACHED */
10166 case '?': /* (??...) */
10168 if (*RExC_parse != '{') {
10170 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10172 "Sequence (%"UTF8f"...) not recognized",
10173 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10176 *flagp |= POSTPONED;
10177 paren = *RExC_parse++;
10179 case '{': /* (?{...}) */
10182 struct reg_code_block *cb;
10184 RExC_seen_zerolen++;
10186 if ( !pRExC_state->num_code_blocks
10187 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10188 || pRExC_state->code_blocks[pRExC_state->code_index].start
10189 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10192 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10193 FAIL("panic: Sequence (?{...}): no code block found\n");
10194 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10196 /* this is a pre-compiled code block (?{...}) */
10197 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10198 RExC_parse = RExC_start + cb->end;
10201 if (cb->src_regex) {
10202 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10203 RExC_rxi->data->data[n] =
10204 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10205 RExC_rxi->data->data[n+1] = (void*)o;
10208 n = add_data(pRExC_state,
10209 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10210 RExC_rxi->data->data[n] = (void*)o;
10213 pRExC_state->code_index++;
10214 nextchar(pRExC_state);
10218 ret = reg_node(pRExC_state, LOGICAL);
10220 eval = reg2Lanode(pRExC_state, EVAL,
10223 /* for later propagation into (??{})
10225 RExC_flags & RXf_PMf_COMPILETIME
10230 REGTAIL(pRExC_state, ret, eval);
10231 /* deal with the length of this later - MJD */
10234 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10235 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10236 Set_Node_Offset(ret, parse_start);
10239 case '(': /* (?(?{...})...) and (?(?=...)...) */
10242 const int DEFINE_len = sizeof("DEFINE") - 1;
10243 if (RExC_parse[0] == '?') { /* (?(?...)) */
10244 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10245 || RExC_parse[1] == '<'
10246 || RExC_parse[1] == '{') { /* Lookahead or eval. */
10250 ret = reg_node(pRExC_state, LOGICAL);
10254 tail = reg(pRExC_state, 1, &flag, depth+1);
10255 if (flag & RESTART_UTF8) {
10256 *flagp = RESTART_UTF8;
10259 REGTAIL(pRExC_state, ret, tail);
10262 /* Fall through to ‘Unknown switch condition’ at the
10263 end of the if/else chain. */
10265 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10266 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10268 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10269 char *name_start= RExC_parse++;
10271 SV *sv_dat=reg_scan_name(pRExC_state,
10272 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10273 if (RExC_parse == name_start || *RExC_parse != ch)
10274 vFAIL2("Sequence (?(%c... not terminated",
10275 (ch == '>' ? '<' : ch));
10278 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10279 RExC_rxi->data->data[num]=(void*)sv_dat;
10280 SvREFCNT_inc_simple_void(sv_dat);
10282 ret = reganode(pRExC_state,NGROUPP,num);
10283 goto insert_if_check_paren;
10285 else if (strnEQ(RExC_parse, "DEFINE",
10286 MIN(DEFINE_len, RExC_end - RExC_parse)))
10288 ret = reganode(pRExC_state,DEFINEP,0);
10289 RExC_parse += DEFINE_len;
10291 goto insert_if_check_paren;
10293 else if (RExC_parse[0] == 'R') {
10296 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10297 parno = grok_atou(RExC_parse, &endptr);
10299 RExC_parse = (char*)endptr;
10300 } else if (RExC_parse[0] == '&') {
10303 sv_dat = reg_scan_name(pRExC_state,
10305 ? REG_RSN_RETURN_NULL
10306 : REG_RSN_RETURN_DATA);
10307 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10309 ret = reganode(pRExC_state,INSUBP,parno);
10310 goto insert_if_check_paren;
10312 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10316 parno = grok_atou(RExC_parse, &endptr);
10318 RExC_parse = (char*)endptr;
10319 ret = reganode(pRExC_state, GROUPP, parno);
10321 insert_if_check_paren:
10322 if (*(tmp = nextchar(pRExC_state)) != ')') {
10323 /* nextchar also skips comments, so undo its work
10324 * and skip over the the next character.
10327 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10328 vFAIL("Switch condition not recognized");
10331 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10332 br = regbranch(pRExC_state, &flags, 1,depth+1);
10334 if (flags & RESTART_UTF8) {
10335 *flagp = RESTART_UTF8;
10338 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10341 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10343 c = *nextchar(pRExC_state);
10344 if (flags&HASWIDTH)
10345 *flagp |= HASWIDTH;
10348 vFAIL("(?(DEFINE)....) does not allow branches");
10350 /* Fake one for optimizer. */
10351 lastbr = reganode(pRExC_state, IFTHEN, 0);
10353 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10354 if (flags & RESTART_UTF8) {
10355 *flagp = RESTART_UTF8;
10358 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10361 REGTAIL(pRExC_state, ret, lastbr);
10362 if (flags&HASWIDTH)
10363 *flagp |= HASWIDTH;
10364 c = *nextchar(pRExC_state);
10369 if (RExC_parse>RExC_end)
10370 vFAIL("Switch (?(condition)... not terminated");
10372 vFAIL("Switch (?(condition)... contains too many branches");
10374 ender = reg_node(pRExC_state, TAIL);
10375 REGTAIL(pRExC_state, br, ender);
10377 REGTAIL(pRExC_state, lastbr, ender);
10378 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10381 REGTAIL(pRExC_state, ret, ender);
10382 RExC_size++; /* XXX WHY do we need this?!!
10383 For large programs it seems to be required
10384 but I can't figure out why. -- dmq*/
10387 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10388 vFAIL("Unknown switch condition (?(...))");
10390 case '[': /* (?[ ... ]) */
10391 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10394 RExC_parse--; /* for vFAIL to print correctly */
10395 vFAIL("Sequence (? incomplete");
10397 default: /* e.g., (?i) */
10400 parse_lparen_question_flags(pRExC_state);
10401 if (UCHARAT(RExC_parse) != ':') {
10402 nextchar(pRExC_state);
10407 nextchar(pRExC_state);
10417 ret = reganode(pRExC_state, OPEN, parno);
10419 if (!RExC_nestroot)
10420 RExC_nestroot = parno;
10421 if (RExC_seen & REG_RECURSE_SEEN
10422 && !RExC_open_parens[parno-1])
10424 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10425 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10426 22, "| |", (int)(depth * 2 + 1), "",
10427 (IV)parno, REG_NODE_NUM(ret)));
10428 RExC_open_parens[parno-1]= ret;
10431 Set_Node_Length(ret, 1); /* MJD */
10432 Set_Node_Offset(ret, RExC_parse); /* MJD */
10440 /* Pick up the branches, linking them together. */
10441 parse_start = RExC_parse; /* MJD */
10442 br = regbranch(pRExC_state, &flags, 1,depth+1);
10444 /* branch_len = (paren != 0); */
10447 if (flags & RESTART_UTF8) {
10448 *flagp = RESTART_UTF8;
10451 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10453 if (*RExC_parse == '|') {
10454 if (!SIZE_ONLY && RExC_extralen) {
10455 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10458 reginsert(pRExC_state, BRANCH, br, depth+1);
10459 Set_Node_Length(br, paren != 0);
10460 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10464 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10466 else if (paren == ':') {
10467 *flagp |= flags&SIMPLE;
10469 if (is_open) { /* Starts with OPEN. */
10470 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10472 else if (paren != '?') /* Not Conditional */
10474 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10476 while (*RExC_parse == '|') {
10477 if (!SIZE_ONLY && RExC_extralen) {
10478 ender = reganode(pRExC_state, LONGJMP,0);
10480 /* Append to the previous. */
10481 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10484 RExC_extralen += 2; /* Account for LONGJMP. */
10485 nextchar(pRExC_state);
10486 if (freeze_paren) {
10487 if (RExC_npar > after_freeze)
10488 after_freeze = RExC_npar;
10489 RExC_npar = freeze_paren;
10491 br = regbranch(pRExC_state, &flags, 0, depth+1);
10494 if (flags & RESTART_UTF8) {
10495 *flagp = RESTART_UTF8;
10498 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10500 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10502 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10505 if (have_branch || paren != ':') {
10506 /* Make a closing node, and hook it on the end. */
10509 ender = reg_node(pRExC_state, TAIL);
10512 ender = reganode(pRExC_state, CLOSE, parno);
10513 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10514 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10515 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10516 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10517 RExC_close_parens[parno-1]= ender;
10518 if (RExC_nestroot == parno)
10521 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10522 Set_Node_Length(ender,1); /* MJD */
10528 *flagp &= ~HASWIDTH;
10531 ender = reg_node(pRExC_state, SUCCEED);
10534 ender = reg_node(pRExC_state, END);
10536 assert(!RExC_opend); /* there can only be one! */
10537 RExC_opend = ender;
10541 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10542 DEBUG_PARSE_MSG("lsbr");
10543 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10544 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10545 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10546 SvPV_nolen_const(RExC_mysv1),
10547 (IV)REG_NODE_NUM(lastbr),
10548 SvPV_nolen_const(RExC_mysv2),
10549 (IV)REG_NODE_NUM(ender),
10550 (IV)(ender - lastbr)
10553 REGTAIL(pRExC_state, lastbr, ender);
10555 if (have_branch && !SIZE_ONLY) {
10556 char is_nothing= 1;
10558 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10560 /* Hook the tails of the branches to the closing node. */
10561 for (br = ret; br; br = regnext(br)) {
10562 const U8 op = PL_regkind[OP(br)];
10563 if (op == BRANCH) {
10564 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10565 if ( OP(NEXTOPER(br)) != NOTHING
10566 || regnext(NEXTOPER(br)) != ender)
10569 else if (op == BRANCHJ) {
10570 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10571 /* for now we always disable this optimisation * /
10572 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10573 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10579 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10580 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10581 DEBUG_PARSE_MSG("NADA");
10582 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10583 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10584 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10585 SvPV_nolen_const(RExC_mysv1),
10586 (IV)REG_NODE_NUM(ret),
10587 SvPV_nolen_const(RExC_mysv2),
10588 (IV)REG_NODE_NUM(ender),
10593 if (OP(ender) == TAIL) {
10598 for ( opt= br + 1; opt < ender ; opt++ )
10599 OP(opt)= OPTIMIZED;
10600 NEXT_OFF(br)= ender - br;
10608 static const char parens[] = "=!<,>";
10610 if (paren && (p = strchr(parens, paren))) {
10611 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10612 int flag = (p - parens) > 1;
10615 node = SUSPEND, flag = 0;
10616 reginsert(pRExC_state, node,ret, depth+1);
10617 Set_Node_Cur_Length(ret, parse_start);
10618 Set_Node_Offset(ret, parse_start + 1);
10620 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10624 /* Check for proper termination. */
10626 /* restore original flags, but keep (?p) */
10627 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10628 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10629 RExC_parse = oregcomp_parse;
10630 vFAIL("Unmatched (");
10633 else if (!paren && RExC_parse < RExC_end) {
10634 if (*RExC_parse == ')') {
10636 vFAIL("Unmatched )");
10639 FAIL("Junk on end of regexp"); /* "Can't happen". */
10640 assert(0); /* NOTREACHED */
10643 if (RExC_in_lookbehind) {
10644 RExC_in_lookbehind--;
10646 if (after_freeze > RExC_npar)
10647 RExC_npar = after_freeze;
10652 - regbranch - one alternative of an | operator
10654 * Implements the concatenation operator.
10656 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10660 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10663 regnode *chain = NULL;
10665 I32 flags = 0, c = 0;
10666 GET_RE_DEBUG_FLAGS_DECL;
10668 PERL_ARGS_ASSERT_REGBRANCH;
10670 DEBUG_PARSE("brnc");
10675 if (!SIZE_ONLY && RExC_extralen)
10676 ret = reganode(pRExC_state, BRANCHJ,0);
10678 ret = reg_node(pRExC_state, BRANCH);
10679 Set_Node_Length(ret, 1);
10683 if (!first && SIZE_ONLY)
10684 RExC_extralen += 1; /* BRANCHJ */
10686 *flagp = WORST; /* Tentatively. */
10689 nextchar(pRExC_state);
10690 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10691 flags &= ~TRYAGAIN;
10692 latest = regpiece(pRExC_state, &flags,depth+1);
10693 if (latest == NULL) {
10694 if (flags & TRYAGAIN)
10696 if (flags & RESTART_UTF8) {
10697 *flagp = RESTART_UTF8;
10700 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10702 else if (ret == NULL)
10704 *flagp |= flags&(HASWIDTH|POSTPONED);
10705 if (chain == NULL) /* First piece. */
10706 *flagp |= flags&SPSTART;
10709 REGTAIL(pRExC_state, chain, latest);
10714 if (chain == NULL) { /* Loop ran zero times. */
10715 chain = reg_node(pRExC_state, NOTHING);
10720 *flagp |= flags&SIMPLE;
10727 - regpiece - something followed by possible [*+?]
10729 * Note that the branching code sequences used for ? and the general cases
10730 * of * and + are somewhat optimized: they use the same NOTHING node as
10731 * both the endmarker for their branch list and the body of the last branch.
10732 * It might seem that this node could be dispensed with entirely, but the
10733 * endmarker role is not redundant.
10735 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10737 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10741 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10747 const char * const origparse = RExC_parse;
10749 I32 max = REG_INFTY;
10750 #ifdef RE_TRACK_PATTERN_OFFSETS
10753 const char *maxpos = NULL;
10755 /* Save the original in case we change the emitted regop to a FAIL. */
10756 regnode * const orig_emit = RExC_emit;
10758 GET_RE_DEBUG_FLAGS_DECL;
10760 PERL_ARGS_ASSERT_REGPIECE;
10762 DEBUG_PARSE("piec");
10764 ret = regatom(pRExC_state, &flags,depth+1);
10766 if (flags & (TRYAGAIN|RESTART_UTF8))
10767 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10769 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10775 if (op == '{' && regcurly(RExC_parse)) {
10777 #ifdef RE_TRACK_PATTERN_OFFSETS
10778 parse_start = RExC_parse; /* MJD */
10780 next = RExC_parse + 1;
10781 while (isDIGIT(*next) || *next == ',') {
10782 if (*next == ',') {
10790 if (*next == '}') { /* got one */
10791 const char* endptr;
10795 min = grok_atou(RExC_parse, &endptr);
10796 if (*maxpos == ',')
10799 maxpos = RExC_parse;
10800 max = grok_atou(maxpos, &endptr);
10801 if (!max && *maxpos != '0')
10802 max = REG_INFTY; /* meaning "infinity" */
10803 else if (max >= REG_INFTY)
10804 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10806 nextchar(pRExC_state);
10807 if (max < min) { /* If can't match, warn and optimize to fail
10811 /* We can't back off the size because we have to reserve
10812 * enough space for all the things we are about to throw
10813 * away, but we can shrink it by the ammount we are about
10814 * to re-use here */
10815 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10818 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10819 RExC_emit = orig_emit;
10821 ret = reg_node(pRExC_state, OPFAIL);
10824 else if (min == max
10825 && RExC_parse < RExC_end
10826 && (*RExC_parse == '?' || *RExC_parse == '+'))
10829 ckWARN2reg(RExC_parse + 1,
10830 "Useless use of greediness modifier '%c'",
10833 /* Absorb the modifier, so later code doesn't see nor use
10835 nextchar(pRExC_state);
10839 if ((flags&SIMPLE)) {
10840 RExC_naughty += 2 + RExC_naughty / 2;
10841 reginsert(pRExC_state, CURLY, ret, depth+1);
10842 Set_Node_Offset(ret, parse_start+1); /* MJD */
10843 Set_Node_Cur_Length(ret, parse_start);
10846 regnode * const w = reg_node(pRExC_state, WHILEM);
10849 REGTAIL(pRExC_state, ret, w);
10850 if (!SIZE_ONLY && RExC_extralen) {
10851 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10852 reginsert(pRExC_state, NOTHING,ret, depth+1);
10853 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10855 reginsert(pRExC_state, CURLYX,ret, depth+1);
10857 Set_Node_Offset(ret, parse_start+1);
10858 Set_Node_Length(ret,
10859 op == '{' ? (RExC_parse - parse_start) : 1);
10861 if (!SIZE_ONLY && RExC_extralen)
10862 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10863 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10865 RExC_whilem_seen++, RExC_extralen += 3;
10866 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10873 *flagp |= HASWIDTH;
10875 ARG1_SET(ret, (U16)min);
10876 ARG2_SET(ret, (U16)max);
10878 if (max == REG_INFTY)
10879 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10885 if (!ISMULT1(op)) {
10890 #if 0 /* Now runtime fix should be reliable. */
10892 /* if this is reinstated, don't forget to put this back into perldiag:
10894 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10896 (F) The part of the regexp subject to either the * or + quantifier
10897 could match an empty string. The {#} shows in the regular
10898 expression about where the problem was discovered.
10902 if (!(flags&HASWIDTH) && op != '?')
10903 vFAIL("Regexp *+ operand could be empty");
10906 #ifdef RE_TRACK_PATTERN_OFFSETS
10907 parse_start = RExC_parse;
10909 nextchar(pRExC_state);
10911 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10913 if (op == '*' && (flags&SIMPLE)) {
10914 reginsert(pRExC_state, STAR, ret, depth+1);
10917 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10919 else if (op == '*') {
10923 else if (op == '+' && (flags&SIMPLE)) {
10924 reginsert(pRExC_state, PLUS, ret, depth+1);
10927 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10929 else if (op == '+') {
10933 else if (op == '?') {
10938 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10939 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10940 ckWARN2reg(RExC_parse,
10941 "%"UTF8f" matches null string many times",
10942 UTF8fARG(UTF, (RExC_parse >= origparse
10943 ? RExC_parse - origparse
10946 (void)ReREFCNT_inc(RExC_rx_sv);
10949 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10950 nextchar(pRExC_state);
10951 reginsert(pRExC_state, MINMOD, ret, depth+1);
10952 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10955 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10957 nextchar(pRExC_state);
10958 ender = reg_node(pRExC_state, SUCCEED);
10959 REGTAIL(pRExC_state, ret, ender);
10960 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10962 ender = reg_node(pRExC_state, TAIL);
10963 REGTAIL(pRExC_state, ret, ender);
10966 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10968 vFAIL("Nested quantifiers");
10975 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10976 UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10980 /* This is expected to be called by a parser routine that has recognized '\N'
10981 and needs to handle the rest. RExC_parse is expected to point at the first
10982 char following the N at the time of the call. On successful return,
10983 RExC_parse has been updated to point to just after the sequence identified
10984 by this routine, <*flagp> has been updated, and the non-NULL input pointers
10985 have been set appropriately.
10987 The typical case for this is \N{some character name}. This is usually
10988 called while parsing the input, filling in or ready to fill in an EXACTish
10989 node, and the code point for the character should be returned, so that it
10990 can be added to the node, and parsing continued with the next input
10991 character. But it may be that instead of a single character the \N{}
10992 expands to more than one, a named sequence. In this case any following
10993 quantifier applies to the whole sequence, and it is easier, given the code
10994 structure that calls this, to handle it from a different area of the code.
10995 For this reason, the input parameters can be set so that it returns valid
10996 only on one or the other of these cases.
10998 Another possibility is for the input to be an empty \N{}, which for
10999 backwards compatibility we accept, but generate a NOTHING node which should
11000 later get optimized out. This is handled from the area of code which can
11001 handle a named sequence, so if called with the parameters for the other, it
11004 Still another possibility is for the \N to mean [^\n], and not a single
11005 character or explicit sequence at all. This is determined by context.
11006 Again, this is handled from the area of code which can handle a named
11007 sequence, so if called with the parameters for the other, it also fails.
11009 And the final possibility is for the \N to be called from within a bracketed
11010 character class. In this case the [^\n] meaning makes no sense, and so is
11011 an error. Other anomalous situations are left to the calling code to handle.
11013 For non-single-quoted regexes, the tokenizer has attempted to decide which
11014 of the above applies, and in the case of a named sequence, has converted it
11015 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11016 where c1... are the characters in the sequence. For single-quoted regexes,
11017 the tokenizer passes the \N sequence through unchanged; this code will not
11018 attempt to determine this nor expand those, instead raising a syntax error.
11019 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11020 or there is no '}', it signals that this \N occurrence means to match a
11021 non-newline. (This mostly was done because of [perl #56444].)
11023 The API is somewhat convoluted due to historical and the above reasons.
11025 The function raises an error (via vFAIL), and doesn't return for various
11026 syntax errors. For other failures, it returns (STRLEN) -1. For successes,
11027 it returns a count of how many characters were accounted for by it. (This
11028 can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11029 points in the sequence. It sets <node_p>, <valuep>, and/or
11030 <substitute_parse> on success.
11032 If <valuep> is non-null, it means the caller can accept an input sequence
11033 consisting of a just a single code point; <*valuep> is set to the value
11034 of the only or first code point in the input.
11036 If <substitute_parse> is non-null, it means the caller can accept an input
11037 sequence consisting of one or more code points; <*substitute_parse> is a
11038 newly created mortal SV* in this case, containing \x{} escapes representing
11041 Both <valuep> and <substitute_parse> can be non-NULL.
11043 If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
11044 that the caller can accept any legal sequence other than a single code
11045 point. To wit, <*node_p> is set as follows:
11046 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11047 2) \N{}: points to a new NOTHING node; return is 0
11048 3) otherwise: points to a new EXACT node containing the resolved
11049 string; return is the number of code points in the
11050 string. This will never be 1.
11051 Note that failure is returned for single code point sequences if <valuep> is
11052 null and <node_p> is not.
11055 char * endbrace; /* '}' following the name */
11057 char *endchar; /* Points to '.' or '}' ending cur char in the input
11059 bool has_multiple_chars; /* true if the input stream contains a sequence of
11060 more than one character */
11061 bool in_char_class = substitute_parse != NULL;
11062 STRLEN count = 0; /* Number of characters in this sequence */
11064 GET_RE_DEBUG_FLAGS_DECL;
11066 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11068 GET_RE_DEBUG_FLAGS;
11070 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
11071 assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11073 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11074 * modifier. The other meaning does not, so use a temporary until we find
11075 * out which we are being called with */
11076 p = (RExC_flags & RXf_PMf_EXTENDED)
11077 ? regpatws(pRExC_state, RExC_parse,
11078 TRUE) /* means recognize comments */
11081 /* Disambiguate between \N meaning a named character versus \N meaning
11082 * [^\n]. The former is assumed when it can't be the latter. */
11083 if (*p != '{' || regcurly(p)) {
11086 /* no bare \N allowed in a charclass */
11087 if (in_char_class) {
11088 vFAIL("\\N in a character class must be a named character: \\N{...}");
11090 return (STRLEN) -1;
11092 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
11094 nextchar(pRExC_state);
11095 *node_p = reg_node(pRExC_state, REG_ANY);
11096 *flagp |= HASWIDTH|SIMPLE;
11098 Set_Node_Length(*node_p, 1); /* MJD */
11102 /* Here, we have decided it should be a named character or sequence */
11104 /* The test above made sure that the next real character is a '{', but
11105 * under the /x modifier, it could be separated by space (or a comment and
11106 * \n) and this is not allowed (for consistency with \x{...} and the
11107 * tokenizer handling of \N{NAME}). */
11108 if (*RExC_parse != '{') {
11109 vFAIL("Missing braces on \\N{}");
11112 RExC_parse++; /* Skip past the '{' */
11114 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11115 || ! (endbrace == RExC_parse /* nothing between the {} */
11116 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
11118 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
11121 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11122 vFAIL("\\N{NAME} must be resolved by the lexer");
11125 if (endbrace == RExC_parse) { /* empty: \N{} */
11127 *node_p = reg_node(pRExC_state,NOTHING);
11129 else if (! in_char_class) {
11130 return (STRLEN) -1;
11132 nextchar(pRExC_state);
11136 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11137 RExC_parse += 2; /* Skip past the 'U+' */
11139 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11141 /* Code points are separated by dots. If none, there is only one code
11142 * point, and is terminated by the brace */
11143 has_multiple_chars = (endchar < endbrace);
11145 /* We get the first code point if we want it, and either there is only one,
11146 * or we can accept both cases of one and more than one */
11147 if (valuep && (substitute_parse || ! has_multiple_chars)) {
11148 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11149 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11150 | PERL_SCAN_DISALLOW_PREFIX
11152 /* No errors in the first pass (See [perl
11153 * #122671].) We let the code below find the
11154 * errors when there are multiple chars. */
11155 | ((SIZE_ONLY || has_multiple_chars)
11156 ? PERL_SCAN_SILENT_ILLDIGIT
11159 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11161 /* The tokenizer should have guaranteed validity, but it's possible to
11162 * bypass it by using single quoting, so check. Don't do the check
11163 * here when there are multiple chars; we do it below anyway. */
11164 if (! has_multiple_chars) {
11165 if (length_of_hex == 0
11166 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11168 RExC_parse += length_of_hex; /* Includes all the valid */
11169 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11170 ? UTF8SKIP(RExC_parse)
11172 /* Guard against malformed utf8 */
11173 if (RExC_parse >= endchar) {
11174 RExC_parse = endchar;
11176 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11179 RExC_parse = endbrace + 1;
11184 /* Here, we should have already handled the case where a single character
11185 * is expected and found. So it is a failure if we aren't expecting
11186 * multiple chars and got them; or didn't get them but wanted them. We
11187 * fail without advancing the parse, so that the caller can try again with
11188 * different acceptance criteria */
11189 if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11191 return (STRLEN) -1;
11196 /* What is done here is to convert this to a sub-pattern of the form
11197 * \x{char1}\x{char2}...
11198 * and then either return it in <*substitute_parse> if non-null; or
11199 * call reg recursively to parse it (enclosing in "(?: ... )" ). That
11200 * way, it retains its atomicness, while not having to worry about
11201 * special handling that some code points may have. toke.c has
11202 * converted the original Unicode values to native, so that we can just
11203 * pass on the hex values unchanged. We do have to set a flag to keep
11204 * recoding from happening in the recursion */
11208 char *orig_end = RExC_end;
11211 if (substitute_parse) {
11212 *substitute_parse = newSVpvs("");
11215 substitute_parse = &dummy;
11216 *substitute_parse = newSVpvs("?:");
11218 *substitute_parse = sv_2mortal(*substitute_parse);
11220 while (RExC_parse < endbrace) {
11222 /* Convert to notation the rest of the code understands */
11223 sv_catpv(*substitute_parse, "\\x{");
11224 sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11225 sv_catpv(*substitute_parse, "}");
11227 /* Point to the beginning of the next character in the sequence. */
11228 RExC_parse = endchar + 1;
11229 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11233 if (! in_char_class) {
11234 sv_catpv(*substitute_parse, ")");
11237 RExC_parse = SvPV(*substitute_parse, len);
11239 /* Don't allow empty number */
11240 if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11241 RExC_parse = endbrace;
11242 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11244 RExC_end = RExC_parse + len;
11246 /* The values are Unicode, and therefore not subject to recoding */
11247 RExC_override_recoding = 1;
11250 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11251 if (flags & RESTART_UTF8) {
11252 *flagp = RESTART_UTF8;
11253 return (STRLEN) -1;
11255 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11258 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11261 RExC_parse = endbrace;
11262 RExC_end = orig_end;
11263 RExC_override_recoding = 0;
11265 nextchar(pRExC_state);
11275 * It returns the code point in utf8 for the value in *encp.
11276 * value: a code value in the source encoding
11277 * encp: a pointer to an Encode object
11279 * If the result from Encode is not a single character,
11280 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11283 S_reg_recode(pTHX_ const char value, SV **encp)
11286 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11287 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11288 const STRLEN newlen = SvCUR(sv);
11289 UV uv = UNICODE_REPLACEMENT;
11291 PERL_ARGS_ASSERT_REG_RECODE;
11295 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11298 if (!newlen || numlen != newlen) {
11299 uv = UNICODE_REPLACEMENT;
11305 PERL_STATIC_INLINE U8
11306 S_compute_EXACTish(RExC_state_t *pRExC_state)
11310 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11316 op = get_regex_charset(RExC_flags);
11317 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11318 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11319 been, so there is no hole */
11322 return op + EXACTF;
11325 PERL_STATIC_INLINE void
11326 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11327 regnode *node, I32* flagp, STRLEN len, UV code_point,
11330 /* This knows the details about sizing an EXACTish node, setting flags for
11331 * it (by setting <*flagp>, and potentially populating it with a single
11334 * If <len> (the length in bytes) is non-zero, this function assumes that
11335 * the node has already been populated, and just does the sizing. In this
11336 * case <code_point> should be the final code point that has already been
11337 * placed into the node. This value will be ignored except that under some
11338 * circumstances <*flagp> is set based on it.
11340 * If <len> is zero, the function assumes that the node is to contain only
11341 * the single character given by <code_point> and calculates what <len>
11342 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11343 * additionally will populate the node's STRING with <code_point> or its
11346 * In both cases <*flagp> is appropriately set
11348 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11349 * 255, must be folded (the former only when the rules indicate it can
11352 * When it does the populating, it looks at the flag 'downgradable'. If
11353 * true with a node that folds, it checks if the single code point
11354 * participates in a fold, and if not downgrades the node to an EXACT.
11355 * This helps the optimizer */
11357 bool len_passed_in = cBOOL(len != 0);
11358 U8 character[UTF8_MAXBYTES_CASE+1];
11360 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11362 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11363 * sizing difference, and is extra work that is thrown away */
11364 if (downgradable && ! PASS2) {
11365 downgradable = FALSE;
11368 if (! len_passed_in) {
11370 if (UVCHR_IS_INVARIANT(code_point)) {
11371 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11372 *character = (U8) code_point;
11374 else { /* Here is /i and not /l. (toFOLD() is defined on just
11375 ASCII, which isn't the same thing as INVARIANT on
11376 EBCDIC, but it works there, as the extra invariants
11377 fold to themselves) */
11378 *character = toFOLD((U8) code_point);
11380 /* We can downgrade to an EXACT node if this character
11381 * isn't a folding one. Note that this assumes that
11382 * nothing above Latin1 folds to some other invariant than
11383 * one of these alphabetics; otherwise we would also have
11385 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11386 * || ASCII_FOLD_RESTRICTED))
11388 if (downgradable && PL_fold[code_point] == code_point) {
11394 else if (FOLD && (! LOC
11395 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11396 { /* Folding, and ok to do so now */
11397 UV folded = _to_uni_fold_flags(
11401 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11402 ? FOLD_FLAGS_NOMIX_ASCII
11405 && folded == code_point /* This quickly rules out many
11406 cases, avoiding the
11407 _invlist_contains_cp() overhead
11409 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11414 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11416 /* Not folding this cp, and can output it directly */
11417 *character = UTF8_TWO_BYTE_HI(code_point);
11418 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11422 uvchr_to_utf8( character, code_point);
11423 len = UTF8SKIP(character);
11425 } /* Else pattern isn't UTF8. */
11427 *character = (U8) code_point;
11429 } /* Else is folded non-UTF8 */
11430 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11432 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11433 * comments at join_exact()); */
11434 *character = (U8) code_point;
11437 /* Can turn into an EXACT node if we know the fold at compile time,
11438 * and it folds to itself and doesn't particpate in other folds */
11441 && PL_fold_latin1[code_point] == code_point
11442 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11443 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11447 } /* else is Sharp s. May need to fold it */
11448 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11450 *(character + 1) = 's';
11454 *character = LATIN_SMALL_LETTER_SHARP_S;
11460 RExC_size += STR_SZ(len);
11463 RExC_emit += STR_SZ(len);
11464 STR_LEN(node) = len;
11465 if (! len_passed_in) {
11466 Copy((char *) character, STRING(node), len, char);
11470 *flagp |= HASWIDTH;
11472 /* A single character node is SIMPLE, except for the special-cased SHARP S
11474 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11475 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11476 || ! FOLD || ! DEPENDS_SEMANTICS))
11481 /* The OP may not be well defined in PASS1 */
11482 if (PASS2 && OP(node) == EXACTFL) {
11483 RExC_contains_locale = 1;
11488 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11489 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11492 S_backref_value(char *p)
11494 const char* endptr;
11495 UV val = grok_atou(p, &endptr);
11496 if (endptr == p || endptr == NULL || val > I32_MAX)
11503 - regatom - the lowest level
11505 Try to identify anything special at the start of the pattern. If there
11506 is, then handle it as required. This may involve generating a single regop,
11507 such as for an assertion; or it may involve recursing, such as to
11508 handle a () structure.
11510 If the string doesn't start with something special then we gobble up
11511 as much literal text as we can.
11513 Once we have been able to handle whatever type of thing started the
11514 sequence, we return.
11516 Note: we have to be careful with escapes, as they can be both literal
11517 and special, and in the case of \10 and friends, context determines which.
11519 A summary of the code structure is:
11521 switch (first_byte) {
11522 cases for each special:
11523 handle this special;
11526 switch (2nd byte) {
11527 cases for each unambiguous special:
11528 handle this special;
11530 cases for each ambigous special/literal:
11532 if (special) handle here
11534 default: // unambiguously literal:
11537 default: // is a literal char
11540 create EXACTish node for literal;
11541 while (more input and node isn't full) {
11542 switch (input_byte) {
11543 cases for each special;
11544 make sure parse pointer is set so that the next call to
11545 regatom will see this special first
11546 goto loopdone; // EXACTish node terminated by prev. char
11548 append char to EXACTISH node;
11550 get next input byte;
11554 return the generated node;
11556 Specifically there are two separate switches for handling
11557 escape sequences, with the one for handling literal escapes requiring
11558 a dummy entry for all of the special escapes that are actually handled
11561 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11563 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11565 Otherwise does not return NULL.
11569 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11571 regnode *ret = NULL;
11573 char *parse_start = RExC_parse;
11578 GET_RE_DEBUG_FLAGS_DECL;
11580 *flagp = WORST; /* Tentatively. */
11582 DEBUG_PARSE("atom");
11584 PERL_ARGS_ASSERT_REGATOM;
11587 switch ((U8)*RExC_parse) {
11589 RExC_seen_zerolen++;
11590 nextchar(pRExC_state);
11591 if (RExC_flags & RXf_PMf_MULTILINE)
11592 ret = reg_node(pRExC_state, MBOL);
11594 ret = reg_node(pRExC_state, SBOL);
11595 Set_Node_Length(ret, 1); /* MJD */
11598 nextchar(pRExC_state);
11600 RExC_seen_zerolen++;
11601 if (RExC_flags & RXf_PMf_MULTILINE)
11602 ret = reg_node(pRExC_state, MEOL);
11604 ret = reg_node(pRExC_state, SEOL);
11605 Set_Node_Length(ret, 1); /* MJD */
11608 nextchar(pRExC_state);
11609 if (RExC_flags & RXf_PMf_SINGLELINE)
11610 ret = reg_node(pRExC_state, SANY);
11612 ret = reg_node(pRExC_state, REG_ANY);
11613 *flagp |= HASWIDTH|SIMPLE;
11615 Set_Node_Length(ret, 1); /* MJD */
11619 char * const oregcomp_parse = ++RExC_parse;
11620 ret = regclass(pRExC_state, flagp,depth+1,
11621 FALSE, /* means parse the whole char class */
11622 TRUE, /* allow multi-char folds */
11623 FALSE, /* don't silence non-portable warnings. */
11625 if (*RExC_parse != ']') {
11626 RExC_parse = oregcomp_parse;
11627 vFAIL("Unmatched [");
11630 if (*flagp & RESTART_UTF8)
11632 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11635 nextchar(pRExC_state);
11636 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11640 nextchar(pRExC_state);
11641 ret = reg(pRExC_state, 2, &flags,depth+1);
11643 if (flags & TRYAGAIN) {
11644 if (RExC_parse == RExC_end) {
11645 /* Make parent create an empty node if needed. */
11646 *flagp |= TRYAGAIN;
11651 if (flags & RESTART_UTF8) {
11652 *flagp = RESTART_UTF8;
11655 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11658 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11662 if (flags & TRYAGAIN) {
11663 *flagp |= TRYAGAIN;
11666 vFAIL("Internal urp");
11667 /* Supposed to be caught earlier. */
11673 vFAIL("Quantifier follows nothing");
11678 This switch handles escape sequences that resolve to some kind
11679 of special regop and not to literal text. Escape sequnces that
11680 resolve to literal text are handled below in the switch marked
11683 Every entry in this switch *must* have a corresponding entry
11684 in the literal escape switch. However, the opposite is not
11685 required, as the default for this switch is to jump to the
11686 literal text handling code.
11688 switch ((U8)*++RExC_parse) {
11689 /* Special Escapes */
11691 RExC_seen_zerolen++;
11692 ret = reg_node(pRExC_state, SBOL);
11693 /* SBOL is shared with /^/ so we set the flags so we can tell
11694 * /\A/ from /^/ in split. We check ret because first pass we
11695 * have no regop struct to set the flags on. */
11699 goto finish_meta_pat;
11701 ret = reg_node(pRExC_state, GPOS);
11702 RExC_seen |= REG_GPOS_SEEN;
11704 goto finish_meta_pat;
11706 RExC_seen_zerolen++;
11707 ret = reg_node(pRExC_state, KEEPS);
11709 /* XXX:dmq : disabling in-place substitution seems to
11710 * be necessary here to avoid cases of memory corruption, as
11711 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11713 RExC_seen |= REG_LOOKBEHIND_SEEN;
11714 goto finish_meta_pat;
11716 ret = reg_node(pRExC_state, SEOL);
11718 RExC_seen_zerolen++; /* Do not optimize RE away */
11719 goto finish_meta_pat;
11721 ret = reg_node(pRExC_state, EOS);
11723 RExC_seen_zerolen++; /* Do not optimize RE away */
11724 goto finish_meta_pat;
11726 ret = reg_node(pRExC_state, CANY);
11727 RExC_seen |= REG_CANY_SEEN;
11728 *flagp |= HASWIDTH|SIMPLE;
11730 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11732 goto finish_meta_pat;
11734 ret = reg_node(pRExC_state, CLUMP);
11735 *flagp |= HASWIDTH;
11736 goto finish_meta_pat;
11742 arg = ANYOF_WORDCHAR;
11746 RExC_seen_zerolen++;
11747 RExC_seen |= REG_LOOKBEHIND_SEEN;
11748 op = BOUND + get_regex_charset(RExC_flags);
11749 if (op > BOUNDA) { /* /aa is same as /a */
11752 else if (op == BOUNDL) {
11753 RExC_contains_locale = 1;
11755 ret = reg_node(pRExC_state, op);
11756 FLAGS(ret) = get_regex_charset(RExC_flags);
11758 if ((U8) *(RExC_parse + 1) == '{') {
11759 /* diag_listed_as: Use "%s" instead of "%s" */
11760 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11762 goto finish_meta_pat;
11764 RExC_seen_zerolen++;
11765 RExC_seen |= REG_LOOKBEHIND_SEEN;
11766 op = NBOUND + get_regex_charset(RExC_flags);
11767 if (op > NBOUNDA) { /* /aa is same as /a */
11770 else if (op == NBOUNDL) {
11771 RExC_contains_locale = 1;
11773 ret = reg_node(pRExC_state, op);
11774 FLAGS(ret) = get_regex_charset(RExC_flags);
11776 if ((U8) *(RExC_parse + 1) == '{') {
11777 /* diag_listed_as: Use "%s" instead of "%s" */
11778 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11780 goto finish_meta_pat;
11790 ret = reg_node(pRExC_state, LNBREAK);
11791 *flagp |= HASWIDTH|SIMPLE;
11792 goto finish_meta_pat;
11800 goto join_posix_op_known;
11806 arg = ANYOF_VERTWS;
11808 goto join_posix_op_known;
11818 op = POSIXD + get_regex_charset(RExC_flags);
11819 if (op > POSIXA) { /* /aa is same as /a */
11822 else if (op == POSIXL) {
11823 RExC_contains_locale = 1;
11826 join_posix_op_known:
11829 op += NPOSIXD - POSIXD;
11832 ret = reg_node(pRExC_state, op);
11834 FLAGS(ret) = namedclass_to_classnum(arg);
11837 *flagp |= HASWIDTH|SIMPLE;
11841 nextchar(pRExC_state);
11842 Set_Node_Length(ret, 2); /* MJD */
11848 char* parse_start = RExC_parse - 2;
11853 ret = regclass(pRExC_state, flagp,depth+1,
11854 TRUE, /* means just parse this element */
11855 FALSE, /* don't allow multi-char folds */
11856 FALSE, /* don't silence non-portable warnings.
11857 It would be a bug if these returned
11860 /* regclass() can only return RESTART_UTF8 if multi-char folds
11863 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11868 Set_Node_Offset(ret, parse_start + 2);
11869 Set_Node_Cur_Length(ret, parse_start);
11870 nextchar(pRExC_state);
11874 /* Handle \N and \N{NAME} with multiple code points here and not
11875 * below because it can be multicharacter. join_exact() will join
11876 * them up later on. Also this makes sure that things like
11877 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11878 * The options to the grok function call causes it to fail if the
11879 * sequence is just a single code point. We then go treat it as
11880 * just another character in the current EXACT node, and hence it
11881 * gets uniform treatment with all the other characters. The
11882 * special treatment for quantifiers is not needed for such single
11883 * character sequences */
11885 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11888 if (*flagp & RESTART_UTF8)
11894 case 'k': /* Handle \k<NAME> and \k'NAME' */
11897 char ch= RExC_parse[1];
11898 if (ch != '<' && ch != '\'' && ch != '{') {
11900 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11901 vFAIL2("Sequence %.2s... not terminated",parse_start);
11903 /* this pretty much dupes the code for (?P=...) in reg(), if
11904 you change this make sure you change that */
11905 char* name_start = (RExC_parse += 2);
11907 SV *sv_dat = reg_scan_name(pRExC_state,
11908 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11909 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11910 if (RExC_parse == name_start || *RExC_parse != ch)
11911 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11912 vFAIL2("Sequence %.3s... not terminated",parse_start);
11915 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11916 RExC_rxi->data->data[num]=(void*)sv_dat;
11917 SvREFCNT_inc_simple_void(sv_dat);
11921 ret = reganode(pRExC_state,
11924 : (ASCII_FOLD_RESTRICTED)
11926 : (AT_LEAST_UNI_SEMANTICS)
11932 *flagp |= HASWIDTH;
11934 /* override incorrect value set in reganode MJD */
11935 Set_Node_Offset(ret, parse_start+1);
11936 Set_Node_Cur_Length(ret, parse_start);
11937 nextchar(pRExC_state);
11943 case '1': case '2': case '3': case '4':
11944 case '5': case '6': case '7': case '8': case '9':
11949 if (*RExC_parse == 'g') {
11953 if (*RExC_parse == '{') {
11957 if (*RExC_parse == '-') {
11961 if (hasbrace && !isDIGIT(*RExC_parse)) {
11962 if (isrel) RExC_parse--;
11964 goto parse_named_seq;
11967 num = S_backref_value(RExC_parse);
11969 vFAIL("Reference to invalid group 0");
11970 else if (num == I32_MAX) {
11971 if (isDIGIT(*RExC_parse))
11972 vFAIL("Reference to nonexistent group");
11974 vFAIL("Unterminated \\g... pattern");
11978 num = RExC_npar - num;
11980 vFAIL("Reference to nonexistent or unclosed group");
11984 num = S_backref_value(RExC_parse);
11985 /* bare \NNN might be backref or octal - if it is larger than or equal
11986 * RExC_npar then it is assumed to be and octal escape.
11987 * Note RExC_npar is +1 from the actual number of parens*/
11988 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11989 && *RExC_parse != '8' && *RExC_parse != '9'))
11991 /* Probably a character specified in octal, e.g. \35 */
11996 /* at this point RExC_parse definitely points to a backref
11999 #ifdef RE_TRACK_PATTERN_OFFSETS
12000 char * const parse_start = RExC_parse - 1; /* MJD */
12002 while (isDIGIT(*RExC_parse))
12005 if (*RExC_parse != '}')
12006 vFAIL("Unterminated \\g{...} pattern");
12010 if (num > (I32)RExC_rx->nparens)
12011 vFAIL("Reference to nonexistent group");
12014 ret = reganode(pRExC_state,
12017 : (ASCII_FOLD_RESTRICTED)
12019 : (AT_LEAST_UNI_SEMANTICS)
12025 *flagp |= HASWIDTH;
12027 /* override incorrect value set in reganode MJD */
12028 Set_Node_Offset(ret, parse_start+1);
12029 Set_Node_Cur_Length(ret, parse_start);
12031 nextchar(pRExC_state);
12036 if (RExC_parse >= RExC_end)
12037 FAIL("Trailing \\");
12040 /* Do not generate "unrecognized" warnings here, we fall
12041 back into the quick-grab loop below */
12048 if (RExC_flags & RXf_PMf_EXTENDED) {
12049 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12050 if (RExC_parse < RExC_end)
12057 parse_start = RExC_parse - 1;
12066 #define MAX_NODE_STRING_SIZE 127
12067 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12069 U8 upper_parse = MAX_NODE_STRING_SIZE;
12070 U8 node_type = compute_EXACTish(pRExC_state);
12071 bool next_is_quantifier;
12072 char * oldp = NULL;
12074 /* We can convert EXACTF nodes to EXACTFU if they contain only
12075 * characters that match identically regardless of the target
12076 * string's UTF8ness. The reason to do this is that EXACTF is not
12077 * trie-able, EXACTFU is.
12079 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12080 * contain only above-Latin1 characters (hence must be in UTF8),
12081 * which don't participate in folds with Latin1-range characters,
12082 * as the latter's folds aren't known until runtime. (We don't
12083 * need to figure this out until pass 2) */
12084 bool maybe_exactfu = PASS2
12085 && (node_type == EXACTF || node_type == EXACTFL);
12087 /* If a folding node contains only code points that don't
12088 * participate in folds, it can be changed into an EXACT node,
12089 * which allows the optimizer more things to look for */
12092 ret = reg_node(pRExC_state, node_type);
12094 /* In pass1, folded, we use a temporary buffer instead of the
12095 * actual node, as the node doesn't exist yet */
12096 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12102 /* We do the EXACTFish to EXACT node only if folding. (And we
12103 * don't need to figure this out until pass 2) */
12104 maybe_exact = FOLD && PASS2;
12106 /* XXX The node can hold up to 255 bytes, yet this only goes to
12107 * 127. I (khw) do not know why. Keeping it somewhat less than
12108 * 255 allows us to not have to worry about overflow due to
12109 * converting to utf8 and fold expansion, but that value is
12110 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12111 * split up by this limit into a single one using the real max of
12112 * 255. Even at 127, this breaks under rare circumstances. If
12113 * folding, we do not want to split a node at a character that is a
12114 * non-final in a multi-char fold, as an input string could just
12115 * happen to want to match across the node boundary. The join
12116 * would solve that problem if the join actually happens. But a
12117 * series of more than two nodes in a row each of 127 would cause
12118 * the first join to succeed to get to 254, but then there wouldn't
12119 * be room for the next one, which could at be one of those split
12120 * multi-char folds. I don't know of any fool-proof solution. One
12121 * could back off to end with only a code point that isn't such a
12122 * non-final, but it is possible for there not to be any in the
12124 for (p = RExC_parse - 1;
12125 len < upper_parse && p < RExC_end;
12130 if (RExC_flags & RXf_PMf_EXTENDED)
12131 p = regpatws(pRExC_state, p,
12132 TRUE); /* means recognize comments */
12143 /* Literal Escapes Switch
12145 This switch is meant to handle escape sequences that
12146 resolve to a literal character.
12148 Every escape sequence that represents something
12149 else, like an assertion or a char class, is handled
12150 in the switch marked 'Special Escapes' above in this
12151 routine, but also has an entry here as anything that
12152 isn't explicitly mentioned here will be treated as
12153 an unescaped equivalent literal.
12156 switch ((U8)*++p) {
12157 /* These are all the special escapes. */
12158 case 'A': /* Start assertion */
12159 case 'b': case 'B': /* Word-boundary assertion*/
12160 case 'C': /* Single char !DANGEROUS! */
12161 case 'd': case 'D': /* digit class */
12162 case 'g': case 'G': /* generic-backref, pos assertion */
12163 case 'h': case 'H': /* HORIZWS */
12164 case 'k': case 'K': /* named backref, keep marker */
12165 case 'p': case 'P': /* Unicode property */
12166 case 'R': /* LNBREAK */
12167 case 's': case 'S': /* space class */
12168 case 'v': case 'V': /* VERTWS */
12169 case 'w': case 'W': /* word class */
12170 case 'X': /* eXtended Unicode "combining
12171 character sequence" */
12172 case 'z': case 'Z': /* End of line/string assertion */
12176 /* Anything after here is an escape that resolves to a
12177 literal. (Except digits, which may or may not)
12183 case 'N': /* Handle a single-code point named character. */
12184 /* The options cause it to fail if a multiple code
12185 * point sequence. Handle those in the switch() above
12187 RExC_parse = p + 1;
12188 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12194 if (*flagp & RESTART_UTF8)
12195 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12196 RExC_parse = p = oldp;
12200 if (ender > 0xff) {
12217 ender = ESC_NATIVE;
12227 const char* error_msg;
12229 bool valid = grok_bslash_o(&p,
12232 PASS2, /* out warnings */
12233 FALSE, /* not strict */
12234 TRUE, /* Output warnings
12239 RExC_parse = p; /* going to die anyway; point
12240 to exact spot of failure */
12244 if (PL_encoding && ender < 0x100) {
12245 goto recode_encoding;
12247 if (ender > 0xff) {
12254 UV result = UV_MAX; /* initialize to erroneous
12256 const char* error_msg;
12258 bool valid = grok_bslash_x(&p,
12261 PASS2, /* out warnings */
12262 FALSE, /* not strict */
12263 TRUE, /* Output warnings
12268 RExC_parse = p; /* going to die anyway; point
12269 to exact spot of failure */
12274 if (PL_encoding && ender < 0x100) {
12275 goto recode_encoding;
12277 if (ender > 0xff) {
12284 ender = grok_bslash_c(*p++, PASS2);
12286 case '8': case '9': /* must be a backreference */
12289 case '1': case '2': case '3':case '4':
12290 case '5': case '6': case '7':
12291 /* When we parse backslash escapes there is ambiguity
12292 * between backreferences and octal escapes. Any escape
12293 * from \1 - \9 is a backreference, any multi-digit
12294 * escape which does not start with 0 and which when
12295 * evaluated as decimal could refer to an already
12296 * parsed capture buffer is a backslash. Anything else
12299 * Note this implies that \118 could be interpreted as
12300 * 118 OR as "\11" . "8" depending on whether there
12301 * were 118 capture buffers defined already in the
12304 /* NOTE, RExC_npar is 1 more than the actual number of
12305 * parens we have seen so far, hence the < RExC_npar below. */
12307 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12308 { /* Not to be treated as an octal constant, go
12316 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12318 ender = grok_oct(p, &numlen, &flags, NULL);
12319 if (ender > 0xff) {
12323 if (PASS2 /* like \08, \178 */
12326 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12328 reg_warn_non_literal_string(
12330 form_short_octal_warning(p, numlen));
12333 if (PL_encoding && ender < 0x100)
12334 goto recode_encoding;
12337 if (! RExC_override_recoding) {
12338 SV* enc = PL_encoding;
12339 ender = reg_recode((const char)(U8)ender, &enc);
12341 ckWARNreg(p, "Invalid escape in the specified encoding");
12347 FAIL("Trailing \\");
12350 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12351 /* Include any { following the alpha to emphasize
12352 * that it could be part of an escape at some point
12354 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12355 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12357 goto normal_default;
12358 } /* End of switch on '\' */
12361 /* Currently we don't warn when the lbrace is at the start
12362 * of a construct. This catches it in the middle of a
12363 * literal string, or when its the first thing after
12364 * something like "\b" */
12366 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12368 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12371 default: /* A literal character */
12373 if (UTF8_IS_START(*p) && UTF) {
12375 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12376 &numlen, UTF8_ALLOW_DEFAULT);
12382 } /* End of switch on the literal */
12384 /* Here, have looked at the literal character and <ender>
12385 * contains its ordinal, <p> points to the character after it
12388 if ( RExC_flags & RXf_PMf_EXTENDED)
12389 p = regpatws(pRExC_state, p,
12390 TRUE); /* means recognize comments */
12392 /* If the next thing is a quantifier, it applies to this
12393 * character only, which means that this character has to be in
12394 * its own node and can't just be appended to the string in an
12395 * existing node, so if there are already other characters in
12396 * the node, close the node with just them, and set up to do
12397 * this character again next time through, when it will be the
12398 * only thing in its new node */
12399 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12405 if (! FOLD /* The simple case, just append the literal */
12406 || (LOC /* Also don't fold for tricky chars under /l */
12407 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12410 const STRLEN unilen = reguni(pRExC_state, ender, s);
12416 /* The loop increments <len> each time, as all but this
12417 * path (and one other) through it add a single byte to
12418 * the EXACTish node. But this one has changed len to
12419 * be the correct final value, so subtract one to
12420 * cancel out the increment that follows */
12424 REGC((char)ender, s++);
12427 /* Can get here if folding only if is one of the /l
12428 * characters whose fold depends on the locale. The
12429 * occurrence of any of these indicate that we can't
12430 * simplify things */
12432 maybe_exact = FALSE;
12433 maybe_exactfu = FALSE;
12438 /* See comments for join_exact() as to why we fold this
12439 * non-UTF at compile time */
12440 || (node_type == EXACTFU
12441 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12443 /* Here, are folding and are not UTF-8 encoded; therefore
12444 * the character must be in the range 0-255, and is not /l
12445 * (Not /l because we already handled these under /l in
12446 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12447 if (IS_IN_SOME_FOLD_L1(ender)) {
12448 maybe_exact = FALSE;
12450 /* See if the character's fold differs between /d and
12451 * /u. This includes the multi-char fold SHARP S to
12454 && (PL_fold[ender] != PL_fold_latin1[ender]
12455 || ender == LATIN_SMALL_LETTER_SHARP_S
12457 && isALPHA_FOLD_EQ(ender, 's')
12458 && isALPHA_FOLD_EQ(*(s-1), 's'))))
12460 maybe_exactfu = FALSE;
12464 /* Even when folding, we store just the input character, as
12465 * we have an array that finds its fold quickly */
12466 *(s++) = (char) ender;
12468 else { /* FOLD and UTF */
12469 /* Unlike the non-fold case, we do actually have to
12470 * calculate the results here in pass 1. This is for two
12471 * reasons, the folded length may be longer than the
12472 * unfolded, and we have to calculate how many EXACTish
12473 * nodes it will take; and we may run out of room in a node
12474 * in the middle of a potential multi-char fold, and have
12475 * to back off accordingly. (Hence we can't use REGC for
12476 * the simple case just below.) */
12479 if (isASCII_uni(ender)) {
12480 folded = toFOLD(ender);
12481 *(s)++ = (U8) folded;
12486 folded = _to_uni_fold_flags(
12490 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12491 ? FOLD_FLAGS_NOMIX_ASCII
12495 /* The loop increments <len> each time, as all but this
12496 * path (and one other) through it add a single byte to
12497 * the EXACTish node. But this one has changed len to
12498 * be the correct final value, so subtract one to
12499 * cancel out the increment that follows */
12500 len += foldlen - 1;
12502 /* If this node only contains non-folding code points so
12503 * far, see if this new one is also non-folding */
12505 if (folded != ender) {
12506 maybe_exact = FALSE;
12509 /* Here the fold is the original; we have to check
12510 * further to see if anything folds to it */
12511 if (_invlist_contains_cp(PL_utf8_foldable,
12514 maybe_exact = FALSE;
12521 if (next_is_quantifier) {
12523 /* Here, the next input is a quantifier, and to get here,
12524 * the current character is the only one in the node.
12525 * Also, here <len> doesn't include the final byte for this
12531 } /* End of loop through literal characters */
12533 /* Here we have either exhausted the input or ran out of room in
12534 * the node. (If we encountered a character that can't be in the
12535 * node, transfer is made directly to <loopdone>, and so we
12536 * wouldn't have fallen off the end of the loop.) In the latter
12537 * case, we artificially have to split the node into two, because
12538 * we just don't have enough space to hold everything. This
12539 * creates a problem if the final character participates in a
12540 * multi-character fold in the non-final position, as a match that
12541 * should have occurred won't, due to the way nodes are matched,
12542 * and our artificial boundary. So back off until we find a non-
12543 * problematic character -- one that isn't at the beginning or
12544 * middle of such a fold. (Either it doesn't participate in any
12545 * folds, or appears only in the final position of all the folds it
12546 * does participate in.) A better solution with far fewer false
12547 * positives, and that would fill the nodes more completely, would
12548 * be to actually have available all the multi-character folds to
12549 * test against, and to back-off only far enough to be sure that
12550 * this node isn't ending with a partial one. <upper_parse> is set
12551 * further below (if we need to reparse the node) to include just
12552 * up through that final non-problematic character that this code
12553 * identifies, so when it is set to less than the full node, we can
12554 * skip the rest of this */
12555 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12557 const STRLEN full_len = len;
12559 assert(len >= MAX_NODE_STRING_SIZE);
12561 /* Here, <s> points to the final byte of the final character.
12562 * Look backwards through the string until find a non-
12563 * problematic character */
12567 /* This has no multi-char folds to non-UTF characters */
12568 if (ASCII_FOLD_RESTRICTED) {
12572 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12576 if (! PL_NonL1NonFinalFold) {
12577 PL_NonL1NonFinalFold = _new_invlist_C_array(
12578 NonL1_Perl_Non_Final_Folds_invlist);
12581 /* Point to the first byte of the final character */
12582 s = (char *) utf8_hop((U8 *) s, -1);
12584 while (s >= s0) { /* Search backwards until find
12585 non-problematic char */
12586 if (UTF8_IS_INVARIANT(*s)) {
12588 /* There are no ascii characters that participate
12589 * in multi-char folds under /aa. In EBCDIC, the
12590 * non-ascii invariants are all control characters,
12591 * so don't ever participate in any folds. */
12592 if (ASCII_FOLD_RESTRICTED
12593 || ! IS_NON_FINAL_FOLD(*s))
12598 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12599 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12605 else if (! _invlist_contains_cp(
12606 PL_NonL1NonFinalFold,
12607 valid_utf8_to_uvchr((U8 *) s, NULL)))
12612 /* Here, the current character is problematic in that
12613 * it does occur in the non-final position of some
12614 * fold, so try the character before it, but have to
12615 * special case the very first byte in the string, so
12616 * we don't read outside the string */
12617 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12618 } /* End of loop backwards through the string */
12620 /* If there were only problematic characters in the string,
12621 * <s> will point to before s0, in which case the length
12622 * should be 0, otherwise include the length of the
12623 * non-problematic character just found */
12624 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12627 /* Here, have found the final character, if any, that is
12628 * non-problematic as far as ending the node without splitting
12629 * it across a potential multi-char fold. <len> contains the
12630 * number of bytes in the node up-to and including that
12631 * character, or is 0 if there is no such character, meaning
12632 * the whole node contains only problematic characters. In
12633 * this case, give up and just take the node as-is. We can't
12638 /* If the node ends in an 's' we make sure it stays EXACTF,
12639 * as if it turns into an EXACTFU, it could later get
12640 * joined with another 's' that would then wrongly match
12642 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12644 maybe_exactfu = FALSE;
12648 /* Here, the node does contain some characters that aren't
12649 * problematic. If one such is the final character in the
12650 * node, we are done */
12651 if (len == full_len) {
12654 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12656 /* If the final character is problematic, but the
12657 * penultimate is not, back-off that last character to
12658 * later start a new node with it */
12663 /* Here, the final non-problematic character is earlier
12664 * in the input than the penultimate character. What we do
12665 * is reparse from the beginning, going up only as far as
12666 * this final ok one, thus guaranteeing that the node ends
12667 * in an acceptable character. The reason we reparse is
12668 * that we know how far in the character is, but we don't
12669 * know how to correlate its position with the input parse.
12670 * An alternate implementation would be to build that
12671 * correlation as we go along during the original parse,
12672 * but that would entail extra work for every node, whereas
12673 * this code gets executed only when the string is too
12674 * large for the node, and the final two characters are
12675 * problematic, an infrequent occurrence. Yet another
12676 * possible strategy would be to save the tail of the
12677 * string, and the next time regatom is called, initialize
12678 * with that. The problem with this is that unless you
12679 * back off one more character, you won't be guaranteed
12680 * regatom will get called again, unless regbranch,
12681 * regpiece ... are also changed. If you do back off that
12682 * extra character, so that there is input guaranteed to
12683 * force calling regatom, you can't handle the case where
12684 * just the first character in the node is acceptable. I
12685 * (khw) decided to try this method which doesn't have that
12686 * pitfall; if performance issues are found, we can do a
12687 * combination of the current approach plus that one */
12693 } /* End of verifying node ends with an appropriate char */
12695 loopdone: /* Jumped to when encounters something that shouldn't be in
12698 /* I (khw) don't know if you can get here with zero length, but the
12699 * old code handled this situation by creating a zero-length EXACT
12700 * node. Might as well be NOTHING instead */
12706 /* If 'maybe_exact' is still set here, means there are no
12707 * code points in the node that participate in folds;
12708 * similarly for 'maybe_exactfu' and code points that match
12709 * differently depending on UTF8ness of the target string
12710 * (for /u), or depending on locale for /l */
12714 else if (maybe_exactfu) {
12718 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12719 FALSE /* Don't look to see if could
12720 be turned into an EXACT
12721 node, as we have already
12726 RExC_parse = p - 1;
12727 Set_Node_Cur_Length(ret, parse_start);
12728 nextchar(pRExC_state);
12730 /* len is STRLEN which is unsigned, need to copy to signed */
12733 vFAIL("Internal disaster");
12736 } /* End of label 'defchar:' */
12738 } /* End of giant switch on input character */
12744 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12746 /* Returns the next non-pattern-white space, non-comment character (the
12747 * latter only if 'recognize_comment is true) in the string p, which is
12748 * ended by RExC_end. See also reg_skipcomment */
12749 const char *e = RExC_end;
12751 PERL_ARGS_ASSERT_REGPATWS;
12755 if ((len = is_PATWS_safe(p, e, UTF))) {
12758 else if (recognize_comment && *p == '#') {
12759 p = reg_skipcomment(pRExC_state, p);
12768 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12770 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12771 * sets up the bitmap and any flags, removing those code points from the
12772 * inversion list, setting it to NULL should it become completely empty */
12774 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12775 assert(PL_regkind[OP(node)] == ANYOF);
12777 ANYOF_BITMAP_ZERO(node);
12778 if (*invlist_ptr) {
12780 /* This gets set if we actually need to modify things */
12781 bool change_invlist = FALSE;
12785 /* Start looking through *invlist_ptr */
12786 invlist_iterinit(*invlist_ptr);
12787 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12791 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12792 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12794 else if (end >= NUM_ANYOF_CODE_POINTS) {
12795 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12798 /* Quit if are above what we should change */
12799 if (start >= NUM_ANYOF_CODE_POINTS) {
12803 change_invlist = TRUE;
12805 /* Set all the bits in the range, up to the max that we are doing */
12806 high = (end < NUM_ANYOF_CODE_POINTS - 1)
12808 : NUM_ANYOF_CODE_POINTS - 1;
12809 for (i = start; i <= (int) high; i++) {
12810 if (! ANYOF_BITMAP_TEST(node, i)) {
12811 ANYOF_BITMAP_SET(node, i);
12815 invlist_iterfinish(*invlist_ptr);
12817 /* Done with loop; remove any code points that are in the bitmap from
12818 * *invlist_ptr; similarly for code points above the bitmap if we have
12819 * a flag to match all of them anyways */
12820 if (change_invlist) {
12821 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12823 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12824 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12827 /* If have completely emptied it, remove it completely */
12828 if (_invlist_len(*invlist_ptr) == 0) {
12829 SvREFCNT_dec_NN(*invlist_ptr);
12830 *invlist_ptr = NULL;
12835 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12836 Character classes ([:foo:]) can also be negated ([:^foo:]).
12837 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12838 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12839 but trigger failures because they are currently unimplemented. */
12841 #define POSIXCC_DONE(c) ((c) == ':')
12842 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12843 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12845 PERL_STATIC_INLINE I32
12846 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12848 I32 namedclass = OOB_NAMEDCLASS;
12850 PERL_ARGS_ASSERT_REGPPOSIXCC;
12852 if (value == '[' && RExC_parse + 1 < RExC_end &&
12853 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12854 POSIXCC(UCHARAT(RExC_parse)))
12856 const char c = UCHARAT(RExC_parse);
12857 char* const s = RExC_parse++;
12859 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12861 if (RExC_parse == RExC_end) {
12864 /* Try to give a better location for the error (than the end of
12865 * the string) by looking for the matching ']' */
12867 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12870 vFAIL2("Unmatched '%c' in POSIX class", c);
12872 /* Grandfather lone [:, [=, [. */
12876 const char* const t = RExC_parse++; /* skip over the c */
12879 if (UCHARAT(RExC_parse) == ']') {
12880 const char *posixcc = s + 1;
12881 RExC_parse++; /* skip over the ending ] */
12884 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12885 const I32 skip = t - posixcc;
12887 /* Initially switch on the length of the name. */
12890 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12891 this is the Perl \w
12893 namedclass = ANYOF_WORDCHAR;
12896 /* Names all of length 5. */
12897 /* alnum alpha ascii blank cntrl digit graph lower
12898 print punct space upper */
12899 /* Offset 4 gives the best switch position. */
12900 switch (posixcc[4]) {
12902 if (memEQ(posixcc, "alph", 4)) /* alpha */
12903 namedclass = ANYOF_ALPHA;
12906 if (memEQ(posixcc, "spac", 4)) /* space */
12907 namedclass = ANYOF_PSXSPC;
12910 if (memEQ(posixcc, "grap", 4)) /* graph */
12911 namedclass = ANYOF_GRAPH;
12914 if (memEQ(posixcc, "asci", 4)) /* ascii */
12915 namedclass = ANYOF_ASCII;
12918 if (memEQ(posixcc, "blan", 4)) /* blank */
12919 namedclass = ANYOF_BLANK;
12922 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12923 namedclass = ANYOF_CNTRL;
12926 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12927 namedclass = ANYOF_ALPHANUMERIC;
12930 if (memEQ(posixcc, "lowe", 4)) /* lower */
12931 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12932 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12933 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12936 if (memEQ(posixcc, "digi", 4)) /* digit */
12937 namedclass = ANYOF_DIGIT;
12938 else if (memEQ(posixcc, "prin", 4)) /* print */
12939 namedclass = ANYOF_PRINT;
12940 else if (memEQ(posixcc, "punc", 4)) /* punct */
12941 namedclass = ANYOF_PUNCT;
12946 if (memEQ(posixcc, "xdigit", 6))
12947 namedclass = ANYOF_XDIGIT;
12951 if (namedclass == OOB_NAMEDCLASS)
12953 "POSIX class [:%"UTF8f":] unknown",
12954 UTF8fARG(UTF, t - s - 1, s + 1));
12956 /* The #defines are structured so each complement is +1 to
12957 * the normal one */
12961 assert (posixcc[skip] == ':');
12962 assert (posixcc[skip+1] == ']');
12963 } else if (!SIZE_ONLY) {
12964 /* [[=foo=]] and [[.foo.]] are still future. */
12966 /* adjust RExC_parse so the warning shows after
12967 the class closes */
12968 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12970 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12973 /* Maternal grandfather:
12974 * "[:" ending in ":" but not in ":]" */
12976 vFAIL("Unmatched '[' in POSIX class");
12979 /* Grandfather lone [:, [=, [. */
12989 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12991 /* This applies some heuristics at the current parse position (which should
12992 * be at a '[') to see if what follows might be intended to be a [:posix:]
12993 * class. It returns true if it really is a posix class, of course, but it
12994 * also can return true if it thinks that what was intended was a posix
12995 * class that didn't quite make it.
12997 * It will return true for
12999 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13000 * ')' indicating the end of the (?[
13001 * [:any garbage including %^&$ punctuation:]
13003 * This is designed to be called only from S_handle_regex_sets; it could be
13004 * easily adapted to be called from the spot at the beginning of regclass()
13005 * that checks to see in a normal bracketed class if the surrounding []
13006 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13007 * change long-standing behavior, so I (khw) didn't do that */
13008 char* p = RExC_parse + 1;
13009 char first_char = *p;
13011 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13013 assert(*(p - 1) == '[');
13015 if (! POSIXCC(first_char)) {
13020 while (p < RExC_end && isWORDCHAR(*p)) p++;
13022 if (p >= RExC_end) {
13026 if (p - RExC_parse > 2 /* Got at least 1 word character */
13027 && (*p == first_char
13028 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13033 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13036 && p - RExC_parse > 2 /* [:] evaluates to colon;
13037 [::] is a bad posix class. */
13038 && first_char == *(p - 1));
13042 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13043 I32 *flagp, U32 depth,
13044 char * const oregcomp_parse)
13046 /* Handle the (?[...]) construct to do set operations */
13049 UV start, end; /* End points of code point ranges */
13051 char *save_end, *save_parse;
13056 const bool save_fold = FOLD;
13058 GET_RE_DEBUG_FLAGS_DECL;
13060 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13063 vFAIL("(?[...]) not valid in locale");
13065 RExC_uni_semantics = 1;
13067 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13068 * (such as EXACT). Thus we can skip most everything if just sizing. We
13069 * call regclass to handle '[]' so as to not have to reinvent its parsing
13070 * rules here (throwing away the size it computes each time). And, we exit
13071 * upon an unescaped ']' that isn't one ending a regclass. To do both
13072 * these things, we need to realize that something preceded by a backslash
13073 * is escaped, so we have to keep track of backslashes */
13075 Perl_ck_warner_d(aTHX_
13076 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13077 "The regex_sets feature is experimental" REPORT_LOCATION,
13078 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13080 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13081 RExC_precomp + (RExC_parse - RExC_precomp)));
13084 UV depth = 0; /* how many nested (?[...]) constructs */
13086 while (RExC_parse < RExC_end) {
13087 SV* current = NULL;
13088 RExC_parse = regpatws(pRExC_state, RExC_parse,
13089 TRUE); /* means recognize comments */
13090 switch (*RExC_parse) {
13092 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13097 /* Skip the next byte (which could cause us to end up in
13098 * the middle of a UTF-8 character, but since none of those
13099 * are confusable with anything we currently handle in this
13100 * switch (invariants all), it's safe. We'll just hit the
13101 * default: case next time and keep on incrementing until
13102 * we find one of the invariants we do handle. */
13107 /* If this looks like it is a [:posix:] class, leave the
13108 * parse pointer at the '[' to fool regclass() into
13109 * thinking it is part of a '[[:posix:]]'. That function
13110 * will use strict checking to force a syntax error if it
13111 * doesn't work out to a legitimate class */
13112 bool is_posix_class
13113 = could_it_be_a_POSIX_class(pRExC_state);
13114 if (! is_posix_class) {
13118 /* regclass() can only return RESTART_UTF8 if multi-char
13119 folds are allowed. */
13120 if (!regclass(pRExC_state, flagp,depth+1,
13121 is_posix_class, /* parse the whole char
13122 class only if not a
13124 FALSE, /* don't allow multi-char folds */
13125 TRUE, /* silence non-portable warnings. */
13127 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13130 /* function call leaves parse pointing to the ']', except
13131 * if we faked it */
13132 if (is_posix_class) {
13136 SvREFCNT_dec(current); /* In case it returned something */
13141 if (depth--) break;
13143 if (RExC_parse < RExC_end
13144 && *RExC_parse == ')')
13146 node = reganode(pRExC_state, ANYOF, 0);
13147 RExC_size += ANYOF_SKIP;
13148 nextchar(pRExC_state);
13149 Set_Node_Length(node,
13150 RExC_parse - oregcomp_parse + 1); /* MJD */
13159 FAIL("Syntax error in (?[...])");
13162 /* Pass 2 only after this. Everything in this construct is a
13163 * metacharacter. Operands begin with either a '\' (for an escape
13164 * sequence), or a '[' for a bracketed character class. Any other
13165 * character should be an operator, or parenthesis for grouping. Both
13166 * types of operands are handled by calling regclass() to parse them. It
13167 * is called with a parameter to indicate to return the computed inversion
13168 * list. The parsing here is implemented via a stack. Each entry on the
13169 * stack is a single character representing one of the operators, or the
13170 * '('; or else a pointer to an operand inversion list. */
13172 #define IS_OPERAND(a) (! SvIOK(a))
13174 /* The stack starts empty. It is a syntax error if the first thing parsed
13175 * is a binary operator; everything else is pushed on the stack. When an
13176 * operand is parsed, the top of the stack is examined. If it is a binary
13177 * operator, the item before it should be an operand, and both are replaced
13178 * by the result of doing that operation on the new operand and the one on
13179 * the stack. Thus a sequence of binary operands is reduced to a single
13180 * one before the next one is parsed.
13182 * A unary operator may immediately follow a binary in the input, for
13185 * When an operand is parsed and the top of the stack is a unary operator,
13186 * the operation is performed, and then the stack is rechecked to see if
13187 * this new operand is part of a binary operation; if so, it is handled as
13190 * A '(' is simply pushed on the stack; it is valid only if the stack is
13191 * empty, or the top element of the stack is an operator or another '('
13192 * (for which the parenthesized expression will become an operand). By the
13193 * time the corresponding ')' is parsed everything in between should have
13194 * been parsed and evaluated to a single operand (or else is a syntax
13195 * error), and is handled as a regular operand */
13197 sv_2mortal((SV *)(stack = newAV()));
13199 while (RExC_parse < RExC_end) {
13200 I32 top_index = av_tindex(stack);
13202 SV* current = NULL;
13204 /* Skip white space */
13205 RExC_parse = regpatws(pRExC_state, RExC_parse,
13206 TRUE /* means recognize comments */ );
13207 if (RExC_parse >= RExC_end) {
13208 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13210 if ((curchar = UCHARAT(RExC_parse)) == ']') {
13217 if (av_tindex(stack) >= 0 /* This makes sure that we can
13218 safely subtract 1 from
13219 RExC_parse in the next clause.
13220 If we have something on the
13221 stack, we have parsed something
13223 && UCHARAT(RExC_parse - 1) == '('
13224 && RExC_parse < RExC_end)
13226 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13227 * This happens when we have some thing like
13229 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13231 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13233 * Here we would be handling the interpolated
13234 * '$thai_or_lao'. We handle this by a recursive call to
13235 * ourselves which returns the inversion list the
13236 * interpolated expression evaluates to. We use the flags
13237 * from the interpolated pattern. */
13238 U32 save_flags = RExC_flags;
13239 const char * const save_parse = ++RExC_parse;
13241 parse_lparen_question_flags(pRExC_state);
13243 if (RExC_parse == save_parse /* Makes sure there was at
13244 least one flag (or this
13245 embedding wasn't compiled)
13247 || RExC_parse >= RExC_end - 4
13248 || UCHARAT(RExC_parse) != ':'
13249 || UCHARAT(++RExC_parse) != '('
13250 || UCHARAT(++RExC_parse) != '?'
13251 || UCHARAT(++RExC_parse) != '[')
13254 /* In combination with the above, this moves the
13255 * pointer to the point just after the first erroneous
13256 * character (or if there are no flags, to where they
13257 * should have been) */
13258 if (RExC_parse >= RExC_end - 4) {
13259 RExC_parse = RExC_end;
13261 else if (RExC_parse != save_parse) {
13262 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13264 vFAIL("Expecting '(?flags:(?[...'");
13267 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13268 depth+1, oregcomp_parse);
13270 /* Here, 'current' contains the embedded expression's
13271 * inversion list, and RExC_parse points to the trailing
13272 * ']'; the next character should be the ')' which will be
13273 * paired with the '(' that has been put on the stack, so
13274 * the whole embedded expression reduces to '(operand)' */
13277 RExC_flags = save_flags;
13278 goto handle_operand;
13283 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13284 vFAIL("Unexpected character");
13287 /* regclass() can only return RESTART_UTF8 if multi-char
13288 folds are allowed. */
13289 if (!regclass(pRExC_state, flagp,depth+1,
13290 TRUE, /* means parse just the next thing */
13291 FALSE, /* don't allow multi-char folds */
13292 FALSE, /* don't silence non-portable warnings. */
13294 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13296 /* regclass() will return with parsing just the \ sequence,
13297 * leaving the parse pointer at the next thing to parse */
13299 goto handle_operand;
13301 case '[': /* Is a bracketed character class */
13303 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13305 if (! is_posix_class) {
13309 /* regclass() can only return RESTART_UTF8 if multi-char
13310 folds are allowed. */
13311 if(!regclass(pRExC_state, flagp,depth+1,
13312 is_posix_class, /* parse the whole char class
13313 only if not a posix class */
13314 FALSE, /* don't allow multi-char folds */
13315 FALSE, /* don't silence non-portable warnings. */
13317 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13319 /* function call leaves parse pointing to the ']', except if we
13321 if (is_posix_class) {
13325 goto handle_operand;
13334 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13335 || ! IS_OPERAND(*top_ptr))
13338 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13340 av_push(stack, newSVuv(curchar));
13344 av_push(stack, newSVuv(curchar));
13348 if (top_index >= 0) {
13349 top_ptr = av_fetch(stack, top_index, FALSE);
13351 if (IS_OPERAND(*top_ptr)) {
13353 vFAIL("Unexpected '(' with no preceding operator");
13356 av_push(stack, newSVuv(curchar));
13363 || ! (current = av_pop(stack))
13364 || ! IS_OPERAND(current)
13365 || ! (lparen = av_pop(stack))
13366 || IS_OPERAND(lparen)
13367 || SvUV(lparen) != '(')
13369 SvREFCNT_dec(current);
13371 vFAIL("Unexpected ')'");
13374 SvREFCNT_dec_NN(lparen);
13381 /* Here, we have an operand to process, in 'current' */
13383 if (top_index < 0) { /* Just push if stack is empty */
13384 av_push(stack, current);
13387 SV* top = av_pop(stack);
13389 char current_operator;
13391 if (IS_OPERAND(top)) {
13392 SvREFCNT_dec_NN(top);
13393 SvREFCNT_dec_NN(current);
13394 vFAIL("Operand with no preceding operator");
13396 current_operator = (char) SvUV(top);
13397 switch (current_operator) {
13398 case '(': /* Push the '(' back on followed by the new
13400 av_push(stack, top);
13401 av_push(stack, current);
13402 SvREFCNT_inc(top); /* Counters the '_dec' done
13403 just after the 'break', so
13404 it doesn't get wrongly freed
13409 _invlist_invert(current);
13411 /* Unlike binary operators, the top of the stack,
13412 * now that this unary one has been popped off, may
13413 * legally be an operator, and we now have operand
13416 SvREFCNT_dec_NN(top);
13417 goto handle_operand;
13420 prev = av_pop(stack);
13421 _invlist_intersection(prev,
13424 av_push(stack, current);
13429 prev = av_pop(stack);
13430 _invlist_union(prev, current, ¤t);
13431 av_push(stack, current);
13435 prev = av_pop(stack);;
13436 _invlist_subtract(prev, current, ¤t);
13437 av_push(stack, current);
13440 case '^': /* The union minus the intersection */
13446 prev = av_pop(stack);
13447 _invlist_union(prev, current, &u);
13448 _invlist_intersection(prev, current, &i);
13449 /* _invlist_subtract will overwrite current
13450 without freeing what it already contains */
13452 _invlist_subtract(u, i, ¤t);
13453 av_push(stack, current);
13454 SvREFCNT_dec_NN(i);
13455 SvREFCNT_dec_NN(u);
13456 SvREFCNT_dec_NN(element);
13461 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13463 SvREFCNT_dec_NN(top);
13464 SvREFCNT_dec(prev);
13468 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13471 if (av_tindex(stack) < 0 /* Was empty */
13472 || ((final = av_pop(stack)) == NULL)
13473 || ! IS_OPERAND(final)
13474 || av_tindex(stack) >= 0) /* More left on stack */
13476 vFAIL("Incomplete expression within '(?[ ])'");
13479 /* Here, 'final' is the resultant inversion list from evaluating the
13480 * expression. Return it if so requested */
13481 if (return_invlist) {
13482 *return_invlist = final;
13486 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13487 * expecting a string of ranges and individual code points */
13488 invlist_iterinit(final);
13489 result_string = newSVpvs("");
13490 while (invlist_iternext(final, &start, &end)) {
13491 if (start == end) {
13492 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13495 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13500 save_parse = RExC_parse;
13501 RExC_parse = SvPV(result_string, len);
13502 save_end = RExC_end;
13503 RExC_end = RExC_parse + len;
13505 /* We turn off folding around the call, as the class we have constructed
13506 * already has all folding taken into consideration, and we don't want
13507 * regclass() to add to that */
13508 RExC_flags &= ~RXf_PMf_FOLD;
13509 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13511 node = regclass(pRExC_state, flagp,depth+1,
13512 FALSE, /* means parse the whole char class */
13513 FALSE, /* don't allow multi-char folds */
13514 TRUE, /* silence non-portable warnings. The above may very
13515 well have generated non-portable code points, but
13516 they're valid on this machine */
13519 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13522 RExC_flags |= RXf_PMf_FOLD;
13524 RExC_parse = save_parse + 1;
13525 RExC_end = save_end;
13526 SvREFCNT_dec_NN(final);
13527 SvREFCNT_dec_NN(result_string);
13529 nextchar(pRExC_state);
13530 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13536 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13538 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13539 * innocent-looking character class, like /[ks]/i won't have to go out to
13540 * disk to find the possible matches.
13542 * This should be called only for a Latin1-range code points, cp, which is
13543 * known to be involved in a simple fold with other code points above
13544 * Latin1. It would give false results if /aa has been specified.
13545 * Multi-char folds are outside the scope of this, and must be handled
13548 * XXX It would be better to generate these via regen, in case a new
13549 * version of the Unicode standard adds new mappings, though that is not
13550 * really likely, and may be caught by the default: case of the switch
13553 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13555 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13561 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13565 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13568 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13569 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13571 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13572 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13573 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13575 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13576 *invlist = add_cp_to_invlist(*invlist,
13577 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13579 case LATIN_SMALL_LETTER_SHARP_S:
13580 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13583 /* Use deprecated warning to increase the chances of this being
13586 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13593 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13595 /* This adds the string scalar <multi_string> to the array
13596 * <multi_char_matches>. <multi_string> is known to have exactly
13597 * <cp_count> code points in it. This is used when constructing a
13598 * bracketed character class and we find something that needs to match more
13599 * than a single character.
13601 * <multi_char_matches> is actually an array of arrays. Each top-level
13602 * element is an array that contains all the strings known so far that are
13603 * the same length. And that length (in number of code points) is the same
13604 * as the index of the top-level array. Hence, the [2] element is an
13605 * array, each element thereof is a string containing TWO code points;
13606 * while element [3] is for strings of THREE characters, and so on. Since
13607 * this is for multi-char strings there can never be a [0] nor [1] element.
13609 * When we rewrite the character class below, we will do so such that the
13610 * longest strings are written first, so that it prefers the longest
13611 * matching strings first. This is done even if it turns out that any
13612 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
13613 * Christiansen has agreed that this is ok. This makes the test for the
13614 * ligature 'ffi' come before the test for 'ff', for example */
13617 AV** this_array_ptr;
13619 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13621 if (! multi_char_matches) {
13622 multi_char_matches = newAV();
13625 if (av_exists(multi_char_matches, cp_count)) {
13626 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13627 this_array = *this_array_ptr;
13630 this_array = newAV();
13631 av_store(multi_char_matches, cp_count,
13634 av_push(this_array, multi_string);
13636 return multi_char_matches;
13639 /* The names of properties whose definitions are not known at compile time are
13640 * stored in this SV, after a constant heading. So if the length has been
13641 * changed since initialization, then there is a run-time definition. */
13642 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13643 (SvCUR(listsv) != initial_listsv_len)
13646 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13647 const bool stop_at_1, /* Just parse the next thing, don't
13648 look for a full character class */
13649 bool allow_multi_folds,
13650 const bool silence_non_portable, /* Don't output warnings
13653 SV** ret_invlist) /* Return an inversion list, not a node */
13655 /* parse a bracketed class specification. Most of these will produce an
13656 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13657 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13658 * under /i with multi-character folds: it will be rewritten following the
13659 * paradigm of this example, where the <multi-fold>s are characters which
13660 * fold to multiple character sequences:
13661 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13662 * gets effectively rewritten as:
13663 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13664 * reg() gets called (recursively) on the rewritten version, and this
13665 * function will return what it constructs. (Actually the <multi-fold>s
13666 * aren't physically removed from the [abcdefghi], it's just that they are
13667 * ignored in the recursion by means of a flag:
13668 * <RExC_in_multi_char_class>.)
13670 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13671 * characters, with the corresponding bit set if that character is in the
13672 * list. For characters above this, a range list or swash is used. There
13673 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13674 * determinable at compile time
13676 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13677 * to be restarted. This can only happen if ret_invlist is non-NULL.
13680 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13682 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13685 IV namedclass = OOB_NAMEDCLASS;
13686 char *rangebegin = NULL;
13687 bool need_class = 0;
13689 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13690 than just initialized. */
13691 SV* properties = NULL; /* Code points that match \p{} \P{} */
13692 SV* posixes = NULL; /* Code points that match classes like [:word:],
13693 extended beyond the Latin1 range. These have to
13694 be kept separate from other code points for much
13695 of this function because their handling is
13696 different under /i, and for most classes under
13698 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13699 separate for a while from the non-complemented
13700 versions because of complications with /d
13702 UV element_count = 0; /* Number of distinct elements in the class.
13703 Optimizations may be possible if this is tiny */
13704 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13705 character; used under /i */
13707 char * stop_ptr = RExC_end; /* where to stop parsing */
13708 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13710 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13712 /* Unicode properties are stored in a swash; this holds the current one
13713 * being parsed. If this swash is the only above-latin1 component of the
13714 * character class, an optimization is to pass it directly on to the
13715 * execution engine. Otherwise, it is set to NULL to indicate that there
13716 * are other things in the class that have to be dealt with at execution
13718 SV* swash = NULL; /* Code points that match \p{} \P{} */
13720 /* Set if a component of this character class is user-defined; just passed
13721 * on to the engine */
13722 bool has_user_defined_property = FALSE;
13724 /* inversion list of code points this node matches only when the target
13725 * string is in UTF-8. (Because is under /d) */
13726 SV* depends_list = NULL;
13728 /* Inversion list of code points this node matches regardless of things
13729 * like locale, folding, utf8ness of the target string */
13730 SV* cp_list = NULL;
13732 /* Like cp_list, but code points on this list need to be checked for things
13733 * that fold to/from them under /i */
13734 SV* cp_foldable_list = NULL;
13736 /* Like cp_list, but code points on this list are valid only when the
13737 * runtime locale is UTF-8 */
13738 SV* only_utf8_locale_list = NULL;
13741 /* In a range, counts how many 0-2 of the ends of it came from literals,
13742 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13743 UV literal_endpoint = 0;
13745 bool invert = FALSE; /* Is this class to be complemented */
13747 bool warn_super = ALWAYS_WARN_SUPER;
13749 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13750 case we need to change the emitted regop to an EXACT. */
13751 const char * orig_parse = RExC_parse;
13752 const SSize_t orig_size = RExC_size;
13753 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13754 GET_RE_DEBUG_FLAGS_DECL;
13756 PERL_ARGS_ASSERT_REGCLASS;
13758 PERL_UNUSED_ARG(depth);
13761 DEBUG_PARSE("clas");
13763 /* Assume we are going to generate an ANYOF node. */
13764 ret = reganode(pRExC_state, ANYOF, 0);
13767 RExC_size += ANYOF_SKIP;
13768 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13771 ANYOF_FLAGS(ret) = 0;
13773 RExC_emit += ANYOF_SKIP;
13774 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13775 initial_listsv_len = SvCUR(listsv);
13776 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13780 RExC_parse = regpatws(pRExC_state, RExC_parse,
13781 FALSE /* means don't recognize comments */ );
13784 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13787 allow_multi_folds = FALSE;
13790 RExC_parse = regpatws(pRExC_state, RExC_parse,
13791 FALSE /* means don't recognize comments */ );
13795 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13796 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13797 const char *s = RExC_parse;
13798 const char c = *s++;
13800 while (isWORDCHAR(*s))
13802 if (*s && c == *s && s[1] == ']') {
13803 SAVEFREESV(RExC_rx_sv);
13805 "POSIX syntax [%c %c] belongs inside character classes",
13807 (void)ReREFCNT_inc(RExC_rx_sv);
13811 /* If the caller wants us to just parse a single element, accomplish this
13812 * by faking the loop ending condition */
13813 if (stop_at_1 && RExC_end > RExC_parse) {
13814 stop_ptr = RExC_parse + 1;
13817 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13818 if (UCHARAT(RExC_parse) == ']')
13819 goto charclassloop;
13822 if (RExC_parse >= stop_ptr) {
13827 RExC_parse = regpatws(pRExC_state, RExC_parse,
13828 FALSE /* means don't recognize comments */ );
13831 if (UCHARAT(RExC_parse) == ']') {
13837 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13838 save_value = value;
13839 save_prevvalue = prevvalue;
13842 rangebegin = RExC_parse;
13846 value = utf8n_to_uvchr((U8*)RExC_parse,
13847 RExC_end - RExC_parse,
13848 &numlen, UTF8_ALLOW_DEFAULT);
13849 RExC_parse += numlen;
13852 value = UCHARAT(RExC_parse++);
13855 && RExC_parse < RExC_end
13856 && POSIXCC(UCHARAT(RExC_parse)))
13858 namedclass = regpposixcc(pRExC_state, value, strict);
13860 else if (value != '\\') {
13862 literal_endpoint++;
13866 /* Is a backslash; get the code point of the char after it */
13867 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13868 value = utf8n_to_uvchr((U8*)RExC_parse,
13869 RExC_end - RExC_parse,
13870 &numlen, UTF8_ALLOW_DEFAULT);
13871 RExC_parse += numlen;
13874 value = UCHARAT(RExC_parse++);
13876 /* Some compilers cannot handle switching on 64-bit integer
13877 * values, therefore value cannot be an UV. Yes, this will
13878 * be a problem later if we want switch on Unicode.
13879 * A similar issue a little bit later when switching on
13880 * namedclass. --jhi */
13882 /* If the \ is escaping white space when white space is being
13883 * skipped, it means that that white space is wanted literally, and
13884 * is already in 'value'. Otherwise, need to translate the escape
13885 * into what it signifies. */
13886 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13888 case 'w': namedclass = ANYOF_WORDCHAR; break;
13889 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13890 case 's': namedclass = ANYOF_SPACE; break;
13891 case 'S': namedclass = ANYOF_NSPACE; break;
13892 case 'd': namedclass = ANYOF_DIGIT; break;
13893 case 'D': namedclass = ANYOF_NDIGIT; break;
13894 case 'v': namedclass = ANYOF_VERTWS; break;
13895 case 'V': namedclass = ANYOF_NVERTWS; break;
13896 case 'h': namedclass = ANYOF_HORIZWS; break;
13897 case 'H': namedclass = ANYOF_NHORIZWS; break;
13898 case 'N': /* Handle \N{NAME} in class */
13901 STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13902 flagp, depth, &as_text);
13903 if (*flagp & RESTART_UTF8)
13904 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13905 if (cp_count != 1) { /* The typical case drops through */
13906 assert(cp_count != (STRLEN) -1);
13907 if (cp_count == 0) {
13909 RExC_parse++; /* Position after the "}" */
13910 vFAIL("Zero length \\N{}");
13913 ckWARNreg(RExC_parse,
13914 "Ignoring zero length \\N{} in character class");
13917 else { /* cp_count > 1 */
13918 if (! RExC_in_multi_char_class) {
13919 if (invert || range || *RExC_parse == '-') {
13922 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13925 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13930 = add_multi_match(multi_char_matches,
13934 break; /* <value> contains the first code
13935 point. Drop out of the switch to
13938 } /* End of cp_count != 1 */
13940 /* This element should not be processed further in this
13943 value = save_value;
13944 prevvalue = save_prevvalue;
13945 continue; /* Back to top of loop to get next char */
13947 /* Here, is a single code point, and <value> contains it */
13949 /* We consider named characters to be literal characters */
13950 literal_endpoint++;
13959 /* We will handle any undefined properties ourselves */
13960 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13961 /* And we actually would prefer to get
13962 * the straight inversion list of the
13963 * swash, since we will be accessing it
13964 * anyway, to save a little time */
13965 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13967 if (RExC_parse >= RExC_end)
13968 vFAIL2("Empty \\%c{}", (U8)value);
13969 if (*RExC_parse == '{') {
13970 const U8 c = (U8)value;
13971 e = strchr(RExC_parse++, '}');
13973 vFAIL2("Missing right brace on \\%c{}", c);
13974 while (isSPACE(*RExC_parse))
13976 if (e == RExC_parse)
13977 vFAIL2("Empty \\%c{}", c);
13978 n = e - RExC_parse;
13979 while (isSPACE(*(RExC_parse + n - 1)))
13990 if (UCHARAT(RExC_parse) == '^') {
13993 /* toggle. (The rhs xor gets the single bit that
13994 * differs between P and p; the other xor inverts just
13996 value ^= 'P' ^ 'p';
13998 while (isSPACE(*RExC_parse)) {
14003 /* Try to get the definition of the property into
14004 * <invlist>. If /i is in effect, the effective property
14005 * will have its name be <__NAME_i>. The design is
14006 * discussed in commit
14007 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14008 name = savepv(Perl_form(aTHX_
14010 (FOLD) ? "__" : "",
14016 /* Look up the property name, and get its swash and
14017 * inversion list, if the property is found */
14019 SvREFCNT_dec_NN(swash);
14021 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14024 NULL, /* No inversion list */
14027 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14028 HV* curpkg = (IN_PERL_COMPILETIME)
14030 : CopSTASH(PL_curcop);
14032 SvREFCNT_dec_NN(swash);
14036 /* Here didn't find it. It could be a user-defined
14037 * property that will be available at run-time. If we
14038 * accept only compile-time properties, is an error;
14039 * otherwise add it to the list for run-time look up */
14041 RExC_parse = e + 1;
14043 "Property '%"UTF8f"' is unknown",
14044 UTF8fARG(UTF, n, name));
14047 /* If the property name doesn't already have a package
14048 * name, add the current one to it so that it can be
14049 * referred to outside it. [perl #121777] */
14050 if (curpkg && ! instr(name, "::")) {
14051 char* pkgname = HvNAME(curpkg);
14052 if (strNE(pkgname, "main")) {
14053 char* full_name = Perl_form(aTHX_
14057 n = strlen(full_name);
14059 name = savepvn(full_name, n);
14062 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14063 (value == 'p' ? '+' : '!'),
14064 UTF8fARG(UTF, n, name));
14065 has_user_defined_property = TRUE;
14067 /* We don't know yet, so have to assume that the
14068 * property could match something in the Latin1 range,
14069 * hence something that isn't utf8. Note that this
14070 * would cause things in <depends_list> to match
14071 * inappropriately, except that any \p{}, including
14072 * this one forces Unicode semantics, which means there
14073 * is no <depends_list> */
14075 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14079 /* Here, did get the swash and its inversion list. If
14080 * the swash is from a user-defined property, then this
14081 * whole character class should be regarded as such */
14082 if (swash_init_flags
14083 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14085 has_user_defined_property = TRUE;
14088 /* We warn on matching an above-Unicode code point
14089 * if the match would return true, except don't
14090 * warn for \p{All}, which has exactly one element
14092 (_invlist_contains_cp(invlist, 0x110000)
14093 && (! (_invlist_len(invlist) == 1
14094 && *invlist_array(invlist) == 0)))
14100 /* Invert if asking for the complement */
14101 if (value == 'P') {
14102 _invlist_union_complement_2nd(properties,
14106 /* The swash can't be used as-is, because we've
14107 * inverted things; delay removing it to here after
14108 * have copied its invlist above */
14109 SvREFCNT_dec_NN(swash);
14113 _invlist_union(properties, invlist, &properties);
14118 RExC_parse = e + 1;
14119 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14122 /* \p means they want Unicode semantics */
14123 RExC_uni_semantics = 1;
14126 case 'n': value = '\n'; break;
14127 case 'r': value = '\r'; break;
14128 case 't': value = '\t'; break;
14129 case 'f': value = '\f'; break;
14130 case 'b': value = '\b'; break;
14131 case 'e': value = ESC_NATIVE; break;
14132 case 'a': value = '\a'; break;
14134 RExC_parse--; /* function expects to be pointed at the 'o' */
14136 const char* error_msg;
14137 bool valid = grok_bslash_o(&RExC_parse,
14140 PASS2, /* warnings only in
14143 silence_non_portable,
14149 if (PL_encoding && value < 0x100) {
14150 goto recode_encoding;
14154 RExC_parse--; /* function expects to be pointed at the 'x' */
14156 const char* error_msg;
14157 bool valid = grok_bslash_x(&RExC_parse,
14160 PASS2, /* Output warnings */
14162 silence_non_portable,
14168 if (PL_encoding && value < 0x100)
14169 goto recode_encoding;
14172 value = grok_bslash_c(*RExC_parse++, PASS2);
14174 case '0': case '1': case '2': case '3': case '4':
14175 case '5': case '6': case '7':
14177 /* Take 1-3 octal digits */
14178 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14179 numlen = (strict) ? 4 : 3;
14180 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14181 RExC_parse += numlen;
14184 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14185 vFAIL("Need exactly 3 octal digits");
14187 else if (! SIZE_ONLY /* like \08, \178 */
14189 && RExC_parse < RExC_end
14190 && isDIGIT(*RExC_parse)
14191 && ckWARN(WARN_REGEXP))
14193 SAVEFREESV(RExC_rx_sv);
14194 reg_warn_non_literal_string(
14196 form_short_octal_warning(RExC_parse, numlen));
14197 (void)ReREFCNT_inc(RExC_rx_sv);
14200 if (PL_encoding && value < 0x100)
14201 goto recode_encoding;
14205 if (! RExC_override_recoding) {
14206 SV* enc = PL_encoding;
14207 value = reg_recode((const char)(U8)value, &enc);
14210 vFAIL("Invalid escape in the specified encoding");
14213 ckWARNreg(RExC_parse,
14214 "Invalid escape in the specified encoding");
14220 /* Allow \_ to not give an error */
14221 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14223 vFAIL2("Unrecognized escape \\%c in character class",
14227 SAVEFREESV(RExC_rx_sv);
14228 ckWARN2reg(RExC_parse,
14229 "Unrecognized escape \\%c in character class passed through",
14231 (void)ReREFCNT_inc(RExC_rx_sv);
14235 } /* End of switch on char following backslash */
14236 } /* end of handling backslash escape sequences */
14238 /* Here, we have the current token in 'value' */
14240 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14243 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14244 * literal, as is the character that began the false range, i.e.
14245 * the 'a' in the examples */
14248 const int w = (RExC_parse >= rangebegin)
14249 ? RExC_parse - rangebegin
14253 "False [] range \"%"UTF8f"\"",
14254 UTF8fARG(UTF, w, rangebegin));
14257 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14258 ckWARN2reg(RExC_parse,
14259 "False [] range \"%"UTF8f"\"",
14260 UTF8fARG(UTF, w, rangebegin));
14261 (void)ReREFCNT_inc(RExC_rx_sv);
14262 cp_list = add_cp_to_invlist(cp_list, '-');
14263 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14268 range = 0; /* this was not a true range */
14269 element_count += 2; /* So counts for three values */
14272 classnum = namedclass_to_classnum(namedclass);
14274 if (LOC && namedclass < ANYOF_POSIXL_MAX
14275 #ifndef HAS_ISASCII
14276 && classnum != _CC_ASCII
14279 /* What the Posix classes (like \w, [:space:]) match in locale
14280 * isn't knowable under locale until actual match time. Room
14281 * must be reserved (one time per outer bracketed class) to
14282 * store such classes. The space will contain a bit for each
14283 * named class that is to be matched against. This isn't
14284 * needed for \p{} and pseudo-classes, as they are not affected
14285 * by locale, and hence are dealt with separately */
14286 if (! need_class) {
14289 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14292 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14294 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14295 ANYOF_POSIXL_ZERO(ret);
14298 /* Coverity thinks it is possible for this to be negative; both
14299 * jhi and khw think it's not, but be safer */
14300 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14301 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14303 /* See if it already matches the complement of this POSIX
14305 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14306 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14310 posixl_matches_all = TRUE;
14311 break; /* No need to continue. Since it matches both
14312 e.g., \w and \W, it matches everything, and the
14313 bracketed class can be optimized into qr/./s */
14316 /* Add this class to those that should be checked at runtime */
14317 ANYOF_POSIXL_SET(ret, namedclass);
14319 /* The above-Latin1 characters are not subject to locale rules.
14320 * Just add them, in the second pass, to the
14321 * unconditionally-matched list */
14323 SV* scratch_list = NULL;
14325 /* Get the list of the above-Latin1 code points this
14327 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14328 PL_XPosix_ptrs[classnum],
14330 /* Odd numbers are complements, like
14331 * NDIGIT, NASCII, ... */
14332 namedclass % 2 != 0,
14334 /* Checking if 'cp_list' is NULL first saves an extra
14335 * clone. Its reference count will be decremented at the
14336 * next union, etc, or if this is the only instance, at the
14337 * end of the routine */
14339 cp_list = scratch_list;
14342 _invlist_union(cp_list, scratch_list, &cp_list);
14343 SvREFCNT_dec_NN(scratch_list);
14345 continue; /* Go get next character */
14348 else if (! SIZE_ONLY) {
14350 /* Here, not in pass1 (in that pass we skip calculating the
14351 * contents of this class), and is /l, or is a POSIX class for
14352 * which /l doesn't matter (or is a Unicode property, which is
14353 * skipped here). */
14354 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14355 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14357 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14358 * nor /l make a difference in what these match,
14359 * therefore we just add what they match to cp_list. */
14360 if (classnum != _CC_VERTSPACE) {
14361 assert( namedclass == ANYOF_HORIZWS
14362 || namedclass == ANYOF_NHORIZWS);
14364 /* It turns out that \h is just a synonym for
14366 classnum = _CC_BLANK;
14369 _invlist_union_maybe_complement_2nd(
14371 PL_XPosix_ptrs[classnum],
14372 namedclass % 2 != 0, /* Complement if odd
14373 (NHORIZWS, NVERTWS)
14378 else { /* Garden variety class. If is NASCII, NDIGIT, ...
14379 complement and use nposixes */
14380 SV** posixes_ptr = namedclass % 2 == 0
14383 SV** source_ptr = &PL_XPosix_ptrs[classnum];
14384 _invlist_union_maybe_complement_2nd(
14387 namedclass % 2 != 0,
14391 } /* end of namedclass \blah */
14394 RExC_parse = regpatws(pRExC_state, RExC_parse,
14395 FALSE /* means don't recognize comments */ );
14398 /* If 'range' is set, 'value' is the ending of a range--check its
14399 * validity. (If value isn't a single code point in the case of a
14400 * range, we should have figured that out above in the code that
14401 * catches false ranges). Later, we will handle each individual code
14402 * point in the range. If 'range' isn't set, this could be the
14403 * beginning of a range, so check for that by looking ahead to see if
14404 * the next real character to be processed is the range indicator--the
14408 if (prevvalue > value) /* b-a */ {
14409 const int w = RExC_parse - rangebegin;
14411 "Invalid [] range \"%"UTF8f"\"",
14412 UTF8fARG(UTF, w, rangebegin));
14413 range = 0; /* not a valid range */
14417 prevvalue = value; /* save the beginning of the potential range */
14418 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14419 && *RExC_parse == '-')
14421 char* next_char_ptr = RExC_parse + 1;
14422 if (skip_white) { /* Get the next real char after the '-' */
14423 next_char_ptr = regpatws(pRExC_state,
14425 FALSE); /* means don't recognize
14429 /* If the '-' is at the end of the class (just before the ']',
14430 * it is a literal minus; otherwise it is a range */
14431 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14432 RExC_parse = next_char_ptr;
14434 /* a bad range like \w-, [:word:]- ? */
14435 if (namedclass > OOB_NAMEDCLASS) {
14436 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14437 const int w = RExC_parse >= rangebegin
14438 ? RExC_parse - rangebegin
14441 vFAIL4("False [] range \"%*.*s\"",
14446 "False [] range \"%*.*s\"",
14451 cp_list = add_cp_to_invlist(cp_list, '-');
14455 range = 1; /* yeah, it's a range! */
14456 continue; /* but do it the next time */
14461 if (namedclass > OOB_NAMEDCLASS) {
14465 /* Here, we have a single value this time through the loop, and
14466 * <prevvalue> is the beginning of the range, if any; or <value> if
14469 /* non-Latin1 code point implies unicode semantics. Must be set in
14470 * pass1 so is there for the whole of pass 2 */
14472 RExC_uni_semantics = 1;
14475 /* Ready to process either the single value, or the completed range.
14476 * For single-valued non-inverted ranges, we consider the possibility
14477 * of multi-char folds. (We made a conscious decision to not do this
14478 * for the other cases because it can often lead to non-intuitive
14479 * results. For example, you have the peculiar case that:
14480 * "s s" =~ /^[^\xDF]+$/i => Y
14481 * "ss" =~ /^[^\xDF]+$/i => N
14483 * See [perl #89750] */
14484 if (FOLD && allow_multi_folds && value == prevvalue) {
14485 if (value == LATIN_SMALL_LETTER_SHARP_S
14486 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14489 /* Here <value> is indeed a multi-char fold. Get what it is */
14491 U8 foldbuf[UTF8_MAXBYTES_CASE];
14494 UV folded = _to_uni_fold_flags(
14498 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14499 ? FOLD_FLAGS_NOMIX_ASCII
14503 /* Here, <folded> should be the first character of the
14504 * multi-char fold of <value>, with <foldbuf> containing the
14505 * whole thing. But, if this fold is not allowed (because of
14506 * the flags), <fold> will be the same as <value>, and should
14507 * be processed like any other character, so skip the special
14509 if (folded != value) {
14511 /* Skip if we are recursed, currently parsing the class
14512 * again. Otherwise add this character to the list of
14513 * multi-char folds. */
14514 if (! RExC_in_multi_char_class) {
14515 STRLEN cp_count = utf8_length(foldbuf,
14516 foldbuf + foldlen);
14517 SV* multi_fold = sv_2mortal(newSVpvs(""));
14519 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14522 = add_multi_match(multi_char_matches,
14528 /* This element should not be processed further in this
14531 value = save_value;
14532 prevvalue = save_prevvalue;
14538 /* Deal with this element of the class */
14541 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14544 SV* this_range = _new_invlist(1);
14545 _append_range_to_invlist(this_range, prevvalue, value);
14547 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14548 * If this range was specified using something like 'i-j', we want
14549 * to include only the 'i' and the 'j', and not anything in
14550 * between, so exclude non-ASCII, non-alphabetics from it.
14551 * However, if the range was specified with something like
14552 * [\x89-\x91] or [\x89-j], all code points within it should be
14553 * included. literal_endpoint==2 means both ends of the range used
14554 * a literal character, not \x{foo} */
14555 if (literal_endpoint == 2
14556 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14557 || (isUPPER_A(prevvalue) && isUPPER_A(value))))
14559 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14562 /* Since 'this_range' now only contains ascii, the intersection
14563 * of it with anything will still yield only ascii */
14564 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14567 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14568 literal_endpoint = 0;
14569 SvREFCNT_dec_NN(this_range);
14573 range = 0; /* this range (if it was one) is done now */
14574 } /* End of loop through all the text within the brackets */
14576 /* If anything in the class expands to more than one character, we have to
14577 * deal with them by building up a substitute parse string, and recursively
14578 * calling reg() on it, instead of proceeding */
14579 if (multi_char_matches) {
14580 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14583 char *save_end = RExC_end;
14584 char *save_parse = RExC_parse;
14585 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14590 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14591 because too confusing */
14593 sv_catpv(substitute_parse, "(?:");
14597 /* Look at the longest folds first */
14598 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14600 if (av_exists(multi_char_matches, cp_count)) {
14601 AV** this_array_ptr;
14604 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14606 while ((this_sequence = av_pop(*this_array_ptr)) !=
14609 if (! first_time) {
14610 sv_catpv(substitute_parse, "|");
14612 first_time = FALSE;
14614 sv_catpv(substitute_parse, SvPVX(this_sequence));
14619 /* If the character class contains anything else besides these
14620 * multi-character folds, have to include it in recursive parsing */
14621 if (element_count) {
14622 sv_catpv(substitute_parse, "|[");
14623 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14624 sv_catpv(substitute_parse, "]");
14627 sv_catpv(substitute_parse, ")");
14630 /* This is a way to get the parse to skip forward a whole named
14631 * sequence instead of matching the 2nd character when it fails the
14633 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14637 RExC_parse = SvPV(substitute_parse, len);
14638 RExC_end = RExC_parse + len;
14639 RExC_in_multi_char_class = 1;
14640 RExC_override_recoding = 1;
14641 RExC_emit = (regnode *)orig_emit;
14643 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14645 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14647 RExC_parse = save_parse;
14648 RExC_end = save_end;
14649 RExC_in_multi_char_class = 0;
14650 RExC_override_recoding = 0;
14651 SvREFCNT_dec_NN(multi_char_matches);
14655 /* Here, we've gone through the entire class and dealt with multi-char
14656 * folds. We are now in a position that we can do some checks to see if we
14657 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14658 * Currently we only do two checks:
14659 * 1) is in the unlikely event that the user has specified both, eg. \w and
14660 * \W under /l, then the class matches everything. (This optimization
14661 * is done only to make the optimizer code run later work.)
14662 * 2) if the character class contains only a single element (including a
14663 * single range), we see if there is an equivalent node for it.
14664 * Other checks are possible */
14665 if (! ret_invlist /* Can't optimize if returning the constructed
14667 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14672 if (UNLIKELY(posixl_matches_all)) {
14675 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14676 \w or [:digit:] or \p{foo}
14679 /* All named classes are mapped into POSIXish nodes, with its FLAG
14680 * argument giving which class it is */
14681 switch ((I32)namedclass) {
14682 case ANYOF_UNIPROP:
14685 /* These don't depend on the charset modifiers. They always
14686 * match under /u rules */
14687 case ANYOF_NHORIZWS:
14688 case ANYOF_HORIZWS:
14689 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14692 case ANYOF_NVERTWS:
14697 /* The actual POSIXish node for all the rest depends on the
14698 * charset modifier. The ones in the first set depend only on
14699 * ASCII or, if available on this platform, locale */
14703 op = (LOC) ? POSIXL : POSIXA;
14714 /* under /a could be alpha */
14716 if (ASCII_RESTRICTED) {
14717 namedclass = ANYOF_ALPHA + (namedclass % 2);
14725 /* The rest have more possibilities depending on the charset.
14726 * We take advantage of the enum ordering of the charset
14727 * modifiers to get the exact node type, */
14729 op = POSIXD + get_regex_charset(RExC_flags);
14730 if (op > POSIXA) { /* /aa is same as /a */
14735 /* The odd numbered ones are the complements of the
14736 * next-lower even number one */
14737 if (namedclass % 2 == 1) {
14741 arg = namedclass_to_classnum(namedclass);
14745 else if (value == prevvalue) {
14747 /* Here, the class consists of just a single code point */
14750 if (! LOC && value == '\n') {
14751 op = REG_ANY; /* Optimize [^\n] */
14752 *flagp |= HASWIDTH|SIMPLE;
14756 else if (value < 256 || UTF) {
14758 /* Optimize a single value into an EXACTish node, but not if it
14759 * would require converting the pattern to UTF-8. */
14760 op = compute_EXACTish(pRExC_state);
14762 } /* Otherwise is a range */
14763 else if (! LOC) { /* locale could vary these */
14764 if (prevvalue == '0') {
14765 if (value == '9') {
14770 else if (prevvalue == 'A') {
14773 && literal_endpoint == 2
14776 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14780 else if (prevvalue == 'a') {
14783 && literal_endpoint == 2
14786 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14792 /* Here, we have changed <op> away from its initial value iff we found
14793 * an optimization */
14796 /* Throw away this ANYOF regnode, and emit the calculated one,
14797 * which should correspond to the beginning, not current, state of
14799 const char * cur_parse = RExC_parse;
14800 RExC_parse = (char *)orig_parse;
14804 /* To get locale nodes to not use the full ANYOF size would
14805 * require moving the code above that writes the portions
14806 * of it that aren't in other nodes to after this point.
14807 * e.g. ANYOF_POSIXL_SET */
14808 RExC_size = orig_size;
14812 RExC_emit = (regnode *)orig_emit;
14813 if (PL_regkind[op] == POSIXD) {
14814 if (op == POSIXL) {
14815 RExC_contains_locale = 1;
14818 op += NPOSIXD - POSIXD;
14823 ret = reg_node(pRExC_state, op);
14825 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14829 *flagp |= HASWIDTH|SIMPLE;
14831 else if (PL_regkind[op] == EXACT) {
14832 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14833 TRUE /* downgradable to EXACT */
14837 RExC_parse = (char *) cur_parse;
14839 SvREFCNT_dec(posixes);
14840 SvREFCNT_dec(nposixes);
14841 SvREFCNT_dec(cp_list);
14842 SvREFCNT_dec(cp_foldable_list);
14849 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14851 /* If folding, we calculate all characters that could fold to or from the
14852 * ones already on the list */
14853 if (cp_foldable_list) {
14855 UV start, end; /* End points of code point ranges */
14857 SV* fold_intersection = NULL;
14860 /* Our calculated list will be for Unicode rules. For locale
14861 * matching, we have to keep a separate list that is consulted at
14862 * runtime only when the locale indicates Unicode rules. For
14863 * non-locale, we just use to the general list */
14865 use_list = &only_utf8_locale_list;
14868 use_list = &cp_list;
14871 /* Only the characters in this class that participate in folds need
14872 * be checked. Get the intersection of this class and all the
14873 * possible characters that are foldable. This can quickly narrow
14874 * down a large class */
14875 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14876 &fold_intersection);
14878 /* The folds for all the Latin1 characters are hard-coded into this
14879 * program, but we have to go out to disk to get the others. */
14880 if (invlist_highest(cp_foldable_list) >= 256) {
14882 /* This is a hash that for a particular fold gives all
14883 * characters that are involved in it */
14884 if (! PL_utf8_foldclosures) {
14885 _load_PL_utf8_foldclosures();
14889 /* Now look at the foldable characters in this class individually */
14890 invlist_iterinit(fold_intersection);
14891 while (invlist_iternext(fold_intersection, &start, &end)) {
14894 /* Look at every character in the range */
14895 for (j = start; j <= end; j++) {
14896 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14902 if (IS_IN_SOME_FOLD_L1(j)) {
14904 /* ASCII is always matched; non-ASCII is matched
14905 * only under Unicode rules (which could happen
14906 * under /l if the locale is a UTF-8 one */
14907 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14908 *use_list = add_cp_to_invlist(*use_list,
14909 PL_fold_latin1[j]);
14913 add_cp_to_invlist(depends_list,
14914 PL_fold_latin1[j]);
14918 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14919 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14921 add_above_Latin1_folds(pRExC_state,
14928 /* Here is an above Latin1 character. We don't have the
14929 * rules hard-coded for it. First, get its fold. This is
14930 * the simple fold, as the multi-character folds have been
14931 * handled earlier and separated out */
14932 _to_uni_fold_flags(j, foldbuf, &foldlen,
14933 (ASCII_FOLD_RESTRICTED)
14934 ? FOLD_FLAGS_NOMIX_ASCII
14937 /* Single character fold of above Latin1. Add everything in
14938 * its fold closure to the list that this node should match.
14939 * The fold closures data structure is a hash with the keys
14940 * being the UTF-8 of every character that is folded to, like
14941 * 'k', and the values each an array of all code points that
14942 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14943 * Multi-character folds are not included */
14944 if ((listp = hv_fetch(PL_utf8_foldclosures,
14945 (char *) foldbuf, foldlen, FALSE)))
14947 AV* list = (AV*) *listp;
14949 for (k = 0; k <= av_tindex(list); k++) {
14950 SV** c_p = av_fetch(list, k, FALSE);
14956 /* /aa doesn't allow folds between ASCII and non- */
14957 if ((ASCII_FOLD_RESTRICTED
14958 && (isASCII(c) != isASCII(j))))
14963 /* Folds under /l which cross the 255/256 boundary
14964 * are added to a separate list. (These are valid
14965 * only when the locale is UTF-8.) */
14966 if (c < 256 && LOC) {
14967 *use_list = add_cp_to_invlist(*use_list, c);
14971 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14973 cp_list = add_cp_to_invlist(cp_list, c);
14976 /* Similarly folds involving non-ascii Latin1
14977 * characters under /d are added to their list */
14978 depends_list = add_cp_to_invlist(depends_list,
14985 SvREFCNT_dec_NN(fold_intersection);
14988 /* Now that we have finished adding all the folds, there is no reason
14989 * to keep the foldable list separate */
14990 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14991 SvREFCNT_dec_NN(cp_foldable_list);
14994 /* And combine the result (if any) with any inversion list from posix
14995 * classes. The lists are kept separate up to now because we don't want to
14996 * fold the classes (folding of those is automatically handled by the swash
14997 * fetching code) */
14998 if (posixes || nposixes) {
14999 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15000 /* Under /a and /aa, nothing above ASCII matches these */
15001 _invlist_intersection(posixes,
15002 PL_XPosix_ptrs[_CC_ASCII],
15006 if (DEPENDS_SEMANTICS) {
15007 /* Under /d, everything in the upper half of the Latin1 range
15008 * matches these complements */
15009 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15011 else if (AT_LEAST_ASCII_RESTRICTED) {
15012 /* Under /a and /aa, everything above ASCII matches these
15014 _invlist_union_complement_2nd(nposixes,
15015 PL_XPosix_ptrs[_CC_ASCII],
15019 _invlist_union(posixes, nposixes, &posixes);
15020 SvREFCNT_dec_NN(nposixes);
15023 posixes = nposixes;
15026 if (! DEPENDS_SEMANTICS) {
15028 _invlist_union(cp_list, posixes, &cp_list);
15029 SvREFCNT_dec_NN(posixes);
15036 /* Under /d, we put into a separate list the Latin1 things that
15037 * match only when the target string is utf8 */
15038 SV* nonascii_but_latin1_properties = NULL;
15039 _invlist_intersection(posixes, PL_UpperLatin1,
15040 &nonascii_but_latin1_properties);
15041 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15044 _invlist_union(cp_list, posixes, &cp_list);
15045 SvREFCNT_dec_NN(posixes);
15051 if (depends_list) {
15052 _invlist_union(depends_list, nonascii_but_latin1_properties,
15054 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15057 depends_list = nonascii_but_latin1_properties;
15062 /* And combine the result (if any) with any inversion list from properties.
15063 * The lists are kept separate up to now so that we can distinguish the two
15064 * in regards to matching above-Unicode. A run-time warning is generated
15065 * if a Unicode property is matched against a non-Unicode code point. But,
15066 * we allow user-defined properties to match anything, without any warning,
15067 * and we also suppress the warning if there is a portion of the character
15068 * class that isn't a Unicode property, and which matches above Unicode, \W
15069 * or [\x{110000}] for example.
15070 * (Note that in this case, unlike the Posix one above, there is no
15071 * <depends_list>, because having a Unicode property forces Unicode
15076 /* If it matters to the final outcome, see if a non-property
15077 * component of the class matches above Unicode. If so, the
15078 * warning gets suppressed. This is true even if just a single
15079 * such code point is specified, as though not strictly correct if
15080 * another such code point is matched against, the fact that they
15081 * are using above-Unicode code points indicates they should know
15082 * the issues involved */
15084 warn_super = ! (invert
15085 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15088 _invlist_union(properties, cp_list, &cp_list);
15089 SvREFCNT_dec_NN(properties);
15092 cp_list = properties;
15096 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15100 /* Here, we have calculated what code points should be in the character
15103 * Now we can see about various optimizations. Fold calculation (which we
15104 * did above) needs to take place before inversion. Otherwise /[^k]/i
15105 * would invert to include K, which under /i would match k, which it
15106 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15107 * folded until runtime */
15109 /* If we didn't do folding, it's because some information isn't available
15110 * until runtime; set the run-time fold flag for these. (We don't have to
15111 * worry about properties folding, as that is taken care of by the swash
15112 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15113 * locales, or the class matches at least one 0-255 range code point */
15115 if (only_utf8_locale_list) {
15116 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15118 else if (cp_list) { /* Look to see if there a 0-255 code point is in
15121 invlist_iterinit(cp_list);
15122 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15123 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15125 invlist_iterfinish(cp_list);
15129 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15130 * at compile time. Besides not inverting folded locale now, we can't
15131 * invert if there are things such as \w, which aren't known until runtime
15135 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15137 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15139 _invlist_invert(cp_list);
15141 /* Any swash can't be used as-is, because we've inverted things */
15143 SvREFCNT_dec_NN(swash);
15147 /* Clear the invert flag since have just done it here */
15152 *ret_invlist = cp_list;
15153 SvREFCNT_dec(swash);
15155 /* Discard the generated node */
15157 RExC_size = orig_size;
15160 RExC_emit = orig_emit;
15165 /* Some character classes are equivalent to other nodes. Such nodes take
15166 * up less room and generally fewer operations to execute than ANYOF nodes.
15167 * Above, we checked for and optimized into some such equivalents for
15168 * certain common classes that are easy to test. Getting to this point in
15169 * the code means that the class didn't get optimized there. Since this
15170 * code is only executed in Pass 2, it is too late to save space--it has
15171 * been allocated in Pass 1, and currently isn't given back. But turning
15172 * things into an EXACTish node can allow the optimizer to join it to any
15173 * adjacent such nodes. And if the class is equivalent to things like /./,
15174 * expensive run-time swashes can be avoided. Now that we have more
15175 * complete information, we can find things necessarily missed by the
15176 * earlier code. I (khw) am not sure how much to look for here. It would
15177 * be easy, but perhaps too slow, to check any candidates against all the
15178 * node types they could possibly match using _invlistEQ(). */
15183 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15184 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15186 /* We don't optimize if we are supposed to make sure all non-Unicode
15187 * code points raise a warning, as only ANYOF nodes have this check.
15189 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15192 U8 op = END; /* The optimzation node-type */
15193 const char * cur_parse= RExC_parse;
15195 invlist_iterinit(cp_list);
15196 if (! invlist_iternext(cp_list, &start, &end)) {
15198 /* Here, the list is empty. This happens, for example, when a
15199 * Unicode property is the only thing in the character class, and
15200 * it doesn't match anything. (perluniprops.pod notes such
15203 *flagp |= HASWIDTH|SIMPLE;
15205 else if (start == end) { /* The range is a single code point */
15206 if (! invlist_iternext(cp_list, &start, &end)
15208 /* Don't do this optimization if it would require changing
15209 * the pattern to UTF-8 */
15210 && (start < 256 || UTF))
15212 /* Here, the list contains a single code point. Can optimize
15213 * into an EXACTish node */
15222 /* A locale node under folding with one code point can be
15223 * an EXACTFL, as its fold won't be calculated until
15229 /* Here, we are generally folding, but there is only one
15230 * code point to match. If we have to, we use an EXACT
15231 * node, but it would be better for joining with adjacent
15232 * nodes in the optimization pass if we used the same
15233 * EXACTFish node that any such are likely to be. We can
15234 * do this iff the code point doesn't participate in any
15235 * folds. For example, an EXACTF of a colon is the same as
15236 * an EXACT one, since nothing folds to or from a colon. */
15238 if (IS_IN_SOME_FOLD_L1(value)) {
15243 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15248 /* If we haven't found the node type, above, it means we
15249 * can use the prevailing one */
15251 op = compute_EXACTish(pRExC_state);
15256 else if (start == 0) {
15257 if (end == UV_MAX) {
15259 *flagp |= HASWIDTH|SIMPLE;
15262 else if (end == '\n' - 1
15263 && invlist_iternext(cp_list, &start, &end)
15264 && start == '\n' + 1 && end == UV_MAX)
15267 *flagp |= HASWIDTH|SIMPLE;
15271 invlist_iterfinish(cp_list);
15274 RExC_parse = (char *)orig_parse;
15275 RExC_emit = (regnode *)orig_emit;
15277 ret = reg_node(pRExC_state, op);
15279 RExC_parse = (char *)cur_parse;
15281 if (PL_regkind[op] == EXACT) {
15282 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15283 TRUE /* downgradable to EXACT */
15287 SvREFCNT_dec_NN(cp_list);
15292 /* Here, <cp_list> contains all the code points we can determine at
15293 * compile time that match under all conditions. Go through it, and
15294 * for things that belong in the bitmap, put them there, and delete from
15295 * <cp_list>. While we are at it, see if everything above 255 is in the
15296 * list, and if so, set a flag to speed up execution */
15298 populate_ANYOF_from_invlist(ret, &cp_list);
15301 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15304 /* Here, the bitmap has been populated with all the Latin1 code points that
15305 * always match. Can now add to the overall list those that match only
15306 * when the target string is UTF-8 (<depends_list>). */
15307 if (depends_list) {
15309 _invlist_union(cp_list, depends_list, &cp_list);
15310 SvREFCNT_dec_NN(depends_list);
15313 cp_list = depends_list;
15315 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15318 /* If there is a swash and more than one element, we can't use the swash in
15319 * the optimization below. */
15320 if (swash && element_count > 1) {
15321 SvREFCNT_dec_NN(swash);
15325 /* Note that the optimization of using 'swash' if it is the only thing in
15326 * the class doesn't have us change swash at all, so it can include things
15327 * that are also in the bitmap; otherwise we have purposely deleted that
15328 * duplicate information */
15329 set_ANYOF_arg(pRExC_state, ret, cp_list,
15330 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15332 only_utf8_locale_list,
15333 swash, has_user_defined_property);
15335 *flagp |= HASWIDTH|SIMPLE;
15337 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15338 RExC_contains_locale = 1;
15344 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15347 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15348 regnode* const node,
15350 SV* const runtime_defns,
15351 SV* const only_utf8_locale_list,
15353 const bool has_user_defined_property)
15355 /* Sets the arg field of an ANYOF-type node 'node', using information about
15356 * the node passed-in. If there is nothing outside the node's bitmap, the
15357 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
15358 * the count returned by add_data(), having allocated and stored an array,
15359 * av, that that count references, as follows:
15360 * av[0] stores the character class description in its textual form.
15361 * This is used later (regexec.c:Perl_regclass_swash()) to
15362 * initialize the appropriate swash, and is also useful for dumping
15363 * the regnode. This is set to &PL_sv_undef if the textual
15364 * description is not needed at run-time (as happens if the other
15365 * elements completely define the class)
15366 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15367 * computed from av[0]. But if no further computation need be done,
15368 * the swash is stored here now (and av[0] is &PL_sv_undef).
15369 * av[2] stores the inversion list of code points that match only if the
15370 * current locale is UTF-8
15371 * av[3] stores the cp_list inversion list for use in addition or instead
15372 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15373 * (Otherwise everything needed is already in av[0] and av[1])
15374 * av[4] is set if any component of the class is from a user-defined
15375 * property; used only if av[3] exists */
15379 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15381 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15382 assert(! (ANYOF_FLAGS(node)
15383 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15384 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15385 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15388 AV * const av = newAV();
15391 assert(ANYOF_FLAGS(node)
15392 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15393 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15395 av_store(av, 0, (runtime_defns)
15396 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15399 av_store(av, 1, swash);
15400 SvREFCNT_dec_NN(cp_list);
15403 av_store(av, 1, &PL_sv_undef);
15405 av_store(av, 3, cp_list);
15406 av_store(av, 4, newSVuv(has_user_defined_property));
15410 if (only_utf8_locale_list) {
15411 av_store(av, 2, only_utf8_locale_list);
15414 av_store(av, 2, &PL_sv_undef);
15417 rv = newRV_noinc(MUTABLE_SV(av));
15418 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15419 RExC_rxi->data->data[n] = (void*)rv;
15424 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15426 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15427 const regnode* node,
15430 SV** only_utf8_locale_ptr,
15434 /* For internal core use only.
15435 * Returns the swash for the input 'node' in the regex 'prog'.
15436 * If <doinit> is 'true', will attempt to create the swash if not already
15438 * If <listsvp> is non-null, will return the printable contents of the
15439 * swash. This can be used to get debugging information even before the
15440 * swash exists, by calling this function with 'doinit' set to false, in
15441 * which case the components that will be used to eventually create the
15442 * swash are returned (in a printable form).
15443 * If <exclude_list> is not NULL, it is an inversion list of things to
15444 * exclude from what's returned in <listsvp>.
15445 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
15446 * that, in spite of this function's name, the swash it returns may include
15447 * the bitmap data as well */
15450 SV *si = NULL; /* Input swash initialization string */
15451 SV* invlist = NULL;
15453 RXi_GET_DECL(prog,progi);
15454 const struct reg_data * const data = prog ? progi->data : NULL;
15456 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15458 assert(ANYOF_FLAGS(node)
15459 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15460 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15462 if (data && data->count) {
15463 const U32 n = ARG(node);
15465 if (data->what[n] == 's') {
15466 SV * const rv = MUTABLE_SV(data->data[n]);
15467 AV * const av = MUTABLE_AV(SvRV(rv));
15468 SV **const ary = AvARRAY(av);
15469 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15471 si = *ary; /* ary[0] = the string to initialize the swash with */
15473 /* Elements 3 and 4 are either both present or both absent. [3] is
15474 * any inversion list generated at compile time; [4] indicates if
15475 * that inversion list has any user-defined properties in it. */
15476 if (av_tindex(av) >= 2) {
15477 if (only_utf8_locale_ptr
15479 && ary[2] != &PL_sv_undef)
15481 *only_utf8_locale_ptr = ary[2];
15484 assert(only_utf8_locale_ptr);
15485 *only_utf8_locale_ptr = NULL;
15488 if (av_tindex(av) >= 3) {
15490 if (SvUV(ary[4])) {
15491 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15499 /* Element [1] is reserved for the set-up swash. If already there,
15500 * return it; if not, create it and store it there */
15501 if (ary[1] && SvROK(ary[1])) {
15504 else if (doinit && ((si && si != &PL_sv_undef)
15505 || (invlist && invlist != &PL_sv_undef))) {
15507 sw = _core_swash_init("utf8", /* the utf8 package */
15511 0, /* not from tr/// */
15513 &swash_init_flags);
15514 (void)av_store(av, 1, sw);
15519 /* If requested, return a printable version of what this swash matches */
15521 SV* matches_string = newSVpvs("");
15523 /* The swash should be used, if possible, to get the data, as it
15524 * contains the resolved data. But this function can be called at
15525 * compile-time, before everything gets resolved, in which case we
15526 * return the currently best available information, which is the string
15527 * that will eventually be used to do that resolving, 'si' */
15528 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15529 && (si && si != &PL_sv_undef))
15531 sv_catsv(matches_string, si);
15534 /* Add the inversion list to whatever we have. This may have come from
15535 * the swash, or from an input parameter */
15537 if (exclude_list) {
15538 SV* clone = invlist_clone(invlist);
15539 _invlist_subtract(clone, exclude_list, &clone);
15540 sv_catsv(matches_string, _invlist_contents(clone));
15541 SvREFCNT_dec_NN(clone);
15544 sv_catsv(matches_string, _invlist_contents(invlist));
15547 *listsvp = matches_string;
15552 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15554 /* reg_skipcomment()
15556 Absorbs an /x style # comment from the input stream,
15557 returning a pointer to the first character beyond the comment, or if the
15558 comment terminates the pattern without anything following it, this returns
15559 one past the final character of the pattern (in other words, RExC_end) and
15560 sets the REG_RUN_ON_COMMENT_SEEN flag.
15562 Note it's the callers responsibility to ensure that we are
15563 actually in /x mode
15567 PERL_STATIC_INLINE char*
15568 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15570 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15574 while (p < RExC_end) {
15575 if (*(++p) == '\n') {
15580 /* we ran off the end of the pattern without ending the comment, so we have
15581 * to add an \n when wrapping */
15582 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15588 Advances the parse position, and optionally absorbs
15589 "whitespace" from the inputstream.
15591 Without /x "whitespace" means (?#...) style comments only,
15592 with /x this means (?#...) and # comments and whitespace proper.
15594 Returns the RExC_parse point from BEFORE the scan occurs.
15596 This is the /x friendly way of saying RExC_parse++.
15600 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15602 char* const retval = RExC_parse++;
15604 PERL_ARGS_ASSERT_NEXTCHAR;
15607 if (RExC_end - RExC_parse >= 3
15608 && *RExC_parse == '('
15609 && RExC_parse[1] == '?'
15610 && RExC_parse[2] == '#')
15612 while (*RExC_parse != ')') {
15613 if (RExC_parse == RExC_end)
15614 FAIL("Sequence (?#... not terminated");
15620 if (RExC_flags & RXf_PMf_EXTENDED) {
15621 char * p = regpatws(pRExC_state, RExC_parse,
15622 TRUE); /* means recognize comments */
15623 if (p != RExC_parse) {
15633 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15635 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15636 * space. In pass1, it aligns and increments RExC_size; in pass2,
15639 regnode * const ret = RExC_emit;
15640 GET_RE_DEBUG_FLAGS_DECL;
15642 PERL_ARGS_ASSERT_REGNODE_GUTS;
15644 assert(extra_size >= regarglen[op]);
15647 SIZE_ALIGN(RExC_size);
15648 RExC_size += 1 + extra_size;
15651 if (RExC_emit >= RExC_emit_bound)
15652 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15653 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15655 NODE_ALIGN_FILL(ret);
15656 #ifndef RE_TRACK_PATTERN_OFFSETS
15657 PERL_UNUSED_ARG(name);
15659 if (RExC_offsets) { /* MJD */
15661 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15664 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15665 ? "Overwriting end of array!\n" : "OK",
15666 (UV)(RExC_emit - RExC_emit_start),
15667 (UV)(RExC_parse - RExC_start),
15668 (UV)RExC_offsets[0]));
15669 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15676 - reg_node - emit a node
15678 STATIC regnode * /* Location. */
15679 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15681 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15683 PERL_ARGS_ASSERT_REG_NODE;
15685 assert(regarglen[op] == 0);
15688 regnode *ptr = ret;
15689 FILL_ADVANCE_NODE(ptr, op);
15696 - reganode - emit a node with an argument
15698 STATIC regnode * /* Location. */
15699 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15701 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15703 PERL_ARGS_ASSERT_REGANODE;
15705 assert(regarglen[op] == 1);
15708 regnode *ptr = ret;
15709 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15716 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15718 /* emit a node with U32 and I32 arguments */
15720 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15722 PERL_ARGS_ASSERT_REG2LANODE;
15724 assert(regarglen[op] == 2);
15727 regnode *ptr = ret;
15728 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15735 - reguni - emit (if appropriate) a Unicode character
15737 PERL_STATIC_INLINE STRLEN
15738 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15740 PERL_ARGS_ASSERT_REGUNI;
15742 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15746 - reginsert - insert an operator in front of already-emitted operand
15748 * Means relocating the operand.
15751 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15756 const int offset = regarglen[(U8)op];
15757 const int size = NODE_STEP_REGNODE + offset;
15758 GET_RE_DEBUG_FLAGS_DECL;
15760 PERL_ARGS_ASSERT_REGINSERT;
15761 PERL_UNUSED_CONTEXT;
15762 PERL_UNUSED_ARG(depth);
15763 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15764 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15773 if (RExC_open_parens) {
15775 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15776 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15777 if ( RExC_open_parens[paren] >= opnd ) {
15778 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15779 RExC_open_parens[paren] += size;
15781 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15783 if ( RExC_close_parens[paren] >= opnd ) {
15784 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15785 RExC_close_parens[paren] += size;
15787 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15792 while (src > opnd) {
15793 StructCopy(--src, --dst, regnode);
15794 #ifdef RE_TRACK_PATTERN_OFFSETS
15795 if (RExC_offsets) { /* MJD 20010112 */
15797 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15801 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15802 ? "Overwriting end of array!\n" : "OK",
15803 (UV)(src - RExC_emit_start),
15804 (UV)(dst - RExC_emit_start),
15805 (UV)RExC_offsets[0]));
15806 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15807 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15813 place = opnd; /* Op node, where operand used to be. */
15814 #ifdef RE_TRACK_PATTERN_OFFSETS
15815 if (RExC_offsets) { /* MJD */
15817 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15821 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15822 ? "Overwriting end of array!\n" : "OK",
15823 (UV)(place - RExC_emit_start),
15824 (UV)(RExC_parse - RExC_start),
15825 (UV)RExC_offsets[0]));
15826 Set_Node_Offset(place, RExC_parse);
15827 Set_Node_Length(place, 1);
15830 src = NEXTOPER(place);
15831 FILL_ADVANCE_NODE(place, op);
15832 Zero(src, offset, regnode);
15836 - regtail - set the next-pointer at the end of a node chain of p to val.
15837 - SEE ALSO: regtail_study
15839 /* TODO: All three parms should be const */
15841 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15842 const regnode *val,U32 depth)
15845 GET_RE_DEBUG_FLAGS_DECL;
15847 PERL_ARGS_ASSERT_REGTAIL;
15849 PERL_UNUSED_ARG(depth);
15855 /* Find last node. */
15858 regnode * const temp = regnext(scan);
15860 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15861 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15862 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15863 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15864 (temp == NULL ? "->" : ""),
15865 (temp == NULL ? PL_reg_name[OP(val)] : "")
15873 if (reg_off_by_arg[OP(scan)]) {
15874 ARG_SET(scan, val - scan);
15877 NEXT_OFF(scan) = val - scan;
15883 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15884 - Look for optimizable sequences at the same time.
15885 - currently only looks for EXACT chains.
15887 This is experimental code. The idea is to use this routine to perform
15888 in place optimizations on branches and groups as they are constructed,
15889 with the long term intention of removing optimization from study_chunk so
15890 that it is purely analytical.
15892 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15893 to control which is which.
15896 /* TODO: All four parms should be const */
15899 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15900 const regnode *val,U32 depth)
15904 #ifdef EXPERIMENTAL_INPLACESCAN
15907 GET_RE_DEBUG_FLAGS_DECL;
15909 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15915 /* Find last node. */
15919 regnode * const temp = regnext(scan);
15920 #ifdef EXPERIMENTAL_INPLACESCAN
15921 if (PL_regkind[OP(scan)] == EXACT) {
15922 bool unfolded_multi_char; /* Unexamined in this routine */
15923 if (join_exact(pRExC_state, scan, &min,
15924 &unfolded_multi_char, 1, val, depth+1))
15929 switch (OP(scan)) {
15932 case EXACTFA_NO_TRIE:
15937 if( exact == PSEUDO )
15939 else if ( exact != OP(scan) )
15948 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15949 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15950 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15951 SvPV_nolen_const(RExC_mysv),
15952 REG_NODE_NUM(scan),
15953 PL_reg_name[exact]);
15960 DEBUG_PARSE_MSG("");
15961 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
15962 PerlIO_printf(Perl_debug_log,
15963 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15964 SvPV_nolen_const(RExC_mysv),
15965 (IV)REG_NODE_NUM(val),
15969 if (reg_off_by_arg[OP(scan)]) {
15970 ARG_SET(scan, val - scan);
15973 NEXT_OFF(scan) = val - scan;
15981 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15986 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15991 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15993 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15994 if (flags & (1<<bit)) {
15995 if (!set++ && lead)
15996 PerlIO_printf(Perl_debug_log, "%s",lead);
15997 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16002 PerlIO_printf(Perl_debug_log, "\n");
16004 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16009 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16015 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16017 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16018 if (flags & (1<<bit)) {
16019 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16022 if (!set++ && lead)
16023 PerlIO_printf(Perl_debug_log, "%s",lead);
16024 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16027 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16028 if (!set++ && lead) {
16029 PerlIO_printf(Perl_debug_log, "%s",lead);
16032 case REGEX_UNICODE_CHARSET:
16033 PerlIO_printf(Perl_debug_log, "UNICODE");
16035 case REGEX_LOCALE_CHARSET:
16036 PerlIO_printf(Perl_debug_log, "LOCALE");
16038 case REGEX_ASCII_RESTRICTED_CHARSET:
16039 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16041 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16042 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16045 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16051 PerlIO_printf(Perl_debug_log, "\n");
16053 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16059 Perl_regdump(pTHX_ const regexp *r)
16062 SV * const sv = sv_newmortal();
16063 SV *dsv= sv_newmortal();
16064 RXi_GET_DECL(r,ri);
16065 GET_RE_DEBUG_FLAGS_DECL;
16067 PERL_ARGS_ASSERT_REGDUMP;
16069 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16071 /* Header fields of interest. */
16072 if (r->anchored_substr) {
16073 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16074 RE_SV_DUMPLEN(r->anchored_substr), 30);
16075 PerlIO_printf(Perl_debug_log,
16076 "anchored %s%s at %"IVdf" ",
16077 s, RE_SV_TAIL(r->anchored_substr),
16078 (IV)r->anchored_offset);
16079 } else if (r->anchored_utf8) {
16080 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16081 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16082 PerlIO_printf(Perl_debug_log,
16083 "anchored utf8 %s%s at %"IVdf" ",
16084 s, RE_SV_TAIL(r->anchored_utf8),
16085 (IV)r->anchored_offset);
16087 if (r->float_substr) {
16088 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16089 RE_SV_DUMPLEN(r->float_substr), 30);
16090 PerlIO_printf(Perl_debug_log,
16091 "floating %s%s at %"IVdf"..%"UVuf" ",
16092 s, RE_SV_TAIL(r->float_substr),
16093 (IV)r->float_min_offset, (UV)r->float_max_offset);
16094 } else if (r->float_utf8) {
16095 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16096 RE_SV_DUMPLEN(r->float_utf8), 30);
16097 PerlIO_printf(Perl_debug_log,
16098 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16099 s, RE_SV_TAIL(r->float_utf8),
16100 (IV)r->float_min_offset, (UV)r->float_max_offset);
16102 if (r->check_substr || r->check_utf8)
16103 PerlIO_printf(Perl_debug_log,
16105 (r->check_substr == r->float_substr
16106 && r->check_utf8 == r->float_utf8
16107 ? "(checking floating" : "(checking anchored"));
16108 if (r->intflags & PREGf_NOSCAN)
16109 PerlIO_printf(Perl_debug_log, " noscan");
16110 if (r->extflags & RXf_CHECK_ALL)
16111 PerlIO_printf(Perl_debug_log, " isall");
16112 if (r->check_substr || r->check_utf8)
16113 PerlIO_printf(Perl_debug_log, ") ");
16115 if (ri->regstclass) {
16116 regprop(r, sv, ri->regstclass, NULL, NULL);
16117 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16119 if (r->intflags & PREGf_ANCH) {
16120 PerlIO_printf(Perl_debug_log, "anchored");
16121 if (r->intflags & PREGf_ANCH_MBOL)
16122 PerlIO_printf(Perl_debug_log, "(MBOL)");
16123 if (r->intflags & PREGf_ANCH_SBOL)
16124 PerlIO_printf(Perl_debug_log, "(SBOL)");
16125 if (r->intflags & PREGf_ANCH_GPOS)
16126 PerlIO_printf(Perl_debug_log, "(GPOS)");
16127 PerlIO_putc(Perl_debug_log, ' ');
16129 if (r->intflags & PREGf_GPOS_SEEN)
16130 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16131 if (r->intflags & PREGf_SKIP)
16132 PerlIO_printf(Perl_debug_log, "plus ");
16133 if (r->intflags & PREGf_IMPLICIT)
16134 PerlIO_printf(Perl_debug_log, "implicit ");
16135 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16136 if (r->extflags & RXf_EVAL_SEEN)
16137 PerlIO_printf(Perl_debug_log, "with eval ");
16138 PerlIO_printf(Perl_debug_log, "\n");
16140 regdump_extflags("r->extflags: ",r->extflags);
16141 regdump_intflags("r->intflags: ",r->intflags);
16144 PERL_ARGS_ASSERT_REGDUMP;
16145 PERL_UNUSED_CONTEXT;
16146 PERL_UNUSED_ARG(r);
16147 #endif /* DEBUGGING */
16151 - regprop - printable representation of opcode, with run time support
16155 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16160 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16161 static const char * const anyofs[] = {
16162 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16163 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
16164 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
16165 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
16166 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
16167 || _CC_VERTSPACE != 16
16168 #error Need to adjust order of anyofs[]
16205 RXi_GET_DECL(prog,progi);
16206 GET_RE_DEBUG_FLAGS_DECL;
16208 PERL_ARGS_ASSERT_REGPROP;
16210 sv_setpvn(sv, "", 0);
16212 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
16213 /* It would be nice to FAIL() here, but this may be called from
16214 regexec.c, and it would be hard to supply pRExC_state. */
16215 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16216 (int)OP(o), (int)REGNODE_MAX);
16217 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16219 k = PL_regkind[OP(o)];
16222 sv_catpvs(sv, " ");
16223 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16224 * is a crude hack but it may be the best for now since
16225 * we have no flag "this EXACTish node was UTF-8"
16227 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16228 PERL_PV_ESCAPE_UNI_DETECT |
16229 PERL_PV_ESCAPE_NONASCII |
16230 PERL_PV_PRETTY_ELLIPSES |
16231 PERL_PV_PRETTY_LTGT |
16232 PERL_PV_PRETTY_NOCLEAR
16234 } else if (k == TRIE) {
16235 /* print the details of the trie in dumpuntil instead, as
16236 * progi->data isn't available here */
16237 const char op = OP(o);
16238 const U32 n = ARG(o);
16239 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16240 (reg_ac_data *)progi->data->data[n] :
16242 const reg_trie_data * const trie
16243 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16245 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16246 DEBUG_TRIE_COMPILE_r(
16247 Perl_sv_catpvf(aTHX_ sv,
16248 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16249 (UV)trie->startstate,
16250 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16251 (UV)trie->wordcount,
16254 (UV)TRIE_CHARCOUNT(trie),
16255 (UV)trie->uniquecharcount
16258 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16259 sv_catpvs(sv, "[");
16260 (void) put_charclass_bitmap_innards(sv,
16261 (IS_ANYOF_TRIE(op))
16263 : TRIE_BITMAP(trie),
16265 sv_catpvs(sv, "]");
16268 } else if (k == CURLY) {
16269 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16270 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16271 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16273 else if (k == WHILEM && o->flags) /* Ordinal/of */
16274 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16275 else if (k == REF || k == OPEN || k == CLOSE
16276 || k == GROUPP || OP(o)==ACCEPT)
16278 AV *name_list= NULL;
16279 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16280 if ( RXp_PAREN_NAMES(prog) ) {
16281 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16282 } else if ( pRExC_state ) {
16283 name_list= RExC_paren_name_list;
16286 if ( k != REF || (OP(o) < NREF)) {
16287 SV **name= av_fetch(name_list, ARG(o), 0 );
16289 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16292 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16293 I32 *nums=(I32*)SvPVX(sv_dat);
16294 SV **name= av_fetch(name_list, nums[0], 0 );
16297 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16298 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16299 (n ? "," : ""), (IV)nums[n]);
16301 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16305 if ( k == REF && reginfo) {
16306 U32 n = ARG(o); /* which paren pair */
16307 I32 ln = prog->offs[n].start;
16308 if (prog->lastparen < n || ln == -1)
16309 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16310 else if (ln == prog->offs[n].end)
16311 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16313 const char *s = reginfo->strbeg + ln;
16314 Perl_sv_catpvf(aTHX_ sv, ": ");
16315 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16316 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16319 } else if (k == GOSUB) {
16320 AV *name_list= NULL;
16321 if ( RXp_PAREN_NAMES(prog) ) {
16322 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16323 } else if ( pRExC_state ) {
16324 name_list= RExC_paren_name_list;
16327 /* Paren and offset */
16328 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16330 SV **name= av_fetch(name_list, ARG(o), 0 );
16332 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16335 else if (k == VERB) {
16337 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16338 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16339 } else if (k == LOGICAL)
16340 /* 2: embedded, otherwise 1 */
16341 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16342 else if (k == ANYOF) {
16343 const U8 flags = ANYOF_FLAGS(o);
16345 SV* bitmap_invlist; /* Will hold what the bit map contains */
16348 if (flags & ANYOF_LOCALE_FLAGS)
16349 sv_catpvs(sv, "{loc}");
16350 if (flags & ANYOF_LOC_FOLD)
16351 sv_catpvs(sv, "{i}");
16352 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16353 if (flags & ANYOF_INVERT)
16354 sv_catpvs(sv, "^");
16356 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16358 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16361 /* output any special charclass tests (used entirely under use
16363 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16365 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16366 if (ANYOF_POSIXL_TEST(o,i)) {
16367 sv_catpv(sv, anyofs[i]);
16373 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16374 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16375 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16379 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16380 if (flags & ANYOF_INVERT)
16381 /*make sure the invert info is in each */
16382 sv_catpvs(sv, "^");
16385 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16386 sv_catpvs(sv, "{non-utf8-latin1-all}");
16389 /* output information about the unicode matching */
16390 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16391 sv_catpvs(sv, "{above_bitmap_all}");
16392 else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16393 SV *lv; /* Set if there is something outside the bit map. */
16394 bool byte_output = FALSE; /* If something in the bitmap has
16396 SV *only_utf8_locale;
16398 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
16399 * is used to guarantee that nothing in the bitmap gets
16401 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16402 &lv, &only_utf8_locale,
16404 if (lv && lv != &PL_sv_undef) {
16405 char *s = savesvpv(lv);
16406 char * const origs = s;
16408 while (*s && *s != '\n')
16412 const char * const t = ++s;
16414 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16415 sv_catpvs(sv, "{outside bitmap}");
16418 sv_catpvs(sv, "{utf8}");
16422 sv_catpvs(sv, " ");
16428 /* Truncate very long output */
16429 if (s - origs > 256) {
16430 Perl_sv_catpvf(aTHX_ sv,
16432 (int) (s - origs - 1),
16438 else if (*s == '\t') {
16452 SvREFCNT_dec_NN(lv);
16455 if ((flags & ANYOF_LOC_FOLD)
16456 && only_utf8_locale
16457 && only_utf8_locale != &PL_sv_undef)
16460 int max_entries = 256;
16462 sv_catpvs(sv, "{utf8 locale}");
16463 invlist_iterinit(only_utf8_locale);
16464 while (invlist_iternext(only_utf8_locale,
16466 put_range(sv, start, end, FALSE);
16468 if (max_entries < 0) {
16469 sv_catpvs(sv, "...");
16473 invlist_iterfinish(only_utf8_locale);
16477 SvREFCNT_dec(bitmap_invlist);
16480 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16482 else if (k == POSIXD || k == NPOSIXD) {
16483 U8 index = FLAGS(o) * 2;
16484 if (index < C_ARRAY_LENGTH(anyofs)) {
16485 if (*anyofs[index] != '[') {
16488 sv_catpv(sv, anyofs[index]);
16489 if (*anyofs[index] != '[') {
16494 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16497 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16498 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16499 else if (OP(o) == SBOL)
16500 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16502 PERL_UNUSED_CONTEXT;
16503 PERL_UNUSED_ARG(sv);
16504 PERL_UNUSED_ARG(o);
16505 PERL_UNUSED_ARG(prog);
16506 PERL_UNUSED_ARG(reginfo);
16507 PERL_UNUSED_ARG(pRExC_state);
16508 #endif /* DEBUGGING */
16514 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16515 { /* Assume that RE_INTUIT is set */
16516 struct regexp *const prog = ReANY(r);
16517 GET_RE_DEBUG_FLAGS_DECL;
16519 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16520 PERL_UNUSED_CONTEXT;
16524 const char * const s = SvPV_nolen_const(prog->check_substr
16525 ? prog->check_substr : prog->check_utf8);
16527 if (!PL_colorset) reginitcolors();
16528 PerlIO_printf(Perl_debug_log,
16529 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16531 prog->check_substr ? "" : "utf8 ",
16532 PL_colors[5],PL_colors[0],
16535 (strlen(s) > 60 ? "..." : ""));
16538 return prog->check_substr ? prog->check_substr : prog->check_utf8;
16544 handles refcounting and freeing the perl core regexp structure. When
16545 it is necessary to actually free the structure the first thing it
16546 does is call the 'free' method of the regexp_engine associated to
16547 the regexp, allowing the handling of the void *pprivate; member
16548 first. (This routine is not overridable by extensions, which is why
16549 the extensions free is called first.)
16551 See regdupe and regdupe_internal if you change anything here.
16553 #ifndef PERL_IN_XSUB_RE
16555 Perl_pregfree(pTHX_ REGEXP *r)
16561 Perl_pregfree2(pTHX_ REGEXP *rx)
16563 struct regexp *const r = ReANY(rx);
16564 GET_RE_DEBUG_FLAGS_DECL;
16566 PERL_ARGS_ASSERT_PREGFREE2;
16568 if (r->mother_re) {
16569 ReREFCNT_dec(r->mother_re);
16571 CALLREGFREE_PVT(rx); /* free the private data */
16572 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16573 Safefree(r->xpv_len_u.xpvlenu_pv);
16576 SvREFCNT_dec(r->anchored_substr);
16577 SvREFCNT_dec(r->anchored_utf8);
16578 SvREFCNT_dec(r->float_substr);
16579 SvREFCNT_dec(r->float_utf8);
16580 Safefree(r->substrs);
16582 RX_MATCH_COPY_FREE(rx);
16583 #ifdef PERL_ANY_COW
16584 SvREFCNT_dec(r->saved_copy);
16587 SvREFCNT_dec(r->qr_anoncv);
16588 rx->sv_u.svu_rx = 0;
16593 This is a hacky workaround to the structural issue of match results
16594 being stored in the regexp structure which is in turn stored in
16595 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16596 could be PL_curpm in multiple contexts, and could require multiple
16597 result sets being associated with the pattern simultaneously, such
16598 as when doing a recursive match with (??{$qr})
16600 The solution is to make a lightweight copy of the regexp structure
16601 when a qr// is returned from the code executed by (??{$qr}) this
16602 lightweight copy doesn't actually own any of its data except for
16603 the starp/end and the actual regexp structure itself.
16609 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16611 struct regexp *ret;
16612 struct regexp *const r = ReANY(rx);
16613 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16615 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16618 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16620 SvOK_off((SV *)ret_x);
16622 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16623 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16624 made both spots point to the same regexp body.) */
16625 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16626 assert(!SvPVX(ret_x));
16627 ret_x->sv_u.svu_rx = temp->sv_any;
16628 temp->sv_any = NULL;
16629 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16630 SvREFCNT_dec_NN(temp);
16631 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16632 ing below will not set it. */
16633 SvCUR_set(ret_x, SvCUR(rx));
16636 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16637 sv_force_normal(sv) is called. */
16639 ret = ReANY(ret_x);
16641 SvFLAGS(ret_x) |= SvUTF8(rx);
16642 /* We share the same string buffer as the original regexp, on which we
16643 hold a reference count, incremented when mother_re is set below.
16644 The string pointer is copied here, being part of the regexp struct.
16646 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16647 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16649 const I32 npar = r->nparens+1;
16650 Newx(ret->offs, npar, regexp_paren_pair);
16651 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16654 Newx(ret->substrs, 1, struct reg_substr_data);
16655 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16657 SvREFCNT_inc_void(ret->anchored_substr);
16658 SvREFCNT_inc_void(ret->anchored_utf8);
16659 SvREFCNT_inc_void(ret->float_substr);
16660 SvREFCNT_inc_void(ret->float_utf8);
16662 /* check_substr and check_utf8, if non-NULL, point to either their
16663 anchored or float namesakes, and don't hold a second reference. */
16665 RX_MATCH_COPIED_off(ret_x);
16666 #ifdef PERL_ANY_COW
16667 ret->saved_copy = NULL;
16669 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16670 SvREFCNT_inc_void(ret->qr_anoncv);
16676 /* regfree_internal()
16678 Free the private data in a regexp. This is overloadable by
16679 extensions. Perl takes care of the regexp structure in pregfree(),
16680 this covers the *pprivate pointer which technically perl doesn't
16681 know about, however of course we have to handle the
16682 regexp_internal structure when no extension is in use.
16684 Note this is called before freeing anything in the regexp
16689 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16691 struct regexp *const r = ReANY(rx);
16692 RXi_GET_DECL(r,ri);
16693 GET_RE_DEBUG_FLAGS_DECL;
16695 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16701 SV *dsv= sv_newmortal();
16702 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16703 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16704 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16705 PL_colors[4],PL_colors[5],s);
16708 #ifdef RE_TRACK_PATTERN_OFFSETS
16710 Safefree(ri->u.offsets); /* 20010421 MJD */
16712 if (ri->code_blocks) {
16714 for (n = 0; n < ri->num_code_blocks; n++)
16715 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16716 Safefree(ri->code_blocks);
16720 int n = ri->data->count;
16723 /* If you add a ->what type here, update the comment in regcomp.h */
16724 switch (ri->data->what[n]) {
16730 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16733 Safefree(ri->data->data[n]);
16739 { /* Aho Corasick add-on structure for a trie node.
16740 Used in stclass optimization only */
16742 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16743 #ifdef USE_ITHREADS
16747 refcount = --aho->refcount;
16750 PerlMemShared_free(aho->states);
16751 PerlMemShared_free(aho->fail);
16752 /* do this last!!!! */
16753 PerlMemShared_free(ri->data->data[n]);
16754 /* we should only ever get called once, so
16755 * assert as much, and also guard the free
16756 * which /might/ happen twice. At the least
16757 * it will make code anlyzers happy and it
16758 * doesn't cost much. - Yves */
16759 assert(ri->regstclass);
16760 if (ri->regstclass) {
16761 PerlMemShared_free(ri->regstclass);
16762 ri->regstclass = 0;
16769 /* trie structure. */
16771 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16772 #ifdef USE_ITHREADS
16776 refcount = --trie->refcount;
16779 PerlMemShared_free(trie->charmap);
16780 PerlMemShared_free(trie->states);
16781 PerlMemShared_free(trie->trans);
16783 PerlMemShared_free(trie->bitmap);
16785 PerlMemShared_free(trie->jump);
16786 PerlMemShared_free(trie->wordinfo);
16787 /* do this last!!!! */
16788 PerlMemShared_free(ri->data->data[n]);
16793 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16794 ri->data->what[n]);
16797 Safefree(ri->data->what);
16798 Safefree(ri->data);
16804 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16805 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16806 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16809 re_dup - duplicate a regexp.
16811 This routine is expected to clone a given regexp structure. It is only
16812 compiled under USE_ITHREADS.
16814 After all of the core data stored in struct regexp is duplicated
16815 the regexp_engine.dupe method is used to copy any private data
16816 stored in the *pprivate pointer. This allows extensions to handle
16817 any duplication it needs to do.
16819 See pregfree() and regfree_internal() if you change anything here.
16821 #if defined(USE_ITHREADS)
16822 #ifndef PERL_IN_XSUB_RE
16824 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16828 const struct regexp *r = ReANY(sstr);
16829 struct regexp *ret = ReANY(dstr);
16831 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16833 npar = r->nparens+1;
16834 Newx(ret->offs, npar, regexp_paren_pair);
16835 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16837 if (ret->substrs) {
16838 /* Do it this way to avoid reading from *r after the StructCopy().
16839 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16840 cache, it doesn't matter. */
16841 const bool anchored = r->check_substr
16842 ? r->check_substr == r->anchored_substr
16843 : r->check_utf8 == r->anchored_utf8;
16844 Newx(ret->substrs, 1, struct reg_substr_data);
16845 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16847 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16848 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16849 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16850 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16852 /* check_substr and check_utf8, if non-NULL, point to either their
16853 anchored or float namesakes, and don't hold a second reference. */
16855 if (ret->check_substr) {
16857 assert(r->check_utf8 == r->anchored_utf8);
16858 ret->check_substr = ret->anchored_substr;
16859 ret->check_utf8 = ret->anchored_utf8;
16861 assert(r->check_substr == r->float_substr);
16862 assert(r->check_utf8 == r->float_utf8);
16863 ret->check_substr = ret->float_substr;
16864 ret->check_utf8 = ret->float_utf8;
16866 } else if (ret->check_utf8) {
16868 ret->check_utf8 = ret->anchored_utf8;
16870 ret->check_utf8 = ret->float_utf8;
16875 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16876 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16879 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16881 if (RX_MATCH_COPIED(dstr))
16882 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16884 ret->subbeg = NULL;
16885 #ifdef PERL_ANY_COW
16886 ret->saved_copy = NULL;
16889 /* Whether mother_re be set or no, we need to copy the string. We
16890 cannot refrain from copying it when the storage points directly to
16891 our mother regexp, because that's
16892 1: a buffer in a different thread
16893 2: something we no longer hold a reference on
16894 so we need to copy it locally. */
16895 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16896 ret->mother_re = NULL;
16898 #endif /* PERL_IN_XSUB_RE */
16903 This is the internal complement to regdupe() which is used to copy
16904 the structure pointed to by the *pprivate pointer in the regexp.
16905 This is the core version of the extension overridable cloning hook.
16906 The regexp structure being duplicated will be copied by perl prior
16907 to this and will be provided as the regexp *r argument, however
16908 with the /old/ structures pprivate pointer value. Thus this routine
16909 may override any copying normally done by perl.
16911 It returns a pointer to the new regexp_internal structure.
16915 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16918 struct regexp *const r = ReANY(rx);
16919 regexp_internal *reti;
16921 RXi_GET_DECL(r,ri);
16923 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16927 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16928 char, regexp_internal);
16929 Copy(ri->program, reti->program, len+1, regnode);
16931 reti->num_code_blocks = ri->num_code_blocks;
16932 if (ri->code_blocks) {
16934 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16935 struct reg_code_block);
16936 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16937 struct reg_code_block);
16938 for (n = 0; n < ri->num_code_blocks; n++)
16939 reti->code_blocks[n].src_regex = (REGEXP*)
16940 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16943 reti->code_blocks = NULL;
16945 reti->regstclass = NULL;
16948 struct reg_data *d;
16949 const int count = ri->data->count;
16952 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16953 char, struct reg_data);
16954 Newx(d->what, count, U8);
16957 for (i = 0; i < count; i++) {
16958 d->what[i] = ri->data->what[i];
16959 switch (d->what[i]) {
16960 /* see also regcomp.h and regfree_internal() */
16961 case 'a': /* actually an AV, but the dup function is identical. */
16965 case 'u': /* actually an HV, but the dup function is identical. */
16966 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16969 /* This is cheating. */
16970 Newx(d->data[i], 1, regnode_ssc);
16971 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16972 reti->regstclass = (regnode*)d->data[i];
16975 /* Trie stclasses are readonly and can thus be shared
16976 * without duplication. We free the stclass in pregfree
16977 * when the corresponding reg_ac_data struct is freed.
16979 reti->regstclass= ri->regstclass;
16983 ((reg_trie_data*)ri->data->data[i])->refcount++;
16988 d->data[i] = ri->data->data[i];
16991 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16992 ri->data->what[i]);
17001 reti->name_list_idx = ri->name_list_idx;
17003 #ifdef RE_TRACK_PATTERN_OFFSETS
17004 if (ri->u.offsets) {
17005 Newx(reti->u.offsets, 2*len+1, U32);
17006 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17009 SetProgLen(reti,len);
17012 return (void*)reti;
17015 #endif /* USE_ITHREADS */
17017 #ifndef PERL_IN_XSUB_RE
17020 - regnext - dig the "next" pointer out of a node
17023 Perl_regnext(pTHX_ regnode *p)
17030 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17031 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17032 (int)OP(p), (int)REGNODE_MAX);
17035 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17044 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17047 STRLEN l1 = strlen(pat1);
17048 STRLEN l2 = strlen(pat2);
17051 const char *message;
17053 PERL_ARGS_ASSERT_RE_CROAK2;
17059 Copy(pat1, buf, l1 , char);
17060 Copy(pat2, buf + l1, l2 , char);
17061 buf[l1 + l2] = '\n';
17062 buf[l1 + l2 + 1] = '\0';
17063 va_start(args, pat2);
17064 msv = vmess(buf, &args);
17066 message = SvPV_const(msv,l1);
17069 Copy(message, buf, l1 , char);
17070 /* l1-1 to avoid \n */
17071 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17075 /* Certain characters are output as a sequence with the first being a
17077 #define isBACKSLASHED_PUNCT(c) \
17078 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17081 S_put_code_point(pTHX_ SV *sv, UV c)
17083 PERL_ARGS_ASSERT_PUT_CODE_POINT;
17086 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17088 else if (isPRINT(c)) {
17089 const char string = (char) c;
17090 if (isBACKSLASHED_PUNCT(c))
17091 sv_catpvs(sv, "\\");
17092 sv_catpvn(sv, &string, 1);
17095 const char * const mnemonic = cntrl_to_mnemonic((char) c);
17097 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17100 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17105 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17108 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17110 /* Appends to 'sv' a displayable version of the range of code points from
17111 * 'start' to 'end'. It assumes that only ASCII printables are displayable
17112 * as-is (though some of these will be escaped by put_code_point()). */
17114 const unsigned int min_range_count = 3;
17116 assert(start <= end);
17118 PERL_ARGS_ASSERT_PUT_RANGE;
17120 while (start <= end) {
17122 const char * format;
17124 if (end - start < min_range_count) {
17126 /* Individual chars in short ranges */
17127 for (; start <= end; start++) {
17128 put_code_point(sv, start);
17133 /* If permitted by the input options, and there is a possibility that
17134 * this range contains a printable literal, look to see if there is
17136 if (allow_literals && start <= MAX_PRINT_A) {
17138 /* If the range begin isn't an ASCII printable, effectively split
17139 * the range into two parts:
17140 * 1) the portion before the first such printable,
17142 * and output them separately. */
17143 if (! isPRINT_A(start)) {
17144 UV temp_end = start + 1;
17146 /* There is no point looking beyond the final possible
17147 * printable, in MAX_PRINT_A */
17148 UV max = MIN(end, MAX_PRINT_A);
17150 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17154 /* Here, temp_end points to one beyond the first printable if
17155 * found, or to one beyond 'max' if not. If none found, make
17156 * sure that we use the entire range */
17157 if (temp_end > MAX_PRINT_A) {
17158 temp_end = end + 1;
17161 /* Output the first part of the split range, the part that
17162 * doesn't have printables, with no looking for literals
17163 * (otherwise we would infinitely recurse) */
17164 put_range(sv, start, temp_end - 1, FALSE);
17166 /* The 2nd part of the range (if any) starts here. */
17169 /* We continue instead of dropping down because even if the 2nd
17170 * part is non-empty, it could be so short that we want to
17171 * output it specially, as tested for at the top of this loop.
17176 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
17177 * output a sub-range of just the digits or letters, then process
17178 * the remaining portion as usual. */
17179 if (isALPHANUMERIC_A(start)) {
17180 UV mask = (isDIGIT_A(start))
17185 UV temp_end = start + 1;
17187 /* Find the end of the sub-range that includes just the
17188 * characters in the same class as the first character in it */
17189 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17194 /* For short ranges, don't duplicate the code above to output
17195 * them; just call recursively */
17196 if (temp_end - start < min_range_count) {
17197 put_range(sv, start, temp_end, FALSE);
17199 else { /* Output as a range */
17200 put_code_point(sv, start);
17201 sv_catpvs(sv, "-");
17202 put_code_point(sv, temp_end);
17204 start = temp_end + 1;
17208 /* We output any other printables as individual characters */
17209 if (isPUNCT_A(start) || isSPACE_A(start)) {
17210 while (start <= end && (isPUNCT_A(start)
17211 || isSPACE_A(start)))
17213 put_code_point(sv, start);
17218 } /* End of looking for literals */
17220 /* Here is not to output as a literal. Some control characters have
17221 * mnemonic names. Split off any of those at the beginning and end of
17222 * the range to print mnemonically. It isn't possible for many of
17223 * these to be in a row, so this won't overwhelm with output */
17224 while (isMNEMONIC_CNTRL(start) && start <= end) {
17225 put_code_point(sv, start);
17228 if (start < end && isMNEMONIC_CNTRL(end)) {
17230 /* Here, the final character in the range has a mnemonic name.
17231 * Work backwards from the end to find the final non-mnemonic */
17232 UV temp_end = end - 1;
17233 while (isMNEMONIC_CNTRL(temp_end)) {
17237 /* And separately output the range that doesn't have mnemonics */
17238 put_range(sv, start, temp_end, FALSE);
17240 /* Then output the mnemonic trailing controls */
17241 start = temp_end + 1;
17242 while (start <= end) {
17243 put_code_point(sv, start);
17249 /* As a final resort, output the range or subrange as hex. */
17251 this_end = (end < NUM_ANYOF_CODE_POINTS)
17253 : NUM_ANYOF_CODE_POINTS - 1;
17254 format = (this_end < 256)
17255 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17256 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17257 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17263 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17265 /* Appends to 'sv' a displayable version of the innards of the bracketed
17266 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
17267 * output anything, and bitmap_invlist, if not NULL, will point to an
17268 * inversion list of what is in the bit map */
17272 unsigned int punct_count = 0;
17273 SV* invlist = NULL;
17274 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17275 bool allow_literals = TRUE;
17277 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17279 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17281 /* Worst case is exactly every-other code point is in the list */
17282 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17284 /* Convert the bit map to an inversion list, keeping track of how many
17285 * ASCII puncts are set, including an extra amount for the backslashed
17287 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17288 if (BITMAP_TEST(bitmap, i)) {
17289 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17290 if (isPUNCT_A(i)) {
17292 if isBACKSLASHED_PUNCT(i) {
17299 /* Nothing to output */
17300 if (_invlist_len(*invlist_ptr) == 0) {
17301 SvREFCNT_dec(invlist);
17305 /* Generally, it is more readable if printable characters are output as
17306 * literals, but if a range (nearly) spans all of them, it's best to output
17307 * it as a single range. This code will use a single range if all but 2
17308 * printables are in it */
17309 invlist_iterinit(*invlist_ptr);
17310 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17312 /* If range starts beyond final printable, it doesn't have any in it */
17313 if (start > MAX_PRINT_A) {
17317 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
17318 * all but two, the range must start and end no later than 2 from
17320 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17321 if (end > MAX_PRINT_A) {
17327 if (end - start >= MAX_PRINT_A - ' ' - 2) {
17328 allow_literals = FALSE;
17333 invlist_iterfinish(*invlist_ptr);
17335 /* The legibility of the output depends mostly on how many punctuation
17336 * characters are output. There are 32 possible ASCII ones, and some have
17337 * an additional backslash, bringing it to currently 36, so if any more
17338 * than 18 are to be output, we can instead output it as its complement,
17339 * yielding fewer puncts, and making it more legible. But give some weight
17340 * to the fact that outputting it as a complement is less legible than a
17341 * straight output, so don't complement unless we are somewhat over the 18
17343 if (allow_literals && punct_count > 22) {
17344 sv_catpvs(sv, "^");
17346 /* Add everything remaining to the list, so when we invert it just
17347 * below, it will be excluded */
17348 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17349 _invlist_invert(*invlist_ptr);
17352 /* Here we have figured things out. Output each range */
17353 invlist_iterinit(*invlist_ptr);
17354 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17355 if (start >= NUM_ANYOF_CODE_POINTS) {
17358 put_range(sv, start, end, allow_literals);
17360 invlist_iterfinish(*invlist_ptr);
17365 #define CLEAR_OPTSTART \
17366 if (optstart) STMT_START { \
17367 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
17368 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17372 #define DUMPUNTIL(b,e) \
17374 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17376 STATIC const regnode *
17377 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17378 const regnode *last, const regnode *plast,
17379 SV* sv, I32 indent, U32 depth)
17381 U8 op = PSEUDO; /* Arbitrary non-END op. */
17382 const regnode *next;
17383 const regnode *optstart= NULL;
17385 RXi_GET_DECL(r,ri);
17386 GET_RE_DEBUG_FLAGS_DECL;
17388 PERL_ARGS_ASSERT_DUMPUNTIL;
17390 #ifdef DEBUG_DUMPUNTIL
17391 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17392 last ? last-start : 0,plast ? plast-start : 0);
17395 if (plast && plast < last)
17398 while (PL_regkind[op] != END && (!last || node < last)) {
17400 /* While that wasn't END last time... */
17403 if (op == CLOSE || op == WHILEM)
17405 next = regnext((regnode *)node);
17408 if (OP(node) == OPTIMIZED) {
17409 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17416 regprop(r, sv, node, NULL, NULL);
17417 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17418 (int)(2*indent + 1), "", SvPVX_const(sv));
17420 if (OP(node) != OPTIMIZED) {
17421 if (next == NULL) /* Next ptr. */
17422 PerlIO_printf(Perl_debug_log, " (0)");
17423 else if (PL_regkind[(U8)op] == BRANCH
17424 && PL_regkind[OP(next)] != BRANCH )
17425 PerlIO_printf(Perl_debug_log, " (FAIL)");
17427 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17428 (void)PerlIO_putc(Perl_debug_log, '\n');
17432 if (PL_regkind[(U8)op] == BRANCHJ) {
17435 const regnode *nnode = (OP(next) == LONGJMP
17436 ? regnext((regnode *)next)
17438 if (last && nnode > last)
17440 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17443 else if (PL_regkind[(U8)op] == BRANCH) {
17445 DUMPUNTIL(NEXTOPER(node), next);
17447 else if ( PL_regkind[(U8)op] == TRIE ) {
17448 const regnode *this_trie = node;
17449 const char op = OP(node);
17450 const U32 n = ARG(node);
17451 const reg_ac_data * const ac = op>=AHOCORASICK ?
17452 (reg_ac_data *)ri->data->data[n] :
17454 const reg_trie_data * const trie =
17455 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17457 AV *const trie_words
17458 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17460 const regnode *nextbranch= NULL;
17463 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17464 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17466 PerlIO_printf(Perl_debug_log, "%*s%s ",
17467 (int)(2*(indent+3)), "",
17469 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17470 SvCUR(*elem_ptr), 60,
17471 PL_colors[0], PL_colors[1],
17473 ? PERL_PV_ESCAPE_UNI
17475 | PERL_PV_PRETTY_ELLIPSES
17476 | PERL_PV_PRETTY_LTGT
17481 U16 dist= trie->jump[word_idx+1];
17482 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17483 (UV)((dist ? this_trie + dist : next) - start));
17486 nextbranch= this_trie + trie->jump[0];
17487 DUMPUNTIL(this_trie + dist, nextbranch);
17489 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17490 nextbranch= regnext((regnode *)nextbranch);
17492 PerlIO_printf(Perl_debug_log, "\n");
17495 if (last && next > last)
17500 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
17501 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17502 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17504 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17506 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17508 else if ( op == PLUS || op == STAR) {
17509 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17511 else if (PL_regkind[(U8)op] == ANYOF) {
17512 /* arglen 1 + class block */
17513 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17514 ? ANYOF_POSIXL_SKIP
17516 node = NEXTOPER(node);
17518 else if (PL_regkind[(U8)op] == EXACT) {
17519 /* Literal string, where present. */
17520 node += NODE_SZ_STR(node) - 1;
17521 node = NEXTOPER(node);
17524 node = NEXTOPER(node);
17525 node += regarglen[(U8)op];
17527 if (op == CURLYX || op == OPEN)
17531 #ifdef DEBUG_DUMPUNTIL
17532 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17537 #endif /* DEBUGGING */
17541 * c-indentation-style: bsd
17542 * c-basic-offset: 4
17543 * indent-tabs-mode: nil
17546 * ex: set ts=8 sts=4 sw=4 et: