5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
100 #define STATIC static
104 struct RExC_state_t {
105 U32 flags; /* RXf_* are we folding, multilining? */
106 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object
112 char *start; /* Start of input for compile */
113 char *end; /* End of input for compile */
114 char *parse; /* Input-scan pointer. */
115 SSize_t whilem_seen; /* number of WHILEM in this expr */
116 regnode *emit_start; /* Start of emitted-code area */
117 regnode *emit_bound; /* First regnode outside of the
119 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
120 implies compiling, so don't emit */
121 regnode_ssc emit_dummy; /* placeholder for emit to point to;
122 large enough for the largest
123 non-EXACTish node, so can use it as
125 I32 naughty; /* How bad is this pattern? */
126 I32 sawback; /* Did we see \1, ...? */
128 SSize_t size; /* Code size. */
129 I32 npar; /* Capture buffer count, (OPEN) plus
130 one. ("par" 0 is the whole
132 I32 nestroot; /* root parens we are in - used by
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
146 HV *paren_names; /* Paren names */
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
150 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
152 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
156 I32 override_recoding;
157 I32 in_multi_char_class;
158 struct reg_code_block *code_blocks; /* positions of literal (?{})
160 int num_code_blocks; /* size of code_blocks[] */
161 int code_index; /* next code_blocks[] slot */
163 char *starttry; /* -Dr: where regtry was called. */
164 #define RExC_starttry (pRExC_state->starttry)
166 SV *runtime_code_qr; /* qr with the runtime code blocks */
168 const char *lastparse;
170 AV *paren_name_list; /* idx -> name */
171 #define RExC_lastparse (pRExC_state->lastparse)
172 #define RExC_lastnum (pRExC_state->lastnum)
173 #define RExC_paren_name_list (pRExC_state->paren_name_list)
177 #define RExC_flags (pRExC_state->flags)
178 #define RExC_pm_flags (pRExC_state->pm_flags)
179 #define RExC_precomp (pRExC_state->precomp)
180 #define RExC_rx_sv (pRExC_state->rx_sv)
181 #define RExC_rx (pRExC_state->rx)
182 #define RExC_rxi (pRExC_state->rxi)
183 #define RExC_start (pRExC_state->start)
184 #define RExC_end (pRExC_state->end)
185 #define RExC_parse (pRExC_state->parse)
186 #define RExC_whilem_seen (pRExC_state->whilem_seen)
187 #ifdef RE_TRACK_PATTERN_OFFSETS
188 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
191 #define RExC_emit (pRExC_state->emit)
192 #define RExC_emit_dummy (pRExC_state->emit_dummy)
193 #define RExC_emit_start (pRExC_state->emit_start)
194 #define RExC_emit_bound (pRExC_state->emit_bound)
195 #define RExC_naughty (pRExC_state->naughty)
196 #define RExC_sawback (pRExC_state->sawback)
197 #define RExC_seen (pRExC_state->seen)
198 #define RExC_size (pRExC_state->size)
199 #define RExC_npar (pRExC_state->npar)
200 #define RExC_nestroot (pRExC_state->nestroot)
201 #define RExC_extralen (pRExC_state->extralen)
202 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
203 #define RExC_utf8 (pRExC_state->utf8)
204 #define RExC_uni_semantics (pRExC_state->uni_semantics)
205 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
206 #define RExC_open_parens (pRExC_state->open_parens)
207 #define RExC_close_parens (pRExC_state->close_parens)
208 #define RExC_opend (pRExC_state->opend)
209 #define RExC_paren_names (pRExC_state->paren_names)
210 #define RExC_recurse (pRExC_state->recurse)
211 #define RExC_recurse_count (pRExC_state->recurse_count)
212 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
213 #define RExC_study_chunk_recursed_bytes \
214 (pRExC_state->study_chunk_recursed_bytes)
215 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
216 #define RExC_contains_locale (pRExC_state->contains_locale)
217 #define RExC_contains_i (pRExC_state->contains_i)
218 #define RExC_override_recoding (pRExC_state->override_recoding)
219 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
222 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
223 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
224 ((*s) == '{' && regcurly(s, FALSE)))
227 * Flags to be passed up and down.
229 #define WORST 0 /* Worst case. */
230 #define HASWIDTH 0x01 /* Known to match non-null strings. */
232 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
233 * character. (There needs to be a case: in the switch statement in regexec.c
234 * for any node marked SIMPLE.) Note that this is not the same thing as
237 #define SPSTART 0x04 /* Starts with * or + */
238 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
239 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
240 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
242 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
244 /* whether trie related optimizations are enabled */
245 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
246 #define TRIE_STUDY_OPT
247 #define FULL_TRIE_STUDY
253 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
254 #define PBITVAL(paren) (1 << ((paren) & 7))
255 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
256 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
257 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
259 #define REQUIRE_UTF8 STMT_START { \
261 *flagp = RESTART_UTF8; \
266 /* This converts the named class defined in regcomp.h to its equivalent class
267 * number defined in handy.h. */
268 #define namedclass_to_classnum(class) ((int) ((class) / 2))
269 #define classnum_to_namedclass(classnum) ((classnum) * 2)
271 #define _invlist_union_complement_2nd(a, b, output) \
272 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
273 #define _invlist_intersection_complement_2nd(a, b, output) \
274 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
276 /* About scan_data_t.
278 During optimisation we recurse through the regexp program performing
279 various inplace (keyhole style) optimisations. In addition study_chunk
280 and scan_commit populate this data structure with information about
281 what strings MUST appear in the pattern. We look for the longest
282 string that must appear at a fixed location, and we look for the
283 longest string that may appear at a floating location. So for instance
288 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
289 strings (because they follow a .* construct). study_chunk will identify
290 both FOO and BAR as being the longest fixed and floating strings respectively.
292 The strings can be composites, for instance
296 will result in a composite fixed substring 'foo'.
298 For each string some basic information is maintained:
300 - offset or min_offset
301 This is the position the string must appear at, or not before.
302 It also implicitly (when combined with minlenp) tells us how many
303 characters must match before the string we are searching for.
304 Likewise when combined with minlenp and the length of the string it
305 tells us how many characters must appear after the string we have
309 Only used for floating strings. This is the rightmost point that
310 the string can appear at. If set to SSize_t_MAX it indicates that the
311 string can occur infinitely far to the right.
314 A pointer to the minimum number of characters of the pattern that the
315 string was found inside. This is important as in the case of positive
316 lookahead or positive lookbehind we can have multiple patterns
321 The minimum length of the pattern overall is 3, the minimum length
322 of the lookahead part is 3, but the minimum length of the part that
323 will actually match is 1. So 'FOO's minimum length is 3, but the
324 minimum length for the F is 1. This is important as the minimum length
325 is used to determine offsets in front of and behind the string being
326 looked for. Since strings can be composites this is the length of the
327 pattern at the time it was committed with a scan_commit. Note that
328 the length is calculated by study_chunk, so that the minimum lengths
329 are not known until the full pattern has been compiled, thus the
330 pointer to the value.
334 In the case of lookbehind the string being searched for can be
335 offset past the start point of the final matching string.
336 If this value was just blithely removed from the min_offset it would
337 invalidate some of the calculations for how many chars must match
338 before or after (as they are derived from min_offset and minlen and
339 the length of the string being searched for).
340 When the final pattern is compiled and the data is moved from the
341 scan_data_t structure into the regexp structure the information
342 about lookbehind is factored in, with the information that would
343 have been lost precalculated in the end_shift field for the
346 The fields pos_min and pos_delta are used to store the minimum offset
347 and the delta to the maximum offset at the current point in the pattern.
351 typedef struct scan_data_t {
352 /*I32 len_min; unused */
353 /*I32 len_delta; unused */
357 SSize_t last_end; /* min value, <0 unless valid. */
358 SSize_t last_start_min;
359 SSize_t last_start_max;
360 SV **longest; /* Either &l_fixed, or &l_float. */
361 SV *longest_fixed; /* longest fixed string found in pattern */
362 SSize_t offset_fixed; /* offset where it starts */
363 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
364 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
365 SV *longest_float; /* longest floating string found in pattern */
366 SSize_t offset_float_min; /* earliest point in string it can appear */
367 SSize_t offset_float_max; /* latest point in string it can appear */
368 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
369 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
372 SSize_t *last_closep;
373 regnode_ssc *start_class;
376 /* The below is perhaps overboard, but this allows us to save a test at the
377 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
378 * and 'a' differ by a single bit; the same with the upper and lower case of
379 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
380 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
381 * then inverts it to form a mask, with just a single 0, in the bit position
382 * where the upper- and lowercase differ. XXX There are about 40 other
383 * instances in the Perl core where this micro-optimization could be used.
384 * Should decide if maintenance cost is worse, before changing those
386 * Returns a boolean as to whether or not 'v' is either a lowercase or
387 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
388 * compile-time constant, the generated code is better than some optimizing
389 * compilers figure out, amounting to a mask and test. The results are
390 * meaningless if 'c' is not one of [A-Za-z] */
391 #define isARG2_lower_or_UPPER_ARG1(c, v) \
392 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
395 * Forward declarations for pregcomp()'s friends.
398 static const scan_data_t zero_scan_data =
399 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
401 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
402 #define SF_BEFORE_SEOL 0x0001
403 #define SF_BEFORE_MEOL 0x0002
404 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
405 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
407 #define SF_FIX_SHIFT_EOL (+2)
408 #define SF_FL_SHIFT_EOL (+4)
410 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
411 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
413 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
414 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
415 #define SF_IS_INF 0x0040
416 #define SF_HAS_PAR 0x0080
417 #define SF_IN_PAR 0x0100
418 #define SF_HAS_EVAL 0x0200
419 #define SCF_DO_SUBSTR 0x0400
420 #define SCF_DO_STCLASS_AND 0x0800
421 #define SCF_DO_STCLASS_OR 0x1000
422 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
423 #define SCF_WHILEM_VISITED_POS 0x2000
425 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
426 #define SCF_SEEN_ACCEPT 0x8000
427 #define SCF_TRIE_DOING_RESTUDY 0x10000
429 #define UTF cBOOL(RExC_utf8)
431 /* The enums for all these are ordered so things work out correctly */
432 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
433 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
434 == REGEX_DEPENDS_CHARSET)
435 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
436 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
437 >= REGEX_UNICODE_CHARSET)
438 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
439 == REGEX_ASCII_RESTRICTED_CHARSET)
440 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
441 >= REGEX_ASCII_RESTRICTED_CHARSET)
442 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
443 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
445 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
447 /* For programs that want to be strictly Unicode compatible by dying if any
448 * attempt is made to match a non-Unicode code point against a Unicode
450 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
452 #define OOB_NAMEDCLASS -1
454 /* There is no code point that is out-of-bounds, so this is problematic. But
455 * its only current use is to initialize a variable that is always set before
457 #define OOB_UNICODE 0xDEADBEEF
459 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
460 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
463 /* length of regex to show in messages that don't mark a position within */
464 #define RegexLengthToShowInErrorMessages 127
467 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
468 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
469 * op/pragma/warn/regcomp.
471 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
472 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
474 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
475 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
477 #define REPORT_LOCATION_ARGS(offset) \
478 UTF8fARG(UTF, offset, RExC_precomp), \
479 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
482 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
483 * arg. Show regex, up to a maximum length. If it's too long, chop and add
486 #define _FAIL(code) STMT_START { \
487 const char *ellipses = ""; \
488 IV len = RExC_end - RExC_precomp; \
491 SAVEFREESV(RExC_rx_sv); \
492 if (len > RegexLengthToShowInErrorMessages) { \
493 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
494 len = RegexLengthToShowInErrorMessages - 10; \
500 #define FAIL(msg) _FAIL( \
501 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
502 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
504 #define FAIL2(msg,arg) _FAIL( \
505 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
506 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
509 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
511 #define Simple_vFAIL(m) STMT_START { \
512 const IV offset = RExC_parse - RExC_precomp; \
513 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
514 m, REPORT_LOCATION_ARGS(offset)); \
518 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
520 #define vFAIL(m) STMT_START { \
522 SAVEFREESV(RExC_rx_sv); \
527 * Like Simple_vFAIL(), but accepts two arguments.
529 #define Simple_vFAIL2(m,a1) STMT_START { \
530 const IV offset = RExC_parse - RExC_precomp; \
531 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
532 REPORT_LOCATION_ARGS(offset)); \
536 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
538 #define vFAIL2(m,a1) STMT_START { \
540 SAVEFREESV(RExC_rx_sv); \
541 Simple_vFAIL2(m, a1); \
546 * Like Simple_vFAIL(), but accepts three arguments.
548 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
549 const IV offset = RExC_parse - RExC_precomp; \
550 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
551 REPORT_LOCATION_ARGS(offset)); \
555 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
557 #define vFAIL3(m,a1,a2) STMT_START { \
559 SAVEFREESV(RExC_rx_sv); \
560 Simple_vFAIL3(m, a1, a2); \
564 * Like Simple_vFAIL(), but accepts four arguments.
566 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
567 const IV offset = RExC_parse - RExC_precomp; \
568 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
569 REPORT_LOCATION_ARGS(offset)); \
572 #define vFAIL4(m,a1,a2,a3) STMT_START { \
574 SAVEFREESV(RExC_rx_sv); \
575 Simple_vFAIL4(m, a1, a2, a3); \
578 /* A specialized version of vFAIL2 that works with UTF8f */
579 #define vFAIL2utf8f(m, a1) STMT_START { \
580 const IV offset = RExC_parse - RExC_precomp; \
582 SAVEFREESV(RExC_rx_sv); \
583 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
584 REPORT_LOCATION_ARGS(offset)); \
588 /* m is not necessarily a "literal string", in this macro */
589 #define reg_warn_non_literal_string(loc, m) STMT_START { \
590 const IV offset = loc - RExC_precomp; \
591 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
592 m, REPORT_LOCATION_ARGS(offset)); \
595 #define ckWARNreg(loc,m) STMT_START { \
596 const IV offset = loc - RExC_precomp; \
597 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
598 REPORT_LOCATION_ARGS(offset)); \
601 #define vWARN_dep(loc, m) STMT_START { \
602 const IV offset = loc - RExC_precomp; \
603 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
604 REPORT_LOCATION_ARGS(offset)); \
607 #define ckWARNdep(loc,m) STMT_START { \
608 const IV offset = loc - RExC_precomp; \
609 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
611 REPORT_LOCATION_ARGS(offset)); \
614 #define ckWARNregdep(loc,m) STMT_START { \
615 const IV offset = loc - RExC_precomp; \
616 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
618 REPORT_LOCATION_ARGS(offset)); \
621 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
622 const IV offset = loc - RExC_precomp; \
623 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
625 a1, REPORT_LOCATION_ARGS(offset)); \
628 #define ckWARN2reg(loc, m, a1) STMT_START { \
629 const IV offset = loc - RExC_precomp; \
630 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
631 a1, REPORT_LOCATION_ARGS(offset)); \
634 #define vWARN3(loc, m, a1, a2) STMT_START { \
635 const IV offset = loc - RExC_precomp; \
636 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
637 a1, a2, REPORT_LOCATION_ARGS(offset)); \
640 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
641 const IV offset = loc - RExC_precomp; \
642 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
643 a1, a2, REPORT_LOCATION_ARGS(offset)); \
646 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
647 const IV offset = loc - RExC_precomp; \
648 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
649 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
652 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
653 const IV offset = loc - RExC_precomp; \
654 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
655 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
658 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
659 const IV offset = loc - RExC_precomp; \
660 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
661 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
665 /* Allow for side effects in s */
666 #define REGC(c,s) STMT_START { \
667 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
670 /* Macros for recording node offsets. 20001227 mjd@plover.com
671 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
672 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
673 * Element 0 holds the number n.
674 * Position is 1 indexed.
676 #ifndef RE_TRACK_PATTERN_OFFSETS
677 #define Set_Node_Offset_To_R(node,byte)
678 #define Set_Node_Offset(node,byte)
679 #define Set_Cur_Node_Offset
680 #define Set_Node_Length_To_R(node,len)
681 #define Set_Node_Length(node,len)
682 #define Set_Node_Cur_Length(node,start)
683 #define Node_Offset(n)
684 #define Node_Length(n)
685 #define Set_Node_Offset_Length(node,offset,len)
686 #define ProgLen(ri) ri->u.proglen
687 #define SetProgLen(ri,x) ri->u.proglen = x
689 #define ProgLen(ri) ri->u.offsets[0]
690 #define SetProgLen(ri,x) ri->u.offsets[0] = x
691 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
693 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
694 __LINE__, (int)(node), (int)(byte))); \
696 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
699 RExC_offsets[2*(node)-1] = (byte); \
704 #define Set_Node_Offset(node,byte) \
705 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
706 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
708 #define Set_Node_Length_To_R(node,len) STMT_START { \
710 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
711 __LINE__, (int)(node), (int)(len))); \
713 Perl_croak(aTHX_ "value of node is %d in Length macro", \
716 RExC_offsets[2*(node)] = (len); \
721 #define Set_Node_Length(node,len) \
722 Set_Node_Length_To_R((node)-RExC_emit_start, len)
723 #define Set_Node_Cur_Length(node, start) \
724 Set_Node_Length(node, RExC_parse - start)
726 /* Get offsets and lengths */
727 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
728 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
730 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
731 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
732 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
736 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
737 #define EXPERIMENTAL_INPLACESCAN
738 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
740 #define DEBUG_RExC_seen() \
741 DEBUG_OPTIMISE_MORE_r({ \
742 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
744 if (RExC_seen & REG_SEEN_ZERO_LEN) \
745 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN "); \
747 if (RExC_seen & REG_SEEN_LOOKBEHIND) \
748 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND "); \
750 if (RExC_seen & REG_SEEN_GPOS) \
751 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS "); \
753 if (RExC_seen & REG_SEEN_CANY) \
754 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY "); \
756 if (RExC_seen & REG_SEEN_RECURSE) \
757 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE "); \
759 if (RExC_seen & REG_TOP_LEVEL_BRANCHES) \
760 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES "); \
762 if (RExC_seen & REG_SEEN_VERBARG) \
763 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG "); \
765 if (RExC_seen & REG_SEEN_CUTGROUP) \
766 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP "); \
768 if (RExC_seen & REG_SEEN_RUN_ON_COMMENT) \
769 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT "); \
771 if (RExC_seen & REG_SEEN_UNFOLDED_MULTI) \
772 PerlIO_printf(Perl_debug_log,"REG_SEEN_UNFOLDED_MULTI "); \
774 if (RExC_seen & REG_SEEN_GOSTART) \
775 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART "); \
777 PerlIO_printf(Perl_debug_log,"\n"); \
780 #define DEBUG_STUDYDATA(str,data,depth) \
781 DEBUG_OPTIMISE_MORE_r(if(data){ \
782 PerlIO_printf(Perl_debug_log, \
783 "%*s" str "Pos:%"IVdf"/%"IVdf \
784 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
785 (int)(depth)*2, "", \
786 (IV)((data)->pos_min), \
787 (IV)((data)->pos_delta), \
788 (UV)((data)->flags), \
789 (IV)((data)->whilem_c), \
790 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
791 is_inf ? "INF " : "" \
793 if ((data)->last_found) \
794 PerlIO_printf(Perl_debug_log, \
795 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
796 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
797 SvPVX_const((data)->last_found), \
798 (IV)((data)->last_end), \
799 (IV)((data)->last_start_min), \
800 (IV)((data)->last_start_max), \
801 ((data)->longest && \
802 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
803 SvPVX_const((data)->longest_fixed), \
804 (IV)((data)->offset_fixed), \
805 ((data)->longest && \
806 (data)->longest==&((data)->longest_float)) ? "*" : "", \
807 SvPVX_const((data)->longest_float), \
808 (IV)((data)->offset_float_min), \
809 (IV)((data)->offset_float_max) \
811 PerlIO_printf(Perl_debug_log,"\n"); \
814 /* Mark that we cannot extend a found fixed substring at this point.
815 Update the longest found anchored substring and the longest found
816 floating substrings if needed. */
819 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
820 SSize_t *minlenp, int is_inf)
822 const STRLEN l = CHR_SVLEN(data->last_found);
823 const STRLEN old_l = CHR_SVLEN(*data->longest);
824 GET_RE_DEBUG_FLAGS_DECL;
826 PERL_ARGS_ASSERT_SCAN_COMMIT;
828 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
829 SvSetMagicSV(*data->longest, data->last_found);
830 if (*data->longest == data->longest_fixed) {
831 data->offset_fixed = l ? data->last_start_min : data->pos_min;
832 if (data->flags & SF_BEFORE_EOL)
834 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
836 data->flags &= ~SF_FIX_BEFORE_EOL;
837 data->minlen_fixed=minlenp;
838 data->lookbehind_fixed=0;
840 else { /* *data->longest == data->longest_float */
841 data->offset_float_min = l ? data->last_start_min : data->pos_min;
842 data->offset_float_max = (l
843 ? data->last_start_max
844 : (data->pos_delta == SSize_t_MAX
846 : data->pos_min + data->pos_delta));
848 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
849 data->offset_float_max = SSize_t_MAX;
850 if (data->flags & SF_BEFORE_EOL)
852 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
854 data->flags &= ~SF_FL_BEFORE_EOL;
855 data->minlen_float=minlenp;
856 data->lookbehind_float=0;
859 SvCUR_set(data->last_found, 0);
861 SV * const sv = data->last_found;
862 if (SvUTF8(sv) && SvMAGICAL(sv)) {
863 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
869 data->flags &= ~SF_BEFORE_EOL;
870 DEBUG_STUDYDATA("commit: ",data,0);
873 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
874 * list that describes which code points it matches */
877 S_ssc_anything(pTHX_ regnode_ssc *ssc)
879 /* Set the SSC 'ssc' to match an empty string or any code point */
881 PERL_ARGS_ASSERT_SSC_ANYTHING;
883 assert(is_ANYOF_SYNTHETIC(ssc));
885 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
886 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
887 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
891 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
893 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
894 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
895 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
896 * in any way, so there's no point in using it */
901 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
903 assert(is_ANYOF_SYNTHETIC(ssc));
905 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
909 /* See if the list consists solely of the range 0 - Infinity */
910 invlist_iterinit(ssc->invlist);
911 ret = invlist_iternext(ssc->invlist, &start, &end)
915 invlist_iterfinish(ssc->invlist);
921 /* If e.g., both \w and \W are set, matches everything */
922 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
924 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
925 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
935 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
937 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
938 * string, any code point, or any posix class under locale */
940 PERL_ARGS_ASSERT_SSC_INIT;
942 Zero(ssc, 1, regnode_ssc);
943 set_ANYOF_SYNTHETIC(ssc);
944 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
947 /* If any portion of the regex is to operate under locale rules,
948 * initialization includes it. The reason this isn't done for all regexes
949 * is that the optimizer was written under the assumption that locale was
950 * all-or-nothing. Given the complexity and lack of documentation in the
951 * optimizer, and that there are inadequate test cases for locale, many
952 * parts of it may not work properly, it is safest to avoid locale unless
954 if (RExC_contains_locale) {
955 ANYOF_POSIXL_SETALL(ssc);
956 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
959 ANYOF_POSIXL_ZERO(ssc);
964 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
965 const regnode_ssc *ssc)
967 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
968 * to the list of code points matched, and locale posix classes; hence does
969 * not check its flags) */
974 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
976 assert(is_ANYOF_SYNTHETIC(ssc));
978 invlist_iterinit(ssc->invlist);
979 ret = invlist_iternext(ssc->invlist, &start, &end)
983 invlist_iterfinish(ssc->invlist);
989 if (RExC_contains_locale
990 && ! ((ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
991 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
992 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc)))
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002 const regnode_charclass_posixl_fold* const node)
1004 /* Returns a mortal inversion list defining which code points are matched
1005 * by 'node', which is of type ANYOF. Handles complementing the result if
1006 * appropriate. If some code points aren't knowable at this time, the
1007 * returned list must, and will, contain every code point that is a
1010 SV* invlist = sv_2mortal(_new_invlist(0));
1012 const U32 n = ARG(node);
1013 bool new_node_has_latin1 = FALSE;
1015 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1017 /* Look at the data structure created by S_set_ANYOF_arg() */
1018 if (n != ANYOF_NONBITMAP_EMPTY) {
1019 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1020 AV * const av = MUTABLE_AV(SvRV(rv));
1021 SV **const ary = AvARRAY(av);
1022 assert(RExC_rxi->data->what[n] == 's');
1024 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1025 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1027 else if (ary[0] && ary[0] != &PL_sv_undef) {
1029 /* Here, no compile-time swash, and there are things that won't be
1030 * known until runtime -- we have to assume it could be anything */
1031 return _add_range_to_invlist(invlist, 0, UV_MAX);
1035 /* Here no compile-time swash, and no run-time only data. Use the
1036 * node's inversion list */
1037 invlist = sv_2mortal(invlist_clone(ary[2]));
1041 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1042 * inversion list for the others, but if there are code points that should
1043 * match only conditionally on the target string being UTF-8, those are
1044 * placed in the inversion list, and not the bitmap. Since there are
1045 * circumstances under which they could match, they are included in the
1046 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1047 * here, so that when we invert below, the end result actually does include
1048 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1049 * before we add the unconditionally matched code points */
1050 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1051 _invlist_intersection_complement_2nd(invlist,
1056 /* Add in the points from the bit map */
1057 for (i = 0; i < 256; i++) {
1058 if (ANYOF_BITMAP_TEST(node, i)) {
1059 invlist = add_cp_to_invlist(invlist, i);
1060 new_node_has_latin1 = TRUE;
1064 /* If this can match all upper Latin1 code points, have to add them
1066 if (OP(node) == ANYOF_NON_UTF8_NON_ASCII_ALL) {
1067 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1070 /* Similarly for these */
1071 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1072 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1075 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1076 _invlist_invert(invlist);
1078 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1080 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1081 * locale. We can skip this if there are no 0-255 at all. */
1082 _invlist_union(invlist, PL_Latin1, &invlist);
1085 /* Similarly add the UTF-8 locale possible matches */
1086 if (ANYOF_FLAGS(node) & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(node))
1088 _invlist_union_maybe_complement_2nd(invlist,
1089 ANYOF_UTF8_LOCALE_INVLIST(node),
1090 ANYOF_FLAGS(node) & ANYOF_INVERT,
1097 /* These two functions currently do the exact same thing */
1098 #define ssc_init_zero ssc_init
1100 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1101 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1104 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1106 /* Take the flags 'and_with' and accumulate them anded into the flags for
1107 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
1108 * The flags 'and_with' should not come from another SSC (otherwise the
1109 * EMPTY_STRING flag won't work) */
1111 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS;
1113 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1115 /* Use just the SSC-related flags from 'and_with' */
1116 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_COMMON_FLAGS);
1117 ANYOF_FLAGS(ssc) |= ssc_only_flags;
1120 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1121 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1122 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1125 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1126 const regnode_ssc *and_with)
1128 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1129 * another SSC or a regular ANYOF class. Can create false positives. */
1134 PERL_ARGS_ASSERT_SSC_AND;
1136 assert(is_ANYOF_SYNTHETIC(ssc));
1138 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1139 * the code point inversion list and just the relevant flags */
1140 if (is_ANYOF_SYNTHETIC(and_with)) {
1141 anded_cp_list = and_with->invlist;
1142 anded_flags = ANYOF_FLAGS(and_with);
1144 /* XXX This is a kludge around what appears to be deficiencies in the
1145 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1146 * there are paths through the optimizer where it doesn't get weeded
1147 * out when it should. And if we don't make some extra provision for
1148 * it like the code just below, it doesn't get added when it should.
1149 * This solution is to add it only when AND'ing, which is here, and
1150 * only when what is being AND'ed is the pristine, original node
1151 * matching anything. Thus it is like adding it to ssc_anything() but
1152 * only when the result is to be AND'ed. Probably the same solution
1153 * could be adopted for the same problem we have with /l matching,
1154 * which is solved differently in S_ssc_init(), and that would lead to
1155 * fewer false positives than that solution has. But if this solution
1156 * creates bugs, the consequences are only that a warning isn't raised
1157 * that should be; while the consequences for having /l bugs is
1158 * incorrect matches */
1159 if (ssc_is_anything(and_with)) {
1160 anded_flags |= ANYOF_WARN_SUPER;
1164 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1165 (regnode_charclass_posixl_fold*) and_with);
1166 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1169 ANYOF_FLAGS(ssc) &= anded_flags;
1171 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1172 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1173 * 'and_with' may be inverted. When not inverted, we have the situation of
1175 * (C1 | P1) & (C2 | P2)
1176 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1177 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1178 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1179 * <= ((C1 & C2) | P1 | P2)
1180 * Alternatively, the last few steps could be:
1181 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1182 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1183 * <= (C1 | C2 | (P1 & P2))
1184 * We favor the second approach if either P1 or P2 is non-empty. This is
1185 * because these components are a barrier to doing optimizations, as what
1186 * they match cannot be known until the moment of matching as they are
1187 * dependent on the current locale, 'AND"ing them likely will reduce or
1189 * But we can do better if we know that C1,P1 are in their initial state (a
1190 * frequent occurrence), each matching everything:
1191 * (<everything>) & (C2 | P2) = C2 | P2
1192 * Similarly, if C2,P2 are in their initial state (again a frequent
1193 * occurrence), the result is a no-op
1194 * (C1 | P1) & (<everything>) = C1 | P1
1197 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1198 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1199 * <= (C1 & ~C2) | (P1 & ~P2)
1202 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1203 && ! is_ANYOF_SYNTHETIC(and_with))
1207 ssc_intersection(ssc,
1209 FALSE /* Has already been inverted */
1212 /* If either P1 or P2 is empty, the intersection will be also; can skip
1214 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1215 ANYOF_POSIXL_ZERO(ssc);
1217 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1219 /* Note that the Posix class component P from 'and_with' actually
1221 * P = Pa | Pb | ... | Pn
1222 * where each component is one posix class, such as in [\w\s].
1224 * ~P = ~(Pa | Pb | ... | Pn)
1225 * = ~Pa & ~Pb & ... & ~Pn
1226 * <= ~Pa | ~Pb | ... | ~Pn
1227 * The last is something we can easily calculate, but unfortunately
1228 * is likely to have many false positives. We could do better
1229 * in some (but certainly not all) instances if two classes in
1230 * P have known relationships. For example
1231 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1233 * :lower: & :print: = :lower:
1234 * And similarly for classes that must be disjoint. For example,
1235 * since \s and \w can have no elements in common based on rules in
1236 * the POSIX standard,
1237 * \w & ^\S = nothing
1238 * Unfortunately, some vendor locales do not meet the Posix
1239 * standard, in particular almost everything by Microsoft.
1240 * The loop below just changes e.g., \w into \W and vice versa */
1242 regnode_charclass_posixl_fold temp;
1243 int add = 1; /* To calculate the index of the complement */
1245 ANYOF_POSIXL_ZERO(&temp);
1246 for (i = 0; i < ANYOF_MAX; i++) {
1248 || ! ANYOF_POSIXL_TEST(and_with, i)
1249 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1251 if (ANYOF_POSIXL_TEST(and_with, i)) {
1252 ANYOF_POSIXL_SET(&temp, i + add);
1254 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1256 ANYOF_POSIXL_AND(&temp, ssc);
1258 } /* else ssc already has no posixes */
1259 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1260 in its initial state */
1261 else if (! is_ANYOF_SYNTHETIC(and_with)
1262 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1264 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1265 * copy it over 'ssc' */
1266 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1267 if (is_ANYOF_SYNTHETIC(and_with)) {
1268 StructCopy(and_with, ssc, regnode_ssc);
1271 ssc->invlist = anded_cp_list;
1272 ANYOF_POSIXL_ZERO(ssc);
1273 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274 ANYOF_POSIXL_OR(and_with, ssc);
1278 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1279 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1281 /* One or the other of P1, P2 is non-empty. */
1282 ANYOF_POSIXL_AND(and_with, ssc);
1283 ssc_union(ssc, anded_cp_list, FALSE);
1285 else { /* P1 = P2 = empty */
1286 ssc_intersection(ssc, anded_cp_list, FALSE);
1292 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1293 const regnode_ssc *or_with)
1295 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1296 * another SSC or a regular ANYOF class. Can create false positives if
1297 * 'or_with' is to be inverted. */
1302 PERL_ARGS_ASSERT_SSC_OR;
1304 assert(is_ANYOF_SYNTHETIC(ssc));
1306 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1307 * the code point inversion list and just the relevant flags */
1308 if (is_ANYOF_SYNTHETIC(or_with)) {
1309 ored_cp_list = or_with->invlist;
1310 ored_flags = ANYOF_FLAGS(or_with);
1313 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1314 (regnode_charclass_posixl_fold*) or_with);
1315 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1318 ANYOF_FLAGS(ssc) |= ored_flags;
1320 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1321 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1322 * 'or_with' may be inverted. When not inverted, we have the simple
1323 * situation of computing:
1324 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1325 * If P1|P2 yields a situation with both a class and its complement are
1326 * set, like having both \w and \W, this matches all code points, and we
1327 * can delete these from the P component of the ssc going forward. XXX We
1328 * might be able to delete all the P components, but I (khw) am not certain
1329 * about this, and it is better to be safe.
1332 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1333 * <= (C1 | P1) | ~C2
1334 * <= (C1 | ~C2) | P1
1335 * (which results in actually simpler code than the non-inverted case)
1338 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1339 && ! is_ANYOF_SYNTHETIC(or_with))
1341 /* We ignore P2, leaving P1 going forward */
1343 else { /* Not inverted */
1344 ANYOF_POSIXL_OR(or_with, ssc);
1345 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1347 for (i = 0; i < ANYOF_MAX; i += 2) {
1348 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1350 ssc_match_all_cp(ssc);
1351 ANYOF_POSIXL_CLEAR(ssc, i);
1352 ANYOF_POSIXL_CLEAR(ssc, i+1);
1353 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1354 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1363 FALSE /* Already has been inverted */
1367 PERL_STATIC_INLINE void
1368 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1370 PERL_ARGS_ASSERT_SSC_UNION;
1372 assert(is_ANYOF_SYNTHETIC(ssc));
1374 _invlist_union_maybe_complement_2nd(ssc->invlist,
1380 PERL_STATIC_INLINE void
1381 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1383 const bool invert2nd)
1385 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1387 assert(is_ANYOF_SYNTHETIC(ssc));
1389 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1395 PERL_STATIC_INLINE void
1396 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1398 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1400 assert(is_ANYOF_SYNTHETIC(ssc));
1402 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1405 PERL_STATIC_INLINE void
1406 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1408 /* AND just the single code point 'cp' into the SSC 'ssc' */
1410 SV* cp_list = _new_invlist(2);
1412 PERL_ARGS_ASSERT_SSC_CP_AND;
1414 assert(is_ANYOF_SYNTHETIC(ssc));
1416 cp_list = add_cp_to_invlist(cp_list, cp);
1417 ssc_intersection(ssc, cp_list,
1418 FALSE /* Not inverted */
1420 SvREFCNT_dec_NN(cp_list);
1423 PERL_STATIC_INLINE void
1424 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1426 /* Set the SSC 'ssc' to not match any locale things */
1428 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1430 assert(is_ANYOF_SYNTHETIC(ssc));
1432 ANYOF_POSIXL_ZERO(ssc);
1433 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1437 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1439 /* The inversion list in the SSC is marked mortal; now we need a more
1440 * permanent copy, which is stored the same way that is done in a regular
1441 * ANYOF node, with the first 256 code points in a bit map */
1443 SV* invlist = invlist_clone(ssc->invlist);
1445 PERL_ARGS_ASSERT_SSC_FINALIZE;
1447 assert(is_ANYOF_SYNTHETIC(ssc));
1449 /* The code in this file assumes that all but these flags aren't relevant
1450 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1451 * time we reach here */
1452 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1454 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1456 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1458 /* The code points that could match under /li are already incorporated into
1459 * the inversion list and bit map */
1460 ANYOF_FLAGS(ssc) &= ~ANYOF_LOC_FOLD;
1462 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1465 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1466 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1467 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1468 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1469 ? (TRIE_LIST_CUR( idx ) - 1) \
1475 dump_trie(trie,widecharmap,revcharmap)
1476 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1477 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1479 These routines dump out a trie in a somewhat readable format.
1480 The _interim_ variants are used for debugging the interim
1481 tables that are used to generate the final compressed
1482 representation which is what dump_trie expects.
1484 Part of the reason for their existence is to provide a form
1485 of documentation as to how the different representations function.
1490 Dumps the final compressed table form of the trie to Perl_debug_log.
1491 Used for debugging make_trie().
1495 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1496 AV *revcharmap, U32 depth)
1499 SV *sv=sv_newmortal();
1500 int colwidth= widecharmap ? 6 : 4;
1502 GET_RE_DEBUG_FLAGS_DECL;
1504 PERL_ARGS_ASSERT_DUMP_TRIE;
1506 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1507 (int)depth * 2 + 2,"",
1508 "Match","Base","Ofs" );
1510 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1511 SV ** const tmp = av_fetch( revcharmap, state, 0);
1513 PerlIO_printf( Perl_debug_log, "%*s",
1515 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1516 PL_colors[0], PL_colors[1],
1517 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1518 PERL_PV_ESCAPE_FIRSTCHAR
1523 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1524 (int)depth * 2 + 2,"");
1526 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1527 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1528 PerlIO_printf( Perl_debug_log, "\n");
1530 for( state = 1 ; state < trie->statecount ; state++ ) {
1531 const U32 base = trie->states[ state ].trans.base;
1533 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1534 (int)depth * 2 + 2,"", (UV)state);
1536 if ( trie->states[ state ].wordnum ) {
1537 PerlIO_printf( Perl_debug_log, " W%4X",
1538 trie->states[ state ].wordnum );
1540 PerlIO_printf( Perl_debug_log, "%6s", "" );
1543 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1548 while( ( base + ofs < trie->uniquecharcount ) ||
1549 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1550 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1554 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1556 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1557 if ( ( base + ofs >= trie->uniquecharcount )
1558 && ( base + ofs - trie->uniquecharcount
1560 && trie->trans[ base + ofs
1561 - trie->uniquecharcount ].check == state )
1563 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1565 (UV)trie->trans[ base + ofs
1566 - trie->uniquecharcount ].next );
1568 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1572 PerlIO_printf( Perl_debug_log, "]");
1575 PerlIO_printf( Perl_debug_log, "\n" );
1577 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1579 for (word=1; word <= trie->wordcount; word++) {
1580 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1581 (int)word, (int)(trie->wordinfo[word].prev),
1582 (int)(trie->wordinfo[word].len));
1584 PerlIO_printf(Perl_debug_log, "\n" );
1587 Dumps a fully constructed but uncompressed trie in list form.
1588 List tries normally only are used for construction when the number of
1589 possible chars (trie->uniquecharcount) is very high.
1590 Used for debugging make_trie().
1593 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1594 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1598 SV *sv=sv_newmortal();
1599 int colwidth= widecharmap ? 6 : 4;
1600 GET_RE_DEBUG_FLAGS_DECL;
1602 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1604 /* print out the table precompression. */
1605 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1606 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1607 "------:-----+-----------------\n" );
1609 for( state=1 ; state < next_alloc ; state ++ ) {
1612 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1613 (int)depth * 2 + 2,"", (UV)state );
1614 if ( ! trie->states[ state ].wordnum ) {
1615 PerlIO_printf( Perl_debug_log, "%5s| ","");
1617 PerlIO_printf( Perl_debug_log, "W%4x| ",
1618 trie->states[ state ].wordnum
1621 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1622 SV ** const tmp = av_fetch( revcharmap,
1623 TRIE_LIST_ITEM(state,charid).forid, 0);
1625 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1627 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1629 PL_colors[0], PL_colors[1],
1630 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1631 | PERL_PV_ESCAPE_FIRSTCHAR
1633 TRIE_LIST_ITEM(state,charid).forid,
1634 (UV)TRIE_LIST_ITEM(state,charid).newstate
1637 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1638 (int)((depth * 2) + 14), "");
1641 PerlIO_printf( Perl_debug_log, "\n");
1646 Dumps a fully constructed but uncompressed trie in table form.
1647 This is the normal DFA style state transition table, with a few
1648 twists to facilitate compression later.
1649 Used for debugging make_trie().
1652 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1653 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1658 SV *sv=sv_newmortal();
1659 int colwidth= widecharmap ? 6 : 4;
1660 GET_RE_DEBUG_FLAGS_DECL;
1662 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1665 print out the table precompression so that we can do a visual check
1666 that they are identical.
1669 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1671 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1672 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1674 PerlIO_printf( Perl_debug_log, "%*s",
1676 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1677 PL_colors[0], PL_colors[1],
1678 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1679 PERL_PV_ESCAPE_FIRSTCHAR
1685 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1687 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1688 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1691 PerlIO_printf( Perl_debug_log, "\n" );
1693 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1695 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1696 (int)depth * 2 + 2,"",
1697 (UV)TRIE_NODENUM( state ) );
1699 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1700 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1702 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1704 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1706 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1707 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1708 (UV)trie->trans[ state ].check );
1710 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1711 (UV)trie->trans[ state ].check,
1712 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1720 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1721 startbranch: the first branch in the whole branch sequence
1722 first : start branch of sequence of branch-exact nodes.
1723 May be the same as startbranch
1724 last : Thing following the last branch.
1725 May be the same as tail.
1726 tail : item following the branch sequence
1727 count : words in the sequence
1728 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1729 depth : indent depth
1731 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1733 A trie is an N'ary tree where the branches are determined by digital
1734 decomposition of the key. IE, at the root node you look up the 1st character and
1735 follow that branch repeat until you find the end of the branches. Nodes can be
1736 marked as "accepting" meaning they represent a complete word. Eg:
1740 would convert into the following structure. Numbers represent states, letters
1741 following numbers represent valid transitions on the letter from that state, if
1742 the number is in square brackets it represents an accepting state, otherwise it
1743 will be in parenthesis.
1745 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1749 (1) +-i->(6)-+-s->[7]
1751 +-s->(3)-+-h->(4)-+-e->[5]
1753 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1755 This shows that when matching against the string 'hers' we will begin at state 1
1756 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1757 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1758 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1759 single traverse. We store a mapping from accepting to state to which word was
1760 matched, and then when we have multiple possibilities we try to complete the
1761 rest of the regex in the order in which they occured in the alternation.
1763 The only prior NFA like behaviour that would be changed by the TRIE support is
1764 the silent ignoring of duplicate alternations which are of the form:
1766 / (DUPE|DUPE) X? (?{ ... }) Y /x
1768 Thus EVAL blocks following a trie may be called a different number of times with
1769 and without the optimisation. With the optimisations dupes will be silently
1770 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1771 the following demonstrates:
1773 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1775 which prints out 'word' three times, but
1777 'words'=~/(word|word|word)(?{ print $1 })S/
1779 which doesnt print it out at all. This is due to other optimisations kicking in.
1781 Example of what happens on a structural level:
1783 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1785 1: CURLYM[1] {1,32767}(18)
1796 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1797 and should turn into:
1799 1: CURLYM[1] {1,32767}(18)
1801 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1809 Cases where tail != last would be like /(?foo|bar)baz/:
1819 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1820 and would end up looking like:
1823 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1830 d = uvchr_to_utf8_flags(d, uv, 0);
1832 is the recommended Unicode-aware way of saying
1837 #define TRIE_STORE_REVCHAR(val) \
1840 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1841 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1842 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1843 SvCUR_set(zlopp, kapow - flrbbbbb); \
1846 av_push(revcharmap, zlopp); \
1848 char ooooff = (char)val; \
1849 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1853 /* This gets the next character from the input, folding it if not already
1855 #define TRIE_READ_CHAR STMT_START { \
1858 /* if it is UTF then it is either already folded, or does not need \
1860 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1862 else if (folder == PL_fold_latin1) { \
1863 /* This folder implies Unicode rules, which in the range expressible \
1864 * by not UTF is the lower case, with the two exceptions, one of \
1865 * which should have been taken care of before calling this */ \
1866 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1867 uvc = toLOWER_L1(*uc); \
1868 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1871 /* raw data, will be folded later if needed */ \
1879 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1880 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1881 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1882 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1884 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1885 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1886 TRIE_LIST_CUR( state )++; \
1889 #define TRIE_LIST_NEW(state) STMT_START { \
1890 Newxz( trie->states[ state ].trans.list, \
1891 4, reg_trie_trans_le ); \
1892 TRIE_LIST_CUR( state ) = 1; \
1893 TRIE_LIST_LEN( state ) = 4; \
1896 #define TRIE_HANDLE_WORD(state) STMT_START { \
1897 U16 dupe= trie->states[ state ].wordnum; \
1898 regnode * const noper_next = regnext( noper ); \
1901 /* store the word for dumping */ \
1903 if (OP(noper) != NOTHING) \
1904 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1906 tmp = newSVpvn_utf8( "", 0, UTF ); \
1907 av_push( trie_words, tmp ); \
1911 trie->wordinfo[curword].prev = 0; \
1912 trie->wordinfo[curword].len = wordlen; \
1913 trie->wordinfo[curword].accept = state; \
1915 if ( noper_next < tail ) { \
1917 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1919 trie->jump[curword] = (U16)(noper_next - convert); \
1921 jumper = noper_next; \
1923 nextbranch= regnext(cur); \
1927 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1928 /* chain, so that when the bits of chain are later */\
1929 /* linked together, the dups appear in the chain */\
1930 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1931 trie->wordinfo[dupe].prev = curword; \
1933 /* we haven't inserted this word yet. */ \
1934 trie->states[ state ].wordnum = curword; \
1939 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1940 ( ( base + charid >= ucharcount \
1941 && base + charid < ubound \
1942 && state == trie->trans[ base - ucharcount + charid ].check \
1943 && trie->trans[ base - ucharcount + charid ].next ) \
1944 ? trie->trans[ base - ucharcount + charid ].next \
1945 : ( state==1 ? special : 0 ) \
1949 #define MADE_JUMP_TRIE 2
1950 #define MADE_EXACT_TRIE 4
1953 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1954 regnode *first, regnode *last, regnode *tail,
1955 U32 word_count, U32 flags, U32 depth)
1958 /* first pass, loop through and scan words */
1959 reg_trie_data *trie;
1960 HV *widecharmap = NULL;
1961 AV *revcharmap = newAV();
1967 regnode *jumper = NULL;
1968 regnode *nextbranch = NULL;
1969 regnode *convert = NULL;
1970 U32 *prev_states; /* temp array mapping each state to previous one */
1971 /* we just use folder as a flag in utf8 */
1972 const U8 * folder = NULL;
1975 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1976 AV *trie_words = NULL;
1977 /* along with revcharmap, this only used during construction but both are
1978 * useful during debugging so we store them in the struct when debugging.
1981 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1982 STRLEN trie_charcount=0;
1984 SV *re_trie_maxbuff;
1985 GET_RE_DEBUG_FLAGS_DECL;
1987 PERL_ARGS_ASSERT_MAKE_TRIE;
1989 PERL_UNUSED_ARG(depth);
1996 case EXACTFU: folder = PL_fold_latin1; break;
1997 case EXACTF: folder = PL_fold; break;
1998 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2001 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2003 trie->startstate = 1;
2004 trie->wordcount = word_count;
2005 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2006 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2008 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2009 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2010 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2013 trie_words = newAV();
2016 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2017 if (!SvIOK(re_trie_maxbuff)) {
2018 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2020 DEBUG_TRIE_COMPILE_r({
2021 PerlIO_printf( Perl_debug_log,
2022 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2023 (int)depth * 2 + 2, "",
2024 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2025 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2028 /* Find the node we are going to overwrite */
2029 if ( first == startbranch && OP( last ) != BRANCH ) {
2030 /* whole branch chain */
2033 /* branch sub-chain */
2034 convert = NEXTOPER( first );
2037 /* -- First loop and Setup --
2039 We first traverse the branches and scan each word to determine if it
2040 contains widechars, and how many unique chars there are, this is
2041 important as we have to build a table with at least as many columns as we
2044 We use an array of integers to represent the character codes 0..255
2045 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2046 the native representation of the character value as the key and IV's for
2049 *TODO* If we keep track of how many times each character is used we can
2050 remap the columns so that the table compression later on is more
2051 efficient in terms of memory by ensuring the most common value is in the
2052 middle and the least common are on the outside. IMO this would be better
2053 than a most to least common mapping as theres a decent chance the most
2054 common letter will share a node with the least common, meaning the node
2055 will not be compressible. With a middle is most common approach the worst
2056 case is when we have the least common nodes twice.
2060 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2061 regnode *noper = NEXTOPER( cur );
2062 const U8 *uc = (U8*)STRING( noper );
2063 const U8 *e = uc + STR_LEN( noper );
2065 U32 wordlen = 0; /* required init */
2066 STRLEN minbytes = 0;
2067 STRLEN maxbytes = 0;
2068 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2071 if (OP(noper) == NOTHING) {
2072 regnode *noper_next= regnext(noper);
2073 if (noper_next != tail && OP(noper_next) == flags) {
2075 uc= (U8*)STRING(noper);
2076 e= uc + STR_LEN(noper);
2077 trie->minlen= STR_LEN(noper);
2084 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2085 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2086 regardless of encoding */
2087 if (OP( noper ) == EXACTFU_SS) {
2088 /* false positives are ok, so just set this */
2089 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2092 for ( ; uc < e ; uc += len ) {
2093 TRIE_CHARCOUNT(trie)++;
2096 /* Acummulate to the current values, the range in the number of
2097 * bytes that this character could match. The max is presumed to
2098 * be the same as the folded input (which TRIE_READ_CHAR returns),
2099 * except that when this is not in UTF-8, it could be matched
2100 * against a string which is UTF-8, and the variant characters
2101 * could be 2 bytes instead of the 1 here. Likewise, for the
2102 * minimum number of bytes when not folded. When folding, the min
2103 * is assumed to be 1 byte could fold to match the single character
2104 * here, or in the case of a multi-char fold, 1 byte can fold to
2105 * the whole sequence. 'foldlen' is used to denote whether we are
2106 * in such a sequence, skipping the min setting if so. XXX TODO
2107 * Use the exact list of what folds to each character, from
2108 * PL_utf8_foldclosures */
2110 maxbytes += UTF8SKIP(uc);
2112 /* A non-UTF-8 string could be 1 byte to match our 2 */
2113 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2119 foldlen -= UTF8SKIP(uc);
2122 foldlen = is_MULTI_CHAR_FOLD_utf8(uc);
2128 maxbytes += (UNI_IS_INVARIANT(*uc))
2139 foldlen = is_MULTI_CHAR_FOLD_latin1(uc);
2146 U8 folded= folder[ (U8) uvc ];
2147 if ( !trie->charmap[ folded ] ) {
2148 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2149 TRIE_STORE_REVCHAR( folded );
2152 if ( !trie->charmap[ uvc ] ) {
2153 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2154 TRIE_STORE_REVCHAR( uvc );
2157 /* store the codepoint in the bitmap, and its folded
2159 TRIE_BITMAP_SET(trie, uvc);
2161 /* store the folded codepoint */
2162 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2165 /* store first byte of utf8 representation of
2166 variant codepoints */
2167 if (! UVCHR_IS_INVARIANT(uvc)) {
2168 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2171 set_bit = 0; /* We've done our bit :-) */
2176 widecharmap = newHV();
2178 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2181 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2183 if ( !SvTRUE( *svpp ) ) {
2184 sv_setiv( *svpp, ++trie->uniquecharcount );
2185 TRIE_STORE_REVCHAR(uvc);
2189 if( cur == first ) {
2190 trie->minlen = minbytes;
2191 trie->maxlen = maxbytes;
2192 } else if (minbytes < trie->minlen) {
2193 trie->minlen = minbytes;
2194 } else if (maxbytes > trie->maxlen) {
2195 trie->maxlen = maxbytes;
2197 } /* end first pass */
2198 DEBUG_TRIE_COMPILE_r(
2199 PerlIO_printf( Perl_debug_log,
2200 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2201 (int)depth * 2 + 2,"",
2202 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2203 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2204 (int)trie->minlen, (int)trie->maxlen )
2208 We now know what we are dealing with in terms of unique chars and
2209 string sizes so we can calculate how much memory a naive
2210 representation using a flat table will take. If it's over a reasonable
2211 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2212 conservative but potentially much slower representation using an array
2215 At the end we convert both representations into the same compressed
2216 form that will be used in regexec.c for matching with. The latter
2217 is a form that cannot be used to construct with but has memory
2218 properties similar to the list form and access properties similar
2219 to the table form making it both suitable for fast searches and
2220 small enough that its feasable to store for the duration of a program.
2222 See the comment in the code where the compressed table is produced
2223 inplace from the flat tabe representation for an explanation of how
2224 the compression works.
2229 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2232 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2233 > SvIV(re_trie_maxbuff) )
2236 Second Pass -- Array Of Lists Representation
2238 Each state will be represented by a list of charid:state records
2239 (reg_trie_trans_le) the first such element holds the CUR and LEN
2240 points of the allocated array. (See defines above).
2242 We build the initial structure using the lists, and then convert
2243 it into the compressed table form which allows faster lookups
2244 (but cant be modified once converted).
2247 STRLEN transcount = 1;
2249 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2250 "%*sCompiling trie using list compiler\n",
2251 (int)depth * 2 + 2, ""));
2253 trie->states = (reg_trie_state *)
2254 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2255 sizeof(reg_trie_state) );
2259 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2261 regnode *noper = NEXTOPER( cur );
2262 U8 *uc = (U8*)STRING( noper );
2263 const U8 *e = uc + STR_LEN( noper );
2264 U32 state = 1; /* required init */
2265 U16 charid = 0; /* sanity init */
2266 U32 wordlen = 0; /* required init */
2268 if (OP(noper) == NOTHING) {
2269 regnode *noper_next= regnext(noper);
2270 if (noper_next != tail && OP(noper_next) == flags) {
2272 uc= (U8*)STRING(noper);
2273 e= uc + STR_LEN(noper);
2277 if (OP(noper) != NOTHING) {
2278 for ( ; uc < e ; uc += len ) {
2283 charid = trie->charmap[ uvc ];
2285 SV** const svpp = hv_fetch( widecharmap,
2292 charid=(U16)SvIV( *svpp );
2295 /* charid is now 0 if we dont know the char read, or
2296 * nonzero if we do */
2303 if ( !trie->states[ state ].trans.list ) {
2304 TRIE_LIST_NEW( state );
2307 check <= TRIE_LIST_USED( state );
2310 if ( TRIE_LIST_ITEM( state, check ).forid
2313 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2318 newstate = next_alloc++;
2319 prev_states[newstate] = state;
2320 TRIE_LIST_PUSH( state, charid, newstate );
2325 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2329 TRIE_HANDLE_WORD(state);
2331 } /* end second pass */
2333 /* next alloc is the NEXT state to be allocated */
2334 trie->statecount = next_alloc;
2335 trie->states = (reg_trie_state *)
2336 PerlMemShared_realloc( trie->states,
2338 * sizeof(reg_trie_state) );
2340 /* and now dump it out before we compress it */
2341 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2342 revcharmap, next_alloc,
2346 trie->trans = (reg_trie_trans *)
2347 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2354 for( state=1 ; state < next_alloc ; state ++ ) {
2358 DEBUG_TRIE_COMPILE_MORE_r(
2359 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2363 if (trie->states[state].trans.list) {
2364 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2368 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2369 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2370 if ( forid < minid ) {
2372 } else if ( forid > maxid ) {
2376 if ( transcount < tp + maxid - minid + 1) {
2378 trie->trans = (reg_trie_trans *)
2379 PerlMemShared_realloc( trie->trans,
2381 * sizeof(reg_trie_trans) );
2382 Zero( trie->trans + (transcount / 2),
2386 base = trie->uniquecharcount + tp - minid;
2387 if ( maxid == minid ) {
2389 for ( ; zp < tp ; zp++ ) {
2390 if ( ! trie->trans[ zp ].next ) {
2391 base = trie->uniquecharcount + zp - minid;
2392 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2394 trie->trans[ zp ].check = state;
2400 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2402 trie->trans[ tp ].check = state;
2407 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2408 const U32 tid = base
2409 - trie->uniquecharcount
2410 + TRIE_LIST_ITEM( state, idx ).forid;
2411 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2413 trie->trans[ tid ].check = state;
2415 tp += ( maxid - minid + 1 );
2417 Safefree(trie->states[ state ].trans.list);
2420 DEBUG_TRIE_COMPILE_MORE_r(
2421 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2424 trie->states[ state ].trans.base=base;
2426 trie->lasttrans = tp + 1;
2430 Second Pass -- Flat Table Representation.
2432 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2433 each. We know that we will need Charcount+1 trans at most to store
2434 the data (one row per char at worst case) So we preallocate both
2435 structures assuming worst case.
2437 We then construct the trie using only the .next slots of the entry
2440 We use the .check field of the first entry of the node temporarily
2441 to make compression both faster and easier by keeping track of how
2442 many non zero fields are in the node.
2444 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2447 There are two terms at use here: state as a TRIE_NODEIDX() which is
2448 a number representing the first entry of the node, and state as a
2449 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2450 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2451 if there are 2 entrys per node. eg:
2459 The table is internally in the right hand, idx form. However as we
2460 also have to deal with the states array which is indexed by nodenum
2461 we have to use TRIE_NODENUM() to convert.
2464 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2465 "%*sCompiling trie using table compiler\n",
2466 (int)depth * 2 + 2, ""));
2468 trie->trans = (reg_trie_trans *)
2469 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2470 * trie->uniquecharcount + 1,
2471 sizeof(reg_trie_trans) );
2472 trie->states = (reg_trie_state *)
2473 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2474 sizeof(reg_trie_state) );
2475 next_alloc = trie->uniquecharcount + 1;
2478 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2480 regnode *noper = NEXTOPER( cur );
2481 const U8 *uc = (U8*)STRING( noper );
2482 const U8 *e = uc + STR_LEN( noper );
2484 U32 state = 1; /* required init */
2486 U16 charid = 0; /* sanity init */
2487 U32 accept_state = 0; /* sanity init */
2489 U32 wordlen = 0; /* required init */
2491 if (OP(noper) == NOTHING) {
2492 regnode *noper_next= regnext(noper);
2493 if (noper_next != tail && OP(noper_next) == flags) {
2495 uc= (U8*)STRING(noper);
2496 e= uc + STR_LEN(noper);
2500 if ( OP(noper) != NOTHING ) {
2501 for ( ; uc < e ; uc += len ) {
2506 charid = trie->charmap[ uvc ];
2508 SV* const * const svpp = hv_fetch( widecharmap,
2512 charid = svpp ? (U16)SvIV(*svpp) : 0;
2516 if ( !trie->trans[ state + charid ].next ) {
2517 trie->trans[ state + charid ].next = next_alloc;
2518 trie->trans[ state ].check++;
2519 prev_states[TRIE_NODENUM(next_alloc)]
2520 = TRIE_NODENUM(state);
2521 next_alloc += trie->uniquecharcount;
2523 state = trie->trans[ state + charid ].next;
2525 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2527 /* charid is now 0 if we dont know the char read, or
2528 * nonzero if we do */
2531 accept_state = TRIE_NODENUM( state );
2532 TRIE_HANDLE_WORD(accept_state);
2534 } /* end second pass */
2536 /* and now dump it out before we compress it */
2537 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2539 next_alloc, depth+1));
2543 * Inplace compress the table.*
2545 For sparse data sets the table constructed by the trie algorithm will
2546 be mostly 0/FAIL transitions or to put it another way mostly empty.
2547 (Note that leaf nodes will not contain any transitions.)
2549 This algorithm compresses the tables by eliminating most such
2550 transitions, at the cost of a modest bit of extra work during lookup:
2552 - Each states[] entry contains a .base field which indicates the
2553 index in the state[] array wheres its transition data is stored.
2555 - If .base is 0 there are no valid transitions from that node.
2557 - If .base is nonzero then charid is added to it to find an entry in
2560 -If trans[states[state].base+charid].check!=state then the
2561 transition is taken to be a 0/Fail transition. Thus if there are fail
2562 transitions at the front of the node then the .base offset will point
2563 somewhere inside the previous nodes data (or maybe even into a node
2564 even earlier), but the .check field determines if the transition is
2568 The following process inplace converts the table to the compressed
2569 table: We first do not compress the root node 1,and mark all its
2570 .check pointers as 1 and set its .base pointer as 1 as well. This
2571 allows us to do a DFA construction from the compressed table later,
2572 and ensures that any .base pointers we calculate later are greater
2575 - We set 'pos' to indicate the first entry of the second node.
2577 - We then iterate over the columns of the node, finding the first and
2578 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2579 and set the .check pointers accordingly, and advance pos
2580 appropriately and repreat for the next node. Note that when we copy
2581 the next pointers we have to convert them from the original
2582 NODEIDX form to NODENUM form as the former is not valid post
2585 - If a node has no transitions used we mark its base as 0 and do not
2586 advance the pos pointer.
2588 - If a node only has one transition we use a second pointer into the
2589 structure to fill in allocated fail transitions from other states.
2590 This pointer is independent of the main pointer and scans forward
2591 looking for null transitions that are allocated to a state. When it
2592 finds one it writes the single transition into the "hole". If the
2593 pointer doesnt find one the single transition is appended as normal.
2595 - Once compressed we can Renew/realloc the structures to release the
2598 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2599 specifically Fig 3.47 and the associated pseudocode.
2603 const U32 laststate = TRIE_NODENUM( next_alloc );
2606 trie->statecount = laststate;
2608 for ( state = 1 ; state < laststate ; state++ ) {
2610 const U32 stateidx = TRIE_NODEIDX( state );
2611 const U32 o_used = trie->trans[ stateidx ].check;
2612 U32 used = trie->trans[ stateidx ].check;
2613 trie->trans[ stateidx ].check = 0;
2616 used && charid < trie->uniquecharcount;
2619 if ( flag || trie->trans[ stateidx + charid ].next ) {
2620 if ( trie->trans[ stateidx + charid ].next ) {
2622 for ( ; zp < pos ; zp++ ) {
2623 if ( ! trie->trans[ zp ].next ) {
2627 trie->states[ state ].trans.base
2629 + trie->uniquecharcount
2631 trie->trans[ zp ].next
2632 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2634 trie->trans[ zp ].check = state;
2635 if ( ++zp > pos ) pos = zp;
2642 trie->states[ state ].trans.base
2643 = pos + trie->uniquecharcount - charid ;
2645 trie->trans[ pos ].next
2646 = SAFE_TRIE_NODENUM(
2647 trie->trans[ stateidx + charid ].next );
2648 trie->trans[ pos ].check = state;
2653 trie->lasttrans = pos + 1;
2654 trie->states = (reg_trie_state *)
2655 PerlMemShared_realloc( trie->states, laststate
2656 * sizeof(reg_trie_state) );
2657 DEBUG_TRIE_COMPILE_MORE_r(
2658 PerlIO_printf( Perl_debug_log,
2659 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2660 (int)depth * 2 + 2,"",
2661 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2665 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2668 } /* end table compress */
2670 DEBUG_TRIE_COMPILE_MORE_r(
2671 PerlIO_printf(Perl_debug_log,
2672 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2673 (int)depth * 2 + 2, "",
2674 (UV)trie->statecount,
2675 (UV)trie->lasttrans)
2677 /* resize the trans array to remove unused space */
2678 trie->trans = (reg_trie_trans *)
2679 PerlMemShared_realloc( trie->trans, trie->lasttrans
2680 * sizeof(reg_trie_trans) );
2682 { /* Modify the program and insert the new TRIE node */
2683 U8 nodetype =(U8)(flags & 0xFF);
2687 regnode *optimize = NULL;
2688 #ifdef RE_TRACK_PATTERN_OFFSETS
2691 U32 mjd_nodelen = 0;
2692 #endif /* RE_TRACK_PATTERN_OFFSETS */
2693 #endif /* DEBUGGING */
2695 This means we convert either the first branch or the first Exact,
2696 depending on whether the thing following (in 'last') is a branch
2697 or not and whther first is the startbranch (ie is it a sub part of
2698 the alternation or is it the whole thing.)
2699 Assuming its a sub part we convert the EXACT otherwise we convert
2700 the whole branch sequence, including the first.
2702 /* Find the node we are going to overwrite */
2703 if ( first != startbranch || OP( last ) == BRANCH ) {
2704 /* branch sub-chain */
2705 NEXT_OFF( first ) = (U16)(last - first);
2706 #ifdef RE_TRACK_PATTERN_OFFSETS
2708 mjd_offset= Node_Offset((convert));
2709 mjd_nodelen= Node_Length((convert));
2712 /* whole branch chain */
2714 #ifdef RE_TRACK_PATTERN_OFFSETS
2717 const regnode *nop = NEXTOPER( convert );
2718 mjd_offset= Node_Offset((nop));
2719 mjd_nodelen= Node_Length((nop));
2723 PerlIO_printf(Perl_debug_log,
2724 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2725 (int)depth * 2 + 2, "",
2726 (UV)mjd_offset, (UV)mjd_nodelen)
2729 /* But first we check to see if there is a common prefix we can
2730 split out as an EXACT and put in front of the TRIE node. */
2731 trie->startstate= 1;
2732 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2734 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2738 const U32 base = trie->states[ state ].trans.base;
2740 if ( trie->states[state].wordnum )
2743 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2744 if ( ( base + ofs >= trie->uniquecharcount ) &&
2745 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2746 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2748 if ( ++count > 1 ) {
2749 SV **tmp = av_fetch( revcharmap, ofs, 0);
2750 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2751 if ( state == 1 ) break;
2753 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2755 PerlIO_printf(Perl_debug_log,
2756 "%*sNew Start State=%"UVuf" Class: [",
2757 (int)depth * 2 + 2, "",
2760 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2761 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2763 TRIE_BITMAP_SET(trie,*ch);
2765 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2767 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2771 TRIE_BITMAP_SET(trie,*ch);
2773 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2774 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2780 SV **tmp = av_fetch( revcharmap, idx, 0);
2782 char *ch = SvPV( *tmp, len );
2784 SV *sv=sv_newmortal();
2785 PerlIO_printf( Perl_debug_log,
2786 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2787 (int)depth * 2 + 2, "",
2789 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2790 PL_colors[0], PL_colors[1],
2791 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2792 PERL_PV_ESCAPE_FIRSTCHAR
2797 OP( convert ) = nodetype;
2798 str=STRING(convert);
2801 STR_LEN(convert) += len;
2807 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2812 trie->prefixlen = (state-1);
2814 regnode *n = convert+NODE_SZ_STR(convert);
2815 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2816 trie->startstate = state;
2817 trie->minlen -= (state - 1);
2818 trie->maxlen -= (state - 1);
2820 /* At least the UNICOS C compiler choked on this
2821 * being argument to DEBUG_r(), so let's just have
2824 #ifdef PERL_EXT_RE_BUILD
2830 regnode *fix = convert;
2831 U32 word = trie->wordcount;
2833 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2834 while( ++fix < n ) {
2835 Set_Node_Offset_Length(fix, 0, 0);
2838 SV ** const tmp = av_fetch( trie_words, word, 0 );
2840 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2841 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2843 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2851 NEXT_OFF(convert) = (U16)(tail - convert);
2852 DEBUG_r(optimize= n);
2858 if ( trie->maxlen ) {
2859 NEXT_OFF( convert ) = (U16)(tail - convert);
2860 ARG_SET( convert, data_slot );
2861 /* Store the offset to the first unabsorbed branch in
2862 jump[0], which is otherwise unused by the jump logic.
2863 We use this when dumping a trie and during optimisation. */
2865 trie->jump[0] = (U16)(nextbranch - convert);
2867 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2868 * and there is a bitmap
2869 * and the first "jump target" node we found leaves enough room
2870 * then convert the TRIE node into a TRIEC node, with the bitmap
2871 * embedded inline in the opcode - this is hypothetically faster.
2873 if ( !trie->states[trie->startstate].wordnum
2875 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2877 OP( convert ) = TRIEC;
2878 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2879 PerlMemShared_free(trie->bitmap);
2882 OP( convert ) = TRIE;
2884 /* store the type in the flags */
2885 convert->flags = nodetype;
2889 + regarglen[ OP( convert ) ];
2891 /* XXX We really should free up the resource in trie now,
2892 as we won't use them - (which resources?) dmq */
2894 /* needed for dumping*/
2895 DEBUG_r(if (optimize) {
2896 regnode *opt = convert;
2898 while ( ++opt < optimize) {
2899 Set_Node_Offset_Length(opt,0,0);
2902 Try to clean up some of the debris left after the
2905 while( optimize < jumper ) {
2906 mjd_nodelen += Node_Length((optimize));
2907 OP( optimize ) = OPTIMIZED;
2908 Set_Node_Offset_Length(optimize,0,0);
2911 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2913 } /* end node insert */
2915 /* Finish populating the prev field of the wordinfo array. Walk back
2916 * from each accept state until we find another accept state, and if
2917 * so, point the first word's .prev field at the second word. If the
2918 * second already has a .prev field set, stop now. This will be the
2919 * case either if we've already processed that word's accept state,
2920 * or that state had multiple words, and the overspill words were
2921 * already linked up earlier.
2928 for (word=1; word <= trie->wordcount; word++) {
2930 if (trie->wordinfo[word].prev)
2932 state = trie->wordinfo[word].accept;
2934 state = prev_states[state];
2937 prev = trie->states[state].wordnum;
2941 trie->wordinfo[word].prev = prev;
2943 Safefree(prev_states);
2947 /* and now dump out the compressed format */
2948 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2950 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2952 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2953 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2955 SvREFCNT_dec_NN(revcharmap);
2959 : trie->startstate>1
2965 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2967 /* The Trie is constructed and compressed now so we can build a fail array if
2970 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2972 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2976 We find the fail state for each state in the trie, this state is the longest
2977 proper suffix of the current state's 'word' that is also a proper prefix of
2978 another word in our trie. State 1 represents the word '' and is thus the
2979 default fail state. This allows the DFA not to have to restart after its
2980 tried and failed a word at a given point, it simply continues as though it
2981 had been matching the other word in the first place.
2983 'abcdgu'=~/abcdefg|cdgu/
2984 When we get to 'd' we are still matching the first word, we would encounter
2985 'g' which would fail, which would bring us to the state representing 'd' in
2986 the second word where we would try 'g' and succeed, proceeding to match
2989 /* add a fail transition */
2990 const U32 trie_offset = ARG(source);
2991 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2993 const U32 ucharcount = trie->uniquecharcount;
2994 const U32 numstates = trie->statecount;
2995 const U32 ubound = trie->lasttrans + ucharcount;
2999 U32 base = trie->states[ 1 ].trans.base;
3002 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3003 GET_RE_DEBUG_FLAGS_DECL;
3005 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3007 PERL_UNUSED_ARG(depth);
3011 ARG_SET( stclass, data_slot );
3012 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3013 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3014 aho->trie=trie_offset;
3015 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3016 Copy( trie->states, aho->states, numstates, reg_trie_state );
3017 Newxz( q, numstates, U32);
3018 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3021 /* initialize fail[0..1] to be 1 so that we always have
3022 a valid final fail state */
3023 fail[ 0 ] = fail[ 1 ] = 1;
3025 for ( charid = 0; charid < ucharcount ; charid++ ) {
3026 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3028 q[ q_write ] = newstate;
3029 /* set to point at the root */
3030 fail[ q[ q_write++ ] ]=1;
3033 while ( q_read < q_write) {
3034 const U32 cur = q[ q_read++ % numstates ];
3035 base = trie->states[ cur ].trans.base;
3037 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3038 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3040 U32 fail_state = cur;
3043 fail_state = fail[ fail_state ];
3044 fail_base = aho->states[ fail_state ].trans.base;
3045 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3047 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3048 fail[ ch_state ] = fail_state;
3049 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3051 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3053 q[ q_write++ % numstates] = ch_state;
3057 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3058 when we fail in state 1, this allows us to use the
3059 charclass scan to find a valid start char. This is based on the principle
3060 that theres a good chance the string being searched contains lots of stuff
3061 that cant be a start char.
3063 fail[ 0 ] = fail[ 1 ] = 0;
3064 DEBUG_TRIE_COMPILE_r({
3065 PerlIO_printf(Perl_debug_log,
3066 "%*sStclass Failtable (%"UVuf" states): 0",
3067 (int)(depth * 2), "", (UV)numstates
3069 for( q_read=1; q_read<numstates; q_read++ ) {
3070 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3072 PerlIO_printf(Perl_debug_log, "\n");
3075 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
3079 #define DEBUG_PEEP(str,scan,depth) \
3080 DEBUG_OPTIMISE_r({if (scan){ \
3081 SV * const mysv=sv_newmortal(); \
3082 regnode *Next = regnext(scan); \
3083 regprop(RExC_rx, mysv, scan); \
3084 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3085 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3086 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3090 /* The below joins as many adjacent EXACTish nodes as possible into a single
3091 * one. The regop may be changed if the node(s) contain certain sequences that
3092 * require special handling. The joining is only done if:
3093 * 1) there is room in the current conglomerated node to entirely contain the
3095 * 2) they are the exact same node type
3097 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3098 * these get optimized out
3100 * If a node is to match under /i (folded), the number of characters it matches
3101 * can be different than its character length if it contains a multi-character
3102 * fold. *min_subtract is set to the total delta number of characters of the
3105 * And *unfolded_multi_char is set to indicate whether or not the node contains
3106 * an unfolded multi-char fold. This happens when whether the fold is valid or
3107 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3108 * SMALL LETTER SHARP S, as only if the target string being matched against
3109 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3110 * folding rules depend on the locale in force at runtime. (Multi-char folds
3111 * whose components are all above the Latin1 range are not run-time locale
3112 * dependent, and have already been folded by the time this function is
3115 * This is as good a place as any to discuss the design of handling these
3116 * multi-character fold sequences. It's been wrong in Perl for a very long
3117 * time. There are three code points in Unicode whose multi-character folds
3118 * were long ago discovered to mess things up. The previous designs for
3119 * dealing with these involved assigning a special node for them. This
3120 * approach doesn't always work, as evidenced by this example:
3121 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3122 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3123 * would match just the \xDF, it won't be able to handle the case where a
3124 * successful match would have to cross the node's boundary. The new approach
3125 * that hopefully generally solves the problem generates an EXACTFU_SS node
3126 * that is "sss" in this case.
3128 * It turns out that there are problems with all multi-character folds, and not
3129 * just these three. Now the code is general, for all such cases. The
3130 * approach taken is:
3131 * 1) This routine examines each EXACTFish node that could contain multi-
3132 * character folded sequences. Since a single character can fold into
3133 * such a sequence, the minimum match length for this node is less than
3134 * the number of characters in the node. This routine returns in
3135 * *min_subtract how many characters to subtract from the the actual
3136 * length of the string to get a real minimum match length; it is 0 if
3137 * there are no multi-char foldeds. This delta is used by the caller to
3138 * adjust the min length of the match, and the delta between min and max,
3139 * so that the optimizer doesn't reject these possibilities based on size
3141 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3142 * is used for an EXACTFU node that contains at least one "ss" sequence in
3143 * it. For non-UTF-8 patterns and strings, this is the only case where
3144 * there is a possible fold length change. That means that a regular
3145 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3146 * with length changes, and so can be processed faster. regexec.c takes
3147 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3148 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3149 * known until runtime). This saves effort in regex matching. However,
3150 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3151 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3152 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3153 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3154 * possibilities for the non-UTF8 patterns are quite simple, except for
3155 * the sharp s. All the ones that don't involve a UTF-8 target string are
3156 * members of a fold-pair, and arrays are set up for all of them so that
3157 * the other member of the pair can be found quickly. Code elsewhere in
3158 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3159 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3160 * described in the next item.
3161 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3162 * validity of the fold won't be known until runtime, and so must remain
3163 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3164 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3165 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3166 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3167 * The reason this is a problem is that the optimizer part of regexec.c
3168 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3169 * that a character in the pattern corresponds to at most a single
3170 * character in the target string. (And I do mean character, and not byte
3171 * here, unlike other parts of the documentation that have never been
3172 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3173 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3174 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3175 * nodes, violate the assumption, and they are the only instances where it
3176 * is violated. I'm reluctant to try to change the assumption, as the
3177 * code involved is impenetrable to me (khw), so instead the code here
3178 * punts. This routine examines EXACTFL nodes, and (when the pattern
3179 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3180 * boolean indicating whether or not the node contains such a fold. When
3181 * it is true, the caller sets a flag that later causes the optimizer in
3182 * this file to not set values for the floating and fixed string lengths,
3183 * and thus avoids the optimizer code in regexec.c that makes the invalid
3184 * assumption. Thus, there is no optimization based on string lengths for
3185 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3186 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3187 * assumption is wrong only in these cases is that all other non-UTF-8
3188 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3189 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3190 * EXACTF nodes because we don't know at compile time if it actually
3191 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3192 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3193 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3194 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3195 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3196 * string would require the pattern to be forced into UTF-8, the overhead
3197 * of which we want to avoid. Similarly the unfolded multi-char folds in
3198 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3201 * Similarly, the code that generates tries doesn't currently handle
3202 * not-already-folded multi-char folds, and it looks like a pain to change
3203 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3204 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3205 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3206 * using /iaa matching will be doing so almost entirely with ASCII
3207 * strings, so this should rarely be encountered in practice */
3209 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3210 if (PL_regkind[OP(scan)] == EXACT) \
3211 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3214 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3215 UV *min_subtract, bool *unfolded_multi_char,
3216 U32 flags,regnode *val, U32 depth)
3218 /* Merge several consecutive EXACTish nodes into one. */
3219 regnode *n = regnext(scan);
3221 regnode *next = scan + NODE_SZ_STR(scan);
3225 regnode *stop = scan;
3226 GET_RE_DEBUG_FLAGS_DECL;
3228 PERL_UNUSED_ARG(depth);
3231 PERL_ARGS_ASSERT_JOIN_EXACT;
3232 #ifndef EXPERIMENTAL_INPLACESCAN
3233 PERL_UNUSED_ARG(flags);
3234 PERL_UNUSED_ARG(val);
3236 DEBUG_PEEP("join",scan,depth);
3238 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3239 * EXACT ones that are mergeable to the current one. */
3241 && (PL_regkind[OP(n)] == NOTHING
3242 || (stringok && OP(n) == OP(scan)))
3244 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3247 if (OP(n) == TAIL || n > next)
3249 if (PL_regkind[OP(n)] == NOTHING) {
3250 DEBUG_PEEP("skip:",n,depth);
3251 NEXT_OFF(scan) += NEXT_OFF(n);
3252 next = n + NODE_STEP_REGNODE;
3259 else if (stringok) {
3260 const unsigned int oldl = STR_LEN(scan);
3261 regnode * const nnext = regnext(n);
3263 /* XXX I (khw) kind of doubt that this works on platforms (should
3264 * Perl ever run on one) where U8_MAX is above 255 because of lots
3265 * of other assumptions */
3266 /* Don't join if the sum can't fit into a single node */
3267 if (oldl + STR_LEN(n) > U8_MAX)
3270 DEBUG_PEEP("merg",n,depth);
3273 NEXT_OFF(scan) += NEXT_OFF(n);
3274 STR_LEN(scan) += STR_LEN(n);
3275 next = n + NODE_SZ_STR(n);
3276 /* Now we can overwrite *n : */
3277 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3285 #ifdef EXPERIMENTAL_INPLACESCAN
3286 if (flags && !NEXT_OFF(n)) {
3287 DEBUG_PEEP("atch", val, depth);
3288 if (reg_off_by_arg[OP(n)]) {
3289 ARG_SET(n, val - n);
3292 NEXT_OFF(n) = val - n;
3300 *unfolded_multi_char = FALSE;
3302 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3303 * can now analyze for sequences of problematic code points. (Prior to
3304 * this final joining, sequences could have been split over boundaries, and
3305 * hence missed). The sequences only happen in folding, hence for any
3306 * non-EXACT EXACTish node */
3307 if (OP(scan) != EXACT) {
3308 U8* s0 = (U8*) STRING(scan);
3310 U8* s_end = s0 + STR_LEN(scan);
3312 int total_count_delta = 0; /* Total delta number of characters that
3313 multi-char folds expand to */
3315 /* One pass is made over the node's string looking for all the
3316 * possibilities. To avoid some tests in the loop, there are two main
3317 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3322 if (OP(scan) == EXACTFL) {
3325 /* An EXACTFL node would already have been changed to another
3326 * node type unless there is at least one character in it that
3327 * is problematic; likely a character whose fold definition
3328 * won't be known until runtime, and so has yet to be folded.
3329 * For all but the UTF-8 locale, folds are 1-1 in length, but
3330 * to handle the UTF-8 case, we need to create a temporary
3331 * folded copy using UTF-8 locale rules in order to analyze it.
3332 * This is because our macros that look to see if a sequence is
3333 * a multi-char fold assume everything is folded (otherwise the
3334 * tests in those macros would be too complicated and slow).
3335 * Note that here, the non-problematic folds will have already
3336 * been done, so we can just copy such characters. We actually
3337 * don't completely fold the EXACTFL string. We skip the
3338 * unfolded multi-char folds, as that would just create work
3339 * below to figure out the size they already are */
3341 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3344 STRLEN s_len = UTF8SKIP(s);
3345 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3346 Copy(s, d, s_len, U8);
3349 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3350 *unfolded_multi_char = TRUE;
3351 Copy(s, d, s_len, U8);
3354 else if (isASCII(*s)) {
3355 *(d++) = toFOLD(*s);
3359 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3365 /* Point the remainder of the routine to look at our temporary
3369 } /* End of creating folded copy of EXACTFL string */
3371 /* Examine the string for a multi-character fold sequence. UTF-8
3372 * patterns have all characters pre-folded by the time this code is
3374 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3375 length sequence we are looking for is 2 */
3377 int count = 0; /* How many characters in a multi-char fold */
3378 int len = is_MULTI_CHAR_FOLD_utf8(s);
3379 if (! len) { /* Not a multi-char fold: get next char */
3384 /* Nodes with 'ss' require special handling, except for
3385 * EXACTFA-ish for which there is no multi-char fold to this */
3386 if (len == 2 && *s == 's' && *(s+1) == 's'
3387 && OP(scan) != EXACTFA
3388 && OP(scan) != EXACTFA_NO_TRIE)
3391 if (OP(scan) != EXACTFL) {
3392 OP(scan) = EXACTFU_SS;
3396 else { /* Here is a generic multi-char fold. */
3397 U8* multi_end = s + len;
3399 /* Count how many characters in it. In the case of /aa, no
3400 * folds which contain ASCII code points are allowed, so
3401 * check for those, and skip if found. */
3402 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3403 count = utf8_length(s, multi_end);
3407 while (s < multi_end) {
3410 goto next_iteration;
3420 /* The delta is how long the sequence is minus 1 (1 is how long
3421 * the character that folds to the sequence is) */
3422 total_count_delta += count - 1;
3426 /* We created a temporary folded copy of the string in EXACTFL
3427 * nodes. Therefore we need to be sure it doesn't go below zero,
3428 * as the real string could be shorter */
3429 if (OP(scan) == EXACTFL) {
3430 int total_chars = utf8_length((U8*) STRING(scan),
3431 (U8*) STRING(scan) + STR_LEN(scan));
3432 if (total_count_delta > total_chars) {
3433 total_count_delta = total_chars;
3437 *min_subtract += total_count_delta;
3440 else if (OP(scan) == EXACTFA) {
3442 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3443 * fold to the ASCII range (and there are no existing ones in the
3444 * upper latin1 range). But, as outlined in the comments preceding
3445 * this function, we need to flag any occurrences of the sharp s.
3446 * This character forbids trie formation (because of added
3449 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3450 OP(scan) = EXACTFA_NO_TRIE;
3451 *unfolded_multi_char = TRUE;
3460 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3461 * folds that are all Latin1. As explained in the comments
3462 * preceding this function, we look also for the sharp s in EXACTF
3463 * and EXACTFL nodes; it can be in the final position. Otherwise
3464 * we can stop looking 1 byte earlier because have to find at least
3465 * two characters for a multi-fold */
3466 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3471 int len = is_MULTI_CHAR_FOLD_latin1(s);
3472 if (! len) { /* Not a multi-char fold. */
3473 if (*s == LATIN_SMALL_LETTER_SHARP_S
3474 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3476 *unfolded_multi_char = TRUE;
3483 && isARG2_lower_or_UPPER_ARG1('s', *s)
3484 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3487 /* EXACTF nodes need to know that the minimum length
3488 * changed so that a sharp s in the string can match this
3489 * ss in the pattern, but they remain EXACTF nodes, as they
3490 * won't match this unless the target string is is UTF-8,
3491 * which we don't know until runtime. EXACTFL nodes can't
3492 * transform into EXACTFU nodes */
3493 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3494 OP(scan) = EXACTFU_SS;
3498 *min_subtract += len - 1;
3505 /* Allow dumping but overwriting the collection of skipped
3506 * ops and/or strings with fake optimized ops */
3507 n = scan + NODE_SZ_STR(scan);
3515 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3519 /* REx optimizer. Converts nodes into quicker variants "in place".
3520 Finds fixed substrings. */
3522 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3523 to the position after last scanned or to NULL. */
3525 #define INIT_AND_WITHP \
3526 assert(!and_withp); \
3527 Newx(and_withp,1, regnode_ssc); \
3528 SAVEFREEPV(and_withp)
3530 /* this is a chain of data about sub patterns we are processing that
3531 need to be handled separately/specially in study_chunk. Its so
3532 we can simulate recursion without losing state. */
3534 typedef struct scan_frame {
3535 regnode *last; /* last node to process in this frame */
3536 regnode *next; /* next node to process when last is reached */
3537 struct scan_frame *prev; /*previous frame*/
3538 U32 prev_recursed_depth;
3539 I32 stop; /* what stopparen do we use */
3543 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3546 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3547 SSize_t *minlenp, SSize_t *deltap,
3552 regnode_ssc *and_withp,
3553 U32 flags, U32 depth)
3554 /* scanp: Start here (read-write). */
3555 /* deltap: Write maxlen-minlen here. */
3556 /* last: Stop before this one. */
3557 /* data: string data about the pattern */
3558 /* stopparen: treat close N as END */
3559 /* recursed: which subroutines have we recursed into */
3560 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3563 /* There must be at least this number of characters to match */
3566 regnode *scan = *scanp, *next;
3568 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3569 int is_inf_internal = 0; /* The studied chunk is infinite */
3570 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3571 scan_data_t data_fake;
3572 SV *re_trie_maxbuff = NULL;
3573 regnode *first_non_open = scan;
3574 SSize_t stopmin = SSize_t_MAX;
3575 scan_frame *frame = NULL;
3576 GET_RE_DEBUG_FLAGS_DECL;
3578 PERL_ARGS_ASSERT_STUDY_CHUNK;
3581 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3584 while (first_non_open && OP(first_non_open) == OPEN)
3585 first_non_open=regnext(first_non_open);
3590 while ( scan && OP(scan) != END && scan < last ){
3591 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3592 node length to get a real minimum (because
3593 the folded version may be shorter) */
3594 bool unfolded_multi_char = FALSE;
3595 /* Peephole optimizer: */
3596 DEBUG_OPTIMISE_MORE_r(
3598 PerlIO_printf(Perl_debug_log,
3599 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3600 ((int) depth*2), "", (long)stopparen,
3601 (unsigned long)depth, (unsigned long)recursed_depth);
3602 if (recursed_depth) {
3605 for ( j = 0 ; j < recursed_depth ; j++ ) {
3606 PerlIO_printf(Perl_debug_log,"[");
3607 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3608 PerlIO_printf(Perl_debug_log,"%d",
3609 PAREN_TEST(RExC_study_chunk_recursed +
3610 (j * RExC_study_chunk_recursed_bytes), i)
3613 PerlIO_printf(Perl_debug_log,"]");
3616 PerlIO_printf(Perl_debug_log,"\n");
3619 DEBUG_STUDYDATA("Peep:", data, depth);
3620 DEBUG_PEEP("Peep", scan, depth);
3623 /* Its not clear to khw or hv why this is done here, and not in the
3624 * clauses that deal with EXACT nodes. khw's guess is that it's
3625 * because of a previous design */
3626 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3628 /* Follow the next-chain of the current node and optimize
3629 away all the NOTHINGs from it. */
3630 if (OP(scan) != CURLYX) {
3631 const int max = (reg_off_by_arg[OP(scan)]
3633 /* I32 may be smaller than U16 on CRAYs! */
3634 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3635 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3639 /* Skip NOTHING and LONGJMP. */
3640 while ((n = regnext(n))
3641 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3642 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3643 && off + noff < max)
3645 if (reg_off_by_arg[OP(scan)])
3648 NEXT_OFF(scan) = off;
3653 /* The principal pseudo-switch. Cannot be a switch, since we
3654 look into several different things. */
3655 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3656 || OP(scan) == IFTHEN) {
3657 next = regnext(scan);
3659 /* demq: the op(next)==code check is to see if we have
3660 * "branch-branch" AFAICT */
3662 if (OP(next) == code || code == IFTHEN) {
3663 /* NOTE - There is similar code to this block below for
3664 * handling TRIE nodes on a re-study. If you change stuff here
3665 * check there too. */
3666 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3668 regnode * const startbranch=scan;
3670 if (flags & SCF_DO_SUBSTR)
3671 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge
3674 if (flags & SCF_DO_STCLASS)
3675 ssc_init_zero(pRExC_state, &accum);
3677 while (OP(scan) == code) {
3678 SSize_t deltanext, minnext, fake;
3680 regnode_ssc this_class;
3683 data_fake.flags = 0;
3685 data_fake.whilem_c = data->whilem_c;
3686 data_fake.last_closep = data->last_closep;
3689 data_fake.last_closep = &fake;
3691 data_fake.pos_delta = delta;
3692 next = regnext(scan);
3693 scan = NEXTOPER(scan);
3695 scan = NEXTOPER(scan);
3696 if (flags & SCF_DO_STCLASS) {
3697 ssc_init(pRExC_state, &this_class);
3698 data_fake.start_class = &this_class;
3699 f = SCF_DO_STCLASS_AND;
3701 if (flags & SCF_WHILEM_VISITED_POS)
3702 f |= SCF_WHILEM_VISITED_POS;
3704 /* we suppose the run is continuous, last=next...*/
3705 minnext = study_chunk(pRExC_state, &scan, minlenp,
3706 &deltanext, next, &data_fake, stopparen,
3707 recursed_depth, NULL, f,depth+1);
3710 if (deltanext == SSize_t_MAX) {
3711 is_inf = is_inf_internal = 1;
3713 } else if (max1 < minnext + deltanext)
3714 max1 = minnext + deltanext;
3716 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3718 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3719 if ( stopmin > minnext)
3720 stopmin = min + min1;
3721 flags &= ~SCF_DO_SUBSTR;
3723 data->flags |= SCF_SEEN_ACCEPT;
3726 if (data_fake.flags & SF_HAS_EVAL)
3727 data->flags |= SF_HAS_EVAL;
3728 data->whilem_c = data_fake.whilem_c;
3730 if (flags & SCF_DO_STCLASS)
3731 ssc_or(pRExC_state, &accum, &this_class);
3733 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3735 if (flags & SCF_DO_SUBSTR) {
3736 data->pos_min += min1;
3737 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3738 data->pos_delta = SSize_t_MAX;
3740 data->pos_delta += max1 - min1;
3741 if (max1 != min1 || is_inf)
3742 data->longest = &(data->longest_float);
3745 if (delta == SSize_t_MAX
3746 || SSize_t_MAX - delta - (max1 - min1) < 0)
3747 delta = SSize_t_MAX;
3749 delta += max1 - min1;
3750 if (flags & SCF_DO_STCLASS_OR) {
3751 ssc_or(pRExC_state, data->start_class, &accum);
3753 ssc_and(pRExC_state, data->start_class, and_withp);
3754 flags &= ~SCF_DO_STCLASS;
3757 else if (flags & SCF_DO_STCLASS_AND) {
3759 ssc_and(pRExC_state, data->start_class, &accum);
3760 flags &= ~SCF_DO_STCLASS;
3763 /* Switch to OR mode: cache the old value of
3764 * data->start_class */
3766 StructCopy(data->start_class, and_withp, regnode_ssc);
3767 flags &= ~SCF_DO_STCLASS_AND;
3768 StructCopy(&accum, data->start_class, regnode_ssc);
3769 flags |= SCF_DO_STCLASS_OR;
3773 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch )
3778 Assuming this was/is a branch we are dealing with: 'scan'
3779 now points at the item that follows the branch sequence,
3780 whatever it is. We now start at the beginning of the
3781 sequence and look for subsequences of
3787 which would be constructed from a pattern like
3790 If we can find such a subsequence we need to turn the first
3791 element into a trie and then add the subsequent branch exact
3792 strings to the trie.
3796 1. patterns where the whole set of branches can be
3799 2. patterns where only a subset can be converted.
3801 In case 1 we can replace the whole set with a single regop
3802 for the trie. In case 2 we need to keep the start and end
3805 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3806 becomes BRANCH TRIE; BRANCH X;
3808 There is an additional case, that being where there is a
3809 common prefix, which gets split out into an EXACT like node
3810 preceding the TRIE node.
3812 If x(1..n)==tail then we can do a simple trie, if not we make
3813 a "jump" trie, such that when we match the appropriate word
3814 we "jump" to the appropriate tail node. Essentially we turn
3815 a nested if into a case structure of sorts.
3820 if (!re_trie_maxbuff) {
3821 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3822 if (!SvIOK(re_trie_maxbuff))
3823 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3825 if ( SvIV(re_trie_maxbuff)>=0 ) {
3827 regnode *first = (regnode *)NULL;
3828 regnode *last = (regnode *)NULL;
3829 regnode *tail = scan;
3834 SV * const mysv = sv_newmortal(); /* for dumping */
3836 /* var tail is used because there may be a TAIL
3837 regop in the way. Ie, the exacts will point to the
3838 thing following the TAIL, but the last branch will
3839 point at the TAIL. So we advance tail. If we
3840 have nested (?:) we may have to move through several
3844 while ( OP( tail ) == TAIL ) {
3845 /* this is the TAIL generated by (?:) */
3846 tail = regnext( tail );
3850 DEBUG_TRIE_COMPILE_r({
3851 regprop(RExC_rx, mysv, tail );
3852 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3853 (int)depth * 2 + 2, "",
3854 "Looking for TRIE'able sequences. Tail node is: ",
3855 SvPV_nolen_const( mysv )
3861 Step through the branches
3862 cur represents each branch,
3863 noper is the first thing to be matched as part
3865 noper_next is the regnext() of that node.
3867 We normally handle a case like this
3868 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3869 support building with NOJUMPTRIE, which restricts
3870 the trie logic to structures like /FOO|BAR/.
3872 If noper is a trieable nodetype then the branch is
3873 a possible optimization target. If we are building
3874 under NOJUMPTRIE then we require that noper_next is
3875 the same as scan (our current position in the regex
3878 Once we have two or more consecutive such branches
3879 we can create a trie of the EXACT's contents and
3880 stitch it in place into the program.
3882 If the sequence represents all of the branches in
3883 the alternation we replace the entire thing with a
3886 Otherwise when it is a subsequence we need to
3887 stitch it in place and replace only the relevant
3888 branches. This means the first branch has to remain
3889 as it is used by the alternation logic, and its
3890 next pointer, and needs to be repointed at the item
3891 on the branch chain following the last branch we
3892 have optimized away.
3894 This could be either a BRANCH, in which case the
3895 subsequence is internal, or it could be the item
3896 following the branch sequence in which case the
3897 subsequence is at the end (which does not
3898 necessarily mean the first node is the start of the
3901 TRIE_TYPE(X) is a define which maps the optype to a
3905 ----------------+-----------
3909 EXACTFU_SS | EXACTFU
3914 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3915 ( EXACT == (X) ) ? EXACT : \
3916 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3917 ( EXACTFA == (X) ) ? EXACTFA : \
3920 /* dont use tail as the end marker for this traverse */
3921 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3922 regnode * const noper = NEXTOPER( cur );
3923 U8 noper_type = OP( noper );
3924 U8 noper_trietype = TRIE_TYPE( noper_type );
3925 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3926 regnode * const noper_next = regnext( noper );
3927 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3928 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3931 DEBUG_TRIE_COMPILE_r({
3932 regprop(RExC_rx, mysv, cur);
3933 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3934 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3936 regprop(RExC_rx, mysv, noper);
3937 PerlIO_printf( Perl_debug_log, " -> %s",
3938 SvPV_nolen_const(mysv));
3941 regprop(RExC_rx, mysv, noper_next );
3942 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3943 SvPV_nolen_const(mysv));
3945 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3946 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3947 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3951 /* Is noper a trieable nodetype that can be merged
3952 * with the current trie (if there is one)? */
3956 ( noper_trietype == NOTHING)
3957 || ( trietype == NOTHING )
3958 || ( trietype == noper_trietype )
3961 && noper_next == tail
3965 /* Handle mergable triable node Either we are
3966 * the first node in a new trieable sequence,
3967 * in which case we do some bookkeeping,
3968 * otherwise we update the end pointer. */
3971 if ( noper_trietype == NOTHING ) {
3972 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3973 regnode * const noper_next = regnext( noper );
3974 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3975 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3978 if ( noper_next_trietype ) {
3979 trietype = noper_next_trietype;
3980 } else if (noper_next_type) {
3981 /* a NOTHING regop is 1 regop wide.
3982 * We need at least two for a trie
3983 * so we can't merge this in */
3987 trietype = noper_trietype;
3990 if ( trietype == NOTHING )
3991 trietype = noper_trietype;
3996 } /* end handle mergable triable node */
3998 /* handle unmergable node -
3999 * noper may either be a triable node which can
4000 * not be tried together with the current trie,
4001 * or a non triable node */
4003 /* If last is set and trietype is not
4004 * NOTHING then we have found at least two
4005 * triable branch sequences in a row of a
4006 * similar trietype so we can turn them
4007 * into a trie. If/when we allow NOTHING to
4008 * start a trie sequence this condition
4009 * will be required, and it isn't expensive
4010 * so we leave it in for now. */
4011 if ( trietype && trietype != NOTHING )
4012 make_trie( pRExC_state,
4013 startbranch, first, cur, tail,
4014 count, trietype, depth+1 );
4015 last = NULL; /* note: we clear/update
4016 first, trietype etc below,
4017 so we dont do it here */
4021 && noper_next == tail
4024 /* noper is triable, so we can start a new
4028 trietype = noper_trietype;
4030 /* if we already saw a first but the
4031 * current node is not triable then we have
4032 * to reset the first information. */
4037 } /* end handle unmergable node */
4038 } /* loop over branches */
4039 DEBUG_TRIE_COMPILE_r({
4040 regprop(RExC_rx, mysv, cur);
4041 PerlIO_printf( Perl_debug_log,
4042 "%*s- %s (%d) <SCAN FINISHED>\n",
4044 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4047 if ( last && trietype ) {
4048 if ( trietype != NOTHING ) {
4049 /* the last branch of the sequence was part of
4050 * a trie, so we have to construct it here
4051 * outside of the loop */
4052 made= make_trie( pRExC_state, startbranch,
4053 first, scan, tail, count,
4054 trietype, depth+1 );
4055 #ifdef TRIE_STUDY_OPT
4056 if ( ((made == MADE_EXACT_TRIE &&
4057 startbranch == first)
4058 || ( first_non_open == first )) &&
4060 flags |= SCF_TRIE_RESTUDY;
4061 if ( startbranch == first
4064 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
4069 /* at this point we know whatever we have is a
4070 * NOTHING sequence/branch AND if 'startbranch'
4071 * is 'first' then we can turn the whole thing
4074 if ( startbranch == first ) {
4076 /* the entire thing is a NOTHING sequence,
4077 * something like this: (?:|) So we can
4078 * turn it into a plain NOTHING op. */
4079 DEBUG_TRIE_COMPILE_r({
4080 regprop(RExC_rx, mysv, cur);
4081 PerlIO_printf( Perl_debug_log,
4082 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4083 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4086 OP(startbranch)= NOTHING;
4087 NEXT_OFF(startbranch)= tail - startbranch;
4088 for ( opt= startbranch + 1; opt < tail ; opt++ )
4092 } /* end if ( last) */
4093 } /* TRIE_MAXBUF is non zero */
4098 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4099 scan = NEXTOPER(NEXTOPER(scan));
4100 } else /* single branch is optimized. */
4101 scan = NEXTOPER(scan);
4103 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4104 scan_frame *newframe = NULL;
4108 U32 my_recursed_depth= recursed_depth;
4110 if (OP(scan) != SUSPEND) {
4111 /* set the pointer */
4112 if (OP(scan) == GOSUB) {
4114 RExC_recurse[ARG2L(scan)] = scan;
4115 start = RExC_open_parens[paren-1];
4116 end = RExC_close_parens[paren-1];
4119 start = RExC_rxi->program + 1;
4124 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4126 if (!recursed_depth) {
4127 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4129 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4130 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4131 RExC_study_chunk_recursed_bytes, U8);
4133 /* we havent recursed into this paren yet, so recurse into it */
4134 DEBUG_STUDYDATA("set:", data,depth);
4135 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4136 my_recursed_depth= recursed_depth + 1;
4137 Newx(newframe,1,scan_frame);
4139 DEBUG_STUDYDATA("inf:", data,depth);
4140 /* some form of infinite recursion, assume infinite length
4142 if (flags & SCF_DO_SUBSTR) {
4143 SCAN_COMMIT(pRExC_state,data,minlenp);
4144 data->longest = &(data->longest_float);
4146 is_inf = is_inf_internal = 1;
4147 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4148 ssc_anything(data->start_class);
4149 flags &= ~SCF_DO_STCLASS;
4152 Newx(newframe,1,scan_frame);
4155 end = regnext(scan);
4160 SAVEFREEPV(newframe);
4161 newframe->next = regnext(scan);
4162 newframe->last = last;
4163 newframe->stop = stopparen;
4164 newframe->prev = frame;
4165 newframe->prev_recursed_depth = recursed_depth;
4167 DEBUG_STUDYDATA("frame-new:",data,depth);
4168 DEBUG_PEEP("fnew", scan, depth);
4175 recursed_depth= my_recursed_depth;
4180 else if (OP(scan) == EXACT) {
4181 SSize_t l = STR_LEN(scan);
4184 const U8 * const s = (U8*)STRING(scan);
4185 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4186 l = utf8_length(s, s + l);
4188 uc = *((U8*)STRING(scan));
4191 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4192 /* The code below prefers earlier match for fixed
4193 offset, later match for variable offset. */
4194 if (data->last_end == -1) { /* Update the start info. */
4195 data->last_start_min = data->pos_min;
4196 data->last_start_max = is_inf
4197 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4199 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4201 SvUTF8_on(data->last_found);
4203 SV * const sv = data->last_found;
4204 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4205 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4206 if (mg && mg->mg_len >= 0)
4207 mg->mg_len += utf8_length((U8*)STRING(scan),
4208 (U8*)STRING(scan)+STR_LEN(scan));
4210 data->last_end = data->pos_min + l;
4211 data->pos_min += l; /* As in the first entry. */
4212 data->flags &= ~SF_BEFORE_EOL;
4215 /* ANDing the code point leaves at most it, and not in locale, and
4216 * can't match null string */
4217 if (flags & SCF_DO_STCLASS_AND) {
4218 ssc_cp_and(data->start_class, uc);
4219 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4220 ssc_clear_locale(data->start_class);
4222 else if (flags & SCF_DO_STCLASS_OR) {
4223 ssc_add_cp(data->start_class, uc);
4224 ssc_and(pRExC_state, data->start_class, and_withp);
4226 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4227 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4229 flags &= ~SCF_DO_STCLASS;
4231 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4232 SSize_t l = STR_LEN(scan);
4233 UV uc = *((U8*)STRING(scan));
4234 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4235 separate code points */
4237 /* Search for fixed substrings supports EXACT only. */
4238 if (flags & SCF_DO_SUBSTR) {
4240 SCAN_COMMIT(pRExC_state, data, minlenp);
4243 const U8 * const s = (U8 *)STRING(scan);
4244 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4245 l = utf8_length(s, s + l);
4247 if (unfolded_multi_char) {
4248 RExC_seen |= REG_SEEN_UNFOLDED_MULTI;
4250 min += l - min_subtract;
4252 delta += min_subtract;
4253 if (flags & SCF_DO_SUBSTR) {
4254 data->pos_min += l - min_subtract;
4255 if (data->pos_min < 0) {
4258 data->pos_delta += min_subtract;
4260 data->longest = &(data->longest_float);
4263 if (OP(scan) == EXACTFL) {
4264 if (flags & SCF_DO_STCLASS_AND) {
4265 ssc_flags_and(data->start_class, ANYOF_LOCALE);
4267 else if (flags & SCF_DO_STCLASS_OR) {
4268 ANYOF_FLAGS(data->start_class) |= ANYOF_LOCALE;
4271 /* We don't know what the folds are; it could be anything. XXX
4272 * Actually, we only support UTF-8 encoding for code points
4273 * above Latin1, so we could know what those folds are. */
4274 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4278 else { /* Non-locale EXACTFish */
4279 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4280 if (flags & SCF_DO_STCLASS_AND) {
4281 ssc_clear_locale(data->start_class);
4283 if (uc < 256) { /* We know what the Latin1 folds are ... */
4284 if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
4285 know if anything folds
4287 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4288 PL_fold_latin1[uc]);
4289 if (OP(scan) != EXACTFA) { /* The folds below aren't
4291 if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4293 = add_cp_to_invlist(EXACTF_invlist,
4294 LATIN_SMALL_LETTER_SHARP_S);
4296 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4298 = add_cp_to_invlist(EXACTF_invlist, 's');
4300 = add_cp_to_invlist(EXACTF_invlist, 'S');
4304 /* We also know if there are above-Latin1 code points
4305 * that fold to this (none legal for ASCII and /iaa) */
4306 if ((! isASCII(uc) || OP(scan) != EXACTFA)
4307 && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4309 /* XXX We could know exactly what does fold to this
4310 * if the reverse folds are loaded, as currently in
4312 _invlist_union(EXACTF_invlist,
4318 else { /* Non-locale, above Latin1. XXX We don't currently
4319 know what participates in folds with this, so have
4320 to assume anything could */
4322 /* XXX We could know exactly what does fold to this if the
4323 * reverse folds are loaded, as currently in S_regclass().
4324 * But we do know that under /iaa nothing in the ASCII
4325 * range can participate */
4326 if (OP(scan) == EXACTFA) {
4327 _invlist_union_complement_2nd(EXACTF_invlist,
4328 PL_XPosix_ptrs[_CC_ASCII],
4332 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4337 if (flags & SCF_DO_STCLASS_AND) {
4338 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4339 ANYOF_POSIXL_ZERO(data->start_class);
4340 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4342 else if (flags & SCF_DO_STCLASS_OR) {
4343 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4344 ssc_and(pRExC_state, data->start_class, and_withp);
4346 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4347 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4349 flags &= ~SCF_DO_STCLASS;
4350 SvREFCNT_dec(EXACTF_invlist);
4352 else if (REGNODE_VARIES(OP(scan))) {
4353 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4354 I32 fl = 0, f = flags;
4355 regnode * const oscan = scan;
4356 regnode_ssc this_class;
4357 regnode_ssc *oclass = NULL;
4358 I32 next_is_eval = 0;
4360 switch (PL_regkind[OP(scan)]) {
4361 case WHILEM: /* End of (?:...)* . */
4362 scan = NEXTOPER(scan);
4365 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4366 next = NEXTOPER(scan);
4367 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4369 maxcount = REG_INFTY;
4370 next = regnext(scan);
4371 scan = NEXTOPER(scan);
4375 if (flags & SCF_DO_SUBSTR)
4380 if (flags & SCF_DO_STCLASS) {
4382 maxcount = REG_INFTY;
4383 next = regnext(scan);
4384 scan = NEXTOPER(scan);
4387 is_inf = is_inf_internal = 1;
4388 scan = regnext(scan);
4389 if (flags & SCF_DO_SUBSTR) {
4390 SCAN_COMMIT(pRExC_state, data, minlenp);
4391 /* Cannot extend fixed substrings */
4392 data->longest = &(data->longest_float);
4394 goto optimize_curly_tail;
4396 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4397 && (scan->flags == stopparen))
4402 mincount = ARG1(scan);
4403 maxcount = ARG2(scan);
4405 next = regnext(scan);
4406 if (OP(scan) == CURLYX) {
4407 I32 lp = (data ? *(data->last_closep) : 0);
4408 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4410 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4411 next_is_eval = (OP(scan) == EVAL);
4413 if (flags & SCF_DO_SUBSTR) {
4414 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp);
4415 /* Cannot extend fixed substrings */
4416 pos_before = data->pos_min;
4420 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4422 data->flags |= SF_IS_INF;
4424 if (flags & SCF_DO_STCLASS) {
4425 ssc_init(pRExC_state, &this_class);
4426 oclass = data->start_class;
4427 data->start_class = &this_class;
4428 f |= SCF_DO_STCLASS_AND;
4429 f &= ~SCF_DO_STCLASS_OR;
4431 /* Exclude from super-linear cache processing any {n,m}
4432 regops for which the combination of input pos and regex
4433 pos is not enough information to determine if a match
4436 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4437 regex pos at the \s*, the prospects for a match depend not
4438 only on the input position but also on how many (bar\s*)
4439 repeats into the {4,8} we are. */
4440 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4441 f &= ~SCF_WHILEM_VISITED_POS;
4443 /* This will finish on WHILEM, setting scan, or on NULL: */
4444 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4445 last, data, stopparen, recursed_depth, NULL,
4447 ? (f & ~SCF_DO_SUBSTR)
4451 if (flags & SCF_DO_STCLASS)
4452 data->start_class = oclass;
4453 if (mincount == 0 || minnext == 0) {
4454 if (flags & SCF_DO_STCLASS_OR) {
4455 ssc_or(pRExC_state, data->start_class, &this_class);
4457 else if (flags & SCF_DO_STCLASS_AND) {
4458 /* Switch to OR mode: cache the old value of
4459 * data->start_class */
4461 StructCopy(data->start_class, and_withp, regnode_ssc);
4462 flags &= ~SCF_DO_STCLASS_AND;
4463 StructCopy(&this_class, data->start_class, regnode_ssc);
4464 flags |= SCF_DO_STCLASS_OR;
4465 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4467 } else { /* Non-zero len */
4468 if (flags & SCF_DO_STCLASS_OR) {
4469 ssc_or(pRExC_state, data->start_class, &this_class);
4470 ssc_and(pRExC_state, data->start_class, and_withp);
4472 else if (flags & SCF_DO_STCLASS_AND)
4473 ssc_and(pRExC_state, data->start_class, &this_class);
4474 flags &= ~SCF_DO_STCLASS;
4476 if (!scan) /* It was not CURLYX, but CURLY. */
4478 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4479 /* ? quantifier ok, except for (?{ ... }) */
4480 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4481 && (minnext == 0) && (deltanext == 0)
4482 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4483 && maxcount <= REG_INFTY/3) /* Complement check for big
4486 /* Fatal warnings may leak the regexp without this: */
4487 SAVEFREESV(RExC_rx_sv);
4488 ckWARNreg(RExC_parse,
4489 "Quantifier unexpected on zero-length expression");
4490 (void)ReREFCNT_inc(RExC_rx_sv);
4493 min += minnext * mincount;
4494 is_inf_internal |= deltanext == SSize_t_MAX
4495 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4496 is_inf |= is_inf_internal;
4498 delta = SSize_t_MAX;
4500 delta += (minnext + deltanext) * maxcount
4501 - minnext * mincount;
4503 /* Try powerful optimization CURLYX => CURLYN. */
4504 if ( OP(oscan) == CURLYX && data
4505 && data->flags & SF_IN_PAR
4506 && !(data->flags & SF_HAS_EVAL)
4507 && !deltanext && minnext == 1 ) {
4508 /* Try to optimize to CURLYN. */
4509 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4510 regnode * const nxt1 = nxt;
4517 if (!REGNODE_SIMPLE(OP(nxt))
4518 && !(PL_regkind[OP(nxt)] == EXACT
4519 && STR_LEN(nxt) == 1))
4525 if (OP(nxt) != CLOSE)
4527 if (RExC_open_parens) {
4528 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4529 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4531 /* Now we know that nxt2 is the only contents: */
4532 oscan->flags = (U8)ARG(nxt);
4534 OP(nxt1) = NOTHING; /* was OPEN. */
4537 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4538 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4539 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4540 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4541 OP(nxt + 1) = OPTIMIZED; /* was count. */
4542 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4547 /* Try optimization CURLYX => CURLYM. */
4548 if ( OP(oscan) == CURLYX && data
4549 && !(data->flags & SF_HAS_PAR)
4550 && !(data->flags & SF_HAS_EVAL)
4551 && !deltanext /* atom is fixed width */
4552 && minnext != 0 /* CURLYM can't handle zero width */
4554 /* Nor characters whose fold at run-time may be
4555 * multi-character */
4556 && ! (RExC_seen & REG_SEEN_UNFOLDED_MULTI)
4558 /* XXXX How to optimize if data == 0? */
4559 /* Optimize to a simpler form. */
4560 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4564 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4565 && (OP(nxt2) != WHILEM))
4567 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4568 /* Need to optimize away parenths. */
4569 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4570 /* Set the parenth number. */
4571 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4573 oscan->flags = (U8)ARG(nxt);
4574 if (RExC_open_parens) {
4575 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4576 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4578 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4579 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4582 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4583 OP(nxt + 1) = OPTIMIZED; /* was count. */
4584 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4585 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4588 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4589 regnode *nnxt = regnext(nxt1);
4591 if (reg_off_by_arg[OP(nxt1)])
4592 ARG_SET(nxt1, nxt2 - nxt1);
4593 else if (nxt2 - nxt1 < U16_MAX)
4594 NEXT_OFF(nxt1) = nxt2 - nxt1;
4596 OP(nxt) = NOTHING; /* Cannot beautify */
4601 /* Optimize again: */
4602 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4603 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4608 else if ((OP(oscan) == CURLYX)
4609 && (flags & SCF_WHILEM_VISITED_POS)
4610 /* See the comment on a similar expression above.
4611 However, this time it's not a subexpression
4612 we care about, but the expression itself. */
4613 && (maxcount == REG_INFTY)
4614 && data && ++data->whilem_c < 16) {
4615 /* This stays as CURLYX, we can put the count/of pair. */
4616 /* Find WHILEM (as in regexec.c) */
4617 regnode *nxt = oscan + NEXT_OFF(oscan);
4619 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4621 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4622 | (RExC_whilem_seen << 4)); /* On WHILEM */
4624 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4626 if (flags & SCF_DO_SUBSTR) {
4627 SV *last_str = NULL;
4628 int counted = mincount != 0;
4630 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4632 SSize_t b = pos_before >= data->last_start_min
4633 ? pos_before : data->last_start_min;
4635 const char * const s = SvPV_const(data->last_found, l);
4636 SSize_t old = b - data->last_start_min;
4639 old = utf8_hop((U8*)s, old) - (U8*)s;
4641 /* Get the added string: */
4642 last_str = newSVpvn_utf8(s + old, l, UTF);
4643 if (deltanext == 0 && pos_before == b) {
4644 /* What was added is a constant string */
4646 SvGROW(last_str, (mincount * l) + 1);
4647 repeatcpy(SvPVX(last_str) + l,
4648 SvPVX_const(last_str), l,
4650 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4651 /* Add additional parts. */
4652 SvCUR_set(data->last_found,
4653 SvCUR(data->last_found) - l);
4654 sv_catsv(data->last_found, last_str);
4656 SV * sv = data->last_found;
4658 SvUTF8(sv) && SvMAGICAL(sv) ?
4659 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4660 if (mg && mg->mg_len >= 0)
4661 mg->mg_len += CHR_SVLEN(last_str) - l;
4663 data->last_end += l * (mincount - 1);
4666 /* start offset must point into the last copy */
4667 data->last_start_min += minnext * (mincount - 1);
4668 data->last_start_max += is_inf ? SSize_t_MAX
4669 : (maxcount - 1) * (minnext + data->pos_delta);
4672 /* It is counted once already... */
4673 data->pos_min += minnext * (mincount - counted);
4675 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4676 " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4677 " maxcount=%"UVdf" mincount=%"UVdf"\n",
4678 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4680 if (deltanext != SSize_t_MAX)
4681 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4682 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4683 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4685 if (deltanext == SSize_t_MAX
4686 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4687 data->pos_delta = SSize_t_MAX;
4689 data->pos_delta += - counted * deltanext +
4690 (minnext + deltanext) * maxcount - minnext * mincount;
4691 if (mincount != maxcount) {
4692 /* Cannot extend fixed substrings found inside
4694 SCAN_COMMIT(pRExC_state,data,minlenp);
4695 if (mincount && last_str) {
4696 SV * const sv = data->last_found;
4697 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4698 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4702 sv_setsv(sv, last_str);
4703 data->last_end = data->pos_min;
4704 data->last_start_min =
4705 data->pos_min - CHR_SVLEN(last_str);
4706 data->last_start_max = is_inf
4708 : data->pos_min + data->pos_delta
4709 - CHR_SVLEN(last_str);
4711 data->longest = &(data->longest_float);
4713 SvREFCNT_dec(last_str);
4715 if (data && (fl & SF_HAS_EVAL))
4716 data->flags |= SF_HAS_EVAL;
4717 optimize_curly_tail:
4718 if (OP(oscan) != CURLYX) {
4719 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4721 NEXT_OFF(oscan) += NEXT_OFF(next);
4727 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4732 if (flags & SCF_DO_SUBSTR) {
4733 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect
4735 data->longest = &(data->longest_float);
4737 is_inf = is_inf_internal = 1;
4738 if (flags & SCF_DO_STCLASS_OR) {
4739 if (OP(scan) == CLUMP) {
4740 /* Actually is any start char, but very few code points
4741 * aren't start characters */
4742 ssc_match_all_cp(data->start_class);
4745 ssc_anything(data->start_class);
4748 flags &= ~SCF_DO_STCLASS;
4752 else if (OP(scan) == LNBREAK) {
4753 if (flags & SCF_DO_STCLASS) {
4754 if (flags & SCF_DO_STCLASS_AND) {
4755 ssc_intersection(data->start_class,
4756 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4757 ssc_clear_locale(data->start_class);
4758 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4760 else if (flags & SCF_DO_STCLASS_OR) {
4761 ssc_union(data->start_class,
4762 PL_XPosix_ptrs[_CC_VERTSPACE],
4764 ssc_and(pRExC_state, data->start_class, and_withp);
4766 /* See commit msg for
4767 * 749e076fceedeb708a624933726e7989f2302f6a */
4768 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4770 flags &= ~SCF_DO_STCLASS;
4773 delta++; /* Because of the 2 char string cr-lf */
4774 if (flags & SCF_DO_SUBSTR) {
4775 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect
4778 data->pos_delta += 1;
4779 data->longest = &(data->longest_float);
4782 else if (REGNODE_SIMPLE(OP(scan))) {
4784 if (flags & SCF_DO_SUBSTR) {
4785 SCAN_COMMIT(pRExC_state,data,minlenp);
4789 if (flags & SCF_DO_STCLASS) {
4791 SV* my_invlist = sv_2mortal(_new_invlist(0));
4794 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4795 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4797 /* Some of the logic below assumes that switching
4798 locale on will only add false positives. */
4803 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4808 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4809 ssc_match_all_cp(data->start_class);
4814 SV* REG_ANY_invlist = _new_invlist(2);
4815 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4817 if (flags & SCF_DO_STCLASS_OR) {
4818 ssc_union(data->start_class,
4820 TRUE /* TRUE => invert, hence all but \n
4824 else if (flags & SCF_DO_STCLASS_AND) {
4825 ssc_intersection(data->start_class,
4827 TRUE /* TRUE => invert */
4829 ssc_clear_locale(data->start_class);
4831 SvREFCNT_dec_NN(REG_ANY_invlist);
4836 if (flags & SCF_DO_STCLASS_AND)
4837 ssc_and(pRExC_state, data->start_class,
4838 (regnode_ssc*) scan);
4840 ssc_or(pRExC_state, data->start_class,
4841 (regnode_ssc*)scan);
4849 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4850 if (flags & SCF_DO_STCLASS_AND) {
4851 bool was_there = cBOOL(
4852 ANYOF_POSIXL_TEST(data->start_class,
4854 ANYOF_POSIXL_ZERO(data->start_class);
4855 if (was_there) { /* Do an AND */
4856 ANYOF_POSIXL_SET(data->start_class, namedclass);
4858 /* No individual code points can now match */
4859 data->start_class->invlist
4860 = sv_2mortal(_new_invlist(0));
4863 int complement = namedclass + ((invert) ? -1 : 1);
4865 assert(flags & SCF_DO_STCLASS_OR);
4867 /* If the complement of this class was already there,
4868 * the result is that they match all code points,
4869 * (\d + \D == everything). Remove the classes from
4870 * future consideration. Locale is not relevant in
4872 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4873 ssc_match_all_cp(data->start_class);
4874 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4875 ANYOF_POSIXL_CLEAR(data->start_class, complement);
4876 if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
4878 ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
4881 else { /* The usual case; just add this class to the
4883 ANYOF_POSIXL_SET(data->start_class, namedclass);
4884 ANYOF_FLAGS(data->start_class)
4885 |= ANYOF_LOCALE|ANYOF_POSIXL;
4890 case NPOSIXA: /* For these, we always know the exact set of
4895 if (FLAGS(scan) == _CC_ASCII) {
4896 my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4899 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4900 PL_XPosix_ptrs[_CC_ASCII],
4911 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4913 /* NPOSIXD matches all upper Latin1 code points unless the
4914 * target string being matched is UTF-8, which is
4915 * unknowable until match time */
4916 if (PL_regkind[OP(scan)] == NPOSIXD) {
4917 _invlist_union_complement_2nd(my_invlist,
4918 PL_XPosix_ptrs[_CC_ASCII], &my_invlist);
4923 if (flags & SCF_DO_STCLASS_AND) {
4924 ssc_intersection(data->start_class, my_invlist, invert);
4925 ssc_clear_locale(data->start_class);
4928 assert(flags & SCF_DO_STCLASS_OR);
4929 ssc_union(data->start_class, my_invlist, invert);
4932 if (flags & SCF_DO_STCLASS_OR)
4933 ssc_and(pRExC_state, data->start_class, and_withp);
4934 flags &= ~SCF_DO_STCLASS;
4937 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4938 data->flags |= (OP(scan) == MEOL
4941 SCAN_COMMIT(pRExC_state, data, minlenp);
4944 else if ( PL_regkind[OP(scan)] == BRANCHJ
4945 /* Lookbehind, or need to calculate parens/evals/stclass: */
4946 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4947 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4948 if ( OP(scan) == UNLESSM &&
4950 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4951 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4954 regnode *upto= regnext(scan);
4956 SV * const mysv_val=sv_newmortal();
4957 DEBUG_STUDYDATA("OPFAIL",data,depth);
4959 /*DEBUG_PARSE_MSG("opfail");*/
4960 regprop(RExC_rx, mysv_val, upto);
4961 PerlIO_printf(Perl_debug_log,
4962 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4963 SvPV_nolen_const(mysv_val),
4964 (IV)REG_NODE_NUM(upto),
4969 NEXT_OFF(scan) = upto - scan;
4970 for (opt= scan + 1; opt < upto ; opt++)
4971 OP(opt) = OPTIMIZED;
4975 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4976 || OP(scan) == UNLESSM )
4978 /* Negative Lookahead/lookbehind
4979 In this case we can't do fixed string optimisation.
4982 SSize_t deltanext, minnext, fake = 0;
4987 data_fake.flags = 0;
4989 data_fake.whilem_c = data->whilem_c;
4990 data_fake.last_closep = data->last_closep;
4993 data_fake.last_closep = &fake;
4994 data_fake.pos_delta = delta;
4995 if ( flags & SCF_DO_STCLASS && !scan->flags
4996 && OP(scan) == IFMATCH ) { /* Lookahead */
4997 ssc_init(pRExC_state, &intrnl);
4998 data_fake.start_class = &intrnl;
4999 f |= SCF_DO_STCLASS_AND;
5001 if (flags & SCF_WHILEM_VISITED_POS)
5002 f |= SCF_WHILEM_VISITED_POS;
5003 next = regnext(scan);
5004 nscan = NEXTOPER(NEXTOPER(scan));
5005 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5006 last, &data_fake, stopparen,
5007 recursed_depth, NULL, f, depth+1);
5010 FAIL("Variable length lookbehind not implemented");
5012 else if (minnext > (I32)U8_MAX) {
5013 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5016 scan->flags = (U8)minnext;
5019 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5021 if (data_fake.flags & SF_HAS_EVAL)
5022 data->flags |= SF_HAS_EVAL;
5023 data->whilem_c = data_fake.whilem_c;
5025 if (f & SCF_DO_STCLASS_AND) {
5026 if (flags & SCF_DO_STCLASS_OR) {
5027 /* OR before, AND after: ideally we would recurse with
5028 * data_fake to get the AND applied by study of the
5029 * remainder of the pattern, and then derecurse;
5030 * *** HACK *** for now just treat as "no information".
5031 * See [perl #56690].
5033 ssc_init(pRExC_state, data->start_class);
5035 /* AND before and after: combine and continue */
5036 ssc_and(pRExC_state, data->start_class, &intrnl);
5040 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5042 /* Positive Lookahead/lookbehind
5043 In this case we can do fixed string optimisation,
5044 but we must be careful about it. Note in the case of
5045 lookbehind the positions will be offset by the minimum
5046 length of the pattern, something we won't know about
5047 until after the recurse.
5049 SSize_t deltanext, fake = 0;
5053 /* We use SAVEFREEPV so that when the full compile
5054 is finished perl will clean up the allocated
5055 minlens when it's all done. This way we don't
5056 have to worry about freeing them when we know
5057 they wont be used, which would be a pain.
5060 Newx( minnextp, 1, SSize_t );
5061 SAVEFREEPV(minnextp);
5064 StructCopy(data, &data_fake, scan_data_t);
5065 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5068 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
5069 data_fake.last_found=newSVsv(data->last_found);
5073 data_fake.last_closep = &fake;
5074 data_fake.flags = 0;
5075 data_fake.pos_delta = delta;
5077 data_fake.flags |= SF_IS_INF;
5078 if ( flags & SCF_DO_STCLASS && !scan->flags
5079 && OP(scan) == IFMATCH ) { /* Lookahead */
5080 ssc_init(pRExC_state, &intrnl);
5081 data_fake.start_class = &intrnl;
5082 f |= SCF_DO_STCLASS_AND;
5084 if (flags & SCF_WHILEM_VISITED_POS)
5085 f |= SCF_WHILEM_VISITED_POS;
5086 next = regnext(scan);
5087 nscan = NEXTOPER(NEXTOPER(scan));
5089 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5090 &deltanext, last, &data_fake,
5091 stopparen, recursed_depth, NULL,
5095 FAIL("Variable length lookbehind not implemented");
5097 else if (*minnextp > (I32)U8_MAX) {
5098 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5101 scan->flags = (U8)*minnextp;
5106 if (f & SCF_DO_STCLASS_AND) {
5107 ssc_and(pRExC_state, data->start_class, &intrnl);
5110 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5112 if (data_fake.flags & SF_HAS_EVAL)
5113 data->flags |= SF_HAS_EVAL;
5114 data->whilem_c = data_fake.whilem_c;
5115 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5116 if (RExC_rx->minlen<*minnextp)
5117 RExC_rx->minlen=*minnextp;
5118 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
5119 SvREFCNT_dec_NN(data_fake.last_found);
5121 if ( data_fake.minlen_fixed != minlenp )
5123 data->offset_fixed= data_fake.offset_fixed;
5124 data->minlen_fixed= data_fake.minlen_fixed;
5125 data->lookbehind_fixed+= scan->flags;
5127 if ( data_fake.minlen_float != minlenp )
5129 data->minlen_float= data_fake.minlen_float;
5130 data->offset_float_min=data_fake.offset_float_min;
5131 data->offset_float_max=data_fake.offset_float_max;
5132 data->lookbehind_float+= scan->flags;
5139 else if (OP(scan) == OPEN) {
5140 if (stopparen != (I32)ARG(scan))
5143 else if (OP(scan) == CLOSE) {
5144 if (stopparen == (I32)ARG(scan)) {
5147 if ((I32)ARG(scan) == is_par) {
5148 next = regnext(scan);
5150 if ( next && (OP(next) != WHILEM) && next < last)
5151 is_par = 0; /* Disable optimization */
5154 *(data->last_closep) = ARG(scan);
5156 else if (OP(scan) == EVAL) {
5158 data->flags |= SF_HAS_EVAL;
5160 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5161 if (flags & SCF_DO_SUBSTR) {
5162 SCAN_COMMIT(pRExC_state,data,minlenp);
5163 flags &= ~SCF_DO_SUBSTR;
5165 if (data && OP(scan)==ACCEPT) {
5166 data->flags |= SCF_SEEN_ACCEPT;
5171 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5173 if (flags & SCF_DO_SUBSTR) {
5174 SCAN_COMMIT(pRExC_state,data,minlenp);
5175 data->longest = &(data->longest_float);
5177 is_inf = is_inf_internal = 1;
5178 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5179 ssc_anything(data->start_class);
5180 flags &= ~SCF_DO_STCLASS;
5182 else if (OP(scan) == GPOS) {
5183 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
5184 !(delta || is_inf || (data && data->pos_delta)))
5186 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
5187 RExC_rx->extflags |= RXf_ANCH_GPOS;
5188 if (RExC_rx->gofs < (STRLEN)min)
5189 RExC_rx->gofs = min;
5191 RExC_rx->extflags |= RXf_GPOS_FLOAT;
5195 #ifdef TRIE_STUDY_OPT
5196 #ifdef FULL_TRIE_STUDY
5197 else if (PL_regkind[OP(scan)] == TRIE) {
5198 /* NOTE - There is similar code to this block above for handling
5199 BRANCH nodes on the initial study. If you change stuff here
5201 regnode *trie_node= scan;
5202 regnode *tail= regnext(scan);
5203 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5204 SSize_t max1 = 0, min1 = SSize_t_MAX;
5207 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
5208 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings
5210 if (flags & SCF_DO_STCLASS)
5211 ssc_init_zero(pRExC_state, &accum);
5217 const regnode *nextbranch= NULL;
5220 for ( word=1 ; word <= trie->wordcount ; word++)
5222 SSize_t deltanext=0, minnext=0, f = 0, fake;
5223 regnode_ssc this_class;
5225 data_fake.flags = 0;
5227 data_fake.whilem_c = data->whilem_c;
5228 data_fake.last_closep = data->last_closep;
5231 data_fake.last_closep = &fake;
5232 data_fake.pos_delta = delta;
5233 if (flags & SCF_DO_STCLASS) {
5234 ssc_init(pRExC_state, &this_class);
5235 data_fake.start_class = &this_class;
5236 f = SCF_DO_STCLASS_AND;
5238 if (flags & SCF_WHILEM_VISITED_POS)
5239 f |= SCF_WHILEM_VISITED_POS;
5241 if (trie->jump[word]) {
5243 nextbranch = trie_node + trie->jump[0];
5244 scan= trie_node + trie->jump[word];
5245 /* We go from the jump point to the branch that follows
5246 it. Note this means we need the vestigal unused
5247 branches even though they arent otherwise used. */
5248 minnext = study_chunk(pRExC_state, &scan, minlenp,
5249 &deltanext, (regnode *)nextbranch, &data_fake,
5250 stopparen, recursed_depth, NULL, f,depth+1);
5252 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5253 nextbranch= regnext((regnode*)nextbranch);
5255 if (min1 > (SSize_t)(minnext + trie->minlen))
5256 min1 = minnext + trie->minlen;
5257 if (deltanext == SSize_t_MAX) {
5258 is_inf = is_inf_internal = 1;
5260 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5261 max1 = minnext + deltanext + trie->maxlen;
5263 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5265 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5266 if ( stopmin > min + min1)
5267 stopmin = min + min1;
5268 flags &= ~SCF_DO_SUBSTR;
5270 data->flags |= SCF_SEEN_ACCEPT;
5273 if (data_fake.flags & SF_HAS_EVAL)
5274 data->flags |= SF_HAS_EVAL;
5275 data->whilem_c = data_fake.whilem_c;
5277 if (flags & SCF_DO_STCLASS)
5278 ssc_or(pRExC_state, &accum, &this_class);
5281 if (flags & SCF_DO_SUBSTR) {
5282 data->pos_min += min1;
5283 data->pos_delta += max1 - min1;
5284 if (max1 != min1 || is_inf)
5285 data->longest = &(data->longest_float);
5288 delta += max1 - min1;
5289 if (flags & SCF_DO_STCLASS_OR) {
5290 ssc_or(pRExC_state, data->start_class, &accum);
5292 ssc_and(pRExC_state, data->start_class, and_withp);
5293 flags &= ~SCF_DO_STCLASS;
5296 else if (flags & SCF_DO_STCLASS_AND) {
5298 ssc_and(pRExC_state, data->start_class, &accum);
5299 flags &= ~SCF_DO_STCLASS;
5302 /* Switch to OR mode: cache the old value of
5303 * data->start_class */
5305 StructCopy(data->start_class, and_withp, regnode_ssc);
5306 flags &= ~SCF_DO_STCLASS_AND;
5307 StructCopy(&accum, data->start_class, regnode_ssc);
5308 flags |= SCF_DO_STCLASS_OR;
5315 else if (PL_regkind[OP(scan)] == TRIE) {
5316 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5319 min += trie->minlen;
5320 delta += (trie->maxlen - trie->minlen);
5321 flags &= ~SCF_DO_STCLASS; /* xxx */
5322 if (flags & SCF_DO_SUBSTR) {
5323 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect
5325 data->pos_min += trie->minlen;
5326 data->pos_delta += (trie->maxlen - trie->minlen);
5327 if (trie->maxlen != trie->minlen)
5328 data->longest = &(data->longest_float);
5330 if (trie->jump) /* no more substrings -- for now /grr*/
5331 flags &= ~SCF_DO_SUBSTR;
5333 #endif /* old or new */
5334 #endif /* TRIE_STUDY_OPT */
5336 /* Else: zero-length, ignore. */
5337 scan = regnext(scan);
5339 /* If we are exiting a recursion we can unset its recursed bit
5340 * and allow ourselves to enter it again - no danger of an
5341 * infinite loop there.
5342 if (stopparen > -1 && recursed) {
5343 DEBUG_STUDYDATA("unset:", data,depth);
5344 PAREN_UNSET( recursed, stopparen);
5348 DEBUG_STUDYDATA("frame-end:",data,depth);
5349 DEBUG_PEEP("fend", scan, depth);
5350 /* restore previous context */
5353 stopparen = frame->stop;
5354 recursed_depth = frame->prev_recursed_depth;
5357 frame = frame->prev;
5358 goto fake_study_recurse;
5363 DEBUG_STUDYDATA("pre-fin:",data,depth);
5366 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5367 if (flags & SCF_DO_SUBSTR && is_inf)
5368 data->pos_delta = SSize_t_MAX - data->pos_min;
5369 if (is_par > (I32)U8_MAX)
5371 if (is_par && pars==1 && data) {
5372 data->flags |= SF_IN_PAR;
5373 data->flags &= ~SF_HAS_PAR;
5375 else if (pars && data) {
5376 data->flags |= SF_HAS_PAR;
5377 data->flags &= ~SF_IN_PAR;
5379 if (flags & SCF_DO_STCLASS_OR)
5380 ssc_and(pRExC_state, data->start_class, and_withp);
5381 if (flags & SCF_TRIE_RESTUDY)
5382 data->flags |= SCF_TRIE_RESTUDY;
5384 DEBUG_STUDYDATA("post-fin:",data,depth);
5386 return min < stopmin ? min : stopmin;
5390 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5392 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5394 PERL_ARGS_ASSERT_ADD_DATA;
5396 Renewc(RExC_rxi->data,
5397 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5398 char, struct reg_data);
5400 Renew(RExC_rxi->data->what, count + n, U8);
5402 Newx(RExC_rxi->data->what, n, U8);
5403 RExC_rxi->data->count = count + n;
5404 Copy(s, RExC_rxi->data->what + count, n, U8);
5408 /*XXX: todo make this not included in a non debugging perl */
5409 #ifndef PERL_IN_XSUB_RE
5411 Perl_reginitcolors(pTHX)
5414 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5416 char *t = savepv(s);
5420 t = strchr(t, '\t');
5426 PL_colors[i] = t = (char *)"";
5431 PL_colors[i++] = (char *)"";
5438 #ifdef TRIE_STUDY_OPT
5439 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5442 (data.flags & SCF_TRIE_RESTUDY) \
5450 #define CHECK_RESTUDY_GOTO_butfirst
5454 * pregcomp - compile a regular expression into internal code
5456 * Decides which engine's compiler to call based on the hint currently in
5460 #ifndef PERL_IN_XSUB_RE
5462 /* return the currently in-scope regex engine (or the default if none) */
5464 regexp_engine const *
5465 Perl_current_re_engine(pTHX)
5469 if (IN_PERL_COMPILETIME) {
5470 HV * const table = GvHV(PL_hintgv);
5473 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5474 return &PL_core_reg_engine;
5475 ptr = hv_fetchs(table, "regcomp", FALSE);
5476 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5477 return &PL_core_reg_engine;
5478 return INT2PTR(regexp_engine*,SvIV(*ptr));
5482 if (!PL_curcop->cop_hints_hash)
5483 return &PL_core_reg_engine;
5484 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5485 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5486 return &PL_core_reg_engine;
5487 return INT2PTR(regexp_engine*,SvIV(ptr));
5493 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5496 regexp_engine const *eng = current_re_engine();
5497 GET_RE_DEBUG_FLAGS_DECL;
5499 PERL_ARGS_ASSERT_PREGCOMP;
5501 /* Dispatch a request to compile a regexp to correct regexp engine. */
5503 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5506 return CALLREGCOMP_ENG(eng, pattern, flags);
5510 /* public(ish) entry point for the perl core's own regex compiling code.
5511 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5512 * pattern rather than a list of OPs, and uses the internal engine rather
5513 * than the current one */
5516 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5518 SV *pat = pattern; /* defeat constness! */
5519 PERL_ARGS_ASSERT_RE_COMPILE;
5520 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5521 #ifdef PERL_IN_XSUB_RE
5524 &PL_core_reg_engine,
5526 NULL, NULL, rx_flags, 0);
5530 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5531 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5532 * point to the realloced string and length.
5534 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5538 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5539 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5541 U8 *const src = (U8*)*pat_p;
5544 STRLEN s = 0, d = 0;
5546 GET_RE_DEBUG_FLAGS_DECL;
5548 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5549 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5551 Newx(dst, *plen_p * 2 + 1, U8);
5553 while (s < *plen_p) {
5554 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5557 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5558 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
5560 if (n < num_code_blocks) {
5561 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5562 pRExC_state->code_blocks[n].start = d;
5563 assert(dst[d] == '(');
5566 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5567 pRExC_state->code_blocks[n].end = d;
5568 assert(dst[d] == ')');
5578 *pat_p = (char*) dst;
5580 RExC_orig_utf8 = RExC_utf8 = 1;
5585 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5586 * while recording any code block indices, and handling overloading,
5587 * nested qr// objects etc. If pat is null, it will allocate a new
5588 * string, or just return the first arg, if there's only one.
5590 * Returns the malloced/updated pat.
5591 * patternp and pat_count is the array of SVs to be concatted;
5592 * oplist is the optional list of ops that generated the SVs;
5593 * recompile_p is a pointer to a boolean that will be set if
5594 * the regex will need to be recompiled.
5595 * delim, if non-null is an SV that will be inserted between each element
5599 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5600 SV *pat, SV ** const patternp, int pat_count,
5601 OP *oplist, bool *recompile_p, SV *delim)
5605 bool use_delim = FALSE;
5606 bool alloced = FALSE;
5608 /* if we know we have at least two args, create an empty string,
5609 * then concatenate args to that. For no args, return an empty string */
5610 if (!pat && pat_count != 1) {
5611 pat = newSVpvn("", 0);
5616 for (svp = patternp; svp < patternp + pat_count; svp++) {
5619 STRLEN orig_patlen = 0;
5621 SV *msv = use_delim ? delim : *svp;
5622 if (!msv) msv = &PL_sv_undef;
5624 /* if we've got a delimiter, we go round the loop twice for each
5625 * svp slot (except the last), using the delimiter the second
5634 if (SvTYPE(msv) == SVt_PVAV) {
5635 /* we've encountered an interpolated array within
5636 * the pattern, e.g. /...@a..../. Expand the list of elements,
5637 * then recursively append elements.
5638 * The code in this block is based on S_pushav() */
5640 AV *const av = (AV*)msv;
5641 const SSize_t maxarg = AvFILL(av) + 1;
5645 assert(oplist->op_type == OP_PADAV
5646 || oplist->op_type == OP_RV2AV);
5647 oplist = oplist->op_sibling;;
5650 if (SvRMAGICAL(av)) {
5653 Newx(array, maxarg, SV*);
5655 for (i=0; i < maxarg; i++) {
5656 SV ** const svp = av_fetch(av, i, FALSE);
5657 array[i] = svp ? *svp : &PL_sv_undef;
5661 array = AvARRAY(av);
5663 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5664 array, maxarg, NULL, recompile_p,
5666 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5672 /* we make the assumption here that each op in the list of
5673 * op_siblings maps to one SV pushed onto the stack,
5674 * except for code blocks, with have both an OP_NULL and
5676 * This allows us to match up the list of SVs against the
5677 * list of OPs to find the next code block.
5679 * Note that PUSHMARK PADSV PADSV ..
5681 * PADRANGE PADSV PADSV ..
5682 * so the alignment still works. */
5685 if (oplist->op_type == OP_NULL
5686 && (oplist->op_flags & OPf_SPECIAL))
5688 assert(n < pRExC_state->num_code_blocks);
5689 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5690 pRExC_state->code_blocks[n].block = oplist;
5691 pRExC_state->code_blocks[n].src_regex = NULL;
5694 oplist = oplist->op_sibling; /* skip CONST */
5697 oplist = oplist->op_sibling;;
5700 /* apply magic and QR overloading to arg */
5703 if (SvROK(msv) && SvAMAGIC(msv)) {
5704 SV *sv = AMG_CALLunary(msv, regexp_amg);
5708 if (SvTYPE(sv) != SVt_REGEXP)
5709 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5714 /* try concatenation overload ... */
5715 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5716 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5719 /* overloading involved: all bets are off over literal
5720 * code. Pretend we haven't seen it */
5721 pRExC_state->num_code_blocks -= n;
5725 /* ... or failing that, try "" overload */
5726 while (SvAMAGIC(msv)
5727 && (sv = AMG_CALLunary(msv, string_amg))
5731 && SvRV(msv) == SvRV(sv))
5736 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5740 /* this is a partially unrolled
5741 * sv_catsv_nomg(pat, msv);
5742 * that allows us to adjust code block indices if
5745 char *dst = SvPV_force_nomg(pat, dlen);
5747 if (SvUTF8(msv) && !SvUTF8(pat)) {
5748 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5749 sv_setpvn(pat, dst, dlen);
5752 sv_catsv_nomg(pat, msv);
5759 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5762 /* extract any code blocks within any embedded qr//'s */
5763 if (rx && SvTYPE(rx) == SVt_REGEXP
5764 && RX_ENGINE((REGEXP*)rx)->op_comp)
5767 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5768 if (ri->num_code_blocks) {
5770 /* the presence of an embedded qr// with code means
5771 * we should always recompile: the text of the
5772 * qr// may not have changed, but it may be a
5773 * different closure than last time */
5775 Renew(pRExC_state->code_blocks,
5776 pRExC_state->num_code_blocks + ri->num_code_blocks,
5777 struct reg_code_block);
5778 pRExC_state->num_code_blocks += ri->num_code_blocks;
5780 for (i=0; i < ri->num_code_blocks; i++) {
5781 struct reg_code_block *src, *dst;
5782 STRLEN offset = orig_patlen
5783 + ReANY((REGEXP *)rx)->pre_prefix;
5784 assert(n < pRExC_state->num_code_blocks);
5785 src = &ri->code_blocks[i];
5786 dst = &pRExC_state->code_blocks[n];
5787 dst->start = src->start + offset;
5788 dst->end = src->end + offset;
5789 dst->block = src->block;
5790 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5799 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5808 /* see if there are any run-time code blocks in the pattern.
5809 * False positives are allowed */
5812 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5813 char *pat, STRLEN plen)
5818 for (s = 0; s < plen; s++) {
5819 if (n < pRExC_state->num_code_blocks
5820 && s == pRExC_state->code_blocks[n].start)
5822 s = pRExC_state->code_blocks[n].end;
5826 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5828 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5830 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5837 /* Handle run-time code blocks. We will already have compiled any direct
5838 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5839 * copy of it, but with any literal code blocks blanked out and
5840 * appropriate chars escaped; then feed it into
5842 * eval "qr'modified_pattern'"
5846 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5850 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5852 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5853 * and merge them with any code blocks of the original regexp.
5855 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5856 * instead, just save the qr and return FALSE; this tells our caller that
5857 * the original pattern needs upgrading to utf8.
5861 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5862 char *pat, STRLEN plen)
5866 GET_RE_DEBUG_FLAGS_DECL;
5868 if (pRExC_state->runtime_code_qr) {
5869 /* this is the second time we've been called; this should
5870 * only happen if the main pattern got upgraded to utf8
5871 * during compilation; re-use the qr we compiled first time
5872 * round (which should be utf8 too)
5874 qr = pRExC_state->runtime_code_qr;
5875 pRExC_state->runtime_code_qr = NULL;
5876 assert(RExC_utf8 && SvUTF8(qr));
5882 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5886 /* determine how many extra chars we need for ' and \ escaping */
5887 for (s = 0; s < plen; s++) {
5888 if (pat[s] == '\'' || pat[s] == '\\')
5892 Newx(newpat, newlen, char);
5894 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5896 for (s = 0; s < plen; s++) {
5897 if (n < pRExC_state->num_code_blocks
5898 && s == pRExC_state->code_blocks[n].start)
5900 /* blank out literal code block */
5901 assert(pat[s] == '(');
5902 while (s <= pRExC_state->code_blocks[n].end) {
5910 if (pat[s] == '\'' || pat[s] == '\\')
5915 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5919 PerlIO_printf(Perl_debug_log,
5920 "%sre-parsing pattern for runtime code:%s %s\n",
5921 PL_colors[4],PL_colors[5],newpat);
5924 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5930 PUSHSTACKi(PERLSI_REQUIRE);
5931 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5932 * parsing qr''; normally only q'' does this. It also alters
5934 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5935 SvREFCNT_dec_NN(sv);
5940 SV * const errsv = ERRSV;
5941 if (SvTRUE_NN(errsv))
5943 Safefree(pRExC_state->code_blocks);
5944 /* use croak_sv ? */
5945 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5948 assert(SvROK(qr_ref));
5950 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5951 /* the leaving below frees the tmp qr_ref.
5952 * Give qr a life of its own */
5960 if (!RExC_utf8 && SvUTF8(qr)) {
5961 /* first time through; the pattern got upgraded; save the
5962 * qr for the next time through */
5963 assert(!pRExC_state->runtime_code_qr);
5964 pRExC_state->runtime_code_qr = qr;
5969 /* extract any code blocks within the returned qr// */
5972 /* merge the main (r1) and run-time (r2) code blocks into one */
5974 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5975 struct reg_code_block *new_block, *dst;
5976 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5979 if (!r2->num_code_blocks) /* we guessed wrong */
5981 SvREFCNT_dec_NN(qr);
5986 r1->num_code_blocks + r2->num_code_blocks,
5987 struct reg_code_block);
5990 while ( i1 < r1->num_code_blocks
5991 || i2 < r2->num_code_blocks)
5993 struct reg_code_block *src;
5996 if (i1 == r1->num_code_blocks) {
5997 src = &r2->code_blocks[i2++];
6000 else if (i2 == r2->num_code_blocks)
6001 src = &r1->code_blocks[i1++];
6002 else if ( r1->code_blocks[i1].start
6003 < r2->code_blocks[i2].start)
6005 src = &r1->code_blocks[i1++];
6006 assert(src->end < r2->code_blocks[i2].start);
6009 assert( r1->code_blocks[i1].start
6010 > r2->code_blocks[i2].start);
6011 src = &r2->code_blocks[i2++];
6013 assert(src->end < r1->code_blocks[i1].start);
6016 assert(pat[src->start] == '(');
6017 assert(pat[src->end] == ')');
6018 dst->start = src->start;
6019 dst->end = src->end;
6020 dst->block = src->block;
6021 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6025 r1->num_code_blocks += r2->num_code_blocks;
6026 Safefree(r1->code_blocks);
6027 r1->code_blocks = new_block;
6030 SvREFCNT_dec_NN(qr);
6036 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6037 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6038 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6039 STRLEN longest_length, bool eol, bool meol)
6041 /* This is the common code for setting up the floating and fixed length
6042 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6043 * as to whether succeeded or not */
6048 if (! (longest_length
6049 || (eol /* Can't have SEOL and MULTI */
6050 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6052 /* See comments for join_exact for why REG_SEEN_UNFOLDED_MULTI */
6053 || (RExC_seen & REG_SEEN_UNFOLDED_MULTI))
6058 /* copy the information about the longest from the reg_scan_data
6059 over to the program. */
6060 if (SvUTF8(sv_longest)) {
6061 *rx_utf8 = sv_longest;
6064 *rx_substr = sv_longest;
6067 /* end_shift is how many chars that must be matched that
6068 follow this item. We calculate it ahead of time as once the
6069 lookbehind offset is added in we lose the ability to correctly
6071 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6072 *rx_end_shift = ml - offset
6073 - longest_length + (SvTAIL(sv_longest) != 0)
6076 t = (eol/* Can't have SEOL and MULTI */
6077 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6078 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6084 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6085 * regular expression into internal code.
6086 * The pattern may be passed either as:
6087 * a list of SVs (patternp plus pat_count)
6088 * a list of OPs (expr)
6089 * If both are passed, the SV list is used, but the OP list indicates
6090 * which SVs are actually pre-compiled code blocks
6092 * The SVs in the list have magic and qr overloading applied to them (and
6093 * the list may be modified in-place with replacement SVs in the latter
6096 * If the pattern hasn't changed from old_re, then old_re will be
6099 * eng is the current engine. If that engine has an op_comp method, then
6100 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6101 * do the initial concatenation of arguments and pass on to the external
6104 * If is_bare_re is not null, set it to a boolean indicating whether the
6105 * arg list reduced (after overloading) to a single bare regex which has
6106 * been returned (i.e. /$qr/).
6108 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6110 * pm_flags contains the PMf_* flags, typically based on those from the
6111 * pm_flags field of the related PMOP. Currently we're only interested in
6112 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6114 * We can't allocate space until we know how big the compiled form will be,
6115 * but we can't compile it (and thus know how big it is) until we've got a
6116 * place to put the code. So we cheat: we compile it twice, once with code
6117 * generation turned off and size counting turned on, and once "for real".
6118 * This also means that we don't allocate space until we are sure that the
6119 * thing really will compile successfully, and we never have to move the
6120 * code and thus invalidate pointers into it. (Note that it has to be in
6121 * one piece because free() must be able to free it all.) [NB: not true in perl]
6123 * Beware that the optimization-preparation code in here knows about some
6124 * of the structure of the compiled regexp. [I'll say.]
6128 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6129 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6130 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6135 regexp_internal *ri;
6143 SV *code_blocksv = NULL;
6144 SV** new_patternp = patternp;
6146 /* these are all flags - maybe they should be turned
6147 * into a single int with different bit masks */
6148 I32 sawlookahead = 0;
6153 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6155 bool runtime_code = 0;
6157 RExC_state_t RExC_state;
6158 RExC_state_t * const pRExC_state = &RExC_state;
6159 #ifdef TRIE_STUDY_OPT
6161 RExC_state_t copyRExC_state;
6163 GET_RE_DEBUG_FLAGS_DECL;
6165 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6167 DEBUG_r(if (!PL_colorset) reginitcolors());
6169 #ifndef PERL_IN_XSUB_RE
6170 /* Initialize these here instead of as-needed, as is quick and avoids
6171 * having to test them each time otherwise */
6172 if (! PL_AboveLatin1) {
6173 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6174 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6175 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6176 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6177 PL_HasMultiCharFold =
6178 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6182 pRExC_state->code_blocks = NULL;
6183 pRExC_state->num_code_blocks = 0;
6186 *is_bare_re = FALSE;
6188 if (expr && (expr->op_type == OP_LIST ||
6189 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6190 /* allocate code_blocks if needed */
6194 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6195 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6196 ncode++; /* count of DO blocks */
6198 pRExC_state->num_code_blocks = ncode;
6199 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6204 /* compile-time pattern with just OP_CONSTs and DO blocks */
6209 /* find how many CONSTs there are */
6212 if (expr->op_type == OP_CONST)
6215 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6216 if (o->op_type == OP_CONST)
6220 /* fake up an SV array */
6222 assert(!new_patternp);
6223 Newx(new_patternp, n, SV*);
6224 SAVEFREEPV(new_patternp);
6228 if (expr->op_type == OP_CONST)
6229 new_patternp[n] = cSVOPx_sv(expr);
6231 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6232 if (o->op_type == OP_CONST)
6233 new_patternp[n++] = cSVOPo_sv;
6238 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6239 "Assembling pattern from %d elements%s\n", pat_count,
6240 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6242 /* set expr to the first arg op */
6244 if (pRExC_state->num_code_blocks
6245 && expr->op_type != OP_CONST)
6247 expr = cLISTOPx(expr)->op_first;
6248 assert( expr->op_type == OP_PUSHMARK
6249 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6250 || expr->op_type == OP_PADRANGE);
6251 expr = expr->op_sibling;
6254 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6255 expr, &recompile, NULL);
6257 /* handle bare (possibly after overloading) regex: foo =~ $re */
6262 if (SvTYPE(re) == SVt_REGEXP) {
6266 Safefree(pRExC_state->code_blocks);
6267 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6268 "Precompiled pattern%s\n",
6269 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6275 exp = SvPV_nomg(pat, plen);
6277 if (!eng->op_comp) {
6278 if ((SvUTF8(pat) && IN_BYTES)
6279 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6281 /* make a temporary copy; either to convert to bytes,
6282 * or to avoid repeating get-magic / overloaded stringify */
6283 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6284 (IN_BYTES ? 0 : SvUTF8(pat)));
6286 Safefree(pRExC_state->code_blocks);
6287 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6290 /* ignore the utf8ness if the pattern is 0 length */
6291 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6292 RExC_uni_semantics = 0;
6293 RExC_contains_locale = 0;
6294 RExC_contains_i = 0;
6295 pRExC_state->runtime_code_qr = NULL;
6298 SV *dsv= sv_newmortal();
6299 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6300 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6301 PL_colors[4],PL_colors[5],s);
6305 /* we jump here if we upgrade the pattern to utf8 and have to
6308 if ((pm_flags & PMf_USE_RE_EVAL)
6309 /* this second condition covers the non-regex literal case,
6310 * i.e. $foo =~ '(?{})'. */
6311 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6313 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6315 /* return old regex if pattern hasn't changed */
6316 /* XXX: note in the below we have to check the flags as well as the
6319 * Things get a touch tricky as we have to compare the utf8 flag
6320 * independently from the compile flags. */
6324 && !!RX_UTF8(old_re) == !!RExC_utf8
6325 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6326 && RX_PRECOMP(old_re)
6327 && RX_PRELEN(old_re) == plen
6328 && memEQ(RX_PRECOMP(old_re), exp, plen)
6329 && !runtime_code /* with runtime code, always recompile */ )
6331 Safefree(pRExC_state->code_blocks);
6335 rx_flags = orig_rx_flags;
6337 if (rx_flags & PMf_FOLD) {
6338 RExC_contains_i = 1;
6340 if (initial_charset == REGEX_LOCALE_CHARSET) {
6341 RExC_contains_locale = 1;
6343 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6345 /* Set to use unicode semantics if the pattern is in utf8 and has the
6346 * 'depends' charset specified, as it means unicode when utf8 */
6347 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6351 RExC_flags = rx_flags;
6352 RExC_pm_flags = pm_flags;
6355 if (TAINTING_get && TAINT_get)
6356 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6358 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6359 /* whoops, we have a non-utf8 pattern, whilst run-time code
6360 * got compiled as utf8. Try again with a utf8 pattern */
6361 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6362 pRExC_state->num_code_blocks);
6363 goto redo_first_pass;
6366 assert(!pRExC_state->runtime_code_qr);
6371 RExC_in_lookbehind = 0;
6372 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6374 RExC_override_recoding = 0;
6375 RExC_in_multi_char_class = 0;
6377 /* First pass: determine size, legality. */
6380 RExC_end = exp + plen;
6385 RExC_emit = (regnode *) &RExC_emit_dummy;
6386 RExC_whilem_seen = 0;
6387 RExC_open_parens = NULL;
6388 RExC_close_parens = NULL;
6390 RExC_paren_names = NULL;
6392 RExC_paren_name_list = NULL;
6394 RExC_recurse = NULL;
6395 RExC_study_chunk_recursed = NULL;
6396 RExC_study_chunk_recursed_bytes= 0;
6397 RExC_recurse_count = 0;
6398 pRExC_state->code_index = 0;
6400 #if 0 /* REGC() is (currently) a NOP at the first pass.
6401 * Clever compilers notice this and complain. --jhi */
6402 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6405 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6407 RExC_lastparse=NULL;
6409 /* reg may croak on us, not giving us a chance to free
6410 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6411 need it to survive as long as the regexp (qr/(?{})/).
6412 We must check that code_blocksv is not already set, because we may
6413 have jumped back to restart the sizing pass. */
6414 if (pRExC_state->code_blocks && !code_blocksv) {
6415 code_blocksv = newSV_type(SVt_PV);
6416 SAVEFREESV(code_blocksv);
6417 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6418 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6420 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6421 /* It's possible to write a regexp in ascii that represents Unicode
6422 codepoints outside of the byte range, such as via \x{100}. If we
6423 detect such a sequence we have to convert the entire pattern to utf8
6424 and then recompile, as our sizing calculation will have been based
6425 on 1 byte == 1 character, but we will need to use utf8 to encode
6426 at least some part of the pattern, and therefore must convert the whole
6429 if (flags & RESTART_UTF8) {
6430 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6431 pRExC_state->num_code_blocks);
6432 goto redo_first_pass;
6434 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6437 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6440 PerlIO_printf(Perl_debug_log,
6441 "Required size %"IVdf" nodes\n"
6442 "Starting second pass (creation)\n",
6445 RExC_lastparse=NULL;
6448 /* The first pass could have found things that force Unicode semantics */
6449 if ((RExC_utf8 || RExC_uni_semantics)
6450 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6452 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6455 /* Small enough for pointer-storage convention?
6456 If extralen==0, this means that we will not need long jumps. */
6457 if (RExC_size >= 0x10000L && RExC_extralen)
6458 RExC_size += RExC_extralen;
6461 if (RExC_whilem_seen > 15)
6462 RExC_whilem_seen = 15;
6464 /* Allocate space and zero-initialize. Note, the two step process
6465 of zeroing when in debug mode, thus anything assigned has to
6466 happen after that */
6467 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6469 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6470 char, regexp_internal);
6471 if ( r == NULL || ri == NULL )
6472 FAIL("Regexp out of space");
6474 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6475 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6478 /* bulk initialize base fields with 0. */
6479 Zero(ri, sizeof(regexp_internal), char);
6482 /* non-zero initialization begins here */
6485 r->extflags = rx_flags;
6486 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6488 if (pm_flags & PMf_IS_QR) {
6489 ri->code_blocks = pRExC_state->code_blocks;
6490 ri->num_code_blocks = pRExC_state->num_code_blocks;
6495 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6496 if (pRExC_state->code_blocks[n].src_regex)
6497 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6498 SAVEFREEPV(pRExC_state->code_blocks);
6502 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6503 bool has_charset = (get_regex_charset(r->extflags)
6504 != REGEX_DEPENDS_CHARSET);
6506 /* The caret is output if there are any defaults: if not all the STD
6507 * flags are set, or if no character set specifier is needed */
6509 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6511 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)
6512 == REG_SEEN_RUN_ON_COMMENT);
6513 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6514 >> RXf_PMf_STD_PMMOD_SHIFT);
6515 const char *fptr = STD_PAT_MODS; /*"msix"*/
6517 /* Allocate for the worst case, which is all the std flags are turned
6518 * on. If more precision is desired, we could do a population count of
6519 * the flags set. This could be done with a small lookup table, or by
6520 * shifting, masking and adding, or even, when available, assembly
6521 * language for a machine-language population count.
6522 * We never output a minus, as all those are defaults, so are
6523 * covered by the caret */
6524 const STRLEN wraplen = plen + has_p + has_runon
6525 + has_default /* If needs a caret */
6527 /* If needs a character set specifier */
6528 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6529 + (sizeof(STD_PAT_MODS) - 1)
6530 + (sizeof("(?:)") - 1);
6532 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6533 r->xpv_len_u.xpvlenu_pv = p;
6535 SvFLAGS(rx) |= SVf_UTF8;
6538 /* If a default, cover it using the caret */
6540 *p++= DEFAULT_PAT_MOD;
6544 const char* const name = get_regex_charset_name(r->extflags, &len);
6545 Copy(name, p, len, char);
6549 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6552 while((ch = *fptr++)) {
6560 Copy(RExC_precomp, p, plen, char);
6561 assert ((RX_WRAPPED(rx) - p) < 16);
6562 r->pre_prefix = p - RX_WRAPPED(rx);
6568 SvCUR_set(rx, p - RX_WRAPPED(rx));
6572 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6574 /* setup various meta data about recursion, this all requires
6575 * RExC_npar to be correctly set, and a bit later on we clear it */
6576 if (RExC_seen & REG_SEEN_RECURSE) {
6577 Newxz(RExC_open_parens, RExC_npar,regnode *);
6578 SAVEFREEPV(RExC_open_parens);
6579 Newxz(RExC_close_parens,RExC_npar,regnode *);
6580 SAVEFREEPV(RExC_close_parens);
6582 if (RExC_seen & (REG_SEEN_RECURSE | REG_SEEN_GOSTART)) {
6583 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6584 * So its 1 if there are no parens. */
6585 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6586 ((RExC_npar & 0x07) != 0);
6587 Newx(RExC_study_chunk_recursed,
6588 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6589 SAVEFREEPV(RExC_study_chunk_recursed);
6592 /* Useful during FAIL. */
6593 #ifdef RE_TRACK_PATTERN_OFFSETS
6594 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6595 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6596 "%s %"UVuf" bytes for offset annotations.\n",
6597 ri->u.offsets ? "Got" : "Couldn't get",
6598 (UV)((2*RExC_size+1) * sizeof(U32))));
6600 SetProgLen(ri,RExC_size);
6605 /* Second pass: emit code. */
6606 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6607 RExC_pm_flags = pm_flags;
6609 RExC_end = exp + plen;
6612 RExC_emit_start = ri->program;
6613 RExC_emit = ri->program;
6614 RExC_emit_bound = ri->program + RExC_size + 1;
6615 pRExC_state->code_index = 0;
6617 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6618 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6620 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6622 /* XXXX To minimize changes to RE engine we always allocate
6623 3-units-long substrs field. */
6624 Newx(r->substrs, 1, struct reg_substr_data);
6625 if (RExC_recurse_count) {
6626 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6627 SAVEFREEPV(RExC_recurse);
6631 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6632 Zero(r->substrs, 1, struct reg_substr_data);
6633 if (RExC_study_chunk_recursed)
6634 Zero(RExC_study_chunk_recursed,
6635 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6637 #ifdef TRIE_STUDY_OPT
6639 StructCopy(&zero_scan_data, &data, scan_data_t);
6640 copyRExC_state = RExC_state;
6643 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6645 RExC_state = copyRExC_state;
6646 if (seen & REG_TOP_LEVEL_BRANCHES)
6647 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6649 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6650 StructCopy(&zero_scan_data, &data, scan_data_t);
6653 StructCopy(&zero_scan_data, &data, scan_data_t);
6656 /* Dig out information for optimizations. */
6657 r->extflags = RExC_flags; /* was pm_op */
6658 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6661 SvUTF8_on(rx); /* Unicode in it? */
6662 ri->regstclass = NULL;
6663 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6664 r->intflags |= PREGf_NAUGHTY;
6665 scan = ri->program + 1; /* First BRANCH. */
6667 /* testing for BRANCH here tells us whether there is "must appear"
6668 data in the pattern. If there is then we can use it for optimisations */
6669 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice.
6672 STRLEN longest_float_length, longest_fixed_length;
6673 regnode_ssc ch_class; /* pointed to by data */
6675 SSize_t last_close = 0; /* pointed to by data */
6676 regnode *first= scan;
6677 regnode *first_next= regnext(first);
6679 * Skip introductions and multiplicators >= 1
6680 * so that we can extract the 'meat' of the pattern that must
6681 * match in the large if() sequence following.
6682 * NOTE that EXACT is NOT covered here, as it is normally
6683 * picked up by the optimiser separately.
6685 * This is unfortunate as the optimiser isnt handling lookahead
6686 * properly currently.
6689 while ((OP(first) == OPEN && (sawopen = 1)) ||
6690 /* An OR of *one* alternative - should not happen now. */
6691 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6692 /* for now we can't handle lookbehind IFMATCH*/
6693 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6694 (OP(first) == PLUS) ||
6695 (OP(first) == MINMOD) ||
6696 /* An {n,m} with n>0 */
6697 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6698 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6701 * the only op that could be a regnode is PLUS, all the rest
6702 * will be regnode_1 or regnode_2.
6704 * (yves doesn't think this is true)
6706 if (OP(first) == PLUS)
6709 if (OP(first) == MINMOD)
6711 first += regarglen[OP(first)];
6713 first = NEXTOPER(first);
6714 first_next= regnext(first);
6717 /* Starting-point info. */
6719 DEBUG_PEEP("first:",first,0);
6720 /* Ignore EXACT as we deal with it later. */
6721 if (PL_regkind[OP(first)] == EXACT) {
6722 if (OP(first) == EXACT)
6723 NOOP; /* Empty, get anchored substr later. */
6725 ri->regstclass = first;
6728 else if (PL_regkind[OP(first)] == TRIE &&
6729 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6732 /* this can happen only on restudy */
6733 if ( OP(first) == TRIE ) {
6734 struct regnode_1 *trieop = (struct regnode_1 *)
6735 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6736 StructCopy(first,trieop,struct regnode_1);
6737 trie_op=(regnode *)trieop;
6739 struct regnode_charclass *trieop = (struct regnode_charclass *)
6740 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6741 StructCopy(first,trieop,struct regnode_charclass);
6742 trie_op=(regnode *)trieop;
6745 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6746 ri->regstclass = trie_op;
6749 else if (REGNODE_SIMPLE(OP(first)))
6750 ri->regstclass = first;
6751 else if (PL_regkind[OP(first)] == BOUND ||
6752 PL_regkind[OP(first)] == NBOUND)
6753 ri->regstclass = first;
6754 else if (PL_regkind[OP(first)] == BOL) {
6755 r->extflags |= (OP(first) == MBOL
6757 : (OP(first) == SBOL
6760 first = NEXTOPER(first);
6763 else if (OP(first) == GPOS) {
6764 r->extflags |= RXf_ANCH_GPOS;
6765 first = NEXTOPER(first);
6768 else if ((!sawopen || !RExC_sawback) &&
6769 (OP(first) == STAR &&
6770 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6771 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6773 /* turn .* into ^.* with an implied $*=1 */
6775 (OP(NEXTOPER(first)) == REG_ANY)
6778 r->extflags |= type;
6779 r->intflags |= PREGf_IMPLICIT;
6780 first = NEXTOPER(first);
6783 if (sawplus && !sawminmod && !sawlookahead
6784 && (!sawopen || !RExC_sawback)
6785 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6786 /* x+ must match at the 1st pos of run of x's */
6787 r->intflags |= PREGf_SKIP;
6789 /* Scan is after the zeroth branch, first is atomic matcher. */
6790 #ifdef TRIE_STUDY_OPT
6793 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6794 (IV)(first - scan + 1))
6798 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6799 (IV)(first - scan + 1))
6805 * If there's something expensive in the r.e., find the
6806 * longest literal string that must appear and make it the
6807 * regmust. Resolve ties in favor of later strings, since
6808 * the regstart check works with the beginning of the r.e.
6809 * and avoiding duplication strengthens checking. Not a
6810 * strong reason, but sufficient in the absence of others.
6811 * [Now we resolve ties in favor of the earlier string if
6812 * it happens that c_offset_min has been invalidated, since the
6813 * earlier string may buy us something the later one won't.]
6816 data.longest_fixed = newSVpvs("");
6817 data.longest_float = newSVpvs("");
6818 data.last_found = newSVpvs("");
6819 data.longest = &(data.longest_fixed);
6820 ENTER_with_name("study_chunk");
6821 SAVEFREESV(data.longest_fixed);
6822 SAVEFREESV(data.longest_float);
6823 SAVEFREESV(data.last_found);
6825 if (!ri->regstclass) {
6826 ssc_init(pRExC_state, &ch_class);
6827 data.start_class = &ch_class;
6828 stclass_flag = SCF_DO_STCLASS_AND;
6829 } else /* XXXX Check for BOUND? */
6831 data.last_closep = &last_close;
6834 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6835 scan + RExC_size, /* Up to end */
6837 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6838 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6842 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6845 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6846 && data.last_start_min == 0 && data.last_end > 0
6847 && !RExC_seen_zerolen
6848 && !(RExC_seen & REG_SEEN_VERBARG)
6849 && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6850 r->extflags |= RXf_CHECK_ALL;
6851 scan_commit(pRExC_state, &data,&minlen,0);
6853 longest_float_length = CHR_SVLEN(data.longest_float);
6855 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6856 && data.offset_fixed == data.offset_float_min
6857 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6858 && S_setup_longest (aTHX_ pRExC_state,
6862 &(r->float_end_shift),
6863 data.lookbehind_float,
6864 data.offset_float_min,
6866 longest_float_length,
6867 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6868 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6870 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6871 r->float_max_offset = data.offset_float_max;
6872 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6873 r->float_max_offset -= data.lookbehind_float;
6874 SvREFCNT_inc_simple_void_NN(data.longest_float);
6877 r->float_substr = r->float_utf8 = NULL;
6878 longest_float_length = 0;
6881 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6883 if (S_setup_longest (aTHX_ pRExC_state,
6885 &(r->anchored_utf8),
6886 &(r->anchored_substr),
6887 &(r->anchored_end_shift),
6888 data.lookbehind_fixed,
6891 longest_fixed_length,
6892 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6893 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6895 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6896 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6899 r->anchored_substr = r->anchored_utf8 = NULL;
6900 longest_fixed_length = 0;
6902 LEAVE_with_name("study_chunk");
6905 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6906 ri->regstclass = NULL;
6908 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6910 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6911 && !ssc_is_anything(data.start_class))
6913 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6915 ssc_finalize(pRExC_state, data.start_class);
6917 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6918 StructCopy(data.start_class,
6919 (regnode_ssc*)RExC_rxi->data->data[n],
6921 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6922 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6923 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6924 regprop(r, sv, (regnode*)data.start_class);
6925 PerlIO_printf(Perl_debug_log,
6926 "synthetic stclass \"%s\".\n",
6927 SvPVX_const(sv));});
6928 data.start_class = NULL;
6931 /* A temporary algorithm prefers floated substr to fixed one to dig
6933 if (longest_fixed_length > longest_float_length) {
6934 r->check_end_shift = r->anchored_end_shift;
6935 r->check_substr = r->anchored_substr;
6936 r->check_utf8 = r->anchored_utf8;
6937 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6938 if (r->extflags & RXf_ANCH_SINGLE)
6939 r->extflags |= RXf_NOSCAN;
6942 r->check_end_shift = r->float_end_shift;
6943 r->check_substr = r->float_substr;
6944 r->check_utf8 = r->float_utf8;
6945 r->check_offset_min = r->float_min_offset;
6946 r->check_offset_max = r->float_max_offset;
6948 if ((r->check_substr || r->check_utf8) ) {
6949 r->extflags |= RXf_USE_INTUIT;
6950 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6951 r->extflags |= RXf_INTUIT_TAIL;
6953 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6954 if ( (STRLEN)minlen < longest_float_length )
6955 minlen= longest_float_length;
6956 if ( (STRLEN)minlen < longest_fixed_length )
6957 minlen= longest_fixed_length;
6961 /* Several toplevels. Best we can is to set minlen. */
6963 regnode_ssc ch_class;
6964 SSize_t last_close = 0;
6966 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6968 scan = ri->program + 1;
6969 ssc_init(pRExC_state, &ch_class);
6970 data.start_class = &ch_class;
6971 data.last_closep = &last_close;
6974 minlen = study_chunk(pRExC_state,
6975 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
6976 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
6977 ? SCF_TRIE_DOING_RESTUDY
6981 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6983 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6984 = r->float_substr = r->float_utf8 = NULL;
6986 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6987 && ! ssc_is_anything(data.start_class))
6989 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6991 ssc_finalize(pRExC_state, data.start_class);
6993 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6994 StructCopy(data.start_class,
6995 (regnode_ssc*)RExC_rxi->data->data[n],
6997 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6998 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6999 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7000 regprop(r, sv, (regnode*)data.start_class);
7001 PerlIO_printf(Perl_debug_log,
7002 "synthetic stclass \"%s\".\n",
7003 SvPVX_const(sv));});
7004 data.start_class = NULL;
7008 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7009 the "real" pattern. */
7011 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
7012 (IV)minlen, (IV)r->minlen);
7014 r->minlenret = minlen;
7015 if (r->minlen < minlen)
7018 if (RExC_seen & REG_SEEN_GPOS)
7019 r->extflags |= RXf_GPOS_SEEN;
7020 if (RExC_seen & REG_SEEN_LOOKBEHIND)
7021 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7023 if (pRExC_state->num_code_blocks)
7024 r->extflags |= RXf_EVAL_SEEN;
7025 if (RExC_seen & REG_SEEN_CANY)
7026 r->extflags |= RXf_CANY_SEEN;
7027 if (RExC_seen & REG_SEEN_VERBARG)
7029 r->intflags |= PREGf_VERBARG_SEEN;
7030 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7032 if (RExC_seen & REG_SEEN_CUTGROUP)
7033 r->intflags |= PREGf_CUTGROUP_SEEN;
7034 if (pm_flags & PMf_USE_RE_EVAL)
7035 r->intflags |= PREGf_USE_RE_EVAL;
7036 if (RExC_paren_names)
7037 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7039 RXp_PAREN_NAMES(r) = NULL;
7042 regnode *first = ri->program + 1;
7044 regnode *next = NEXTOPER(first);
7047 if (PL_regkind[fop] == NOTHING && nop == END)
7048 r->extflags |= RXf_NULL;
7049 else if (PL_regkind[fop] == BOL && nop == END)
7050 r->extflags |= RXf_START_ONLY;
7051 else if (fop == PLUS
7052 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7053 && OP(regnext(first)) == END)
7054 r->extflags |= RXf_WHITE;
7055 else if ( r->extflags & RXf_SPLIT
7057 && STR_LEN(first) == 1
7058 && *(STRING(first)) == ' '
7059 && OP(regnext(first)) == END )
7060 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7064 if (RExC_paren_names) {
7065 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7066 ri->data->data[ri->name_list_idx]
7067 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7070 ri->name_list_idx = 0;
7072 if (RExC_recurse_count) {
7073 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7074 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7075 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7078 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7079 /* assume we don't need to swap parens around before we match */
7083 PerlIO_printf(Perl_debug_log,"Final program:\n");
7086 #ifdef RE_TRACK_PATTERN_OFFSETS
7087 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7088 const STRLEN len = ri->u.offsets[0];
7090 GET_RE_DEBUG_FLAGS_DECL;
7091 PerlIO_printf(Perl_debug_log,
7092 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7093 for (i = 1; i <= len; i++) {
7094 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7095 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7096 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7098 PerlIO_printf(Perl_debug_log, "\n");
7103 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7104 * by setting the regexp SV to readonly-only instead. If the
7105 * pattern's been recompiled, the USEDness should remain. */
7106 if (old_re && SvREADONLY(old_re))
7114 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7117 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7119 PERL_UNUSED_ARG(value);
7121 if (flags & RXapif_FETCH) {
7122 return reg_named_buff_fetch(rx, key, flags);
7123 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7124 Perl_croak_no_modify();
7126 } else if (flags & RXapif_EXISTS) {
7127 return reg_named_buff_exists(rx, key, flags)
7130 } else if (flags & RXapif_REGNAMES) {
7131 return reg_named_buff_all(rx, flags);
7132 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7133 return reg_named_buff_scalar(rx, flags);
7135 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7141 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7144 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7145 PERL_UNUSED_ARG(lastkey);
7147 if (flags & RXapif_FIRSTKEY)
7148 return reg_named_buff_firstkey(rx, flags);
7149 else if (flags & RXapif_NEXTKEY)
7150 return reg_named_buff_nextkey(rx, flags);
7152 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7159 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7162 AV *retarray = NULL;
7164 struct regexp *const rx = ReANY(r);
7166 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7168 if (flags & RXapif_ALL)
7171 if (rx && RXp_PAREN_NAMES(rx)) {
7172 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7175 SV* sv_dat=HeVAL(he_str);
7176 I32 *nums=(I32*)SvPVX(sv_dat);
7177 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7178 if ((I32)(rx->nparens) >= nums[i]
7179 && rx->offs[nums[i]].start != -1
7180 && rx->offs[nums[i]].end != -1)
7183 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7188 ret = newSVsv(&PL_sv_undef);
7191 av_push(retarray, ret);
7194 return newRV_noinc(MUTABLE_SV(retarray));
7201 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7204 struct regexp *const rx = ReANY(r);
7206 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7208 if (rx && RXp_PAREN_NAMES(rx)) {
7209 if (flags & RXapif_ALL) {
7210 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7212 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7214 SvREFCNT_dec_NN(sv);
7226 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7228 struct regexp *const rx = ReANY(r);
7230 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7232 if ( rx && RXp_PAREN_NAMES(rx) ) {
7233 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7235 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7242 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7244 struct regexp *const rx = ReANY(r);
7245 GET_RE_DEBUG_FLAGS_DECL;
7247 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7249 if (rx && RXp_PAREN_NAMES(rx)) {
7250 HV *hv = RXp_PAREN_NAMES(rx);
7252 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7255 SV* sv_dat = HeVAL(temphe);
7256 I32 *nums = (I32*)SvPVX(sv_dat);
7257 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7258 if ((I32)(rx->lastparen) >= nums[i] &&
7259 rx->offs[nums[i]].start != -1 &&
7260 rx->offs[nums[i]].end != -1)
7266 if (parno || flags & RXapif_ALL) {
7267 return newSVhek(HeKEY_hek(temphe));
7275 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7280 struct regexp *const rx = ReANY(r);
7282 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7284 if (rx && RXp_PAREN_NAMES(rx)) {
7285 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7286 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7287 } else if (flags & RXapif_ONE) {
7288 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7289 av = MUTABLE_AV(SvRV(ret));
7290 length = av_len(av);
7291 SvREFCNT_dec_NN(ret);
7292 return newSViv(length + 1);
7294 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7299 return &PL_sv_undef;
7303 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7305 struct regexp *const rx = ReANY(r);
7308 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7310 if (rx && RXp_PAREN_NAMES(rx)) {
7311 HV *hv= RXp_PAREN_NAMES(rx);
7313 (void)hv_iterinit(hv);
7314 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7317 SV* sv_dat = HeVAL(temphe);
7318 I32 *nums = (I32*)SvPVX(sv_dat);
7319 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7320 if ((I32)(rx->lastparen) >= nums[i] &&
7321 rx->offs[nums[i]].start != -1 &&
7322 rx->offs[nums[i]].end != -1)
7328 if (parno || flags & RXapif_ALL) {
7329 av_push(av, newSVhek(HeKEY_hek(temphe)));
7334 return newRV_noinc(MUTABLE_SV(av));
7338 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7341 struct regexp *const rx = ReANY(r);
7347 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7349 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7350 || n == RX_BUFF_IDX_CARET_FULLMATCH
7351 || n == RX_BUFF_IDX_CARET_POSTMATCH
7354 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7356 /* on something like
7359 * the KEEPCOPY is set on the PMOP rather than the regex */
7360 if (PL_curpm && r == PM_GETRE(PL_curpm))
7361 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7370 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7371 /* no need to distinguish between them any more */
7372 n = RX_BUFF_IDX_FULLMATCH;
7374 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7375 && rx->offs[0].start != -1)
7377 /* $`, ${^PREMATCH} */
7378 i = rx->offs[0].start;
7382 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7383 && rx->offs[0].end != -1)
7385 /* $', ${^POSTMATCH} */
7386 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7387 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7390 if ( 0 <= n && n <= (I32)rx->nparens &&
7391 (s1 = rx->offs[n].start) != -1 &&
7392 (t1 = rx->offs[n].end) != -1)
7394 /* $&, ${^MATCH}, $1 ... */
7396 s = rx->subbeg + s1 - rx->suboffset;
7401 assert(s >= rx->subbeg);
7402 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7404 #if NO_TAINT_SUPPORT
7405 sv_setpvn(sv, s, i);
7407 const int oldtainted = TAINT_get;
7409 sv_setpvn(sv, s, i);
7410 TAINT_set(oldtainted);
7412 if ( (rx->extflags & RXf_CANY_SEEN)
7413 ? (RXp_MATCH_UTF8(rx)
7414 && (!i || is_utf8_string((U8*)s, i)))
7415 : (RXp_MATCH_UTF8(rx)) )
7422 if (RXp_MATCH_TAINTED(rx)) {
7423 if (SvTYPE(sv) >= SVt_PVMG) {
7424 MAGIC* const mg = SvMAGIC(sv);
7427 SvMAGIC_set(sv, mg->mg_moremagic);
7429 if ((mgt = SvMAGIC(sv))) {
7430 mg->mg_moremagic = mgt;
7431 SvMAGIC_set(sv, mg);
7442 sv_setsv(sv,&PL_sv_undef);
7448 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7449 SV const * const value)
7451 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7453 PERL_UNUSED_ARG(rx);
7454 PERL_UNUSED_ARG(paren);
7455 PERL_UNUSED_ARG(value);
7458 Perl_croak_no_modify();
7462 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7465 struct regexp *const rx = ReANY(r);
7469 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7471 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7472 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7473 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7476 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7478 /* on something like
7481 * the KEEPCOPY is set on the PMOP rather than the regex */
7482 if (PL_curpm && r == PM_GETRE(PL_curpm))
7483 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7489 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7491 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7492 case RX_BUFF_IDX_PREMATCH: /* $` */
7493 if (rx->offs[0].start != -1) {
7494 i = rx->offs[0].start;
7503 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7504 case RX_BUFF_IDX_POSTMATCH: /* $' */
7505 if (rx->offs[0].end != -1) {
7506 i = rx->sublen - rx->offs[0].end;
7508 s1 = rx->offs[0].end;
7515 default: /* $& / ${^MATCH}, $1, $2, ... */
7516 if (paren <= (I32)rx->nparens &&
7517 (s1 = rx->offs[paren].start) != -1 &&
7518 (t1 = rx->offs[paren].end) != -1)
7524 if (ckWARN(WARN_UNINITIALIZED))
7525 report_uninit((const SV *)sv);
7530 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7531 const char * const s = rx->subbeg - rx->suboffset + s1;
7536 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7543 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7545 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7546 PERL_UNUSED_ARG(rx);
7550 return newSVpvs("Regexp");
7553 /* Scans the name of a named buffer from the pattern.
7554 * If flags is REG_RSN_RETURN_NULL returns null.
7555 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7556 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7557 * to the parsed name as looked up in the RExC_paren_names hash.
7558 * If there is an error throws a vFAIL().. type exception.
7561 #define REG_RSN_RETURN_NULL 0
7562 #define REG_RSN_RETURN_NAME 1
7563 #define REG_RSN_RETURN_DATA 2
7566 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7568 char *name_start = RExC_parse;
7570 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7572 assert (RExC_parse <= RExC_end);
7573 if (RExC_parse == RExC_end) NOOP;
7574 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7575 /* skip IDFIRST by using do...while */
7578 RExC_parse += UTF8SKIP(RExC_parse);
7579 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7583 } while (isWORDCHAR(*RExC_parse));
7585 RExC_parse++; /* so the <- from the vFAIL is after the offending
7587 vFAIL("Group name must start with a non-digit word character");
7591 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7592 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7593 if ( flags == REG_RSN_RETURN_NAME)
7595 else if (flags==REG_RSN_RETURN_DATA) {
7598 if ( ! sv_name ) /* should not happen*/
7599 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7600 if (RExC_paren_names)
7601 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7603 sv_dat = HeVAL(he_str);
7605 vFAIL("Reference to nonexistent named group");
7609 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7610 (unsigned long) flags);
7612 assert(0); /* NOT REACHED */
7617 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7618 int rem=(int)(RExC_end - RExC_parse); \
7627 if (RExC_lastparse!=RExC_parse) \
7628 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7631 iscut ? "..." : "<" \
7634 PerlIO_printf(Perl_debug_log,"%16s",""); \
7637 num = RExC_size + 1; \
7639 num=REG_NODE_NUM(RExC_emit); \
7640 if (RExC_lastnum!=num) \
7641 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7643 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7644 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7645 (int)((depth*2)), "", \
7649 RExC_lastparse=RExC_parse; \
7654 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7655 DEBUG_PARSE_MSG((funcname)); \
7656 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7658 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7659 DEBUG_PARSE_MSG((funcname)); \
7660 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7663 /* This section of code defines the inversion list object and its methods. The
7664 * interfaces are highly subject to change, so as much as possible is static to
7665 * this file. An inversion list is here implemented as a malloc'd C UV array
7666 * as an SVt_INVLIST scalar.
7668 * An inversion list for Unicode is an array of code points, sorted by ordinal
7669 * number. The zeroth element is the first code point in the list. The 1th
7670 * element is the first element beyond that not in the list. In other words,
7671 * the first range is
7672 * invlist[0]..(invlist[1]-1)
7673 * The other ranges follow. Thus every element whose index is divisible by two
7674 * marks the beginning of a range that is in the list, and every element not
7675 * divisible by two marks the beginning of a range not in the list. A single
7676 * element inversion list that contains the single code point N generally
7677 * consists of two elements
7680 * (The exception is when N is the highest representable value on the
7681 * machine, in which case the list containing just it would be a single
7682 * element, itself. By extension, if the last range in the list extends to
7683 * infinity, then the first element of that range will be in the inversion list
7684 * at a position that is divisible by two, and is the final element in the
7686 * Taking the complement (inverting) an inversion list is quite simple, if the
7687 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7688 * This implementation reserves an element at the beginning of each inversion
7689 * list to always contain 0; there is an additional flag in the header which
7690 * indicates if the list begins at the 0, or is offset to begin at the next
7693 * More about inversion lists can be found in "Unicode Demystified"
7694 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7695 * More will be coming when functionality is added later.
7697 * The inversion list data structure is currently implemented as an SV pointing
7698 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7699 * array of UV whose memory management is automatically handled by the existing
7700 * facilities for SV's.
7702 * Some of the methods should always be private to the implementation, and some
7703 * should eventually be made public */
7705 /* The header definitions are in F<inline_invlist.c> */
7707 PERL_STATIC_INLINE UV*
7708 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7710 /* Returns a pointer to the first element in the inversion list's array.
7711 * This is called upon initialization of an inversion list. Where the
7712 * array begins depends on whether the list has the code point U+0000 in it
7713 * or not. The other parameter tells it whether the code that follows this
7714 * call is about to put a 0 in the inversion list or not. The first
7715 * element is either the element reserved for 0, if TRUE, or the element
7716 * after it, if FALSE */
7718 bool* offset = get_invlist_offset_addr(invlist);
7719 UV* zero_addr = (UV *) SvPVX(invlist);
7721 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7724 assert(! _invlist_len(invlist));
7728 /* 1^1 = 0; 1^0 = 1 */
7729 *offset = 1 ^ will_have_0;
7730 return zero_addr + *offset;
7733 PERL_STATIC_INLINE UV*
7734 S_invlist_array(pTHX_ SV* const invlist)
7736 /* Returns the pointer to the inversion list's array. Every time the
7737 * length changes, this needs to be called in case malloc or realloc moved
7740 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7742 /* Must not be empty. If these fail, you probably didn't check for <len>
7743 * being non-zero before trying to get the array */
7744 assert(_invlist_len(invlist));
7746 /* The very first element always contains zero, The array begins either
7747 * there, or if the inversion list is offset, at the element after it.
7748 * The offset header field determines which; it contains 0 or 1 to indicate
7749 * how much additionally to add */
7750 assert(0 == *(SvPVX(invlist)));
7751 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7754 PERL_STATIC_INLINE void
7755 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7757 /* Sets the current number of elements stored in the inversion list.
7758 * Updates SvCUR correspondingly */
7760 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7762 assert(SvTYPE(invlist) == SVt_INVLIST);
7767 : TO_INTERNAL_SIZE(len + offset));
7768 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7771 PERL_STATIC_INLINE IV*
7772 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7774 /* Return the address of the IV that is reserved to hold the cached index
7777 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7779 assert(SvTYPE(invlist) == SVt_INVLIST);
7781 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7784 PERL_STATIC_INLINE IV
7785 S_invlist_previous_index(pTHX_ SV* const invlist)
7787 /* Returns cached index of previous search */
7789 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7791 return *get_invlist_previous_index_addr(invlist);
7794 PERL_STATIC_INLINE void
7795 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7797 /* Caches <index> for later retrieval */
7799 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7801 assert(index == 0 || index < (int) _invlist_len(invlist));
7803 *get_invlist_previous_index_addr(invlist) = index;
7806 PERL_STATIC_INLINE UV
7807 S_invlist_max(pTHX_ SV* const invlist)
7809 /* Returns the maximum number of elements storable in the inversion list's
7810 * array, without having to realloc() */
7812 PERL_ARGS_ASSERT_INVLIST_MAX;
7814 assert(SvTYPE(invlist) == SVt_INVLIST);
7816 /* Assumes worst case, in which the 0 element is not counted in the
7817 * inversion list, so subtracts 1 for that */
7818 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7819 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7820 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7823 #ifndef PERL_IN_XSUB_RE
7825 Perl__new_invlist(pTHX_ IV initial_size)
7828 /* Return a pointer to a newly constructed inversion list, with enough
7829 * space to store 'initial_size' elements. If that number is negative, a
7830 * system default is used instead */
7834 if (initial_size < 0) {
7838 /* Allocate the initial space */
7839 new_list = newSV_type(SVt_INVLIST);
7841 /* First 1 is in case the zero element isn't in the list; second 1 is for
7843 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7844 invlist_set_len(new_list, 0, 0);
7846 /* Force iterinit() to be used to get iteration to work */
7847 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7849 *get_invlist_previous_index_addr(new_list) = 0;
7855 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7857 /* Return a pointer to a newly constructed inversion list, initialized to
7858 * point to <list>, which has to be in the exact correct inversion list
7859 * form, including internal fields. Thus this is a dangerous routine that
7860 * should not be used in the wrong hands. The passed in 'list' contains
7861 * several header fields at the beginning that are not part of the
7862 * inversion list body proper */
7864 const STRLEN length = (STRLEN) list[0];
7865 const UV version_id = list[1];
7866 const bool offset = cBOOL(list[2]);
7867 #define HEADER_LENGTH 3
7868 /* If any of the above changes in any way, you must change HEADER_LENGTH
7869 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7870 * perl -E 'say int(rand 2**31-1)'
7872 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7873 data structure type, so that one being
7874 passed in can be validated to be an
7875 inversion list of the correct vintage.
7878 SV* invlist = newSV_type(SVt_INVLIST);
7880 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7882 if (version_id != INVLIST_VERSION_ID) {
7883 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7886 /* The generated array passed in includes header elements that aren't part
7887 * of the list proper, so start it just after them */
7888 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7890 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7891 shouldn't touch it */
7893 *(get_invlist_offset_addr(invlist)) = offset;
7895 /* The 'length' passed to us is the physical number of elements in the
7896 * inversion list. But if there is an offset the logical number is one
7898 invlist_set_len(invlist, length - offset, offset);
7900 invlist_set_previous_index(invlist, 0);
7902 /* Initialize the iteration pointer. */
7903 invlist_iterfinish(invlist);
7905 SvREADONLY_on(invlist);
7909 #endif /* ifndef PERL_IN_XSUB_RE */
7912 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7914 /* Grow the maximum size of an inversion list */
7916 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7918 assert(SvTYPE(invlist) == SVt_INVLIST);
7920 /* Add one to account for the zero element at the beginning which may not
7921 * be counted by the calling parameters */
7922 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7925 PERL_STATIC_INLINE void
7926 S_invlist_trim(pTHX_ SV* const invlist)
7928 PERL_ARGS_ASSERT_INVLIST_TRIM;
7930 assert(SvTYPE(invlist) == SVt_INVLIST);
7932 /* Change the length of the inversion list to how many entries it currently
7934 SvPV_shrink_to_cur((SV *) invlist);
7938 S__append_range_to_invlist(pTHX_ SV* const invlist,
7939 const UV start, const UV end)
7941 /* Subject to change or removal. Append the range from 'start' to 'end' at
7942 * the end of the inversion list. The range must be above any existing
7946 UV max = invlist_max(invlist);
7947 UV len = _invlist_len(invlist);
7950 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7952 if (len == 0) { /* Empty lists must be initialized */
7953 offset = start != 0;
7954 array = _invlist_array_init(invlist, ! offset);
7957 /* Here, the existing list is non-empty. The current max entry in the
7958 * list is generally the first value not in the set, except when the
7959 * set extends to the end of permissible values, in which case it is
7960 * the first entry in that final set, and so this call is an attempt to
7961 * append out-of-order */
7963 UV final_element = len - 1;
7964 array = invlist_array(invlist);
7965 if (array[final_element] > start
7966 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7968 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7969 array[final_element], start,
7970 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7973 /* Here, it is a legal append. If the new range begins with the first
7974 * value not in the set, it is extending the set, so the new first
7975 * value not in the set is one greater than the newly extended range.
7977 offset = *get_invlist_offset_addr(invlist);
7978 if (array[final_element] == start) {
7979 if (end != UV_MAX) {
7980 array[final_element] = end + 1;
7983 /* But if the end is the maximum representable on the machine,
7984 * just let the range that this would extend to have no end */
7985 invlist_set_len(invlist, len - 1, offset);
7991 /* Here the new range doesn't extend any existing set. Add it */
7993 len += 2; /* Includes an element each for the start and end of range */
7995 /* If wll overflow the existing space, extend, which may cause the array to
7998 invlist_extend(invlist, len);
8000 /* Have to set len here to avoid assert failure in invlist_array() */
8001 invlist_set_len(invlist, len, offset);
8003 array = invlist_array(invlist);
8006 invlist_set_len(invlist, len, offset);
8009 /* The next item on the list starts the range, the one after that is
8010 * one past the new range. */
8011 array[len - 2] = start;
8012 if (end != UV_MAX) {
8013 array[len - 1] = end + 1;
8016 /* But if the end is the maximum representable on the machine, just let
8017 * the range have no end */
8018 invlist_set_len(invlist, len - 1, offset);
8022 #ifndef PERL_IN_XSUB_RE
8025 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8027 /* Searches the inversion list for the entry that contains the input code
8028 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8029 * return value is the index into the list's array of the range that
8034 IV high = _invlist_len(invlist);
8035 const IV highest_element = high - 1;
8038 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8040 /* If list is empty, return failure. */
8045 /* (We can't get the array unless we know the list is non-empty) */
8046 array = invlist_array(invlist);
8048 mid = invlist_previous_index(invlist);
8049 assert(mid >=0 && mid <= highest_element);
8051 /* <mid> contains the cache of the result of the previous call to this
8052 * function (0 the first time). See if this call is for the same result,
8053 * or if it is for mid-1. This is under the theory that calls to this
8054 * function will often be for related code points that are near each other.
8055 * And benchmarks show that caching gives better results. We also test
8056 * here if the code point is within the bounds of the list. These tests
8057 * replace others that would have had to be made anyway to make sure that
8058 * the array bounds were not exceeded, and these give us extra information
8059 * at the same time */
8060 if (cp >= array[mid]) {
8061 if (cp >= array[highest_element]) {
8062 return highest_element;
8065 /* Here, array[mid] <= cp < array[highest_element]. This means that
8066 * the final element is not the answer, so can exclude it; it also
8067 * means that <mid> is not the final element, so can refer to 'mid + 1'
8069 if (cp < array[mid + 1]) {
8075 else { /* cp < aray[mid] */
8076 if (cp < array[0]) { /* Fail if outside the array */
8080 if (cp >= array[mid - 1]) {
8085 /* Binary search. What we are looking for is <i> such that
8086 * array[i] <= cp < array[i+1]
8087 * The loop below converges on the i+1. Note that there may not be an
8088 * (i+1)th element in the array, and things work nonetheless */
8089 while (low < high) {
8090 mid = (low + high) / 2;
8091 assert(mid <= highest_element);
8092 if (array[mid] <= cp) { /* cp >= array[mid] */
8095 /* We could do this extra test to exit the loop early.
8096 if (cp < array[low]) {
8101 else { /* cp < array[mid] */
8108 invlist_set_previous_index(invlist, high);
8113 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8114 const UV start, const UV end, U8* swatch)
8116 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8117 * but is used when the swash has an inversion list. This makes this much
8118 * faster, as it uses a binary search instead of a linear one. This is
8119 * intimately tied to that function, and perhaps should be in utf8.c,
8120 * except it is intimately tied to inversion lists as well. It assumes
8121 * that <swatch> is all 0's on input */
8124 const IV len = _invlist_len(invlist);
8128 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8130 if (len == 0) { /* Empty inversion list */
8134 array = invlist_array(invlist);
8136 /* Find which element it is */
8137 i = _invlist_search(invlist, start);
8139 /* We populate from <start> to <end> */
8140 while (current < end) {
8143 /* The inversion list gives the results for every possible code point
8144 * after the first one in the list. Only those ranges whose index is
8145 * even are ones that the inversion list matches. For the odd ones,
8146 * and if the initial code point is not in the list, we have to skip
8147 * forward to the next element */
8148 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8150 if (i >= len) { /* Finished if beyond the end of the array */
8154 if (current >= end) { /* Finished if beyond the end of what we
8156 if (LIKELY(end < UV_MAX)) {
8160 /* We get here when the upper bound is the maximum
8161 * representable on the machine, and we are looking for just
8162 * that code point. Have to special case it */
8164 goto join_end_of_list;
8167 assert(current >= start);
8169 /* The current range ends one below the next one, except don't go past
8172 upper = (i < len && array[i] < end) ? array[i] : end;
8174 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8175 * for each code point in it */
8176 for (; current < upper; current++) {
8177 const STRLEN offset = (STRLEN)(current - start);
8178 swatch[offset >> 3] |= 1 << (offset & 7);
8183 /* Quit if at the end of the list */
8186 /* But first, have to deal with the highest possible code point on
8187 * the platform. The previous code assumes that <end> is one
8188 * beyond where we want to populate, but that is impossible at the
8189 * platform's infinity, so have to handle it specially */
8190 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8192 const STRLEN offset = (STRLEN)(end - start);
8193 swatch[offset >> 3] |= 1 << (offset & 7);
8198 /* Advance to the next range, which will be for code points not in the
8207 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8208 const bool complement_b, SV** output)
8210 /* Take the union of two inversion lists and point <output> to it. *output
8211 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8212 * the reference count to that list will be decremented if not already a
8213 * temporary (mortal); otherwise *output will be made correspondingly
8214 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8215 * second list is returned. If <complement_b> is TRUE, the union is taken
8216 * of the complement (inversion) of <b> instead of b itself.
8218 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8219 * Richard Gillam, published by Addison-Wesley, and explained at some
8220 * length there. The preface says to incorporate its examples into your
8221 * code at your own risk.
8223 * The algorithm is like a merge sort.
8225 * XXX A potential performance improvement is to keep track as we go along
8226 * if only one of the inputs contributes to the result, meaning the other
8227 * is a subset of that one. In that case, we can skip the final copy and
8228 * return the larger of the input lists, but then outside code might need
8229 * to keep track of whether to free the input list or not */
8231 const UV* array_a; /* a's array */
8233 UV len_a; /* length of a's array */
8236 SV* u; /* the resulting union */
8240 UV i_a = 0; /* current index into a's array */
8244 /* running count, as explained in the algorithm source book; items are
8245 * stopped accumulating and are output when the count changes to/from 0.
8246 * The count is incremented when we start a range that's in the set, and
8247 * decremented when we start a range that's not in the set. So its range
8248 * is 0 to 2. Only when the count is zero is something not in the set.
8252 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8255 /* If either one is empty, the union is the other one */
8256 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8257 bool make_temp = FALSE; /* Should we mortalize the result? */
8261 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8267 *output = invlist_clone(b);
8269 _invlist_invert(*output);
8271 } /* else *output already = b; */
8274 sv_2mortal(*output);
8278 else if ((len_b = _invlist_len(b)) == 0) {
8279 bool make_temp = FALSE;
8281 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8286 /* The complement of an empty list is a list that has everything in it,
8287 * so the union with <a> includes everything too */
8290 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8294 *output = _new_invlist(1);
8295 _append_range_to_invlist(*output, 0, UV_MAX);
8297 else if (*output != a) {
8298 *output = invlist_clone(a);
8300 /* else *output already = a; */
8303 sv_2mortal(*output);
8308 /* Here both lists exist and are non-empty */
8309 array_a = invlist_array(a);
8310 array_b = invlist_array(b);
8312 /* If are to take the union of 'a' with the complement of b, set it
8313 * up so are looking at b's complement. */
8316 /* To complement, we invert: if the first element is 0, remove it. To
8317 * do this, we just pretend the array starts one later */
8318 if (array_b[0] == 0) {
8324 /* But if the first element is not zero, we pretend the list starts
8325 * at the 0 that is always stored immediately before the array. */
8331 /* Size the union for the worst case: that the sets are completely
8333 u = _new_invlist(len_a + len_b);
8335 /* Will contain U+0000 if either component does */
8336 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8337 || (len_b > 0 && array_b[0] == 0));
8339 /* Go through each list item by item, stopping when exhausted one of
8341 while (i_a < len_a && i_b < len_b) {
8342 UV cp; /* The element to potentially add to the union's array */
8343 bool cp_in_set; /* is it in the the input list's set or not */
8345 /* We need to take one or the other of the two inputs for the union.
8346 * Since we are merging two sorted lists, we take the smaller of the
8347 * next items. In case of a tie, we take the one that is in its set
8348 * first. If we took one not in the set first, it would decrement the
8349 * count, possibly to 0 which would cause it to be output as ending the
8350 * range, and the next time through we would take the same number, and
8351 * output it again as beginning the next range. By doing it the
8352 * opposite way, there is no possibility that the count will be
8353 * momentarily decremented to 0, and thus the two adjoining ranges will
8354 * be seamlessly merged. (In a tie and both are in the set or both not
8355 * in the set, it doesn't matter which we take first.) */
8356 if (array_a[i_a] < array_b[i_b]
8357 || (array_a[i_a] == array_b[i_b]
8358 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8360 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8364 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8365 cp = array_b[i_b++];
8368 /* Here, have chosen which of the two inputs to look at. Only output
8369 * if the running count changes to/from 0, which marks the
8370 * beginning/end of a range in that's in the set */
8373 array_u[i_u++] = cp;
8380 array_u[i_u++] = cp;
8385 /* Here, we are finished going through at least one of the lists, which
8386 * means there is something remaining in at most one. We check if the list
8387 * that hasn't been exhausted is positioned such that we are in the middle
8388 * of a range in its set or not. (i_a and i_b point to the element beyond
8389 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8390 * is potentially more to output.
8391 * There are four cases:
8392 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8393 * in the union is entirely from the non-exhausted set.
8394 * 2) Both were in their sets, count is 2. Nothing further should
8395 * be output, as everything that remains will be in the exhausted
8396 * list's set, hence in the union; decrementing to 1 but not 0 insures
8398 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8399 * Nothing further should be output because the union includes
8400 * everything from the exhausted set. Not decrementing ensures that.
8401 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8402 * decrementing to 0 insures that we look at the remainder of the
8403 * non-exhausted set */
8404 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8405 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8410 /* The final length is what we've output so far, plus what else is about to
8411 * be output. (If 'count' is non-zero, then the input list we exhausted
8412 * has everything remaining up to the machine's limit in its set, and hence
8413 * in the union, so there will be no further output. */
8416 /* At most one of the subexpressions will be non-zero */
8417 len_u += (len_a - i_a) + (len_b - i_b);
8420 /* Set result to final length, which can change the pointer to array_u, so
8422 if (len_u != _invlist_len(u)) {
8423 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8425 array_u = invlist_array(u);
8428 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8429 * the other) ended with everything above it not in its set. That means
8430 * that the remaining part of the union is precisely the same as the
8431 * non-exhausted list, so can just copy it unchanged. (If both list were
8432 * exhausted at the same time, then the operations below will be both 0.)
8435 IV copy_count; /* At most one will have a non-zero copy count */
8436 if ((copy_count = len_a - i_a) > 0) {
8437 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8439 else if ((copy_count = len_b - i_b) > 0) {
8440 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8444 /* We may be removing a reference to one of the inputs. If so, the output
8445 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8446 * count decremented) */
8447 if (a == *output || b == *output) {
8448 assert(! invlist_is_iterating(*output));
8449 if ((SvTEMP(*output))) {
8453 SvREFCNT_dec_NN(*output);
8463 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8464 const bool complement_b, SV** i)
8466 /* Take the intersection of two inversion lists and point <i> to it. *i
8467 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8468 * the reference count to that list will be decremented if not already a
8469 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8470 * The first list, <a>, may be NULL, in which case an empty list is
8471 * returned. If <complement_b> is TRUE, the result will be the
8472 * intersection of <a> and the complement (or inversion) of <b> instead of
8475 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8476 * Richard Gillam, published by Addison-Wesley, and explained at some
8477 * length there. The preface says to incorporate its examples into your
8478 * code at your own risk. In fact, it had bugs
8480 * The algorithm is like a merge sort, and is essentially the same as the
8484 const UV* array_a; /* a's array */
8486 UV len_a; /* length of a's array */
8489 SV* r; /* the resulting intersection */
8493 UV i_a = 0; /* current index into a's array */
8497 /* running count, as explained in the algorithm source book; items are
8498 * stopped accumulating and are output when the count changes to/from 2.
8499 * The count is incremented when we start a range that's in the set, and
8500 * decremented when we start a range that's not in the set. So its range
8501 * is 0 to 2. Only when the count is 2 is something in the intersection.
8505 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8508 /* Special case if either one is empty */
8509 len_a = (a == NULL) ? 0 : _invlist_len(a);
8510 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8511 bool make_temp = FALSE;
8513 if (len_a != 0 && complement_b) {
8515 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8516 * be empty. Here, also we are using 'b's complement, which hence
8517 * must be every possible code point. Thus the intersection is
8521 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8526 *i = invlist_clone(a);
8528 /* else *i is already 'a' */
8536 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8537 * intersection must be empty */
8539 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8544 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8548 *i = _new_invlist(0);
8556 /* Here both lists exist and are non-empty */
8557 array_a = invlist_array(a);
8558 array_b = invlist_array(b);
8560 /* If are to take the intersection of 'a' with the complement of b, set it
8561 * up so are looking at b's complement. */
8564 /* To complement, we invert: if the first element is 0, remove it. To
8565 * do this, we just pretend the array starts one later */
8566 if (array_b[0] == 0) {
8572 /* But if the first element is not zero, we pretend the list starts
8573 * at the 0 that is always stored immediately before the array. */
8579 /* Size the intersection for the worst case: that the intersection ends up
8580 * fragmenting everything to be completely disjoint */
8581 r= _new_invlist(len_a + len_b);
8583 /* Will contain U+0000 iff both components do */
8584 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8585 && len_b > 0 && array_b[0] == 0);
8587 /* Go through each list item by item, stopping when exhausted one of
8589 while (i_a < len_a && i_b < len_b) {
8590 UV cp; /* The element to potentially add to the intersection's
8592 bool cp_in_set; /* Is it in the input list's set or not */
8594 /* We need to take one or the other of the two inputs for the
8595 * intersection. Since we are merging two sorted lists, we take the
8596 * smaller of the next items. In case of a tie, we take the one that
8597 * is not in its set first (a difference from the union algorithm). If
8598 * we took one in the set first, it would increment the count, possibly
8599 * to 2 which would cause it to be output as starting a range in the
8600 * intersection, and the next time through we would take that same
8601 * number, and output it again as ending the set. By doing it the
8602 * opposite of this, there is no possibility that the count will be
8603 * momentarily incremented to 2. (In a tie and both are in the set or
8604 * both not in the set, it doesn't matter which we take first.) */
8605 if (array_a[i_a] < array_b[i_b]
8606 || (array_a[i_a] == array_b[i_b]
8607 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8609 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8613 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8617 /* Here, have chosen which of the two inputs to look at. Only output
8618 * if the running count changes to/from 2, which marks the
8619 * beginning/end of a range that's in the intersection */
8623 array_r[i_r++] = cp;
8628 array_r[i_r++] = cp;
8634 /* Here, we are finished going through at least one of the lists, which
8635 * means there is something remaining in at most one. We check if the list
8636 * that has been exhausted is positioned such that we are in the middle
8637 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8638 * the ones we care about.) There are four cases:
8639 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8640 * nothing left in the intersection.
8641 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8642 * above 2. What should be output is exactly that which is in the
8643 * non-exhausted set, as everything it has is also in the intersection
8644 * set, and everything it doesn't have can't be in the intersection
8645 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8646 * gets incremented to 2. Like the previous case, the intersection is
8647 * everything that remains in the non-exhausted set.
8648 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8649 * remains 1. And the intersection has nothing more. */
8650 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8651 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8656 /* The final length is what we've output so far plus what else is in the
8657 * intersection. At most one of the subexpressions below will be non-zero
8661 len_r += (len_a - i_a) + (len_b - i_b);
8664 /* Set result to final length, which can change the pointer to array_r, so
8666 if (len_r != _invlist_len(r)) {
8667 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8669 array_r = invlist_array(r);
8672 /* Finish outputting any remaining */
8673 if (count >= 2) { /* At most one will have a non-zero copy count */
8675 if ((copy_count = len_a - i_a) > 0) {
8676 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8678 else if ((copy_count = len_b - i_b) > 0) {
8679 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8683 /* We may be removing a reference to one of the inputs. If so, the output
8684 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8685 * count decremented) */
8686 if (a == *i || b == *i) {
8687 assert(! invlist_is_iterating(*i));
8692 SvREFCNT_dec_NN(*i);
8702 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8704 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8705 * set. A pointer to the inversion list is returned. This may actually be
8706 * a new list, in which case the passed in one has been destroyed. The
8707 * passed in inversion list can be NULL, in which case a new one is created
8708 * with just the one range in it */
8713 if (invlist == NULL) {
8714 invlist = _new_invlist(2);
8718 len = _invlist_len(invlist);
8721 /* If comes after the final entry actually in the list, can just append it
8724 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8725 && start >= invlist_array(invlist)[len - 1]))
8727 _append_range_to_invlist(invlist, start, end);
8731 /* Here, can't just append things, create and return a new inversion list
8732 * which is the union of this range and the existing inversion list */
8733 range_invlist = _new_invlist(2);
8734 _append_range_to_invlist(range_invlist, start, end);
8736 _invlist_union(invlist, range_invlist, &invlist);
8738 /* The temporary can be freed */
8739 SvREFCNT_dec_NN(range_invlist);
8745 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8746 UV** other_elements_ptr)
8748 /* Create and return an inversion list whose contents are to be populated
8749 * by the caller. The caller gives the number of elements (in 'size') and
8750 * the very first element ('element0'). This function will set
8751 * '*other_elements_ptr' to an array of UVs, where the remaining elements
8754 * Obviously there is some trust involved that the caller will properly
8755 * fill in the other elements of the array.
8757 * (The first element needs to be passed in, as the underlying code does
8758 * things differently depending on whether it is zero or non-zero) */
8760 SV* invlist = _new_invlist(size);
8763 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8765 _append_range_to_invlist(invlist, element0, element0);
8766 offset = *get_invlist_offset_addr(invlist);
8768 invlist_set_len(invlist, size, offset);
8769 *other_elements_ptr = invlist_array(invlist) + 1;
8775 PERL_STATIC_INLINE SV*
8776 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8777 return _add_range_to_invlist(invlist, cp, cp);
8780 #ifndef PERL_IN_XSUB_RE
8782 Perl__invlist_invert(pTHX_ SV* const invlist)
8784 /* Complement the input inversion list. This adds a 0 if the list didn't
8785 * have a zero; removes it otherwise. As described above, the data
8786 * structure is set up so that this is very efficient */
8788 PERL_ARGS_ASSERT__INVLIST_INVERT;
8790 assert(! invlist_is_iterating(invlist));
8792 /* The inverse of matching nothing is matching everything */
8793 if (_invlist_len(invlist) == 0) {
8794 _append_range_to_invlist(invlist, 0, UV_MAX);
8798 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8803 PERL_STATIC_INLINE SV*
8804 S_invlist_clone(pTHX_ SV* const invlist)
8807 /* Return a new inversion list that is a copy of the input one, which is
8808 * unchanged. The new list will not be mortal even if the old one was. */
8810 /* Need to allocate extra space to accommodate Perl's addition of a
8811 * trailing NUL to SvPV's, since it thinks they are always strings */
8812 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8813 STRLEN physical_length = SvCUR(invlist);
8814 bool offset = *(get_invlist_offset_addr(invlist));
8816 PERL_ARGS_ASSERT_INVLIST_CLONE;
8818 *(get_invlist_offset_addr(new_invlist)) = offset;
8819 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8820 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8825 PERL_STATIC_INLINE STRLEN*
8826 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8828 /* Return the address of the UV that contains the current iteration
8831 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8833 assert(SvTYPE(invlist) == SVt_INVLIST);
8835 return &(((XINVLIST*) SvANY(invlist))->iterator);
8838 PERL_STATIC_INLINE void
8839 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8841 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8843 *get_invlist_iter_addr(invlist) = 0;
8846 PERL_STATIC_INLINE void
8847 S_invlist_iterfinish(pTHX_ SV* invlist)
8849 /* Terminate iterator for invlist. This is to catch development errors.
8850 * Any iteration that is interrupted before completed should call this
8851 * function. Functions that add code points anywhere else but to the end
8852 * of an inversion list assert that they are not in the middle of an
8853 * iteration. If they were, the addition would make the iteration
8854 * problematical: if the iteration hadn't reached the place where things
8855 * were being added, it would be ok */
8857 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8859 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8863 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8865 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8866 * This call sets in <*start> and <*end>, the next range in <invlist>.
8867 * Returns <TRUE> if successful and the next call will return the next
8868 * range; <FALSE> if was already at the end of the list. If the latter,
8869 * <*start> and <*end> are unchanged, and the next call to this function
8870 * will start over at the beginning of the list */
8872 STRLEN* pos = get_invlist_iter_addr(invlist);
8873 UV len = _invlist_len(invlist);
8876 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8879 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8883 array = invlist_array(invlist);
8885 *start = array[(*pos)++];
8891 *end = array[(*pos)++] - 1;
8897 PERL_STATIC_INLINE bool
8898 S_invlist_is_iterating(pTHX_ SV* const invlist)
8900 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8902 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8905 PERL_STATIC_INLINE UV
8906 S_invlist_highest(pTHX_ SV* const invlist)
8908 /* Returns the highest code point that matches an inversion list. This API
8909 * has an ambiguity, as it returns 0 under either the highest is actually
8910 * 0, or if the list is empty. If this distinction matters to you, check
8911 * for emptiness before calling this function */
8913 UV len = _invlist_len(invlist);
8916 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8922 array = invlist_array(invlist);
8924 /* The last element in the array in the inversion list always starts a
8925 * range that goes to infinity. That range may be for code points that are
8926 * matched in the inversion list, or it may be for ones that aren't
8927 * matched. In the latter case, the highest code point in the set is one
8928 * less than the beginning of this range; otherwise it is the final element
8929 * of this range: infinity */
8930 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8932 : array[len - 1] - 1;
8935 #ifndef PERL_IN_XSUB_RE
8937 Perl__invlist_contents(pTHX_ SV* const invlist)
8939 /* Get the contents of an inversion list into a string SV so that they can
8940 * be printed out. It uses the format traditionally done for debug tracing
8944 SV* output = newSVpvs("\n");
8946 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8948 assert(! invlist_is_iterating(invlist));
8950 invlist_iterinit(invlist);
8951 while (invlist_iternext(invlist, &start, &end)) {
8952 if (end == UV_MAX) {
8953 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8955 else if (end != start) {
8956 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8960 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8968 #ifndef PERL_IN_XSUB_RE
8970 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
8971 const char * const indent, SV* const invlist)
8973 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
8974 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
8975 * the string 'indent'. The output looks like this:
8976 [0] 0x000A .. 0x000D
8978 [4] 0x2028 .. 0x2029
8979 [6] 0x3104 .. INFINITY
8980 * This means that the first range of code points matched by the list are
8981 * 0xA through 0xD; the second range contains only the single code point
8982 * 0x85, etc. An inversion list is an array of UVs. Two array elements
8983 * are used to define each range (except if the final range extends to
8984 * infinity, only a single element is needed). The array index of the
8985 * first element for the corresponding range is given in brackets. */
8990 PERL_ARGS_ASSERT__INVLIST_DUMP;
8992 if (invlist_is_iterating(invlist)) {
8993 Perl_dump_indent(aTHX_ level, file,
8994 "%sCan't dump inversion list because is in middle of iterating\n",
8999 invlist_iterinit(invlist);
9000 while (invlist_iternext(invlist, &start, &end)) {
9001 if (end == UV_MAX) {
9002 Perl_dump_indent(aTHX_ level, file,
9003 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9004 indent, (UV)count, start);
9006 else if (end != start) {
9007 Perl_dump_indent(aTHX_ level, file,
9008 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9009 indent, (UV)count, start, end);
9012 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9013 indent, (UV)count, start);
9020 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9022 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9024 /* Return a boolean as to if the two passed in inversion lists are
9025 * identical. The final argument, if TRUE, says to take the complement of
9026 * the second inversion list before doing the comparison */
9028 const UV* array_a = invlist_array(a);
9029 const UV* array_b = invlist_array(b);
9030 UV len_a = _invlist_len(a);
9031 UV len_b = _invlist_len(b);
9033 UV i = 0; /* current index into the arrays */
9034 bool retval = TRUE; /* Assume are identical until proven otherwise */
9036 PERL_ARGS_ASSERT__INVLISTEQ;
9038 /* If are to compare 'a' with the complement of b, set it
9039 * up so are looking at b's complement. */
9042 /* The complement of nothing is everything, so <a> would have to have
9043 * just one element, starting at zero (ending at infinity) */
9045 return (len_a == 1 && array_a[0] == 0);
9047 else if (array_b[0] == 0) {
9049 /* Otherwise, to complement, we invert. Here, the first element is
9050 * 0, just remove it. To do this, we just pretend the array starts
9058 /* But if the first element is not zero, we pretend the list starts
9059 * at the 0 that is always stored immediately before the array. */
9065 /* Make sure that the lengths are the same, as well as the final element
9066 * before looping through the remainder. (Thus we test the length, final,
9067 * and first elements right off the bat) */
9068 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9071 else for (i = 0; i < len_a - 1; i++) {
9072 if (array_a[i] != array_b[i]) {
9082 #undef HEADER_LENGTH
9083 #undef TO_INTERNAL_SIZE
9084 #undef FROM_INTERNAL_SIZE
9085 #undef INVLIST_VERSION_ID
9087 /* End of inversion list object */
9090 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9092 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9093 * constructs, and updates RExC_flags with them. On input, RExC_parse
9094 * should point to the first flag; it is updated on output to point to the
9095 * final ')' or ':'. There needs to be at least one flag, or this will
9098 /* for (?g), (?gc), and (?o) warnings; warning
9099 about (?c) will warn about (?g) -- japhy */
9101 #define WASTED_O 0x01
9102 #define WASTED_G 0x02
9103 #define WASTED_C 0x04
9104 #define WASTED_GC (WASTED_G|WASTED_C)
9105 I32 wastedflags = 0x00;
9106 U32 posflags = 0, negflags = 0;
9107 U32 *flagsp = &posflags;
9108 char has_charset_modifier = '\0';
9110 bool has_use_defaults = FALSE;
9111 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9113 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9115 /* '^' as an initial flag sets certain defaults */
9116 if (UCHARAT(RExC_parse) == '^') {
9118 has_use_defaults = TRUE;
9119 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9120 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9121 ? REGEX_UNICODE_CHARSET
9122 : REGEX_DEPENDS_CHARSET);
9125 cs = get_regex_charset(RExC_flags);
9126 if (cs == REGEX_DEPENDS_CHARSET
9127 && (RExC_utf8 || RExC_uni_semantics))
9129 cs = REGEX_UNICODE_CHARSET;
9132 while (*RExC_parse) {
9133 /* && strchr("iogcmsx", *RExC_parse) */
9134 /* (?g), (?gc) and (?o) are useless here
9135 and must be globally applied -- japhy */
9136 switch (*RExC_parse) {
9138 /* Code for the imsx flags */
9139 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9141 case LOCALE_PAT_MOD:
9142 if (has_charset_modifier) {
9143 goto excess_modifier;
9145 else if (flagsp == &negflags) {
9148 cs = REGEX_LOCALE_CHARSET;
9149 has_charset_modifier = LOCALE_PAT_MOD;
9150 RExC_contains_locale = 1;
9152 case UNICODE_PAT_MOD:
9153 if (has_charset_modifier) {
9154 goto excess_modifier;
9156 else if (flagsp == &negflags) {
9159 cs = REGEX_UNICODE_CHARSET;
9160 has_charset_modifier = UNICODE_PAT_MOD;
9162 case ASCII_RESTRICT_PAT_MOD:
9163 if (flagsp == &negflags) {
9166 if (has_charset_modifier) {
9167 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9168 goto excess_modifier;
9170 /* Doubled modifier implies more restricted */
9171 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9174 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9176 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9178 case DEPENDS_PAT_MOD:
9179 if (has_use_defaults) {
9180 goto fail_modifiers;
9182 else if (flagsp == &negflags) {
9185 else if (has_charset_modifier) {
9186 goto excess_modifier;
9189 /* The dual charset means unicode semantics if the
9190 * pattern (or target, not known until runtime) are
9191 * utf8, or something in the pattern indicates unicode
9193 cs = (RExC_utf8 || RExC_uni_semantics)
9194 ? REGEX_UNICODE_CHARSET
9195 : REGEX_DEPENDS_CHARSET;
9196 has_charset_modifier = DEPENDS_PAT_MOD;
9200 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9201 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9203 else if (has_charset_modifier == *(RExC_parse - 1)) {
9204 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9208 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9213 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9216 case ONCE_PAT_MOD: /* 'o' */
9217 case GLOBAL_PAT_MOD: /* 'g' */
9218 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9219 const I32 wflagbit = *RExC_parse == 'o'
9222 if (! (wastedflags & wflagbit) ) {
9223 wastedflags |= wflagbit;
9224 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9227 "Useless (%s%c) - %suse /%c modifier",
9228 flagsp == &negflags ? "?-" : "?",
9230 flagsp == &negflags ? "don't " : "",
9237 case CONTINUE_PAT_MOD: /* 'c' */
9238 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9239 if (! (wastedflags & WASTED_C) ) {
9240 wastedflags |= WASTED_GC;
9241 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9244 "Useless (%sc) - %suse /gc modifier",
9245 flagsp == &negflags ? "?-" : "?",
9246 flagsp == &negflags ? "don't " : ""
9251 case KEEPCOPY_PAT_MOD: /* 'p' */
9252 if (flagsp == &negflags) {
9254 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9256 *flagsp |= RXf_PMf_KEEPCOPY;
9260 /* A flag is a default iff it is following a minus, so
9261 * if there is a minus, it means will be trying to
9262 * re-specify a default which is an error */
9263 if (has_use_defaults || flagsp == &negflags) {
9264 goto fail_modifiers;
9267 wastedflags = 0; /* reset so (?g-c) warns twice */
9271 RExC_flags |= posflags;
9272 RExC_flags &= ~negflags;
9273 set_regex_charset(&RExC_flags, cs);
9274 if (RExC_flags & RXf_PMf_FOLD) {
9275 RExC_contains_i = 1;
9281 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9282 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9283 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9284 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9293 - reg - regular expression, i.e. main body or parenthesized thing
9295 * Caller must absorb opening parenthesis.
9297 * Combining parenthesis handling with the base level of regular expression
9298 * is a trifle forced, but the need to tie the tails of the branches to what
9299 * follows makes it hard to avoid.
9301 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9303 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9305 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9308 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9309 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9310 needs to be restarted.
9311 Otherwise would only return NULL if regbranch() returns NULL, which
9314 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9315 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9316 * 2 is like 1, but indicates that nextchar() has been called to advance
9317 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9318 * this flag alerts us to the need to check for that */
9321 regnode *ret; /* Will be the head of the group. */
9324 regnode *ender = NULL;
9327 U32 oregflags = RExC_flags;
9328 bool have_branch = 0;
9330 I32 freeze_paren = 0;
9331 I32 after_freeze = 0;
9333 char * parse_start = RExC_parse; /* MJD */
9334 char * const oregcomp_parse = RExC_parse;
9336 GET_RE_DEBUG_FLAGS_DECL;
9338 PERL_ARGS_ASSERT_REG;
9339 DEBUG_PARSE("reg ");
9341 *flagp = 0; /* Tentatively. */
9344 /* Make an OPEN node, if parenthesized. */
9347 /* Under /x, space and comments can be gobbled up between the '(' and
9348 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9349 * intervening space, as the sequence is a token, and a token should be
9351 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9353 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9354 char *start_verb = RExC_parse;
9355 STRLEN verb_len = 0;
9356 char *start_arg = NULL;
9357 unsigned char op = 0;
9359 int internal_argval = 0; /* internal_argval is only useful if
9362 if (has_intervening_patws && SIZE_ONLY) {
9363 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9365 while ( *RExC_parse && *RExC_parse != ')' ) {
9366 if ( *RExC_parse == ':' ) {
9367 start_arg = RExC_parse + 1;
9373 verb_len = RExC_parse - start_verb;
9376 while ( *RExC_parse && *RExC_parse != ')' )
9378 if ( *RExC_parse != ')' )
9379 vFAIL("Unterminated verb pattern argument");
9380 if ( RExC_parse == start_arg )
9383 if ( *RExC_parse != ')' )
9384 vFAIL("Unterminated verb pattern");
9387 switch ( *start_verb ) {
9388 case 'A': /* (*ACCEPT) */
9389 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9391 internal_argval = RExC_nestroot;
9394 case 'C': /* (*COMMIT) */
9395 if ( memEQs(start_verb,verb_len,"COMMIT") )
9398 case 'F': /* (*FAIL) */
9399 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9404 case ':': /* (*:NAME) */
9405 case 'M': /* (*MARK:NAME) */
9406 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9411 case 'P': /* (*PRUNE) */
9412 if ( memEQs(start_verb,verb_len,"PRUNE") )
9415 case 'S': /* (*SKIP) */
9416 if ( memEQs(start_verb,verb_len,"SKIP") )
9419 case 'T': /* (*THEN) */
9420 /* [19:06] <TimToady> :: is then */
9421 if ( memEQs(start_verb,verb_len,"THEN") ) {
9423 RExC_seen |= REG_SEEN_CUTGROUP;
9428 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9430 "Unknown verb pattern '%"UTF8f"'",
9431 UTF8fARG(UTF, verb_len, start_verb));
9434 if ( start_arg && internal_argval ) {
9435 vFAIL3("Verb pattern '%.*s' may not have an argument",
9436 verb_len, start_verb);
9437 } else if ( argok < 0 && !start_arg ) {
9438 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9439 verb_len, start_verb);
9441 ret = reganode(pRExC_state, op, internal_argval);
9442 if ( ! internal_argval && ! SIZE_ONLY ) {
9444 SV *sv = newSVpvn( start_arg,
9445 RExC_parse - start_arg);
9446 ARG(ret) = add_data( pRExC_state,
9448 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9455 if (!internal_argval)
9456 RExC_seen |= REG_SEEN_VERBARG;
9457 } else if ( start_arg ) {
9458 vFAIL3("Verb pattern '%.*s' may not have an argument",
9459 verb_len, start_verb);
9461 ret = reg_node(pRExC_state, op);
9463 nextchar(pRExC_state);
9466 else if (*RExC_parse == '?') { /* (?...) */
9467 bool is_logical = 0;
9468 const char * const seqstart = RExC_parse;
9469 if (has_intervening_patws && SIZE_ONLY) {
9470 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9474 paren = *RExC_parse++;
9475 ret = NULL; /* For look-ahead/behind. */
9478 case 'P': /* (?P...) variants for those used to PCRE/Python */
9479 paren = *RExC_parse++;
9480 if ( paren == '<') /* (?P<...>) named capture */
9482 else if (paren == '>') { /* (?P>name) named recursion */
9483 goto named_recursion;
9485 else if (paren == '=') { /* (?P=...) named backref */
9486 /* this pretty much dupes the code for \k<NAME> in
9487 * regatom(), if you change this make sure you change that
9489 char* name_start = RExC_parse;
9491 SV *sv_dat = reg_scan_name(pRExC_state,
9492 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9493 if (RExC_parse == name_start || *RExC_parse != ')')
9494 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9495 vFAIL2("Sequence %.3s... not terminated",parse_start);
9498 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9499 RExC_rxi->data->data[num]=(void*)sv_dat;
9500 SvREFCNT_inc_simple_void(sv_dat);
9503 ret = reganode(pRExC_state,
9506 : (ASCII_FOLD_RESTRICTED)
9508 : (AT_LEAST_UNI_SEMANTICS)
9516 Set_Node_Offset(ret, parse_start+1);
9517 Set_Node_Cur_Length(ret, parse_start);
9519 nextchar(pRExC_state);
9523 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9524 vFAIL3("Sequence (%.*s...) not recognized",
9525 RExC_parse-seqstart, seqstart);
9527 case '<': /* (?<...) */
9528 if (*RExC_parse == '!')
9530 else if (*RExC_parse != '=')
9536 case '\'': /* (?'...') */
9537 name_start= RExC_parse;
9538 svname = reg_scan_name(pRExC_state,
9539 SIZE_ONLY /* reverse test from the others */
9540 ? REG_RSN_RETURN_NAME
9541 : REG_RSN_RETURN_NULL);
9542 if (RExC_parse == name_start || *RExC_parse != paren)
9543 vFAIL2("Sequence (?%c... not terminated",
9544 paren=='>' ? '<' : paren);
9548 if (!svname) /* shouldn't happen */
9550 "panic: reg_scan_name returned NULL");
9551 if (!RExC_paren_names) {
9552 RExC_paren_names= newHV();
9553 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9555 RExC_paren_name_list= newAV();
9556 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9559 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9561 sv_dat = HeVAL(he_str);
9563 /* croak baby croak */
9565 "panic: paren_name hash element allocation failed");
9566 } else if ( SvPOK(sv_dat) ) {
9567 /* (?|...) can mean we have dupes so scan to check
9568 its already been stored. Maybe a flag indicating
9569 we are inside such a construct would be useful,
9570 but the arrays are likely to be quite small, so
9571 for now we punt -- dmq */
9572 IV count = SvIV(sv_dat);
9573 I32 *pv = (I32*)SvPVX(sv_dat);
9575 for ( i = 0 ; i < count ; i++ ) {
9576 if ( pv[i] == RExC_npar ) {
9582 pv = (I32*)SvGROW(sv_dat,
9583 SvCUR(sv_dat) + sizeof(I32)+1);
9584 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9585 pv[count] = RExC_npar;
9586 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9589 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9590 sv_setpvn(sv_dat, (char *)&(RExC_npar),
9593 SvIV_set(sv_dat, 1);
9596 /* Yes this does cause a memory leak in debugging Perls
9598 if (!av_store(RExC_paren_name_list,
9599 RExC_npar, SvREFCNT_inc(svname)))
9600 SvREFCNT_dec_NN(svname);
9603 /*sv_dump(sv_dat);*/
9605 nextchar(pRExC_state);
9607 goto capturing_parens;
9609 RExC_seen |= REG_SEEN_LOOKBEHIND;
9610 RExC_in_lookbehind++;
9612 case '=': /* (?=...) */
9613 RExC_seen_zerolen++;
9615 case '!': /* (?!...) */
9616 RExC_seen_zerolen++;
9617 if (*RExC_parse == ')') {
9618 ret=reg_node(pRExC_state, OPFAIL);
9619 nextchar(pRExC_state);
9623 case '|': /* (?|...) */
9624 /* branch reset, behave like a (?:...) except that
9625 buffers in alternations share the same numbers */
9627 after_freeze = freeze_paren = RExC_npar;
9629 case ':': /* (?:...) */
9630 case '>': /* (?>...) */
9632 case '$': /* (?$...) */
9633 case '@': /* (?@...) */
9634 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9636 case '#': /* (?#...) */
9637 /* XXX As soon as we disallow separating the '?' and '*' (by
9638 * spaces or (?#...) comment), it is believed that this case
9639 * will be unreachable and can be removed. See
9641 while (*RExC_parse && *RExC_parse != ')')
9643 if (*RExC_parse != ')')
9644 FAIL("Sequence (?#... not terminated");
9645 nextchar(pRExC_state);
9648 case '0' : /* (?0) */
9649 case 'R' : /* (?R) */
9650 if (*RExC_parse != ')')
9651 FAIL("Sequence (?R) not terminated");
9652 ret = reg_node(pRExC_state, GOSTART);
9653 RExC_seen |= REG_SEEN_GOSTART;
9654 *flagp |= POSTPONED;
9655 nextchar(pRExC_state);
9658 { /* named and numeric backreferences */
9660 case '&': /* (?&NAME) */
9661 parse_start = RExC_parse - 1;
9664 SV *sv_dat = reg_scan_name(pRExC_state,
9665 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9666 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9668 if (RExC_parse == RExC_end || *RExC_parse != ')')
9669 vFAIL("Sequence (?&... not terminated");
9670 goto gen_recurse_regop;
9671 assert(0); /* NOT REACHED */
9673 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9675 vFAIL("Illegal pattern");
9677 goto parse_recursion;
9679 case '-': /* (?-1) */
9680 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9681 RExC_parse--; /* rewind to let it be handled later */
9685 case '1': case '2': case '3': case '4': /* (?1) */
9686 case '5': case '6': case '7': case '8': case '9':
9689 num = atoi(RExC_parse);
9690 parse_start = RExC_parse - 1; /* MJD */
9691 if (*RExC_parse == '-')
9693 while (isDIGIT(*RExC_parse))
9695 if (*RExC_parse!=')')
9696 vFAIL("Expecting close bracket");
9699 if ( paren == '-' ) {
9701 Diagram of capture buffer numbering.
9702 Top line is the normal capture buffer numbers
9703 Bottom line is the negative indexing as from
9707 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9711 num = RExC_npar + num;
9714 vFAIL("Reference to nonexistent group");
9716 } else if ( paren == '+' ) {
9717 num = RExC_npar + num - 1;
9720 ret = reganode(pRExC_state, GOSUB, num);
9722 if (num > (I32)RExC_rx->nparens) {
9724 vFAIL("Reference to nonexistent group");
9726 ARG2L_SET( ret, RExC_recurse_count++);
9728 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9729 "Recurse #%"UVuf" to %"IVdf"\n",
9730 (UV)ARG(ret), (IV)ARG2L(ret)));
9734 RExC_seen |= REG_SEEN_RECURSE;
9735 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9736 Set_Node_Offset(ret, parse_start); /* MJD */
9738 *flagp |= POSTPONED;
9739 nextchar(pRExC_state);
9741 } /* named and numeric backreferences */
9742 assert(0); /* NOT REACHED */
9744 case '?': /* (??...) */
9746 if (*RExC_parse != '{') {
9748 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9750 "Sequence (%"UTF8f"...) not recognized",
9751 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9754 *flagp |= POSTPONED;
9755 paren = *RExC_parse++;
9757 case '{': /* (?{...}) */
9760 struct reg_code_block *cb;
9762 RExC_seen_zerolen++;
9764 if ( !pRExC_state->num_code_blocks
9765 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9766 || pRExC_state->code_blocks[pRExC_state->code_index].start
9767 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9770 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9771 FAIL("panic: Sequence (?{...}): no code block found\n");
9772 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9774 /* this is a pre-compiled code block (?{...}) */
9775 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9776 RExC_parse = RExC_start + cb->end;
9779 if (cb->src_regex) {
9780 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9781 RExC_rxi->data->data[n] =
9782 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9783 RExC_rxi->data->data[n+1] = (void*)o;
9786 n = add_data(pRExC_state,
9787 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9788 RExC_rxi->data->data[n] = (void*)o;
9791 pRExC_state->code_index++;
9792 nextchar(pRExC_state);
9796 ret = reg_node(pRExC_state, LOGICAL);
9797 eval = reganode(pRExC_state, EVAL, n);
9800 /* for later propagation into (??{}) return value */
9801 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9803 REGTAIL(pRExC_state, ret, eval);
9804 /* deal with the length of this later - MJD */
9807 ret = reganode(pRExC_state, EVAL, n);
9808 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9809 Set_Node_Offset(ret, parse_start);
9812 case '(': /* (?(?{...})...) and (?(?=...)...) */
9815 if (RExC_parse[0] == '?') { /* (?(?...)) */
9816 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9817 || RExC_parse[1] == '<'
9818 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9822 ret = reg_node(pRExC_state, LOGICAL);
9826 tail = reg(pRExC_state, 1, &flag, depth+1);
9827 if (flag & RESTART_UTF8) {
9828 *flagp = RESTART_UTF8;
9831 REGTAIL(pRExC_state, ret, tail);
9835 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9836 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9838 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9839 char *name_start= RExC_parse++;
9841 SV *sv_dat=reg_scan_name(pRExC_state,
9842 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9843 if (RExC_parse == name_start || *RExC_parse != ch)
9844 vFAIL2("Sequence (?(%c... not terminated",
9845 (ch == '>' ? '<' : ch));
9848 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9849 RExC_rxi->data->data[num]=(void*)sv_dat;
9850 SvREFCNT_inc_simple_void(sv_dat);
9852 ret = reganode(pRExC_state,NGROUPP,num);
9853 goto insert_if_check_paren;
9855 else if (RExC_parse[0] == 'D' &&
9856 RExC_parse[1] == 'E' &&
9857 RExC_parse[2] == 'F' &&
9858 RExC_parse[3] == 'I' &&
9859 RExC_parse[4] == 'N' &&
9860 RExC_parse[5] == 'E')
9862 ret = reganode(pRExC_state,DEFINEP,0);
9865 goto insert_if_check_paren;
9867 else if (RExC_parse[0] == 'R') {
9870 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9871 parno = atoi(RExC_parse++);
9872 while (isDIGIT(*RExC_parse))
9874 } else if (RExC_parse[0] == '&') {
9877 sv_dat = reg_scan_name(pRExC_state,
9879 ? REG_RSN_RETURN_NULL
9880 : REG_RSN_RETURN_DATA);
9881 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9883 ret = reganode(pRExC_state,INSUBP,parno);
9884 goto insert_if_check_paren;
9886 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9890 parno = atoi(RExC_parse++);
9892 while (isDIGIT(*RExC_parse))
9894 ret = reganode(pRExC_state, GROUPP, parno);
9896 insert_if_check_paren:
9897 if (*(tmp = nextchar(pRExC_state)) != ')') {
9898 /* nextchar also skips comments, so undo its work
9899 * and skip over the the next character.
9902 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9903 vFAIL("Switch condition not recognized");
9906 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9907 br = regbranch(pRExC_state, &flags, 1,depth+1);
9909 if (flags & RESTART_UTF8) {
9910 *flagp = RESTART_UTF8;
9913 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9916 REGTAIL(pRExC_state, br, reganode(pRExC_state,
9918 c = *nextchar(pRExC_state);
9923 vFAIL("(?(DEFINE)....) does not allow branches");
9925 /* Fake one for optimizer. */
9926 lastbr = reganode(pRExC_state, IFTHEN, 0);
9928 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9929 if (flags & RESTART_UTF8) {
9930 *flagp = RESTART_UTF8;
9933 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9936 REGTAIL(pRExC_state, ret, lastbr);
9939 c = *nextchar(pRExC_state);
9944 vFAIL("Switch (?(condition)... contains too many branches");
9945 ender = reg_node(pRExC_state, TAIL);
9946 REGTAIL(pRExC_state, br, ender);
9948 REGTAIL(pRExC_state, lastbr, ender);
9949 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9952 REGTAIL(pRExC_state, ret, ender);
9953 RExC_size++; /* XXX WHY do we need this?!!
9954 For large programs it seems to be required
9955 but I can't figure out why. -- dmq*/
9959 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9960 vFAIL("Unknown switch condition (?(...))");
9963 case '[': /* (?[ ... ]) */
9964 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9967 RExC_parse--; /* for vFAIL to print correctly */
9968 vFAIL("Sequence (? incomplete");
9970 default: /* e.g., (?i) */
9973 parse_lparen_question_flags(pRExC_state);
9974 if (UCHARAT(RExC_parse) != ':') {
9975 nextchar(pRExC_state);
9980 nextchar(pRExC_state);
9990 ret = reganode(pRExC_state, OPEN, parno);
9993 RExC_nestroot = parno;
9994 if (RExC_seen & REG_SEEN_RECURSE
9995 && !RExC_open_parens[parno-1])
9997 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9998 "Setting open paren #%"IVdf" to %d\n",
9999 (IV)parno, REG_NODE_NUM(ret)));
10000 RExC_open_parens[parno-1]= ret;
10003 Set_Node_Length(ret, 1); /* MJD */
10004 Set_Node_Offset(ret, RExC_parse); /* MJD */
10012 /* Pick up the branches, linking them together. */
10013 parse_start = RExC_parse; /* MJD */
10014 br = regbranch(pRExC_state, &flags, 1,depth+1);
10016 /* branch_len = (paren != 0); */
10019 if (flags & RESTART_UTF8) {
10020 *flagp = RESTART_UTF8;
10023 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10025 if (*RExC_parse == '|') {
10026 if (!SIZE_ONLY && RExC_extralen) {
10027 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10030 reginsert(pRExC_state, BRANCH, br, depth+1);
10031 Set_Node_Length(br, paren != 0);
10032 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10036 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10038 else if (paren == ':') {
10039 *flagp |= flags&SIMPLE;
10041 if (is_open) { /* Starts with OPEN. */
10042 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10044 else if (paren != '?') /* Not Conditional */
10046 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10048 while (*RExC_parse == '|') {
10049 if (!SIZE_ONLY && RExC_extralen) {
10050 ender = reganode(pRExC_state, LONGJMP,0);
10052 /* Append to the previous. */
10053 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10056 RExC_extralen += 2; /* Account for LONGJMP. */
10057 nextchar(pRExC_state);
10058 if (freeze_paren) {
10059 if (RExC_npar > after_freeze)
10060 after_freeze = RExC_npar;
10061 RExC_npar = freeze_paren;
10063 br = regbranch(pRExC_state, &flags, 0, depth+1);
10066 if (flags & RESTART_UTF8) {
10067 *flagp = RESTART_UTF8;
10070 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10072 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10074 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10077 if (have_branch || paren != ':') {
10078 /* Make a closing node, and hook it on the end. */
10081 ender = reg_node(pRExC_state, TAIL);
10084 ender = reganode(pRExC_state, CLOSE, parno);
10085 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
10086 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10087 "Setting close paren #%"IVdf" to %d\n",
10088 (IV)parno, REG_NODE_NUM(ender)));
10089 RExC_close_parens[parno-1]= ender;
10090 if (RExC_nestroot == parno)
10093 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10094 Set_Node_Length(ender,1); /* MJD */
10100 *flagp &= ~HASWIDTH;
10103 ender = reg_node(pRExC_state, SUCCEED);
10106 ender = reg_node(pRExC_state, END);
10108 assert(!RExC_opend); /* there can only be one! */
10109 RExC_opend = ender;
10113 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10114 SV * const mysv_val1=sv_newmortal();
10115 SV * const mysv_val2=sv_newmortal();
10116 DEBUG_PARSE_MSG("lsbr");
10117 regprop(RExC_rx, mysv_val1, lastbr);
10118 regprop(RExC_rx, mysv_val2, ender);
10119 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10120 SvPV_nolen_const(mysv_val1),
10121 (IV)REG_NODE_NUM(lastbr),
10122 SvPV_nolen_const(mysv_val2),
10123 (IV)REG_NODE_NUM(ender),
10124 (IV)(ender - lastbr)
10127 REGTAIL(pRExC_state, lastbr, ender);
10129 if (have_branch && !SIZE_ONLY) {
10130 char is_nothing= 1;
10132 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
10134 /* Hook the tails of the branches to the closing node. */
10135 for (br = ret; br; br = regnext(br)) {
10136 const U8 op = PL_regkind[OP(br)];
10137 if (op == BRANCH) {
10138 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10139 if ( OP(NEXTOPER(br)) != NOTHING
10140 || regnext(NEXTOPER(br)) != ender)
10143 else if (op == BRANCHJ) {
10144 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10145 /* for now we always disable this optimisation * /
10146 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10147 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10153 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10154 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10155 SV * const mysv_val1=sv_newmortal();
10156 SV * const mysv_val2=sv_newmortal();
10157 DEBUG_PARSE_MSG("NADA");
10158 regprop(RExC_rx, mysv_val1, ret);
10159 regprop(RExC_rx, mysv_val2, ender);
10160 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10161 SvPV_nolen_const(mysv_val1),
10162 (IV)REG_NODE_NUM(ret),
10163 SvPV_nolen_const(mysv_val2),
10164 (IV)REG_NODE_NUM(ender),
10169 if (OP(ender) == TAIL) {
10174 for ( opt= br + 1; opt < ender ; opt++ )
10175 OP(opt)= OPTIMIZED;
10176 NEXT_OFF(br)= ender - br;
10184 static const char parens[] = "=!<,>";
10186 if (paren && (p = strchr(parens, paren))) {
10187 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10188 int flag = (p - parens) > 1;
10191 node = SUSPEND, flag = 0;
10192 reginsert(pRExC_state, node,ret, depth+1);
10193 Set_Node_Cur_Length(ret, parse_start);
10194 Set_Node_Offset(ret, parse_start + 1);
10196 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10200 /* Check for proper termination. */
10202 /* restore original flags, but keep (?p) */
10203 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10204 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10205 RExC_parse = oregcomp_parse;
10206 vFAIL("Unmatched (");
10209 else if (!paren && RExC_parse < RExC_end) {
10210 if (*RExC_parse == ')') {
10212 vFAIL("Unmatched )");
10215 FAIL("Junk on end of regexp"); /* "Can't happen". */
10216 assert(0); /* NOTREACHED */
10219 if (RExC_in_lookbehind) {
10220 RExC_in_lookbehind--;
10222 if (after_freeze > RExC_npar)
10223 RExC_npar = after_freeze;
10228 - regbranch - one alternative of an | operator
10230 * Implements the concatenation operator.
10232 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10236 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10240 regnode *chain = NULL;
10242 I32 flags = 0, c = 0;
10243 GET_RE_DEBUG_FLAGS_DECL;
10245 PERL_ARGS_ASSERT_REGBRANCH;
10247 DEBUG_PARSE("brnc");
10252 if (!SIZE_ONLY && RExC_extralen)
10253 ret = reganode(pRExC_state, BRANCHJ,0);
10255 ret = reg_node(pRExC_state, BRANCH);
10256 Set_Node_Length(ret, 1);
10260 if (!first && SIZE_ONLY)
10261 RExC_extralen += 1; /* BRANCHJ */
10263 *flagp = WORST; /* Tentatively. */
10266 nextchar(pRExC_state);
10267 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10268 flags &= ~TRYAGAIN;
10269 latest = regpiece(pRExC_state, &flags,depth+1);
10270 if (latest == NULL) {
10271 if (flags & TRYAGAIN)
10273 if (flags & RESTART_UTF8) {
10274 *flagp = RESTART_UTF8;
10277 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10279 else if (ret == NULL)
10281 *flagp |= flags&(HASWIDTH|POSTPONED);
10282 if (chain == NULL) /* First piece. */
10283 *flagp |= flags&SPSTART;
10286 REGTAIL(pRExC_state, chain, latest);
10291 if (chain == NULL) { /* Loop ran zero times. */
10292 chain = reg_node(pRExC_state, NOTHING);
10297 *flagp |= flags&SIMPLE;
10304 - regpiece - something followed by possible [*+?]
10306 * Note that the branching code sequences used for ? and the general cases
10307 * of * and + are somewhat optimized: they use the same NOTHING node as
10308 * both the endmarker for their branch list and the body of the last branch.
10309 * It might seem that this node could be dispensed with entirely, but the
10310 * endmarker role is not redundant.
10312 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10314 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10318 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10325 const char * const origparse = RExC_parse;
10327 I32 max = REG_INFTY;
10328 #ifdef RE_TRACK_PATTERN_OFFSETS
10331 const char *maxpos = NULL;
10333 /* Save the original in case we change the emitted regop to a FAIL. */
10334 regnode * const orig_emit = RExC_emit;
10336 GET_RE_DEBUG_FLAGS_DECL;
10338 PERL_ARGS_ASSERT_REGPIECE;
10340 DEBUG_PARSE("piec");
10342 ret = regatom(pRExC_state, &flags,depth+1);
10344 if (flags & (TRYAGAIN|RESTART_UTF8))
10345 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10347 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10353 if (op == '{' && regcurly(RExC_parse, FALSE)) {
10355 #ifdef RE_TRACK_PATTERN_OFFSETS
10356 parse_start = RExC_parse; /* MJD */
10358 next = RExC_parse + 1;
10359 while (isDIGIT(*next) || *next == ',') {
10360 if (*next == ',') {
10368 if (*next == '}') { /* got one */
10372 min = atoi(RExC_parse);
10373 if (*maxpos == ',')
10376 maxpos = RExC_parse;
10377 max = atoi(maxpos);
10378 if (!max && *maxpos != '0')
10379 max = REG_INFTY; /* meaning "infinity" */
10380 else if (max >= REG_INFTY)
10381 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10383 nextchar(pRExC_state);
10384 if (max < min) { /* If can't match, warn and optimize to fail
10387 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10389 /* We can't back off the size because we have to reserve
10390 * enough space for all the things we are about to throw
10391 * away, but we can shrink it by the ammount we are about
10392 * to re-use here */
10393 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10396 RExC_emit = orig_emit;
10398 ret = reg_node(pRExC_state, OPFAIL);
10401 else if (min == max
10402 && RExC_parse < RExC_end
10403 && (*RExC_parse == '?' || *RExC_parse == '+'))
10406 ckWARN2reg(RExC_parse + 1,
10407 "Useless use of greediness modifier '%c'",
10410 /* Absorb the modifier, so later code doesn't see nor use
10412 nextchar(pRExC_state);
10416 if ((flags&SIMPLE)) {
10417 RExC_naughty += 2 + RExC_naughty / 2;
10418 reginsert(pRExC_state, CURLY, ret, depth+1);
10419 Set_Node_Offset(ret, parse_start+1); /* MJD */
10420 Set_Node_Cur_Length(ret, parse_start);
10423 regnode * const w = reg_node(pRExC_state, WHILEM);
10426 REGTAIL(pRExC_state, ret, w);
10427 if (!SIZE_ONLY && RExC_extralen) {
10428 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10429 reginsert(pRExC_state, NOTHING,ret, depth+1);
10430 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10432 reginsert(pRExC_state, CURLYX,ret, depth+1);
10434 Set_Node_Offset(ret, parse_start+1);
10435 Set_Node_Length(ret,
10436 op == '{' ? (RExC_parse - parse_start) : 1);
10438 if (!SIZE_ONLY && RExC_extralen)
10439 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10440 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10442 RExC_whilem_seen++, RExC_extralen += 3;
10443 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10450 *flagp |= HASWIDTH;
10452 ARG1_SET(ret, (U16)min);
10453 ARG2_SET(ret, (U16)max);
10460 if (!ISMULT1(op)) {
10465 #if 0 /* Now runtime fix should be reliable. */
10467 /* if this is reinstated, don't forget to put this back into perldiag:
10469 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10471 (F) The part of the regexp subject to either the * or + quantifier
10472 could match an empty string. The {#} shows in the regular
10473 expression about where the problem was discovered.
10477 if (!(flags&HASWIDTH) && op != '?')
10478 vFAIL("Regexp *+ operand could be empty");
10481 #ifdef RE_TRACK_PATTERN_OFFSETS
10482 parse_start = RExC_parse;
10484 nextchar(pRExC_state);
10486 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10488 if (op == '*' && (flags&SIMPLE)) {
10489 reginsert(pRExC_state, STAR, ret, depth+1);
10493 else if (op == '*') {
10497 else if (op == '+' && (flags&SIMPLE)) {
10498 reginsert(pRExC_state, PLUS, ret, depth+1);
10502 else if (op == '+') {
10506 else if (op == '?') {
10511 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10512 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10513 ckWARN2reg(RExC_parse,
10514 "%"UTF8f" matches null string many times",
10515 UTF8fARG(UTF, (RExC_parse >= origparse
10516 ? RExC_parse - origparse
10519 (void)ReREFCNT_inc(RExC_rx_sv);
10522 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10523 nextchar(pRExC_state);
10524 reginsert(pRExC_state, MINMOD, ret, depth+1);
10525 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10528 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10530 nextchar(pRExC_state);
10531 ender = reg_node(pRExC_state, SUCCEED);
10532 REGTAIL(pRExC_state, ret, ender);
10533 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10535 ender = reg_node(pRExC_state, TAIL);
10536 REGTAIL(pRExC_state, ret, ender);
10539 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10541 vFAIL("Nested quantifiers");
10548 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10549 UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10550 const bool strict /* Apply stricter parsing rules? */
10554 /* This is expected to be called by a parser routine that has recognized '\N'
10555 and needs to handle the rest. RExC_parse is expected to point at the first
10556 char following the N at the time of the call. On successful return,
10557 RExC_parse has been updated to point to just after the sequence identified
10558 by this routine, and <*flagp> has been updated.
10560 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10563 \N may begin either a named sequence, or if outside a character class, mean
10564 to match a non-newline. For non single-quoted regexes, the tokenizer has
10565 attempted to decide which, and in the case of a named sequence, converted it
10566 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10567 where c1... are the characters in the sequence. For single-quoted regexes,
10568 the tokenizer passes the \N sequence through unchanged; this code will not
10569 attempt to determine this nor expand those, instead raising a syntax error.
10570 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10571 or there is no '}', it signals that this \N occurrence means to match a
10574 Only the \N{U+...} form should occur in a character class, for the same
10575 reason that '.' inside a character class means to just match a period: it
10576 just doesn't make sense.
10578 The function raises an error (via vFAIL), and doesn't return for various
10579 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
10580 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10581 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10582 only possible if node_p is non-NULL.
10585 If <valuep> is non-null, it means the caller can accept an input sequence
10586 consisting of a just a single code point; <*valuep> is set to that value
10587 if the input is such.
10589 If <node_p> is non-null it signifies that the caller can accept any other
10590 legal sequence (i.e., one that isn't just a single code point). <*node_p>
10592 1) \N means not-a-NL: points to a newly created REG_ANY node;
10593 2) \N{}: points to a new NOTHING node;
10594 3) otherwise: points to a new EXACT node containing the resolved
10596 Note that FALSE is returned for single code point sequences if <valuep> is
10600 char * endbrace; /* '}' following the name */
10602 char *endchar; /* Points to '.' or '}' ending cur char in the input
10604 bool has_multiple_chars; /* true if the input stream contains a sequence of
10605 more than one character */
10607 GET_RE_DEBUG_FLAGS_DECL;
10609 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10611 GET_RE_DEBUG_FLAGS;
10613 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10615 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10616 * modifier. The other meaning does not, so use a temporary until we find
10617 * out which we are being called with */
10618 p = (RExC_flags & RXf_PMf_EXTENDED)
10619 ? regwhite( pRExC_state, RExC_parse )
10622 /* Disambiguate between \N meaning a named character versus \N meaning
10623 * [^\n]. The former is assumed when it can't be the latter. */
10624 if (*p != '{' || regcurly(p, FALSE)) {
10627 /* no bare \N allowed in a charclass */
10628 if (in_char_class) {
10629 vFAIL("\\N in a character class must be a named character: \\N{...}");
10633 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10635 nextchar(pRExC_state);
10636 *node_p = reg_node(pRExC_state, REG_ANY);
10637 *flagp |= HASWIDTH|SIMPLE;
10639 Set_Node_Length(*node_p, 1); /* MJD */
10643 /* Here, we have decided it should be a named character or sequence */
10645 /* The test above made sure that the next real character is a '{', but
10646 * under the /x modifier, it could be separated by space (or a comment and
10647 * \n) and this is not allowed (for consistency with \x{...} and the
10648 * tokenizer handling of \N{NAME}). */
10649 if (*RExC_parse != '{') {
10650 vFAIL("Missing braces on \\N{}");
10653 RExC_parse++; /* Skip past the '{' */
10655 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10656 || ! (endbrace == RExC_parse /* nothing between the {} */
10657 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10659 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10662 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10663 vFAIL("\\N{NAME} must be resolved by the lexer");
10666 if (endbrace == RExC_parse) { /* empty: \N{} */
10669 *node_p = reg_node(pRExC_state,NOTHING);
10671 else if (in_char_class) {
10672 if (SIZE_ONLY && in_char_class) {
10674 RExC_parse++; /* Position after the "}" */
10675 vFAIL("Zero length \\N{}");
10678 ckWARNreg(RExC_parse,
10679 "Ignoring zero length \\N{} in character class");
10687 nextchar(pRExC_state);
10691 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10692 RExC_parse += 2; /* Skip past the 'U+' */
10694 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10696 /* Code points are separated by dots. If none, there is only one code
10697 * point, and is terminated by the brace */
10698 has_multiple_chars = (endchar < endbrace);
10700 if (valuep && (! has_multiple_chars || in_char_class)) {
10701 /* We only pay attention to the first char of
10702 multichar strings being returned in char classes. I kinda wonder
10703 if this makes sense as it does change the behaviour
10704 from earlier versions, OTOH that behaviour was broken
10705 as well. XXX Solution is to recharacterize as
10706 [rest-of-class]|multi1|multi2... */
10708 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10709 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10710 | PERL_SCAN_DISALLOW_PREFIX
10711 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10713 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10715 /* The tokenizer should have guaranteed validity, but it's possible to
10716 * bypass it by using single quoting, so check */
10717 if (length_of_hex == 0
10718 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10720 RExC_parse += length_of_hex; /* Includes all the valid */
10721 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10722 ? UTF8SKIP(RExC_parse)
10724 /* Guard against malformed utf8 */
10725 if (RExC_parse >= endchar) {
10726 RExC_parse = endchar;
10728 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10731 if (in_char_class && has_multiple_chars) {
10733 RExC_parse = endbrace;
10734 vFAIL("\\N{} in character class restricted to one character");
10737 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10741 RExC_parse = endbrace + 1;
10743 else if (! node_p || ! has_multiple_chars) {
10745 /* Here, the input is legal, but not according to the caller's
10746 * options. We fail without advancing the parse, so that the
10747 * caller can try again */
10753 /* What is done here is to convert this to a sub-pattern of the form
10754 * (?:\x{char1}\x{char2}...)
10755 * and then call reg recursively. That way, it retains its atomicness,
10756 * while not having to worry about special handling that some code
10757 * points may have. toke.c has converted the original Unicode values
10758 * to native, so that we can just pass on the hex values unchanged. We
10759 * do have to set a flag to keep recoding from happening in the
10762 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10764 char *orig_end = RExC_end;
10767 while (RExC_parse < endbrace) {
10769 /* Convert to notation the rest of the code understands */
10770 sv_catpv(substitute_parse, "\\x{");
10771 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10772 sv_catpv(substitute_parse, "}");
10774 /* Point to the beginning of the next character in the sequence. */
10775 RExC_parse = endchar + 1;
10776 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10778 sv_catpv(substitute_parse, ")");
10780 RExC_parse = SvPV(substitute_parse, len);
10782 /* Don't allow empty number */
10784 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10786 RExC_end = RExC_parse + len;
10788 /* The values are Unicode, and therefore not subject to recoding */
10789 RExC_override_recoding = 1;
10791 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10792 if (flags & RESTART_UTF8) {
10793 *flagp = RESTART_UTF8;
10796 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10799 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10801 RExC_parse = endbrace;
10802 RExC_end = orig_end;
10803 RExC_override_recoding = 0;
10805 nextchar(pRExC_state);
10815 * It returns the code point in utf8 for the value in *encp.
10816 * value: a code value in the source encoding
10817 * encp: a pointer to an Encode object
10819 * If the result from Encode is not a single character,
10820 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10823 S_reg_recode(pTHX_ const char value, SV **encp)
10826 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10827 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10828 const STRLEN newlen = SvCUR(sv);
10829 UV uv = UNICODE_REPLACEMENT;
10831 PERL_ARGS_ASSERT_REG_RECODE;
10835 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10838 if (!newlen || numlen != newlen) {
10839 uv = UNICODE_REPLACEMENT;
10845 PERL_STATIC_INLINE U8
10846 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10850 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10856 op = get_regex_charset(RExC_flags);
10857 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10858 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10859 been, so there is no hole */
10862 return op + EXACTF;
10865 PERL_STATIC_INLINE void
10866 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10867 regnode *node, I32* flagp, STRLEN len, UV code_point)
10869 /* This knows the details about sizing an EXACTish node, setting flags for
10870 * it (by setting <*flagp>, and potentially populating it with a single
10873 * If <len> (the length in bytes) is non-zero, this function assumes that
10874 * the node has already been populated, and just does the sizing. In this
10875 * case <code_point> should be the final code point that has already been
10876 * placed into the node. This value will be ignored except that under some
10877 * circumstances <*flagp> is set based on it.
10879 * If <len> is zero, the function assumes that the node is to contain only
10880 * the single character given by <code_point> and calculates what <len>
10881 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10882 * additionally will populate the node's STRING with <code_point> or its
10885 * In both cases <*flagp> is appropriately set
10887 * It knows that under FOLD, the Latin Sharp S and UTF characters above
10888 * 255, must be folded (the former only when the rules indicate it can
10891 bool len_passed_in = cBOOL(len != 0);
10892 U8 character[UTF8_MAXBYTES_CASE+1];
10894 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10896 if (! len_passed_in) {
10898 if (UNI_IS_INVARIANT(code_point)) {
10899 if (LOC || ! FOLD) { /* /l defers folding until runtime */
10900 *character = (U8) code_point;
10902 else { /* Here is /i and not /l (toFOLD() is defined on just
10903 ASCII, which isn't the same thing as INVARIANT on
10904 EBCDIC, but it works there, as the extra invariants
10905 fold to themselves) */
10906 *character = toFOLD((U8) code_point);
10910 else if (FOLD && (! LOC
10911 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10912 { /* Folding, and ok to do so now */
10913 _to_uni_fold_flags(code_point,
10916 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
10917 ? FOLD_FLAGS_NOMIX_ASCII
10920 else if (code_point <= MAX_UTF8_TWO_BYTE) {
10922 /* Not folding this cp, and can output it directly */
10923 *character = UTF8_TWO_BYTE_HI(code_point);
10924 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
10928 uvchr_to_utf8( character, code_point);
10929 len = UTF8SKIP(character);
10931 } /* Else pattern isn't UTF8. We only fold the sharp s, when
10933 else if (UNLIKELY(code_point == LATIN_SMALL_LETTER_SHARP_S)
10935 && AT_LEAST_UNI_SEMANTICS
10936 && ! ASCII_FOLD_RESTRICTED)
10939 *(character + 1) = 's';
10943 *character = (U8) code_point;
10949 RExC_size += STR_SZ(len);
10952 RExC_emit += STR_SZ(len);
10953 STR_LEN(node) = len;
10954 if (! len_passed_in) {
10955 Copy((char *) character, STRING(node), len, char);
10959 *flagp |= HASWIDTH;
10961 /* A single character node is SIMPLE, except for the special-cased SHARP S
10963 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10964 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10965 || ! FOLD || ! DEPENDS_SEMANTICS))
10972 /* return atoi(p), unless it's too big to sensibly be a backref,
10973 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
10976 S_backref_value(char *p)
10980 for (;isDIGIT(*q); q++); /* calculate length of num */
10981 if (q - p == 0 || q - p > 9)
10988 - regatom - the lowest level
10990 Try to identify anything special at the start of the pattern. If there
10991 is, then handle it as required. This may involve generating a single regop,
10992 such as for an assertion; or it may involve recursing, such as to
10993 handle a () structure.
10995 If the string doesn't start with something special then we gobble up
10996 as much literal text as we can.
10998 Once we have been able to handle whatever type of thing started the
10999 sequence, we return.
11001 Note: we have to be careful with escapes, as they can be both literal
11002 and special, and in the case of \10 and friends, context determines which.
11004 A summary of the code structure is:
11006 switch (first_byte) {
11007 cases for each special:
11008 handle this special;
11011 switch (2nd byte) {
11012 cases for each unambiguous special:
11013 handle this special;
11015 cases for each ambigous special/literal:
11017 if (special) handle here
11019 default: // unambiguously literal:
11022 default: // is a literal char
11025 create EXACTish node for literal;
11026 while (more input and node isn't full) {
11027 switch (input_byte) {
11028 cases for each special;
11029 make sure parse pointer is set so that the next call to
11030 regatom will see this special first
11031 goto loopdone; // EXACTish node terminated by prev. char
11033 append char to EXACTISH node;
11035 get next input byte;
11039 return the generated node;
11041 Specifically there are two separate switches for handling
11042 escape sequences, with the one for handling literal escapes requiring
11043 a dummy entry for all of the special escapes that are actually handled
11046 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11048 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11050 Otherwise does not return NULL.
11054 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11057 regnode *ret = NULL;
11059 char *parse_start = RExC_parse;
11063 GET_RE_DEBUG_FLAGS_DECL;
11065 *flagp = WORST; /* Tentatively. */
11067 DEBUG_PARSE("atom");
11069 PERL_ARGS_ASSERT_REGATOM;
11072 switch ((U8)*RExC_parse) {
11074 RExC_seen_zerolen++;
11075 nextchar(pRExC_state);
11076 if (RExC_flags & RXf_PMf_MULTILINE)
11077 ret = reg_node(pRExC_state, MBOL);
11078 else if (RExC_flags & RXf_PMf_SINGLELINE)
11079 ret = reg_node(pRExC_state, SBOL);
11081 ret = reg_node(pRExC_state, BOL);
11082 Set_Node_Length(ret, 1); /* MJD */
11085 nextchar(pRExC_state);
11087 RExC_seen_zerolen++;
11088 if (RExC_flags & RXf_PMf_MULTILINE)
11089 ret = reg_node(pRExC_state, MEOL);
11090 else if (RExC_flags & RXf_PMf_SINGLELINE)
11091 ret = reg_node(pRExC_state, SEOL);
11093 ret = reg_node(pRExC_state, EOL);
11094 Set_Node_Length(ret, 1); /* MJD */
11097 nextchar(pRExC_state);
11098 if (RExC_flags & RXf_PMf_SINGLELINE)
11099 ret = reg_node(pRExC_state, SANY);
11101 ret = reg_node(pRExC_state, REG_ANY);
11102 *flagp |= HASWIDTH|SIMPLE;
11104 Set_Node_Length(ret, 1); /* MJD */
11108 char * const oregcomp_parse = ++RExC_parse;
11109 ret = regclass(pRExC_state, flagp,depth+1,
11110 FALSE, /* means parse the whole char class */
11111 TRUE, /* allow multi-char folds */
11112 FALSE, /* don't silence non-portable warnings. */
11114 if (*RExC_parse != ']') {
11115 RExC_parse = oregcomp_parse;
11116 vFAIL("Unmatched [");
11119 if (*flagp & RESTART_UTF8)
11121 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11124 nextchar(pRExC_state);
11125 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11129 nextchar(pRExC_state);
11130 ret = reg(pRExC_state, 2, &flags,depth+1);
11132 if (flags & TRYAGAIN) {
11133 if (RExC_parse == RExC_end) {
11134 /* Make parent create an empty node if needed. */
11135 *flagp |= TRYAGAIN;
11140 if (flags & RESTART_UTF8) {
11141 *flagp = RESTART_UTF8;
11144 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11147 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11151 if (flags & TRYAGAIN) {
11152 *flagp |= TRYAGAIN;
11155 vFAIL("Internal urp");
11156 /* Supposed to be caught earlier. */
11159 if (!regcurly(RExC_parse, FALSE)) {
11168 vFAIL("Quantifier follows nothing");
11173 This switch handles escape sequences that resolve to some kind
11174 of special regop and not to literal text. Escape sequnces that
11175 resolve to literal text are handled below in the switch marked
11178 Every entry in this switch *must* have a corresponding entry
11179 in the literal escape switch. However, the opposite is not
11180 required, as the default for this switch is to jump to the
11181 literal text handling code.
11183 switch ((U8)*++RExC_parse) {
11185 /* Special Escapes */
11187 RExC_seen_zerolen++;
11188 ret = reg_node(pRExC_state, SBOL);
11190 goto finish_meta_pat;
11192 ret = reg_node(pRExC_state, GPOS);
11193 RExC_seen |= REG_SEEN_GPOS;
11195 goto finish_meta_pat;
11197 RExC_seen_zerolen++;
11198 ret = reg_node(pRExC_state, KEEPS);
11200 /* XXX:dmq : disabling in-place substitution seems to
11201 * be necessary here to avoid cases of memory corruption, as
11202 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11204 RExC_seen |= REG_SEEN_LOOKBEHIND;
11205 goto finish_meta_pat;
11207 ret = reg_node(pRExC_state, SEOL);
11209 RExC_seen_zerolen++; /* Do not optimize RE away */
11210 goto finish_meta_pat;
11212 ret = reg_node(pRExC_state, EOS);
11214 RExC_seen_zerolen++; /* Do not optimize RE away */
11215 goto finish_meta_pat;
11217 ret = reg_node(pRExC_state, CANY);
11218 RExC_seen |= REG_SEEN_CANY;
11219 *flagp |= HASWIDTH|SIMPLE;
11220 goto finish_meta_pat;
11222 ret = reg_node(pRExC_state, CLUMP);
11223 *flagp |= HASWIDTH;
11224 goto finish_meta_pat;
11230 arg = ANYOF_WORDCHAR;
11234 RExC_seen_zerolen++;
11235 RExC_seen |= REG_SEEN_LOOKBEHIND;
11236 op = BOUND + get_regex_charset(RExC_flags);
11237 if (op > BOUNDA) { /* /aa is same as /a */
11240 ret = reg_node(pRExC_state, op);
11241 FLAGS(ret) = get_regex_charset(RExC_flags);
11243 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11244 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
11246 goto finish_meta_pat;
11248 RExC_seen_zerolen++;
11249 RExC_seen |= REG_SEEN_LOOKBEHIND;
11250 op = NBOUND + get_regex_charset(RExC_flags);
11251 if (op > NBOUNDA) { /* /aa is same as /a */
11254 ret = reg_node(pRExC_state, op);
11255 FLAGS(ret) = get_regex_charset(RExC_flags);
11257 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11258 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
11260 goto finish_meta_pat;
11270 ret = reg_node(pRExC_state, LNBREAK);
11271 *flagp |= HASWIDTH|SIMPLE;
11272 goto finish_meta_pat;
11280 goto join_posix_op_known;
11286 arg = ANYOF_VERTWS;
11288 goto join_posix_op_known;
11298 op = POSIXD + get_regex_charset(RExC_flags);
11299 if (op > POSIXA) { /* /aa is same as /a */
11303 join_posix_op_known:
11306 op += NPOSIXD - POSIXD;
11309 ret = reg_node(pRExC_state, op);
11311 FLAGS(ret) = namedclass_to_classnum(arg);
11314 *flagp |= HASWIDTH|SIMPLE;
11318 nextchar(pRExC_state);
11319 Set_Node_Length(ret, 2); /* MJD */
11325 char* parse_start = RExC_parse - 2;
11330 ret = regclass(pRExC_state, flagp,depth+1,
11331 TRUE, /* means just parse this element */
11332 FALSE, /* don't allow multi-char folds */
11333 FALSE, /* don't silence non-portable warnings.
11334 It would be a bug if these returned
11337 /* regclass() can only return RESTART_UTF8 if multi-char folds
11340 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11345 Set_Node_Offset(ret, parse_start + 2);
11346 Set_Node_Cur_Length(ret, parse_start);
11347 nextchar(pRExC_state);
11351 /* Handle \N and \N{NAME} with multiple code points here and not
11352 * below because it can be multicharacter. join_exact() will join
11353 * them up later on. Also this makes sure that things like
11354 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11355 * The options to the grok function call causes it to fail if the
11356 * sequence is just a single code point. We then go treat it as
11357 * just another character in the current EXACT node, and hence it
11358 * gets uniform treatment with all the other characters. The
11359 * special treatment for quantifiers is not needed for such single
11360 * character sequences */
11362 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11363 FALSE /* not strict */ )) {
11364 if (*flagp & RESTART_UTF8)
11370 case 'k': /* Handle \k<NAME> and \k'NAME' */
11373 char ch= RExC_parse[1];
11374 if (ch != '<' && ch != '\'' && ch != '{') {
11376 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11377 vFAIL2("Sequence %.2s... not terminated",parse_start);
11379 /* this pretty much dupes the code for (?P=...) in reg(), if
11380 you change this make sure you change that */
11381 char* name_start = (RExC_parse += 2);
11383 SV *sv_dat = reg_scan_name(pRExC_state,
11384 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11385 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11386 if (RExC_parse == name_start || *RExC_parse != ch)
11387 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11388 vFAIL2("Sequence %.3s... not terminated",parse_start);
11391 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11392 RExC_rxi->data->data[num]=(void*)sv_dat;
11393 SvREFCNT_inc_simple_void(sv_dat);
11397 ret = reganode(pRExC_state,
11400 : (ASCII_FOLD_RESTRICTED)
11402 : (AT_LEAST_UNI_SEMANTICS)
11408 *flagp |= HASWIDTH;
11410 /* override incorrect value set in reganode MJD */
11411 Set_Node_Offset(ret, parse_start+1);
11412 Set_Node_Cur_Length(ret, parse_start);
11413 nextchar(pRExC_state);
11419 case '1': case '2': case '3': case '4':
11420 case '5': case '6': case '7': case '8': case '9':
11425 if (*RExC_parse == 'g') {
11429 if (*RExC_parse == '{') {
11433 if (*RExC_parse == '-') {
11437 if (hasbrace && !isDIGIT(*RExC_parse)) {
11438 if (isrel) RExC_parse--;
11440 goto parse_named_seq;
11443 num = S_backref_value(RExC_parse);
11445 vFAIL("Reference to invalid group 0");
11446 else if (num == I32_MAX) {
11447 if (isDIGIT(*RExC_parse))
11448 vFAIL("Reference to nonexistent group");
11450 vFAIL("Unterminated \\g... pattern");
11454 num = RExC_npar - num;
11456 vFAIL("Reference to nonexistent or unclosed group");
11460 num = S_backref_value(RExC_parse);
11461 /* bare \NNN might be backref or octal */
11462 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11463 && *RExC_parse != '8' && *RExC_parse != '9'))
11464 /* Probably a character specified in octal, e.g. \35 */
11468 /* at this point RExC_parse definitely points to a backref
11471 #ifdef RE_TRACK_PATTERN_OFFSETS
11472 char * const parse_start = RExC_parse - 1; /* MJD */
11474 while (isDIGIT(*RExC_parse))
11477 if (*RExC_parse != '}')
11478 vFAIL("Unterminated \\g{...} pattern");
11482 if (num > (I32)RExC_rx->nparens)
11483 vFAIL("Reference to nonexistent group");
11486 ret = reganode(pRExC_state,
11489 : (ASCII_FOLD_RESTRICTED)
11491 : (AT_LEAST_UNI_SEMANTICS)
11497 *flagp |= HASWIDTH;
11499 /* override incorrect value set in reganode MJD */
11500 Set_Node_Offset(ret, parse_start+1);
11501 Set_Node_Cur_Length(ret, parse_start);
11503 nextchar(pRExC_state);
11508 if (RExC_parse >= RExC_end)
11509 FAIL("Trailing \\");
11512 /* Do not generate "unrecognized" warnings here, we fall
11513 back into the quick-grab loop below */
11520 if (RExC_flags & RXf_PMf_EXTENDED) {
11521 if ( reg_skipcomment( pRExC_state ) )
11528 parse_start = RExC_parse - 1;
11537 #define MAX_NODE_STRING_SIZE 127
11538 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11540 U8 upper_parse = MAX_NODE_STRING_SIZE;
11541 U8 node_type = compute_EXACTish(pRExC_state);
11542 bool next_is_quantifier;
11543 char * oldp = NULL;
11545 /* We can convert EXACTF nodes to EXACTFU if they contain only
11546 * characters that match identically regardless of the target
11547 * string's UTF8ness. The reason to do this is that EXACTF is not
11548 * trie-able, EXACTFU is.
11550 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11551 * contain only above-Latin1 characters (hence must be in UTF8),
11552 * which don't participate in folds with Latin1-range characters,
11553 * as the latter's folds aren't known until runtime. (We don't
11554 * need to figure this out until pass 2) */
11555 bool maybe_exactfu = PASS2
11556 && (node_type == EXACTF || node_type == EXACTFL);
11558 /* If a folding node contains only code points that don't
11559 * participate in folds, it can be changed into an EXACT node,
11560 * which allows the optimizer more things to look for */
11563 ret = reg_node(pRExC_state, node_type);
11565 /* In pass1, folded, we use a temporary buffer instead of the
11566 * actual node, as the node doesn't exist yet */
11567 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11573 /* We do the EXACTFish to EXACT node only if folding. (And we
11574 * don't need to figure this out until pass 2) */
11575 maybe_exact = FOLD && PASS2;
11577 /* XXX The node can hold up to 255 bytes, yet this only goes to
11578 * 127. I (khw) do not know why. Keeping it somewhat less than
11579 * 255 allows us to not have to worry about overflow due to
11580 * converting to utf8 and fold expansion, but that value is
11581 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11582 * split up by this limit into a single one using the real max of
11583 * 255. Even at 127, this breaks under rare circumstances. If
11584 * folding, we do not want to split a node at a character that is a
11585 * non-final in a multi-char fold, as an input string could just
11586 * happen to want to match across the node boundary. The join
11587 * would solve that problem if the join actually happens. But a
11588 * series of more than two nodes in a row each of 127 would cause
11589 * the first join to succeed to get to 254, but then there wouldn't
11590 * be room for the next one, which could at be one of those split
11591 * multi-char folds. I don't know of any fool-proof solution. One
11592 * could back off to end with only a code point that isn't such a
11593 * non-final, but it is possible for there not to be any in the
11595 for (p = RExC_parse - 1;
11596 len < upper_parse && p < RExC_end;
11601 if (RExC_flags & RXf_PMf_EXTENDED)
11602 p = regwhite( pRExC_state, p );
11613 /* Literal Escapes Switch
11615 This switch is meant to handle escape sequences that
11616 resolve to a literal character.
11618 Every escape sequence that represents something
11619 else, like an assertion or a char class, is handled
11620 in the switch marked 'Special Escapes' above in this
11621 routine, but also has an entry here as anything that
11622 isn't explicitly mentioned here will be treated as
11623 an unescaped equivalent literal.
11626 switch ((U8)*++p) {
11627 /* These are all the special escapes. */
11628 case 'A': /* Start assertion */
11629 case 'b': case 'B': /* Word-boundary assertion*/
11630 case 'C': /* Single char !DANGEROUS! */
11631 case 'd': case 'D': /* digit class */
11632 case 'g': case 'G': /* generic-backref, pos assertion */
11633 case 'h': case 'H': /* HORIZWS */
11634 case 'k': case 'K': /* named backref, keep marker */
11635 case 'p': case 'P': /* Unicode property */
11636 case 'R': /* LNBREAK */
11637 case 's': case 'S': /* space class */
11638 case 'v': case 'V': /* VERTWS */
11639 case 'w': case 'W': /* word class */
11640 case 'X': /* eXtended Unicode "combining
11641 character sequence" */
11642 case 'z': case 'Z': /* End of line/string assertion */
11646 /* Anything after here is an escape that resolves to a
11647 literal. (Except digits, which may or may not)
11653 case 'N': /* Handle a single-code point named character. */
11654 /* The options cause it to fail if a multiple code
11655 * point sequence. Handle those in the switch() above
11657 RExC_parse = p + 1;
11658 if (! grok_bslash_N(pRExC_state, NULL, &ender,
11659 flagp, depth, FALSE,
11660 FALSE /* not strict */ ))
11662 if (*flagp & RESTART_UTF8)
11663 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11664 RExC_parse = p = oldp;
11668 if (ender > 0xff) {
11685 ender = ASCII_TO_NATIVE('\033');
11695 const char* error_msg;
11697 bool valid = grok_bslash_o(&p,
11700 TRUE, /* out warnings */
11701 FALSE, /* not strict */
11702 TRUE, /* Output warnings
11707 RExC_parse = p; /* going to die anyway; point
11708 to exact spot of failure */
11712 if (PL_encoding && ender < 0x100) {
11713 goto recode_encoding;
11715 if (ender > 0xff) {
11722 UV result = UV_MAX; /* initialize to erroneous
11724 const char* error_msg;
11726 bool valid = grok_bslash_x(&p,
11729 TRUE, /* out warnings */
11730 FALSE, /* not strict */
11731 TRUE, /* Output warnings
11736 RExC_parse = p; /* going to die anyway; point
11737 to exact spot of failure */
11742 if (PL_encoding && ender < 0x100) {
11743 goto recode_encoding;
11745 if (ender > 0xff) {
11752 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11754 case '8': case '9': /* must be a backreference */
11757 case '1': case '2': case '3':case '4':
11758 case '5': case '6': case '7':
11759 /* When we parse backslash escapes there is ambiguity
11760 * between backreferences and octal escapes. Any escape
11761 * from \1 - \9 is a backreference, any multi-digit
11762 * escape which does not start with 0 and which when
11763 * evaluated as decimal could refer to an already
11764 * parsed capture buffer is a backslash. Anything else
11767 * Note this implies that \118 could be interpreted as
11768 * 118 OR as "\11" . "8" depending on whether there
11769 * were 118 capture buffers defined already in the
11771 if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
11772 { /* Not to be treated as an octal constant, go
11779 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11781 ender = grok_oct(p, &numlen, &flags, NULL);
11782 if (ender > 0xff) {
11786 if (SIZE_ONLY /* like \08, \178 */
11789 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11791 reg_warn_non_literal_string(
11793 form_short_octal_warning(p, numlen));
11796 if (PL_encoding && ender < 0x100)
11797 goto recode_encoding;
11800 if (! RExC_override_recoding) {
11801 SV* enc = PL_encoding;
11802 ender = reg_recode((const char)(U8)ender, &enc);
11803 if (!enc && SIZE_ONLY)
11804 ckWARNreg(p, "Invalid escape in the specified encoding");
11810 FAIL("Trailing \\");
11813 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11814 /* Include any { following the alpha to emphasize
11815 * that it could be part of an escape at some point
11817 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11818 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11820 goto normal_default;
11821 } /* End of switch on '\' */
11823 default: /* A literal character */
11826 && RExC_flags & RXf_PMf_EXTENDED
11827 && ckWARN_d(WARN_DEPRECATED)
11828 && is_PATWS_non_low(p, UTF))
11830 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11831 "Escape literal pattern white space under /x");
11835 if (UTF8_IS_START(*p) && UTF) {
11837 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11838 &numlen, UTF8_ALLOW_DEFAULT);
11844 } /* End of switch on the literal */
11846 /* Here, have looked at the literal character and <ender>
11847 * contains its ordinal, <p> points to the character after it
11850 if ( RExC_flags & RXf_PMf_EXTENDED)
11851 p = regwhite( pRExC_state, p );
11853 /* If the next thing is a quantifier, it applies to this
11854 * character only, which means that this character has to be in
11855 * its own node and can't just be appended to the string in an
11856 * existing node, so if there are already other characters in
11857 * the node, close the node with just them, and set up to do
11858 * this character again next time through, when it will be the
11859 * only thing in its new node */
11860 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11866 if (! FOLD /* The simple case, just append the literal */
11867 || (LOC /* Also don't fold for tricky chars under /l */
11868 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
11871 const STRLEN unilen = reguni(pRExC_state, ender, s);
11877 /* The loop increments <len> each time, as all but this
11878 * path (and one other) through it add a single byte to
11879 * the EXACTish node. But this one has changed len to
11880 * be the correct final value, so subtract one to
11881 * cancel out the increment that follows */
11885 REGC((char)ender, s++);
11888 /* Can get here if folding only if is one of the /l
11889 * characters whose fold depends on the locale. The
11890 * occurrence of any of these indicate that we can't
11891 * simplify things */
11893 maybe_exact = FALSE;
11894 maybe_exactfu = FALSE;
11899 /* See comments for join_exact() as to why we fold this
11900 * non-UTF at compile time */
11901 || (node_type == EXACTFU
11902 && ender == LATIN_SMALL_LETTER_SHARP_S)))
11904 /* Here, are folding and are not UTF-8 encoded; therefore
11905 * the character must be in the range 0-255, and is not /l
11906 * (Not /l because we already handled these under /l in
11907 * is_PROBLEMATIC_LOCALE_FOLD_cp */
11908 if (IS_IN_SOME_FOLD_L1(ender)) {
11909 maybe_exact = FALSE;
11911 /* See if the character's fold differs between /d and
11912 * /u. This includes the multi-char fold SHARP S to
11915 && (PL_fold[ender] != PL_fold_latin1[ender]
11916 || ender == LATIN_SMALL_LETTER_SHARP_S
11918 && isARG2_lower_or_UPPER_ARG1('s', ender)
11919 && isARG2_lower_or_UPPER_ARG1('s',
11922 maybe_exactfu = FALSE;
11926 /* Even when folding, we store just the input character, as
11927 * we have an array that finds its fold quickly */
11928 *(s++) = (char) ender;
11930 else { /* FOLD and UTF */
11931 /* Unlike the non-fold case, we do actually have to
11932 * calculate the results here in pass 1. This is for two
11933 * reasons, the folded length may be longer than the
11934 * unfolded, and we have to calculate how many EXACTish
11935 * nodes it will take; and we may run out of room in a node
11936 * in the middle of a potential multi-char fold, and have
11937 * to back off accordingly. (Hence we can't use REGC for
11938 * the simple case just below.) */
11941 if (isASCII(ender)) {
11942 folded = toFOLD(ender);
11943 *(s)++ = (U8) folded;
11948 folded = _to_uni_fold_flags(
11952 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11953 ? FOLD_FLAGS_NOMIX_ASCII
11957 /* The loop increments <len> each time, as all but this
11958 * path (and one other) through it add a single byte to
11959 * the EXACTish node. But this one has changed len to
11960 * be the correct final value, so subtract one to
11961 * cancel out the increment that follows */
11962 len += foldlen - 1;
11964 /* If this node only contains non-folding code points so
11965 * far, see if this new one is also non-folding */
11967 if (folded != ender) {
11968 maybe_exact = FALSE;
11971 /* Here the fold is the original; we have to check
11972 * further to see if anything folds to it */
11973 if (_invlist_contains_cp(PL_utf8_foldable,
11976 maybe_exact = FALSE;
11983 if (next_is_quantifier) {
11985 /* Here, the next input is a quantifier, and to get here,
11986 * the current character is the only one in the node.
11987 * Also, here <len> doesn't include the final byte for this
11993 } /* End of loop through literal characters */
11995 /* Here we have either exhausted the input or ran out of room in
11996 * the node. (If we encountered a character that can't be in the
11997 * node, transfer is made directly to <loopdone>, and so we
11998 * wouldn't have fallen off the end of the loop.) In the latter
11999 * case, we artificially have to split the node into two, because
12000 * we just don't have enough space to hold everything. This
12001 * creates a problem if the final character participates in a
12002 * multi-character fold in the non-final position, as a match that
12003 * should have occurred won't, due to the way nodes are matched,
12004 * and our artificial boundary. So back off until we find a non-
12005 * problematic character -- one that isn't at the beginning or
12006 * middle of such a fold. (Either it doesn't participate in any
12007 * folds, or appears only in the final position of all the folds it
12008 * does participate in.) A better solution with far fewer false
12009 * positives, and that would fill the nodes more completely, would
12010 * be to actually have available all the multi-character folds to
12011 * test against, and to back-off only far enough to be sure that
12012 * this node isn't ending with a partial one. <upper_parse> is set
12013 * further below (if we need to reparse the node) to include just
12014 * up through that final non-problematic character that this code
12015 * identifies, so when it is set to less than the full node, we can
12016 * skip the rest of this */
12017 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12019 const STRLEN full_len = len;
12021 assert(len >= MAX_NODE_STRING_SIZE);
12023 /* Here, <s> points to the final byte of the final character.
12024 * Look backwards through the string until find a non-
12025 * problematic character */
12029 /* This has no multi-char folds to non-UTF characters */
12030 if (ASCII_FOLD_RESTRICTED) {
12034 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12038 if (! PL_NonL1NonFinalFold) {
12039 PL_NonL1NonFinalFold = _new_invlist_C_array(
12040 NonL1_Perl_Non_Final_Folds_invlist);
12043 /* Point to the first byte of the final character */
12044 s = (char *) utf8_hop((U8 *) s, -1);
12046 while (s >= s0) { /* Search backwards until find
12047 non-problematic char */
12048 if (UTF8_IS_INVARIANT(*s)) {
12050 /* There are no ascii characters that participate
12051 * in multi-char folds under /aa. In EBCDIC, the
12052 * non-ascii invariants are all control characters,
12053 * so don't ever participate in any folds. */
12054 if (ASCII_FOLD_RESTRICTED
12055 || ! IS_NON_FINAL_FOLD(*s))
12060 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12061 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12067 else if (! _invlist_contains_cp(
12068 PL_NonL1NonFinalFold,
12069 valid_utf8_to_uvchr((U8 *) s, NULL)))
12074 /* Here, the current character is problematic in that
12075 * it does occur in the non-final position of some
12076 * fold, so try the character before it, but have to
12077 * special case the very first byte in the string, so
12078 * we don't read outside the string */
12079 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12080 } /* End of loop backwards through the string */
12082 /* If there were only problematic characters in the string,
12083 * <s> will point to before s0, in which case the length
12084 * should be 0, otherwise include the length of the
12085 * non-problematic character just found */
12086 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12089 /* Here, have found the final character, if any, that is
12090 * non-problematic as far as ending the node without splitting
12091 * it across a potential multi-char fold. <len> contains the
12092 * number of bytes in the node up-to and including that
12093 * character, or is 0 if there is no such character, meaning
12094 * the whole node contains only problematic characters. In
12095 * this case, give up and just take the node as-is. We can't
12100 /* If the node ends in an 's' we make sure it stays EXACTF,
12101 * as if it turns into an EXACTFU, it could later get
12102 * joined with another 's' that would then wrongly match
12104 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12106 maybe_exactfu = FALSE;
12110 /* Here, the node does contain some characters that aren't
12111 * problematic. If one such is the final character in the
12112 * node, we are done */
12113 if (len == full_len) {
12116 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12118 /* If the final character is problematic, but the
12119 * penultimate is not, back-off that last character to
12120 * later start a new node with it */
12125 /* Here, the final non-problematic character is earlier
12126 * in the input than the penultimate character. What we do
12127 * is reparse from the beginning, going up only as far as
12128 * this final ok one, thus guaranteeing that the node ends
12129 * in an acceptable character. The reason we reparse is
12130 * that we know how far in the character is, but we don't
12131 * know how to correlate its position with the input parse.
12132 * An alternate implementation would be to build that
12133 * correlation as we go along during the original parse,
12134 * but that would entail extra work for every node, whereas
12135 * this code gets executed only when the string is too
12136 * large for the node, and the final two characters are
12137 * problematic, an infrequent occurrence. Yet another
12138 * possible strategy would be to save the tail of the
12139 * string, and the next time regatom is called, initialize
12140 * with that. The problem with this is that unless you
12141 * back off one more character, you won't be guaranteed
12142 * regatom will get called again, unless regbranch,
12143 * regpiece ... are also changed. If you do back off that
12144 * extra character, so that there is input guaranteed to
12145 * force calling regatom, you can't handle the case where
12146 * just the first character in the node is acceptable. I
12147 * (khw) decided to try this method which doesn't have that
12148 * pitfall; if performance issues are found, we can do a
12149 * combination of the current approach plus that one */
12155 } /* End of verifying node ends with an appropriate char */
12157 loopdone: /* Jumped to when encounters something that shouldn't be in
12160 /* I (khw) don't know if you can get here with zero length, but the
12161 * old code handled this situation by creating a zero-length EXACT
12162 * node. Might as well be NOTHING instead */
12168 /* If 'maybe_exact' is still set here, means there are no
12169 * code points in the node that participate in folds;
12170 * similarly for 'maybe_exactfu' and code points that match
12171 * differently depending on UTF8ness of the target string
12172 * (for /u), or depending on locale for /l */
12176 else if (maybe_exactfu) {
12180 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
12183 RExC_parse = p - 1;
12184 Set_Node_Cur_Length(ret, parse_start);
12185 nextchar(pRExC_state);
12187 /* len is STRLEN which is unsigned, need to copy to signed */
12190 vFAIL("Internal disaster");
12193 } /* End of label 'defchar:' */
12195 } /* End of giant switch on input character */
12201 S_regwhite( RExC_state_t *pRExC_state, char *p )
12203 const char *e = RExC_end;
12205 PERL_ARGS_ASSERT_REGWHITE;
12210 else if (*p == '#') {
12213 if (*p++ == '\n') {
12219 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12228 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12230 /* Returns the next non-pattern-white space, non-comment character (the
12231 * latter only if 'recognize_comment is true) in the string p, which is
12232 * ended by RExC_end. If there is no line break ending a comment,
12233 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
12234 const char *e = RExC_end;
12236 PERL_ARGS_ASSERT_REGPATWS;
12240 if ((len = is_PATWS_safe(p, e, UTF))) {
12243 else if (recognize_comment && *p == '#') {
12247 if (is_LNBREAK_safe(p, e, UTF)) {
12253 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12262 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12264 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12265 * sets up the bitmap and any flags, removing those code points from the
12266 * inversion list, setting it to NULL should it become completely empty */
12268 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12269 assert(PL_regkind[OP(node)] == ANYOF);
12271 ANYOF_BITMAP_ZERO(node);
12272 if (*invlist_ptr) {
12274 /* This gets set if we actually need to modify things */
12275 bool change_invlist = FALSE;
12279 /* Start looking through *invlist_ptr */
12280 invlist_iterinit(*invlist_ptr);
12281 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12285 if (end == UV_MAX && start <= 256) {
12286 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12289 /* Quit if are above what we should change */
12294 change_invlist = TRUE;
12296 /* Set all the bits in the range, up to the max that we are doing */
12297 high = (end < 255) ? end : 255;
12298 for (i = start; i <= (int) high; i++) {
12299 if (! ANYOF_BITMAP_TEST(node, i)) {
12300 ANYOF_BITMAP_SET(node, i);
12304 invlist_iterfinish(*invlist_ptr);
12306 /* Done with loop; remove any code points that are in the bitmap from
12307 * *invlist_ptr; similarly for code points above latin1 if we have a
12308 * flag to match all of them anyways */
12309 if (change_invlist) {
12310 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12312 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12313 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12316 /* If have completely emptied it, remove it completely */
12317 if (_invlist_len(*invlist_ptr) == 0) {
12318 SvREFCNT_dec_NN(*invlist_ptr);
12319 *invlist_ptr = NULL;
12324 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12325 Character classes ([:foo:]) can also be negated ([:^foo:]).
12326 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12327 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12328 but trigger failures because they are currently unimplemented. */
12330 #define POSIXCC_DONE(c) ((c) == ':')
12331 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12332 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12334 PERL_STATIC_INLINE I32
12335 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12338 I32 namedclass = OOB_NAMEDCLASS;
12340 PERL_ARGS_ASSERT_REGPPOSIXCC;
12342 if (value == '[' && RExC_parse + 1 < RExC_end &&
12343 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12344 POSIXCC(UCHARAT(RExC_parse)))
12346 const char c = UCHARAT(RExC_parse);
12347 char* const s = RExC_parse++;
12349 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12351 if (RExC_parse == RExC_end) {
12354 /* Try to give a better location for the error (than the end of
12355 * the string) by looking for the matching ']' */
12357 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12360 vFAIL2("Unmatched '%c' in POSIX class", c);
12362 /* Grandfather lone [:, [=, [. */
12366 const char* const t = RExC_parse++; /* skip over the c */
12369 if (UCHARAT(RExC_parse) == ']') {
12370 const char *posixcc = s + 1;
12371 RExC_parse++; /* skip over the ending ] */
12374 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12375 const I32 skip = t - posixcc;
12377 /* Initially switch on the length of the name. */
12380 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12381 this is the Perl \w
12383 namedclass = ANYOF_WORDCHAR;
12386 /* Names all of length 5. */
12387 /* alnum alpha ascii blank cntrl digit graph lower
12388 print punct space upper */
12389 /* Offset 4 gives the best switch position. */
12390 switch (posixcc[4]) {
12392 if (memEQ(posixcc, "alph", 4)) /* alpha */
12393 namedclass = ANYOF_ALPHA;
12396 if (memEQ(posixcc, "spac", 4)) /* space */
12397 namedclass = ANYOF_PSXSPC;
12400 if (memEQ(posixcc, "grap", 4)) /* graph */
12401 namedclass = ANYOF_GRAPH;
12404 if (memEQ(posixcc, "asci", 4)) /* ascii */
12405 namedclass = ANYOF_ASCII;
12408 if (memEQ(posixcc, "blan", 4)) /* blank */
12409 namedclass = ANYOF_BLANK;
12412 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12413 namedclass = ANYOF_CNTRL;
12416 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12417 namedclass = ANYOF_ALPHANUMERIC;
12420 if (memEQ(posixcc, "lowe", 4)) /* lower */
12421 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12422 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12423 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12426 if (memEQ(posixcc, "digi", 4)) /* digit */
12427 namedclass = ANYOF_DIGIT;
12428 else if (memEQ(posixcc, "prin", 4)) /* print */
12429 namedclass = ANYOF_PRINT;
12430 else if (memEQ(posixcc, "punc", 4)) /* punct */
12431 namedclass = ANYOF_PUNCT;
12436 if (memEQ(posixcc, "xdigit", 6))
12437 namedclass = ANYOF_XDIGIT;
12441 if (namedclass == OOB_NAMEDCLASS)
12443 "POSIX class [:%"UTF8f":] unknown",
12444 UTF8fARG(UTF, t - s - 1, s + 1));
12446 /* The #defines are structured so each complement is +1 to
12447 * the normal one */
12451 assert (posixcc[skip] == ':');
12452 assert (posixcc[skip+1] == ']');
12453 } else if (!SIZE_ONLY) {
12454 /* [[=foo=]] and [[.foo.]] are still future. */
12456 /* adjust RExC_parse so the warning shows after
12457 the class closes */
12458 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12460 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12463 /* Maternal grandfather:
12464 * "[:" ending in ":" but not in ":]" */
12466 vFAIL("Unmatched '[' in POSIX class");
12469 /* Grandfather lone [:, [=, [. */
12479 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12481 /* This applies some heuristics at the current parse position (which should
12482 * be at a '[') to see if what follows might be intended to be a [:posix:]
12483 * class. It returns true if it really is a posix class, of course, but it
12484 * also can return true if it thinks that what was intended was a posix
12485 * class that didn't quite make it.
12487 * It will return true for
12489 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12490 * ')' indicating the end of the (?[
12491 * [:any garbage including %^&$ punctuation:]
12493 * This is designed to be called only from S_handle_regex_sets; it could be
12494 * easily adapted to be called from the spot at the beginning of regclass()
12495 * that checks to see in a normal bracketed class if the surrounding []
12496 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12497 * change long-standing behavior, so I (khw) didn't do that */
12498 char* p = RExC_parse + 1;
12499 char first_char = *p;
12501 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12503 assert(*(p - 1) == '[');
12505 if (! POSIXCC(first_char)) {
12510 while (p < RExC_end && isWORDCHAR(*p)) p++;
12512 if (p >= RExC_end) {
12516 if (p - RExC_parse > 2 /* Got at least 1 word character */
12517 && (*p == first_char
12518 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12523 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12526 && p - RExC_parse > 2 /* [:] evaluates to colon;
12527 [::] is a bad posix class. */
12528 && first_char == *(p - 1));
12532 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12533 I32 *flagp, U32 depth,
12534 char * const oregcomp_parse)
12536 /* Handle the (?[...]) construct to do set operations */
12539 UV start, end; /* End points of code point ranges */
12541 char *save_end, *save_parse;
12546 const bool save_fold = FOLD;
12548 GET_RE_DEBUG_FLAGS_DECL;
12550 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12553 vFAIL("(?[...]) not valid in locale");
12555 RExC_uni_semantics = 1;
12557 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12558 * (such as EXACT). Thus we can skip most everything if just sizing. We
12559 * call regclass to handle '[]' so as to not have to reinvent its parsing
12560 * rules here (throwing away the size it computes each time). And, we exit
12561 * upon an unescaped ']' that isn't one ending a regclass. To do both
12562 * these things, we need to realize that something preceded by a backslash
12563 * is escaped, so we have to keep track of backslashes */
12565 UV depth = 0; /* how many nested (?[...]) constructs */
12567 Perl_ck_warner_d(aTHX_
12568 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12569 "The regex_sets feature is experimental" REPORT_LOCATION,
12570 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12572 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12573 RExC_precomp + (RExC_parse - RExC_precomp)));
12575 while (RExC_parse < RExC_end) {
12576 SV* current = NULL;
12577 RExC_parse = regpatws(pRExC_state, RExC_parse,
12578 TRUE); /* means recognize comments */
12579 switch (*RExC_parse) {
12581 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12586 /* Skip the next byte (which could cause us to end up in
12587 * the middle of a UTF-8 character, but since none of those
12588 * are confusable with anything we currently handle in this
12589 * switch (invariants all), it's safe. We'll just hit the
12590 * default: case next time and keep on incrementing until
12591 * we find one of the invariants we do handle. */
12596 /* If this looks like it is a [:posix:] class, leave the
12597 * parse pointer at the '[' to fool regclass() into
12598 * thinking it is part of a '[[:posix:]]'. That function
12599 * will use strict checking to force a syntax error if it
12600 * doesn't work out to a legitimate class */
12601 bool is_posix_class
12602 = could_it_be_a_POSIX_class(pRExC_state);
12603 if (! is_posix_class) {
12607 /* regclass() can only return RESTART_UTF8 if multi-char
12608 folds are allowed. */
12609 if (!regclass(pRExC_state, flagp,depth+1,
12610 is_posix_class, /* parse the whole char
12611 class only if not a
12613 FALSE, /* don't allow multi-char folds */
12614 TRUE, /* silence non-portable warnings. */
12616 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12619 /* function call leaves parse pointing to the ']', except
12620 * if we faked it */
12621 if (is_posix_class) {
12625 SvREFCNT_dec(current); /* In case it returned something */
12630 if (depth--) break;
12632 if (RExC_parse < RExC_end
12633 && *RExC_parse == ')')
12635 node = reganode(pRExC_state, ANYOF, 0);
12636 RExC_size += ANYOF_SKIP;
12637 nextchar(pRExC_state);
12638 Set_Node_Length(node,
12639 RExC_parse - oregcomp_parse + 1); /* MJD */
12648 FAIL("Syntax error in (?[...])");
12651 /* Pass 2 only after this. Everything in this construct is a
12652 * metacharacter. Operands begin with either a '\' (for an escape
12653 * sequence), or a '[' for a bracketed character class. Any other
12654 * character should be an operator, or parenthesis for grouping. Both
12655 * types of operands are handled by calling regclass() to parse them. It
12656 * is called with a parameter to indicate to return the computed inversion
12657 * list. The parsing here is implemented via a stack. Each entry on the
12658 * stack is a single character representing one of the operators, or the
12659 * '('; or else a pointer to an operand inversion list. */
12661 #define IS_OPERAND(a) (! SvIOK(a))
12663 /* The stack starts empty. It is a syntax error if the first thing parsed
12664 * is a binary operator; everything else is pushed on the stack. When an
12665 * operand is parsed, the top of the stack is examined. If it is a binary
12666 * operator, the item before it should be an operand, and both are replaced
12667 * by the result of doing that operation on the new operand and the one on
12668 * the stack. Thus a sequence of binary operands is reduced to a single
12669 * one before the next one is parsed.
12671 * A unary operator may immediately follow a binary in the input, for
12674 * When an operand is parsed and the top of the stack is a unary operator,
12675 * the operation is performed, and then the stack is rechecked to see if
12676 * this new operand is part of a binary operation; if so, it is handled as
12679 * A '(' is simply pushed on the stack; it is valid only if the stack is
12680 * empty, or the top element of the stack is an operator or another '('
12681 * (for which the parenthesized expression will become an operand). By the
12682 * time the corresponding ')' is parsed everything in between should have
12683 * been parsed and evaluated to a single operand (or else is a syntax
12684 * error), and is handled as a regular operand */
12686 sv_2mortal((SV *)(stack = newAV()));
12688 while (RExC_parse < RExC_end) {
12689 I32 top_index = av_tindex(stack);
12691 SV* current = NULL;
12693 /* Skip white space */
12694 RExC_parse = regpatws(pRExC_state, RExC_parse,
12695 TRUE); /* means recognize comments */
12696 if (RExC_parse >= RExC_end) {
12697 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12699 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12706 if (av_tindex(stack) >= 0 /* This makes sure that we can
12707 safely subtract 1 from
12708 RExC_parse in the next clause.
12709 If we have something on the
12710 stack, we have parsed something
12712 && UCHARAT(RExC_parse - 1) == '('
12713 && RExC_parse < RExC_end)
12715 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12716 * This happens when we have some thing like
12718 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12720 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12722 * Here we would be handling the interpolated
12723 * '$thai_or_lao'. We handle this by a recursive call to
12724 * ourselves which returns the inversion list the
12725 * interpolated expression evaluates to. We use the flags
12726 * from the interpolated pattern. */
12727 U32 save_flags = RExC_flags;
12728 const char * const save_parse = ++RExC_parse;
12730 parse_lparen_question_flags(pRExC_state);
12732 if (RExC_parse == save_parse /* Makes sure there was at
12733 least one flag (or this
12734 embedding wasn't compiled)
12736 || RExC_parse >= RExC_end - 4
12737 || UCHARAT(RExC_parse) != ':'
12738 || UCHARAT(++RExC_parse) != '('
12739 || UCHARAT(++RExC_parse) != '?'
12740 || UCHARAT(++RExC_parse) != '[')
12743 /* In combination with the above, this moves the
12744 * pointer to the point just after the first erroneous
12745 * character (or if there are no flags, to where they
12746 * should have been) */
12747 if (RExC_parse >= RExC_end - 4) {
12748 RExC_parse = RExC_end;
12750 else if (RExC_parse != save_parse) {
12751 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12753 vFAIL("Expecting '(?flags:(?[...'");
12756 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
12757 depth+1, oregcomp_parse);
12759 /* Here, 'current' contains the embedded expression's
12760 * inversion list, and RExC_parse points to the trailing
12761 * ']'; the next character should be the ')' which will be
12762 * paired with the '(' that has been put on the stack, so
12763 * the whole embedded expression reduces to '(operand)' */
12766 RExC_flags = save_flags;
12767 goto handle_operand;
12772 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12773 vFAIL("Unexpected character");
12776 /* regclass() can only return RESTART_UTF8 if multi-char
12777 folds are allowed. */
12778 if (!regclass(pRExC_state, flagp,depth+1,
12779 TRUE, /* means parse just the next thing */
12780 FALSE, /* don't allow multi-char folds */
12781 FALSE, /* don't silence non-portable warnings. */
12783 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12785 /* regclass() will return with parsing just the \ sequence,
12786 * leaving the parse pointer at the next thing to parse */
12788 goto handle_operand;
12790 case '[': /* Is a bracketed character class */
12792 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12794 if (! is_posix_class) {
12798 /* regclass() can only return RESTART_UTF8 if multi-char
12799 folds are allowed. */
12800 if(!regclass(pRExC_state, flagp,depth+1,
12801 is_posix_class, /* parse the whole char class
12802 only if not a posix class */
12803 FALSE, /* don't allow multi-char folds */
12804 FALSE, /* don't silence non-portable warnings. */
12806 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12808 /* function call leaves parse pointing to the ']', except if we
12810 if (is_posix_class) {
12814 goto handle_operand;
12823 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12824 || ! IS_OPERAND(*top_ptr))
12827 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12829 av_push(stack, newSVuv(curchar));
12833 av_push(stack, newSVuv(curchar));
12837 if (top_index >= 0) {
12838 top_ptr = av_fetch(stack, top_index, FALSE);
12840 if (IS_OPERAND(*top_ptr)) {
12842 vFAIL("Unexpected '(' with no preceding operator");
12845 av_push(stack, newSVuv(curchar));
12852 || ! (current = av_pop(stack))
12853 || ! IS_OPERAND(current)
12854 || ! (lparen = av_pop(stack))
12855 || IS_OPERAND(lparen)
12856 || SvUV(lparen) != '(')
12858 SvREFCNT_dec(current);
12860 vFAIL("Unexpected ')'");
12863 SvREFCNT_dec_NN(lparen);
12870 /* Here, we have an operand to process, in 'current' */
12872 if (top_index < 0) { /* Just push if stack is empty */
12873 av_push(stack, current);
12876 SV* top = av_pop(stack);
12878 char current_operator;
12880 if (IS_OPERAND(top)) {
12881 SvREFCNT_dec_NN(top);
12882 SvREFCNT_dec_NN(current);
12883 vFAIL("Operand with no preceding operator");
12885 current_operator = (char) SvUV(top);
12886 switch (current_operator) {
12887 case '(': /* Push the '(' back on followed by the new
12889 av_push(stack, top);
12890 av_push(stack, current);
12891 SvREFCNT_inc(top); /* Counters the '_dec' done
12892 just after the 'break', so
12893 it doesn't get wrongly freed
12898 _invlist_invert(current);
12900 /* Unlike binary operators, the top of the stack,
12901 * now that this unary one has been popped off, may
12902 * legally be an operator, and we now have operand
12905 SvREFCNT_dec_NN(top);
12906 goto handle_operand;
12909 prev = av_pop(stack);
12910 _invlist_intersection(prev,
12913 av_push(stack, current);
12918 prev = av_pop(stack);
12919 _invlist_union(prev, current, ¤t);
12920 av_push(stack, current);
12924 prev = av_pop(stack);;
12925 _invlist_subtract(prev, current, ¤t);
12926 av_push(stack, current);
12929 case '^': /* The union minus the intersection */
12935 prev = av_pop(stack);
12936 _invlist_union(prev, current, &u);
12937 _invlist_intersection(prev, current, &i);
12938 /* _invlist_subtract will overwrite current
12939 without freeing what it already contains */
12941 _invlist_subtract(u, i, ¤t);
12942 av_push(stack, current);
12943 SvREFCNT_dec_NN(i);
12944 SvREFCNT_dec_NN(u);
12945 SvREFCNT_dec_NN(element);
12950 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12952 SvREFCNT_dec_NN(top);
12953 SvREFCNT_dec(prev);
12957 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12960 if (av_tindex(stack) < 0 /* Was empty */
12961 || ((final = av_pop(stack)) == NULL)
12962 || ! IS_OPERAND(final)
12963 || av_tindex(stack) >= 0) /* More left on stack */
12965 vFAIL("Incomplete expression within '(?[ ])'");
12968 /* Here, 'final' is the resultant inversion list from evaluating the
12969 * expression. Return it if so requested */
12970 if (return_invlist) {
12971 *return_invlist = final;
12975 /* Otherwise generate a resultant node, based on 'final'. regclass() is
12976 * expecting a string of ranges and individual code points */
12977 invlist_iterinit(final);
12978 result_string = newSVpvs("");
12979 while (invlist_iternext(final, &start, &end)) {
12980 if (start == end) {
12981 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12984 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12989 save_parse = RExC_parse;
12990 RExC_parse = SvPV(result_string, len);
12991 save_end = RExC_end;
12992 RExC_end = RExC_parse + len;
12994 /* We turn off folding around the call, as the class we have constructed
12995 * already has all folding taken into consideration, and we don't want
12996 * regclass() to add to that */
12997 RExC_flags &= ~RXf_PMf_FOLD;
12998 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13000 node = regclass(pRExC_state, flagp,depth+1,
13001 FALSE, /* means parse the whole char class */
13002 FALSE, /* don't allow multi-char folds */
13003 TRUE, /* silence non-portable warnings. The above may very
13004 well have generated non-portable code points, but
13005 they're valid on this machine */
13008 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13011 RExC_flags |= RXf_PMf_FOLD;
13013 RExC_parse = save_parse + 1;
13014 RExC_end = save_end;
13015 SvREFCNT_dec_NN(final);
13016 SvREFCNT_dec_NN(result_string);
13018 nextchar(pRExC_state);
13019 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13024 /* The names of properties whose definitions are not known at compile time are
13025 * stored in this SV, after a constant heading. So if the length has been
13026 * changed since initialization, then there is a run-time definition. */
13027 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13028 (SvCUR(listsv) != initial_listsv_len)
13031 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13032 const bool stop_at_1, /* Just parse the next thing, don't
13033 look for a full character class */
13034 bool allow_multi_folds,
13035 const bool silence_non_portable, /* Don't output warnings
13038 SV** ret_invlist) /* Return an inversion list, not a node */
13040 /* parse a bracketed class specification. Most of these will produce an
13041 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13042 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13043 * under /i with multi-character folds: it will be rewritten following the
13044 * paradigm of this example, where the <multi-fold>s are characters which
13045 * fold to multiple character sequences:
13046 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13047 * gets effectively rewritten as:
13048 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13049 * reg() gets called (recursively) on the rewritten version, and this
13050 * function will return what it constructs. (Actually the <multi-fold>s
13051 * aren't physically removed from the [abcdefghi], it's just that they are
13052 * ignored in the recursion by means of a flag:
13053 * <RExC_in_multi_char_class>.)
13055 * ANYOF nodes contain a bit map for the first 256 characters, with the
13056 * corresponding bit set if that character is in the list. For characters
13057 * above 255, a range list or swash is used. There are extra bits for \w,
13058 * etc. in locale ANYOFs, as what these match is not determinable at
13061 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13062 * to be restarted. This can only happen if ret_invlist is non-NULL.
13066 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13068 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13071 IV namedclass = OOB_NAMEDCLASS;
13072 char *rangebegin = NULL;
13073 bool need_class = 0;
13075 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13076 than just initialized. */
13077 SV* properties = NULL; /* Code points that match \p{} \P{} */
13078 SV* posixes = NULL; /* Code points that match classes like [:word:],
13079 extended beyond the Latin1 range. These have to
13080 be kept separate from other code points for much
13081 of this function because their handling is
13082 different under /i, and for most classes under
13084 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13085 separate for a while from the non-complemented
13086 versions because of complications with /d
13088 UV element_count = 0; /* Number of distinct elements in the class.
13089 Optimizations may be possible if this is tiny */
13090 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13091 character; used under /i */
13093 char * stop_ptr = RExC_end; /* where to stop parsing */
13094 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13096 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13098 /* Unicode properties are stored in a swash; this holds the current one
13099 * being parsed. If this swash is the only above-latin1 component of the
13100 * character class, an optimization is to pass it directly on to the
13101 * execution engine. Otherwise, it is set to NULL to indicate that there
13102 * are other things in the class that have to be dealt with at execution
13104 SV* swash = NULL; /* Code points that match \p{} \P{} */
13106 /* Set if a component of this character class is user-defined; just passed
13107 * on to the engine */
13108 bool has_user_defined_property = FALSE;
13110 /* inversion list of code points this node matches only when the target
13111 * string is in UTF-8. (Because is under /d) */
13112 SV* depends_list = NULL;
13114 /* Inversion list of code points this node matches regardless of things
13115 * like locale, folding, utf8ness of the target string */
13116 SV* cp_list = NULL;
13118 /* Like cp_list, but code points on this list need to be checked for things
13119 * that fold to/from them under /i */
13120 SV* cp_foldable_list = NULL;
13123 /* In a range, counts how many 0-2 of the ends of it came from literals,
13124 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13125 UV literal_endpoint = 0;
13127 bool invert = FALSE; /* Is this class to be complemented */
13129 bool warn_super = ALWAYS_WARN_SUPER;
13131 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13132 case we need to change the emitted regop to an EXACT. */
13133 const char * orig_parse = RExC_parse;
13134 const SSize_t orig_size = RExC_size;
13135 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13136 GET_RE_DEBUG_FLAGS_DECL;
13138 PERL_ARGS_ASSERT_REGCLASS;
13140 PERL_UNUSED_ARG(depth);
13143 DEBUG_PARSE("clas");
13145 /* Assume we are going to generate an ANYOF node. */
13146 ret = reganode(pRExC_state, ANYOF, 0);
13149 RExC_size += ANYOF_SKIP;
13150 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13153 ANYOF_FLAGS(ret) = 0;
13155 RExC_emit += ANYOF_SKIP;
13157 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
13159 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13160 initial_listsv_len = SvCUR(listsv);
13161 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13165 RExC_parse = regpatws(pRExC_state, RExC_parse,
13166 FALSE /* means don't recognize comments */);
13169 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13172 allow_multi_folds = FALSE;
13175 RExC_parse = regpatws(pRExC_state, RExC_parse,
13176 FALSE /* means don't recognize comments */);
13180 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13181 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13182 const char *s = RExC_parse;
13183 const char c = *s++;
13185 while (isWORDCHAR(*s))
13187 if (*s && c == *s && s[1] == ']') {
13188 SAVEFREESV(RExC_rx_sv);
13190 "POSIX syntax [%c %c] belongs inside character classes",
13192 (void)ReREFCNT_inc(RExC_rx_sv);
13196 /* If the caller wants us to just parse a single element, accomplish this
13197 * by faking the loop ending condition */
13198 if (stop_at_1 && RExC_end > RExC_parse) {
13199 stop_ptr = RExC_parse + 1;
13202 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13203 if (UCHARAT(RExC_parse) == ']')
13204 goto charclassloop;
13208 if (RExC_parse >= stop_ptr) {
13213 RExC_parse = regpatws(pRExC_state, RExC_parse,
13214 FALSE /* means don't recognize comments */);
13217 if (UCHARAT(RExC_parse) == ']') {
13223 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13224 save_value = value;
13225 save_prevvalue = prevvalue;
13228 rangebegin = RExC_parse;
13232 value = utf8n_to_uvchr((U8*)RExC_parse,
13233 RExC_end - RExC_parse,
13234 &numlen, UTF8_ALLOW_DEFAULT);
13235 RExC_parse += numlen;
13238 value = UCHARAT(RExC_parse++);
13241 && RExC_parse < RExC_end
13242 && POSIXCC(UCHARAT(RExC_parse)))
13244 namedclass = regpposixcc(pRExC_state, value, strict);
13246 else if (value == '\\') {
13248 value = utf8n_to_uvchr((U8*)RExC_parse,
13249 RExC_end - RExC_parse,
13250 &numlen, UTF8_ALLOW_DEFAULT);
13251 RExC_parse += numlen;
13254 value = UCHARAT(RExC_parse++);
13256 /* Some compilers cannot handle switching on 64-bit integer
13257 * values, therefore value cannot be an UV. Yes, this will
13258 * be a problem later if we want switch on Unicode.
13259 * A similar issue a little bit later when switching on
13260 * namedclass. --jhi */
13262 /* If the \ is escaping white space when white space is being
13263 * skipped, it means that that white space is wanted literally, and
13264 * is already in 'value'. Otherwise, need to translate the escape
13265 * into what it signifies. */
13266 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13268 case 'w': namedclass = ANYOF_WORDCHAR; break;
13269 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13270 case 's': namedclass = ANYOF_SPACE; break;
13271 case 'S': namedclass = ANYOF_NSPACE; break;
13272 case 'd': namedclass = ANYOF_DIGIT; break;
13273 case 'D': namedclass = ANYOF_NDIGIT; break;
13274 case 'v': namedclass = ANYOF_VERTWS; break;
13275 case 'V': namedclass = ANYOF_NVERTWS; break;
13276 case 'h': namedclass = ANYOF_HORIZWS; break;
13277 case 'H': namedclass = ANYOF_NHORIZWS; break;
13278 case 'N': /* Handle \N{NAME} in class */
13280 /* We only pay attention to the first char of
13281 multichar strings being returned. I kinda wonder
13282 if this makes sense as it does change the behaviour
13283 from earlier versions, OTOH that behaviour was broken
13285 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13286 TRUE, /* => charclass */
13289 if (*flagp & RESTART_UTF8)
13290 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13300 /* We will handle any undefined properties ourselves */
13301 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13302 /* And we actually would prefer to get
13303 * the straight inversion list of the
13304 * swash, since we will be accessing it
13305 * anyway, to save a little time */
13306 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13308 if (RExC_parse >= RExC_end)
13309 vFAIL2("Empty \\%c{}", (U8)value);
13310 if (*RExC_parse == '{') {
13311 const U8 c = (U8)value;
13312 e = strchr(RExC_parse++, '}');
13314 vFAIL2("Missing right brace on \\%c{}", c);
13315 while (isSPACE(UCHARAT(RExC_parse)))
13317 if (e == RExC_parse)
13318 vFAIL2("Empty \\%c{}", c);
13319 n = e - RExC_parse;
13320 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13332 if (UCHARAT(RExC_parse) == '^') {
13335 /* toggle. (The rhs xor gets the single bit that
13336 * differs between P and p; the other xor inverts just
13338 value ^= 'P' ^ 'p';
13340 while (isSPACE(UCHARAT(RExC_parse))) {
13345 /* Try to get the definition of the property into
13346 * <invlist>. If /i is in effect, the effective property
13347 * will have its name be <__NAME_i>. The design is
13348 * discussed in commit
13349 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13350 formatted = Perl_form(aTHX_
13352 (FOLD) ? "__" : "",
13357 name = savepvn(formatted, strlen(formatted));
13359 /* Look up the property name, and get its swash and
13360 * inversion list, if the property is found */
13362 SvREFCNT_dec_NN(swash);
13364 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13367 NULL, /* No inversion list */
13370 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13372 SvREFCNT_dec_NN(swash);
13376 /* Here didn't find it. It could be a user-defined
13377 * property that will be available at run-time. If we
13378 * accept only compile-time properties, is an error;
13379 * otherwise add it to the list for run-time look up */
13381 RExC_parse = e + 1;
13383 "Property '%"UTF8f"' is unknown",
13384 UTF8fARG(UTF, n, name));
13386 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13387 (value == 'p' ? '+' : '!'),
13388 UTF8fARG(UTF, n, name));
13389 has_user_defined_property = TRUE;
13391 /* We don't know yet, so have to assume that the
13392 * property could match something in the Latin1 range,
13393 * hence something that isn't utf8. Note that this
13394 * would cause things in <depends_list> to match
13395 * inappropriately, except that any \p{}, including
13396 * this one forces Unicode semantics, which means there
13397 * is no <depends_list> */
13398 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13402 /* Here, did get the swash and its inversion list. If
13403 * the swash is from a user-defined property, then this
13404 * whole character class should be regarded as such */
13405 if (swash_init_flags
13406 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13408 has_user_defined_property = TRUE;
13411 /* We warn on matching an above-Unicode code point
13412 * if the match would return true, except don't
13413 * warn for \p{All}, which has exactly one element
13415 (_invlist_contains_cp(invlist, 0x110000)
13416 && (! (_invlist_len(invlist) == 1
13417 && *invlist_array(invlist) == 0)))
13423 /* Invert if asking for the complement */
13424 if (value == 'P') {
13425 _invlist_union_complement_2nd(properties,
13429 /* The swash can't be used as-is, because we've
13430 * inverted things; delay removing it to here after
13431 * have copied its invlist above */
13432 SvREFCNT_dec_NN(swash);
13436 _invlist_union(properties, invlist, &properties);
13441 RExC_parse = e + 1;
13442 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13445 /* \p means they want Unicode semantics */
13446 RExC_uni_semantics = 1;
13449 case 'n': value = '\n'; break;
13450 case 'r': value = '\r'; break;
13451 case 't': value = '\t'; break;
13452 case 'f': value = '\f'; break;
13453 case 'b': value = '\b'; break;
13454 case 'e': value = ASCII_TO_NATIVE('\033');break;
13455 case 'a': value = '\a'; break;
13457 RExC_parse--; /* function expects to be pointed at the 'o' */
13459 const char* error_msg;
13460 bool valid = grok_bslash_o(&RExC_parse,
13463 SIZE_ONLY, /* warnings in pass
13466 silence_non_portable,
13472 if (PL_encoding && value < 0x100) {
13473 goto recode_encoding;
13477 RExC_parse--; /* function expects to be pointed at the 'x' */
13479 const char* error_msg;
13480 bool valid = grok_bslash_x(&RExC_parse,
13483 TRUE, /* Output warnings */
13485 silence_non_portable,
13491 if (PL_encoding && value < 0x100)
13492 goto recode_encoding;
13495 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
13497 case '0': case '1': case '2': case '3': case '4':
13498 case '5': case '6': case '7':
13500 /* Take 1-3 octal digits */
13501 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13502 numlen = (strict) ? 4 : 3;
13503 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13504 RExC_parse += numlen;
13507 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13508 vFAIL("Need exactly 3 octal digits");
13510 else if (! SIZE_ONLY /* like \08, \178 */
13512 && RExC_parse < RExC_end
13513 && isDIGIT(*RExC_parse)
13514 && ckWARN(WARN_REGEXP))
13516 SAVEFREESV(RExC_rx_sv);
13517 reg_warn_non_literal_string(
13519 form_short_octal_warning(RExC_parse, numlen));
13520 (void)ReREFCNT_inc(RExC_rx_sv);
13523 if (PL_encoding && value < 0x100)
13524 goto recode_encoding;
13528 if (! RExC_override_recoding) {
13529 SV* enc = PL_encoding;
13530 value = reg_recode((const char)(U8)value, &enc);
13533 vFAIL("Invalid escape in the specified encoding");
13535 else if (SIZE_ONLY) {
13536 ckWARNreg(RExC_parse,
13537 "Invalid escape in the specified encoding");
13543 /* Allow \_ to not give an error */
13544 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13546 vFAIL2("Unrecognized escape \\%c in character class",
13550 SAVEFREESV(RExC_rx_sv);
13551 ckWARN2reg(RExC_parse,
13552 "Unrecognized escape \\%c in character class passed through",
13554 (void)ReREFCNT_inc(RExC_rx_sv);
13558 } /* End of switch on char following backslash */
13559 } /* end of handling backslash escape sequences */
13562 literal_endpoint++;
13565 /* Here, we have the current token in 'value' */
13567 /* What matches in a locale is not known until runtime. This includes
13568 * what the Posix classes (like \w, [:space:]) match. Room must be
13569 * reserved (one time per outer bracketed class) to store such classes,
13570 * either if Perl is compiled so that locale nodes always should have
13571 * this space, or if there is such posix class info to be stored. The
13572 * space will contain a bit for each named class that is to be matched
13573 * against. This isn't needed for \p{} and pseudo-classes, as they are
13574 * not affected by locale, and hence are dealt with separately */
13576 if (FOLD && ! need_class) {
13579 RExC_size += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP;
13582 RExC_emit += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP;
13585 if (ANYOF_LOCALE == ANYOF_POSIXL
13586 || (namedclass > OOB_NAMEDCLASS
13587 && namedclass < ANYOF_POSIXL_MAX))
13589 if (! need_class) {
13592 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13595 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13598 ANYOF_POSIXL_ZERO(ret);
13599 ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13603 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13606 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13607 * literal, as is the character that began the false range, i.e.
13608 * the 'a' in the examples */
13611 const int w = (RExC_parse >= rangebegin)
13612 ? RExC_parse - rangebegin
13616 "False [] range \"%"UTF8f"\"",
13617 UTF8fARG(UTF, w, rangebegin));
13620 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13621 ckWARN2reg(RExC_parse,
13622 "False [] range \"%"UTF8f"\"",
13623 UTF8fARG(UTF, w, rangebegin));
13624 (void)ReREFCNT_inc(RExC_rx_sv);
13625 cp_list = add_cp_to_invlist(cp_list, '-');
13626 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13631 range = 0; /* this was not a true range */
13632 element_count += 2; /* So counts for three values */
13635 classnum = namedclass_to_classnum(namedclass);
13637 if (LOC && namedclass < ANYOF_POSIXL_MAX
13638 #ifndef HAS_ISASCII
13639 && classnum != _CC_ASCII
13641 #ifndef HAS_ISBLANK
13642 && classnum != _CC_BLANK
13646 /* See if it already matches the complement of this POSIX
13648 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13649 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13653 posixl_matches_all = TRUE;
13654 break; /* No need to continue. Since it matches both
13655 e.g., \w and \W, it matches everything, and the
13656 bracketed class can be optimized into qr/./s */
13659 /* Add this class to those that should be checked at runtime */
13660 ANYOF_POSIXL_SET(ret, namedclass);
13662 /* The above-Latin1 characters are not subject to locale rules.
13663 * Just add them, in the second pass, to the
13664 * unconditionally-matched list */
13666 SV* scratch_list = NULL;
13668 /* Get the list of the above-Latin1 code points this
13670 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13671 PL_XPosix_ptrs[classnum],
13673 /* Odd numbers are complements, like
13674 * NDIGIT, NASCII, ... */
13675 namedclass % 2 != 0,
13677 /* Checking if 'cp_list' is NULL first saves an extra
13678 * clone. Its reference count will be decremented at the
13679 * next union, etc, or if this is the only instance, at the
13680 * end of the routine */
13682 cp_list = scratch_list;
13685 _invlist_union(cp_list, scratch_list, &cp_list);
13686 SvREFCNT_dec_NN(scratch_list);
13688 continue; /* Go get next character */
13691 else if (! SIZE_ONLY) {
13693 /* Here, not in pass1 (in that pass we skip calculating the
13694 * contents of this class), and is /l, or is a POSIX class for
13695 * which /l doesn't matter (or is a Unicode property, which is
13696 * skipped here). */
13697 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
13698 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13700 /* Here, should be \h, \H, \v, or \V. None of /d, /i
13701 * nor /l make a difference in what these match,
13702 * therefore we just add what they match to cp_list. */
13703 if (classnum != _CC_VERTSPACE) {
13704 assert( namedclass == ANYOF_HORIZWS
13705 || namedclass == ANYOF_NHORIZWS);
13707 /* It turns out that \h is just a synonym for
13709 classnum = _CC_BLANK;
13712 _invlist_union_maybe_complement_2nd(
13714 PL_XPosix_ptrs[classnum],
13715 namedclass % 2 != 0, /* Complement if odd
13716 (NHORIZWS, NVERTWS)
13721 else { /* Garden variety class. If is NASCII, NDIGIT, ...
13722 complement and use nposixes */
13723 SV** posixes_ptr = namedclass % 2 == 0
13726 SV** source_ptr = &PL_XPosix_ptrs[classnum];
13727 #ifndef HAS_ISBLANK
13728 /* If the platform doesn't have isblank(), we handle locale
13729 * with the hardcoded ASII values. */
13730 if (LOC && classnum == _CC_BLANK) {
13731 _invlist_subtract(*source_ptr,
13737 _invlist_union_maybe_complement_2nd(
13740 namedclass % 2 != 0,
13743 continue; /* Go get next character */
13745 } /* end of namedclass \blah */
13747 /* Here, we have a single value. If 'range' is set, it is the ending
13748 * of a range--check its validity. Later, we will handle each
13749 * individual code point in the range. If 'range' isn't set, this
13750 * could be the beginning of a range, so check for that by looking
13751 * ahead to see if the next real character to be processed is the range
13752 * indicator--the minus sign */
13755 RExC_parse = regpatws(pRExC_state, RExC_parse,
13756 FALSE /* means don't recognize comments */);
13760 if (prevvalue > value) /* b-a */ {
13761 const int w = RExC_parse - rangebegin;
13763 "Invalid [] range \"%"UTF8f"\"",
13764 UTF8fARG(UTF, w, rangebegin));
13765 range = 0; /* not a valid range */
13769 prevvalue = value; /* save the beginning of the potential range */
13770 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13771 && *RExC_parse == '-')
13773 char* next_char_ptr = RExC_parse + 1;
13774 if (skip_white) { /* Get the next real char after the '-' */
13775 next_char_ptr = regpatws(pRExC_state,
13777 FALSE); /* means don't recognize
13781 /* If the '-' is at the end of the class (just before the ']',
13782 * it is a literal minus; otherwise it is a range */
13783 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13784 RExC_parse = next_char_ptr;
13786 /* a bad range like \w-, [:word:]- ? */
13787 if (namedclass > OOB_NAMEDCLASS) {
13788 if (strict || ckWARN(WARN_REGEXP)) {
13790 RExC_parse >= rangebegin ?
13791 RExC_parse - rangebegin : 0;
13793 vFAIL4("False [] range \"%*.*s\"",
13798 "False [] range \"%*.*s\"",
13803 cp_list = add_cp_to_invlist(cp_list, '-');
13807 range = 1; /* yeah, it's a range! */
13808 continue; /* but do it the next time */
13813 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13816 /* non-Latin1 code point implies unicode semantics. Must be set in
13817 * pass1 so is there for the whole of pass 2 */
13819 RExC_uni_semantics = 1;
13822 /* Ready to process either the single value, or the completed range.
13823 * For single-valued non-inverted ranges, we consider the possibility
13824 * of multi-char folds. (We made a conscious decision to not do this
13825 * for the other cases because it can often lead to non-intuitive
13826 * results. For example, you have the peculiar case that:
13827 * "s s" =~ /^[^\xDF]+$/i => Y
13828 * "ss" =~ /^[^\xDF]+$/i => N
13830 * See [perl #89750] */
13831 if (FOLD && allow_multi_folds && value == prevvalue) {
13832 if (value == LATIN_SMALL_LETTER_SHARP_S
13833 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13836 /* Here <value> is indeed a multi-char fold. Get what it is */
13838 U8 foldbuf[UTF8_MAXBYTES_CASE];
13841 UV folded = _to_uni_fold_flags(
13845 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13846 ? FOLD_FLAGS_NOMIX_ASCII
13850 /* Here, <folded> should be the first character of the
13851 * multi-char fold of <value>, with <foldbuf> containing the
13852 * whole thing. But, if this fold is not allowed (because of
13853 * the flags), <fold> will be the same as <value>, and should
13854 * be processed like any other character, so skip the special
13856 if (folded != value) {
13858 /* Skip if we are recursed, currently parsing the class
13859 * again. Otherwise add this character to the list of
13860 * multi-char folds. */
13861 if (! RExC_in_multi_char_class) {
13862 AV** this_array_ptr;
13864 STRLEN cp_count = utf8_length(foldbuf,
13865 foldbuf + foldlen);
13866 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13868 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13871 if (! multi_char_matches) {
13872 multi_char_matches = newAV();
13875 /* <multi_char_matches> is actually an array of arrays.
13876 * There will be one or two top-level elements: [2],
13877 * and/or [3]. The [2] element is an array, each
13878 * element thereof is a character which folds to TWO
13879 * characters; [3] is for folds to THREE characters.
13880 * (Unicode guarantees a maximum of 3 characters in any
13881 * fold.) When we rewrite the character class below,
13882 * we will do so such that the longest folds are
13883 * written first, so that it prefers the longest
13884 * matching strings first. This is done even if it
13885 * turns out that any quantifier is non-greedy, out of
13886 * programmer laziness. Tom Christiansen has agreed
13887 * that this is ok. This makes the test for the
13888 * ligature 'ffi' come before the test for 'ff' */
13889 if (av_exists(multi_char_matches, cp_count)) {
13890 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13892 this_array = *this_array_ptr;
13895 this_array = newAV();
13896 av_store(multi_char_matches, cp_count,
13899 av_push(this_array, multi_fold);
13902 /* This element should not be processed further in this
13905 value = save_value;
13906 prevvalue = save_prevvalue;
13912 /* Deal with this element of the class */
13915 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
13918 SV* this_range = _new_invlist(1);
13919 _append_range_to_invlist(this_range, prevvalue, value);
13921 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13922 * If this range was specified using something like 'i-j', we want
13923 * to include only the 'i' and the 'j', and not anything in
13924 * between, so exclude non-ASCII, non-alphabetics from it.
13925 * However, if the range was specified with something like
13926 * [\x89-\x91] or [\x89-j], all code points within it should be
13927 * included. literal_endpoint==2 means both ends of the range used
13928 * a literal character, not \x{foo} */
13929 if (literal_endpoint == 2
13930 && ((prevvalue >= 'a' && value <= 'z')
13931 || (prevvalue >= 'A' && value <= 'Z')))
13933 _invlist_intersection(this_range, PL_ASCII,
13936 /* Since this above only contains ascii, the intersection of it
13937 * with anything will still yield only ascii */
13938 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
13941 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
13942 literal_endpoint = 0;
13946 range = 0; /* this range (if it was one) is done now */
13947 } /* End of loop through all the text within the brackets */
13949 /* If anything in the class expands to more than one character, we have to
13950 * deal with them by building up a substitute parse string, and recursively
13951 * calling reg() on it, instead of proceeding */
13952 if (multi_char_matches) {
13953 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13956 char *save_end = RExC_end;
13957 char *save_parse = RExC_parse;
13958 bool first_time = TRUE; /* First multi-char occurrence doesn't get
13963 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13964 because too confusing */
13966 sv_catpv(substitute_parse, "(?:");
13970 /* Look at the longest folds first */
13971 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13973 if (av_exists(multi_char_matches, cp_count)) {
13974 AV** this_array_ptr;
13977 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13979 while ((this_sequence = av_pop(*this_array_ptr)) !=
13982 if (! first_time) {
13983 sv_catpv(substitute_parse, "|");
13985 first_time = FALSE;
13987 sv_catpv(substitute_parse, SvPVX(this_sequence));
13992 /* If the character class contains anything else besides these
13993 * multi-character folds, have to include it in recursive parsing */
13994 if (element_count) {
13995 sv_catpv(substitute_parse, "|[");
13996 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13997 sv_catpv(substitute_parse, "]");
14000 sv_catpv(substitute_parse, ")");
14003 /* This is a way to get the parse to skip forward a whole named
14004 * sequence instead of matching the 2nd character when it fails the
14006 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14010 RExC_parse = SvPV(substitute_parse, len);
14011 RExC_end = RExC_parse + len;
14012 RExC_in_multi_char_class = 1;
14013 RExC_emit = (regnode *)orig_emit;
14015 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14017 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14019 RExC_parse = save_parse;
14020 RExC_end = save_end;
14021 RExC_in_multi_char_class = 0;
14022 SvREFCNT_dec_NN(multi_char_matches);
14026 /* Here, we've gone through the entire class and dealt with multi-char
14027 * folds. We are now in a position that we can do some checks to see if we
14028 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14029 * Currently we only do two checks:
14030 * 1) is in the unlikely event that the user has specified both, eg. \w and
14031 * \W under /l, then the class matches everything. (This optimization
14032 * is done only to make the optimizer code run later work.)
14033 * 2) if the character class contains only a single element (including a
14034 * single range), we see if there is an equivalent node for it.
14035 * Other checks are possible */
14036 if (! ret_invlist /* Can't optimize if returning the constructed
14038 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14043 if (UNLIKELY(posixl_matches_all)) {
14046 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14047 \w or [:digit:] or \p{foo}
14050 /* All named classes are mapped into POSIXish nodes, with its FLAG
14051 * argument giving which class it is */
14052 switch ((I32)namedclass) {
14053 case ANYOF_UNIPROP:
14056 /* These don't depend on the charset modifiers. They always
14057 * match under /u rules */
14058 case ANYOF_NHORIZWS:
14059 case ANYOF_HORIZWS:
14060 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14063 case ANYOF_NVERTWS:
14068 /* The actual POSIXish node for all the rest depends on the
14069 * charset modifier. The ones in the first set depend only on
14070 * ASCII or, if available on this platform, locale */
14074 op = (LOC) ? POSIXL : POSIXA;
14085 /* under /a could be alpha */
14087 if (ASCII_RESTRICTED) {
14088 namedclass = ANYOF_ALPHA + (namedclass % 2);
14096 /* The rest have more possibilities depending on the charset.
14097 * We take advantage of the enum ordering of the charset
14098 * modifiers to get the exact node type, */
14100 op = POSIXD + get_regex_charset(RExC_flags);
14101 if (op > POSIXA) { /* /aa is same as /a */
14104 #ifndef HAS_ISBLANK
14106 && (namedclass == ANYOF_BLANK
14107 || namedclass == ANYOF_NBLANK))
14114 /* The odd numbered ones are the complements of the
14115 * next-lower even number one */
14116 if (namedclass % 2 == 1) {
14120 arg = namedclass_to_classnum(namedclass);
14124 else if (value == prevvalue) {
14126 /* Here, the class consists of just a single code point */
14129 if (! LOC && value == '\n') {
14130 op = REG_ANY; /* Optimize [^\n] */
14131 *flagp |= HASWIDTH|SIMPLE;
14135 else if (value < 256 || UTF) {
14137 /* Optimize a single value into an EXACTish node, but not if it
14138 * would require converting the pattern to UTF-8. */
14139 op = compute_EXACTish(pRExC_state);
14141 } /* Otherwise is a range */
14142 else if (! LOC) { /* locale could vary these */
14143 if (prevvalue == '0') {
14144 if (value == '9') {
14151 /* Here, we have changed <op> away from its initial value iff we found
14152 * an optimization */
14155 /* Throw away this ANYOF regnode, and emit the calculated one,
14156 * which should correspond to the beginning, not current, state of
14158 const char * cur_parse = RExC_parse;
14159 RExC_parse = (char *)orig_parse;
14163 /* To get locale nodes to not use the full ANYOF size would
14164 * require moving the code above that writes the portions
14165 * of it that aren't in other nodes to after this point.
14166 * e.g. ANYOF_POSIXL_SET */
14167 RExC_size = orig_size;
14171 RExC_emit = (regnode *)orig_emit;
14172 if (PL_regkind[op] == POSIXD) {
14174 op += NPOSIXD - POSIXD;
14179 ret = reg_node(pRExC_state, op);
14181 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14185 *flagp |= HASWIDTH|SIMPLE;
14187 else if (PL_regkind[op] == EXACT) {
14188 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14191 RExC_parse = (char *) cur_parse;
14193 SvREFCNT_dec(posixes);
14194 SvREFCNT_dec(nposixes);
14195 SvREFCNT_dec(cp_list);
14196 SvREFCNT_dec(cp_foldable_list);
14203 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14205 /* If folding, we calculate all characters that could fold to or from the
14206 * ones already on the list */
14207 if (cp_foldable_list) {
14209 UV start, end; /* End points of code point ranges */
14211 SV* fold_intersection = NULL;
14214 /* Our calculated list will be for Unicode rules. For locale
14215 * matching, we have to keep a separate list that is consulted at
14216 * runtime only when the locale indicates Unicode rules. For
14217 * non-locale, we just use to the general list */
14219 use_list = &ANYOF_UTF8_LOCALE_INVLIST(ret);
14223 use_list = &cp_list;
14226 /* Only the characters in this class that participate in folds need
14227 * be checked. Get the intersection of this class and all the
14228 * possible characters that are foldable. This can quickly narrow
14229 * down a large class */
14230 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14231 &fold_intersection);
14233 /* The folds for all the Latin1 characters are hard-coded into this
14234 * program, but we have to go out to disk to get the others. */
14235 if (invlist_highest(cp_foldable_list) >= 256) {
14237 /* This is a hash that for a particular fold gives all
14238 * characters that are involved in it */
14239 if (! PL_utf8_foldclosures) {
14241 /* If the folds haven't been read in, call a fold function
14243 if (! PL_utf8_tofold) {
14244 U8 dummy[UTF8_MAXBYTES_CASE+1];
14246 /* This string is just a short named one above \xff */
14247 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14248 assert(PL_utf8_tofold); /* Verify that worked */
14250 PL_utf8_foldclosures
14251 = _swash_inversion_hash(PL_utf8_tofold);
14255 /* Now look at the foldable characters in this class individually */
14256 invlist_iterinit(fold_intersection);
14257 while (invlist_iternext(fold_intersection, &start, &end)) {
14260 /* Look at every character in the range */
14261 for (j = start; j <= end; j++) {
14262 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14268 /* We have the latin1 folding rules hard-coded here so
14269 * that an innocent-looking character class, like
14270 * /[ks]/i won't have to go out to disk to find the
14271 * possible matches. XXX It would be better to
14272 * generate these via regen, in case a new version of
14273 * the Unicode standard adds new mappings, though that
14274 * is not really likely, and may be caught by the
14275 * default: case of the switch below. */
14277 if (IS_IN_SOME_FOLD_L1(j)) {
14279 /* ASCII is always matched; non-ASCII is matched
14280 * only under Unicode rules (which could happen
14281 * under /l if the locale is a UTF-8 one */
14282 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14283 *use_list = add_cp_to_invlist(*use_list,
14284 PL_fold_latin1[j]);
14288 add_cp_to_invlist(depends_list,
14289 PL_fold_latin1[j]);
14293 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14294 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14296 /* Certain Latin1 characters have matches outside
14297 * Latin1. To get here, <j> is one of those
14298 * characters. None of these matches is valid for
14299 * ASCII characters under /aa, which is why the 'if'
14300 * just above excludes those. These matches only
14301 * happen when the target string is utf8. The code
14302 * below adds the single fold closures for <j> to the
14303 * inversion list. */
14309 add_cp_to_invlist(*use_list, KELVIN_SIGN);
14313 *use_list = add_cp_to_invlist(*use_list,
14314 LATIN_SMALL_LETTER_LONG_S);
14317 *use_list = add_cp_to_invlist(*use_list,
14318 GREEK_CAPITAL_LETTER_MU);
14319 *use_list = add_cp_to_invlist(*use_list,
14320 GREEK_SMALL_LETTER_MU);
14322 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14323 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14325 add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14327 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14328 *use_list = add_cp_to_invlist(*use_list,
14329 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14331 case LATIN_SMALL_LETTER_SHARP_S:
14332 *use_list = add_cp_to_invlist(*use_list,
14333 LATIN_CAPITAL_LETTER_SHARP_S);
14335 case 'F': case 'f':
14336 case 'I': case 'i':
14337 case 'L': case 'l':
14338 case 'T': case 't':
14339 case 'A': case 'a':
14340 case 'H': case 'h':
14341 case 'J': case 'j':
14342 case 'N': case 'n':
14343 case 'W': case 'w':
14344 case 'Y': case 'y':
14345 /* These all are targets of multi-character
14346 * folds from code points that require UTF8
14347 * to express, so they can't match unless
14348 * the target string is in UTF-8, so no
14349 * action here is necessary, as regexec.c
14350 * properly handles the general case for
14351 * UTF-8 matching and multi-char folds */
14354 /* Use deprecated warning to increase the
14355 * chances of this being output */
14356 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14363 /* Here is an above Latin1 character. We don't have the
14364 * rules hard-coded for it. First, get its fold. This is
14365 * the simple fold, as the multi-character folds have been
14366 * handled earlier and separated out */
14367 _to_uni_fold_flags(j, foldbuf, &foldlen,
14368 (ASCII_FOLD_RESTRICTED)
14369 ? FOLD_FLAGS_NOMIX_ASCII
14372 /* Single character fold of above Latin1. Add everything in
14373 * its fold closure to the list that this node should match.
14374 * The fold closures data structure is a hash with the keys
14375 * being the UTF-8 of every character that is folded to, like
14376 * 'k', and the values each an array of all code points that
14377 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14378 * Multi-character folds are not included */
14379 if ((listp = hv_fetch(PL_utf8_foldclosures,
14380 (char *) foldbuf, foldlen, FALSE)))
14382 AV* list = (AV*) *listp;
14384 for (k = 0; k <= av_len(list); k++) {
14385 SV** c_p = av_fetch(list, k, FALSE);
14388 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14392 /* /aa doesn't allow folds between ASCII and non- */
14393 if ((ASCII_FOLD_RESTRICTED
14394 && (isASCII(c) != isASCII(j))))
14399 /* Folds under /l which cross the 255/256 boundary
14400 * are added to a separate list. (These are valid
14401 * only when the locale is UTF-8.) */
14402 if (c < 256 && LOC) {
14403 *use_list = add_cp_to_invlist(*use_list, c);
14407 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14409 cp_list = add_cp_to_invlist(cp_list, c);
14412 /* Similarly folds involving non-ascii Latin1
14413 * characters under /d are added to their list */
14414 depends_list = add_cp_to_invlist(depends_list,
14421 SvREFCNT_dec_NN(fold_intersection);
14424 /* Now that we have finished adding all the folds, there is no reason
14425 * to keep the foldable list separate */
14426 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14427 SvREFCNT_dec_NN(cp_foldable_list);
14430 /* And combine the result (if any) with any inversion list from posix
14431 * classes. The lists are kept separate up to now because we don't want to
14432 * fold the classes (folding of those is automatically handled by the swash
14433 * fetching code) */
14434 if (posixes || nposixes) {
14435 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14436 /* Under /a and /aa, nothing above ASCII matches these */
14437 _invlist_intersection(posixes,
14438 PL_XPosix_ptrs[_CC_ASCII],
14442 if (DEPENDS_SEMANTICS) {
14443 /* Under /d, everything in the upper half of the Latin1 range
14444 * matches these complements */
14445 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14447 else if (AT_LEAST_ASCII_RESTRICTED) {
14448 /* Under /a and /aa, everything above ASCII matches these
14450 _invlist_union_complement_2nd(nposixes,
14451 PL_XPosix_ptrs[_CC_ASCII],
14455 _invlist_union(posixes, nposixes, &posixes);
14456 SvREFCNT_dec_NN(nposixes);
14459 posixes = nposixes;
14462 if (! DEPENDS_SEMANTICS) {
14464 _invlist_union(cp_list, posixes, &cp_list);
14465 SvREFCNT_dec_NN(posixes);
14472 /* Under /d, we put into a separate list the Latin1 things that
14473 * match only when the target string is utf8 */
14474 SV* nonascii_but_latin1_properties = NULL;
14475 _invlist_intersection(posixes, PL_UpperLatin1,
14476 &nonascii_but_latin1_properties);
14477 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14480 _invlist_union(cp_list, posixes, &cp_list);
14481 SvREFCNT_dec_NN(posixes);
14487 if (depends_list) {
14488 _invlist_union(depends_list, nonascii_but_latin1_properties,
14490 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14493 depends_list = nonascii_but_latin1_properties;
14498 /* And combine the result (if any) with any inversion list from properties.
14499 * The lists are kept separate up to now so that we can distinguish the two
14500 * in regards to matching above-Unicode. A run-time warning is generated
14501 * if a Unicode property is matched against a non-Unicode code point. But,
14502 * we allow user-defined properties to match anything, without any warning,
14503 * and we also suppress the warning if there is a portion of the character
14504 * class that isn't a Unicode property, and which matches above Unicode, \W
14505 * or [\x{110000}] for example.
14506 * (Note that in this case, unlike the Posix one above, there is no
14507 * <depends_list>, because having a Unicode property forces Unicode
14512 /* If it matters to the final outcome, see if a non-property
14513 * component of the class matches above Unicode. If so, the
14514 * warning gets suppressed. This is true even if just a single
14515 * such code point is specified, as though not strictly correct if
14516 * another such code point is matched against, the fact that they
14517 * are using above-Unicode code points indicates they should know
14518 * the issues involved */
14520 warn_super = ! (invert
14521 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14524 _invlist_union(properties, cp_list, &cp_list);
14525 SvREFCNT_dec_NN(properties);
14528 cp_list = properties;
14532 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14536 /* Here, we have calculated what code points should be in the character
14539 * Now we can see about various optimizations. Fold calculation (which we
14540 * did above) needs to take place before inversion. Otherwise /[^k]/i
14541 * would invert to include K, which under /i would match k, which it
14542 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14543 * folded until runtime */
14545 /* If we didn't do folding, it's because some information isn't available
14546 * until runtime; set the run-time fold flag for these. (We don't have to
14547 * worry about properties folding, as that is taken care of by the swash
14548 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
14549 * locales, or the class matches at least one 0-255 range code point */
14551 if (ANYOF_UTF8_LOCALE_INVLIST(ret)) {
14552 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14554 else if (cp_list) { /* Look to see if there a 0-255 code point is in
14557 invlist_iterinit(cp_list);
14558 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14559 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14561 invlist_iterfinish(cp_list);
14565 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14566 * at compile time. Besides not inverting folded locale now, we can't
14567 * invert if there are things such as \w, which aren't known until runtime
14570 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOC_FOLD|ANYOF_POSIXL))
14572 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14574 _invlist_invert(cp_list);
14576 /* Any swash can't be used as-is, because we've inverted things */
14578 SvREFCNT_dec_NN(swash);
14582 /* Clear the invert flag since have just done it here */
14587 *ret_invlist = cp_list;
14588 SvREFCNT_dec(swash);
14590 /* Discard the generated node */
14592 RExC_size = orig_size;
14595 RExC_emit = orig_emit;
14600 /* Some character classes are equivalent to other nodes. Such nodes take
14601 * up less room and generally fewer operations to execute than ANYOF nodes.
14602 * Above, we checked for and optimized into some such equivalents for
14603 * certain common classes that are easy to test. Getting to this point in
14604 * the code means that the class didn't get optimized there. Since this
14605 * code is only executed in Pass 2, it is too late to save space--it has
14606 * been allocated in Pass 1, and currently isn't given back. But turning
14607 * things into an EXACTish node can allow the optimizer to join it to any
14608 * adjacent such nodes. And if the class is equivalent to things like /./,
14609 * expensive run-time swashes can be avoided. Now that we have more
14610 * complete information, we can find things necessarily missed by the
14611 * earlier code. I (khw) am not sure how much to look for here. It would
14612 * be easy, but perhaps too slow, to check any candidates against all the
14613 * node types they could possibly match using _invlistEQ(). */
14618 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOC_FOLD|ANYOF_POSIXL))
14619 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14621 /* We don't optimize if we are supposed to make sure all non-Unicode
14622 * code points raise a warning, as only ANYOF nodes have this check.
14624 && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14627 U8 op = END; /* The optimzation node-type */
14628 const char * cur_parse= RExC_parse;
14630 invlist_iterinit(cp_list);
14631 if (! invlist_iternext(cp_list, &start, &end)) {
14633 /* Here, the list is empty. This happens, for example, when a
14634 * Unicode property is the only thing in the character class, and
14635 * it doesn't match anything. (perluniprops.pod notes such
14638 *flagp |= HASWIDTH|SIMPLE;
14640 else if (start == end) { /* The range is a single code point */
14641 if (! invlist_iternext(cp_list, &start, &end)
14643 /* Don't do this optimization if it would require changing
14644 * the pattern to UTF-8 */
14645 && (start < 256 || UTF))
14647 /* Here, the list contains a single code point. Can optimize
14648 * into an EXACTish node */
14657 /* A locale node under folding with one code point can be
14658 * an EXACTFL, as its fold won't be calculated until
14664 /* Here, we are generally folding, but there is only one
14665 * code point to match. If we have to, we use an EXACT
14666 * node, but it would be better for joining with adjacent
14667 * nodes in the optimization pass if we used the same
14668 * EXACTFish node that any such are likely to be. We can
14669 * do this iff the code point doesn't participate in any
14670 * folds. For example, an EXACTF of a colon is the same as
14671 * an EXACT one, since nothing folds to or from a colon. */
14673 if (IS_IN_SOME_FOLD_L1(value)) {
14678 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14683 /* If we haven't found the node type, above, it means we
14684 * can use the prevailing one */
14686 op = compute_EXACTish(pRExC_state);
14691 else if (start == 0) {
14692 if (end == UV_MAX) {
14694 *flagp |= HASWIDTH|SIMPLE;
14697 else if (end == '\n' - 1
14698 && invlist_iternext(cp_list, &start, &end)
14699 && start == '\n' + 1 && end == UV_MAX)
14702 *flagp |= HASWIDTH|SIMPLE;
14706 invlist_iterfinish(cp_list);
14709 RExC_parse = (char *)orig_parse;
14710 RExC_emit = (regnode *)orig_emit;
14712 ret = reg_node(pRExC_state, op);
14714 RExC_parse = (char *)cur_parse;
14716 if (PL_regkind[op] == EXACT) {
14717 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14720 SvREFCNT_dec_NN(cp_list);
14725 /* Here, <cp_list> contains all the code points we can determine at
14726 * compile time that match under all conditions. Go through it, and
14727 * for things that belong in the bitmap, put them there, and delete from
14728 * <cp_list>. While we are at it, see if everything above 255 is in the
14729 * list, and if so, set a flag to speed up execution */
14731 populate_ANYOF_from_invlist(ret, &cp_list);
14734 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14737 /* Here, the bitmap has been populated with all the Latin1 code points that
14738 * always match. Can now add to the overall list those that match only
14739 * when the target string is UTF-8 (<depends_list>). */
14740 if (depends_list) {
14742 _invlist_union(cp_list, depends_list, &cp_list);
14743 SvREFCNT_dec_NN(depends_list);
14746 cp_list = depends_list;
14750 /* If there is a swash and more than one element, we can't use the swash in
14751 * the optimization below. */
14752 if (swash && element_count > 1) {
14753 SvREFCNT_dec_NN(swash);
14757 set_ANYOF_arg(pRExC_state, ret, cp_list,
14758 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14760 swash, has_user_defined_property);
14762 *flagp |= HASWIDTH|SIMPLE;
14766 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14769 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14770 regnode* const node,
14772 SV* const runtime_defns,
14774 const bool has_user_defined_property)
14776 /* Sets the arg field of an ANYOF-type node 'node', using information about
14777 * the node passed-in. If there is nothing outside the node's bitmap, the
14778 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
14779 * the count returned by add_data(), having allocated and stored an array,
14780 * av, that that count references, as follows:
14781 * av[0] stores the character class description in its textual form.
14782 * This is used later (regexec.c:Perl_regclass_swash()) to
14783 * initialize the appropriate swash, and is also useful for dumping
14784 * the regnode. This is set to &PL_sv_undef if the textual
14785 * description is not needed at run-time (as happens if the other
14786 * elements completely define the class)
14787 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14788 * computed from av[0]. But if no further computation need be done,
14789 * the swash is stored here now (and av[0] is &PL_sv_undef).
14790 * av[2] stores the cp_list inversion list for use in addition or instead
14791 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14792 * (Otherwise everything needed is already in av[0] and av[1])
14793 * av[3] is set if any component of the class is from a user-defined
14794 * property; used only if av[2] exists */
14798 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14800 if (! cp_list && ! runtime_defns) {
14801 ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14804 AV * const av = newAV();
14807 av_store(av, 0, (runtime_defns)
14808 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14810 av_store(av, 1, swash);
14811 SvREFCNT_dec_NN(cp_list);
14814 av_store(av, 1, &PL_sv_undef);
14816 av_store(av, 2, cp_list);
14817 av_store(av, 3, newSVuv(has_user_defined_property));
14821 rv = newRV_noinc(MUTABLE_SV(av));
14822 n = add_data(pRExC_state, STR_WITH_LEN("s"));
14823 RExC_rxi->data->data[n] = (void*)rv;
14829 /* reg_skipcomment()
14831 Absorbs an /x style # comments from the input stream.
14832 Returns true if there is more text remaining in the stream.
14833 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14834 terminates the pattern without including a newline.
14836 Note its the callers responsibility to ensure that we are
14837 actually in /x mode
14842 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14846 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14848 while (RExC_parse < RExC_end)
14849 if (*RExC_parse++ == '\n') {
14854 /* we ran off the end of the pattern without ending
14855 the comment, so we have to add an \n when wrapping */
14856 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14864 Advances the parse position, and optionally absorbs
14865 "whitespace" from the inputstream.
14867 Without /x "whitespace" means (?#...) style comments only,
14868 with /x this means (?#...) and # comments and whitespace proper.
14870 Returns the RExC_parse point from BEFORE the scan occurs.
14872 This is the /x friendly way of saying RExC_parse++.
14876 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14878 char* const retval = RExC_parse++;
14880 PERL_ARGS_ASSERT_NEXTCHAR;
14883 if (RExC_end - RExC_parse >= 3
14884 && *RExC_parse == '('
14885 && RExC_parse[1] == '?'
14886 && RExC_parse[2] == '#')
14888 while (*RExC_parse != ')') {
14889 if (RExC_parse == RExC_end)
14890 FAIL("Sequence (?#... not terminated");
14896 if (RExC_flags & RXf_PMf_EXTENDED) {
14897 if (isSPACE(*RExC_parse)) {
14901 else if (*RExC_parse == '#') {
14902 if ( reg_skipcomment( pRExC_state ) )
14911 - reg_node - emit a node
14913 STATIC regnode * /* Location. */
14914 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14918 regnode * const ret = RExC_emit;
14919 GET_RE_DEBUG_FLAGS_DECL;
14921 PERL_ARGS_ASSERT_REG_NODE;
14924 SIZE_ALIGN(RExC_size);
14928 if (RExC_emit >= RExC_emit_bound)
14929 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14930 op, RExC_emit, RExC_emit_bound);
14932 NODE_ALIGN_FILL(ret);
14934 FILL_ADVANCE_NODE(ptr, op);
14935 #ifdef RE_TRACK_PATTERN_OFFSETS
14936 if (RExC_offsets) { /* MJD */
14938 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14939 "reg_node", __LINE__,
14941 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14942 ? "Overwriting end of array!\n" : "OK",
14943 (UV)(RExC_emit - RExC_emit_start),
14944 (UV)(RExC_parse - RExC_start),
14945 (UV)RExC_offsets[0]));
14946 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14954 - reganode - emit a node with an argument
14956 STATIC regnode * /* Location. */
14957 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14961 regnode * const ret = RExC_emit;
14962 GET_RE_DEBUG_FLAGS_DECL;
14964 PERL_ARGS_ASSERT_REGANODE;
14967 SIZE_ALIGN(RExC_size);
14972 assert(2==regarglen[op]+1);
14974 Anything larger than this has to allocate the extra amount.
14975 If we changed this to be:
14977 RExC_size += (1 + regarglen[op]);
14979 then it wouldn't matter. Its not clear what side effect
14980 might come from that so its not done so far.
14985 if (RExC_emit >= RExC_emit_bound)
14986 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14987 op, RExC_emit, RExC_emit_bound);
14989 NODE_ALIGN_FILL(ret);
14991 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14992 #ifdef RE_TRACK_PATTERN_OFFSETS
14993 if (RExC_offsets) { /* MJD */
14995 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14999 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15000 "Overwriting end of array!\n" : "OK",
15001 (UV)(RExC_emit - RExC_emit_start),
15002 (UV)(RExC_parse - RExC_start),
15003 (UV)RExC_offsets[0]));
15004 Set_Cur_Node_Offset;
15012 - reguni - emit (if appropriate) a Unicode character
15014 PERL_STATIC_INLINE STRLEN
15015 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15019 PERL_ARGS_ASSERT_REGUNI;
15021 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15025 - reginsert - insert an operator in front of already-emitted operand
15027 * Means relocating the operand.
15030 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15036 const int offset = regarglen[(U8)op];
15037 const int size = NODE_STEP_REGNODE + offset;
15038 GET_RE_DEBUG_FLAGS_DECL;
15040 PERL_ARGS_ASSERT_REGINSERT;
15041 PERL_UNUSED_ARG(depth);
15042 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15043 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15052 if (RExC_open_parens) {
15054 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15055 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15056 if ( RExC_open_parens[paren] >= opnd ) {
15057 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15058 RExC_open_parens[paren] += size;
15060 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15062 if ( RExC_close_parens[paren] >= opnd ) {
15063 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15064 RExC_close_parens[paren] += size;
15066 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15071 while (src > opnd) {
15072 StructCopy(--src, --dst, regnode);
15073 #ifdef RE_TRACK_PATTERN_OFFSETS
15074 if (RExC_offsets) { /* MJD 20010112 */
15076 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15080 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15081 ? "Overwriting end of array!\n" : "OK",
15082 (UV)(src - RExC_emit_start),
15083 (UV)(dst - RExC_emit_start),
15084 (UV)RExC_offsets[0]));
15085 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15086 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15092 place = opnd; /* Op node, where operand used to be. */
15093 #ifdef RE_TRACK_PATTERN_OFFSETS
15094 if (RExC_offsets) { /* MJD */
15096 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15100 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15101 ? "Overwriting end of array!\n" : "OK",
15102 (UV)(place - RExC_emit_start),
15103 (UV)(RExC_parse - RExC_start),
15104 (UV)RExC_offsets[0]));
15105 Set_Node_Offset(place, RExC_parse);
15106 Set_Node_Length(place, 1);
15109 src = NEXTOPER(place);
15110 FILL_ADVANCE_NODE(place, op);
15111 Zero(src, offset, regnode);
15115 - regtail - set the next-pointer at the end of a node chain of p to val.
15116 - SEE ALSO: regtail_study
15118 /* TODO: All three parms should be const */
15120 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15121 const regnode *val,U32 depth)
15125 GET_RE_DEBUG_FLAGS_DECL;
15127 PERL_ARGS_ASSERT_REGTAIL;
15129 PERL_UNUSED_ARG(depth);
15135 /* Find last node. */
15138 regnode * const temp = regnext(scan);
15140 SV * const mysv=sv_newmortal();
15141 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15142 regprop(RExC_rx, mysv, scan);
15143 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15144 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15145 (temp == NULL ? "->" : ""),
15146 (temp == NULL ? PL_reg_name[OP(val)] : "")
15154 if (reg_off_by_arg[OP(scan)]) {
15155 ARG_SET(scan, val - scan);
15158 NEXT_OFF(scan) = val - scan;
15164 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15165 - Look for optimizable sequences at the same time.
15166 - currently only looks for EXACT chains.
15168 This is experimental code. The idea is to use this routine to perform
15169 in place optimizations on branches and groups as they are constructed,
15170 with the long term intention of removing optimization from study_chunk so
15171 that it is purely analytical.
15173 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15174 to control which is which.
15177 /* TODO: All four parms should be const */
15180 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15181 const regnode *val,U32 depth)
15186 #ifdef EXPERIMENTAL_INPLACESCAN
15189 GET_RE_DEBUG_FLAGS_DECL;
15191 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15197 /* Find last node. */
15201 regnode * const temp = regnext(scan);
15202 #ifdef EXPERIMENTAL_INPLACESCAN
15203 if (PL_regkind[OP(scan)] == EXACT) {
15204 bool unfolded_multi_char; /* Unexamined in this routine */
15205 if (join_exact(pRExC_state, scan, &min,
15206 &unfolded_multi_char, 1, val, depth+1))
15211 switch (OP(scan)) {
15214 case EXACTFA_NO_TRIE:
15219 if( exact == PSEUDO )
15221 else if ( exact != OP(scan) )
15230 SV * const mysv=sv_newmortal();
15231 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15232 regprop(RExC_rx, mysv, scan);
15233 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15234 SvPV_nolen_const(mysv),
15235 REG_NODE_NUM(scan),
15236 PL_reg_name[exact]);
15243 SV * const mysv_val=sv_newmortal();
15244 DEBUG_PARSE_MSG("");
15245 regprop(RExC_rx, mysv_val, val);
15246 PerlIO_printf(Perl_debug_log,
15247 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15248 SvPV_nolen_const(mysv_val),
15249 (IV)REG_NODE_NUM(val),
15253 if (reg_off_by_arg[OP(scan)]) {
15254 ARG_SET(scan, val - scan);
15257 NEXT_OFF(scan) = val - scan;
15265 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15270 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15275 for (bit=0; bit<32; bit++) {
15276 if (flags & (1<<bit)) {
15277 if (!set++ && lead)
15278 PerlIO_printf(Perl_debug_log, "%s",lead);
15279 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15284 PerlIO_printf(Perl_debug_log, "\n");
15286 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15291 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15297 for (bit=0; bit<32; bit++) {
15298 if (flags & (1<<bit)) {
15299 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15302 if (!set++ && lead)
15303 PerlIO_printf(Perl_debug_log, "%s",lead);
15304 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15307 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15308 if (!set++ && lead) {
15309 PerlIO_printf(Perl_debug_log, "%s",lead);
15312 case REGEX_UNICODE_CHARSET:
15313 PerlIO_printf(Perl_debug_log, "UNICODE");
15315 case REGEX_LOCALE_CHARSET:
15316 PerlIO_printf(Perl_debug_log, "LOCALE");
15318 case REGEX_ASCII_RESTRICTED_CHARSET:
15319 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15321 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15322 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15325 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15331 PerlIO_printf(Perl_debug_log, "\n");
15333 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15339 Perl_regdump(pTHX_ const regexp *r)
15343 SV * const sv = sv_newmortal();
15344 SV *dsv= sv_newmortal();
15345 RXi_GET_DECL(r,ri);
15346 GET_RE_DEBUG_FLAGS_DECL;
15348 PERL_ARGS_ASSERT_REGDUMP;
15350 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15352 /* Header fields of interest. */
15353 if (r->anchored_substr) {
15354 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15355 RE_SV_DUMPLEN(r->anchored_substr), 30);
15356 PerlIO_printf(Perl_debug_log,
15357 "anchored %s%s at %"IVdf" ",
15358 s, RE_SV_TAIL(r->anchored_substr),
15359 (IV)r->anchored_offset);
15360 } else if (r->anchored_utf8) {
15361 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15362 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15363 PerlIO_printf(Perl_debug_log,
15364 "anchored utf8 %s%s at %"IVdf" ",
15365 s, RE_SV_TAIL(r->anchored_utf8),
15366 (IV)r->anchored_offset);
15368 if (r->float_substr) {
15369 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15370 RE_SV_DUMPLEN(r->float_substr), 30);
15371 PerlIO_printf(Perl_debug_log,
15372 "floating %s%s at %"IVdf"..%"UVuf" ",
15373 s, RE_SV_TAIL(r->float_substr),
15374 (IV)r->float_min_offset, (UV)r->float_max_offset);
15375 } else if (r->float_utf8) {
15376 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15377 RE_SV_DUMPLEN(r->float_utf8), 30);
15378 PerlIO_printf(Perl_debug_log,
15379 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15380 s, RE_SV_TAIL(r->float_utf8),
15381 (IV)r->float_min_offset, (UV)r->float_max_offset);
15383 if (r->check_substr || r->check_utf8)
15384 PerlIO_printf(Perl_debug_log,
15386 (r->check_substr == r->float_substr
15387 && r->check_utf8 == r->float_utf8
15388 ? "(checking floating" : "(checking anchored"));
15389 if (r->extflags & RXf_NOSCAN)
15390 PerlIO_printf(Perl_debug_log, " noscan");
15391 if (r->extflags & RXf_CHECK_ALL)
15392 PerlIO_printf(Perl_debug_log, " isall");
15393 if (r->check_substr || r->check_utf8)
15394 PerlIO_printf(Perl_debug_log, ") ");
15396 if (ri->regstclass) {
15397 regprop(r, sv, ri->regstclass);
15398 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15400 if (r->extflags & RXf_ANCH) {
15401 PerlIO_printf(Perl_debug_log, "anchored");
15402 if (r->extflags & RXf_ANCH_BOL)
15403 PerlIO_printf(Perl_debug_log, "(BOL)");
15404 if (r->extflags & RXf_ANCH_MBOL)
15405 PerlIO_printf(Perl_debug_log, "(MBOL)");
15406 if (r->extflags & RXf_ANCH_SBOL)
15407 PerlIO_printf(Perl_debug_log, "(SBOL)");
15408 if (r->extflags & RXf_ANCH_GPOS)
15409 PerlIO_printf(Perl_debug_log, "(GPOS)");
15410 PerlIO_putc(Perl_debug_log, ' ');
15412 if (r->extflags & RXf_GPOS_SEEN)
15413 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15414 if (r->intflags & PREGf_SKIP)
15415 PerlIO_printf(Perl_debug_log, "plus ");
15416 if (r->intflags & PREGf_IMPLICIT)
15417 PerlIO_printf(Perl_debug_log, "implicit ");
15418 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15419 if (r->extflags & RXf_EVAL_SEEN)
15420 PerlIO_printf(Perl_debug_log, "with eval ");
15421 PerlIO_printf(Perl_debug_log, "\n");
15423 regdump_extflags("r->extflags: ",r->extflags);
15424 regdump_intflags("r->intflags: ",r->intflags);
15427 PERL_ARGS_ASSERT_REGDUMP;
15428 PERL_UNUSED_CONTEXT;
15429 PERL_UNUSED_ARG(r);
15430 #endif /* DEBUGGING */
15434 - regprop - printable representation of opcode
15438 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
15444 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15445 static const char * const anyofs[] = {
15446 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15447 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15448 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15449 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15450 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15451 || _CC_VERTSPACE != 16
15452 #error Need to adjust order of anyofs[]
15489 RXi_GET_DECL(prog,progi);
15490 GET_RE_DEBUG_FLAGS_DECL;
15492 PERL_ARGS_ASSERT_REGPROP;
15496 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15497 /* It would be nice to FAIL() here, but this may be called from
15498 regexec.c, and it would be hard to supply pRExC_state. */
15499 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15500 (int)OP(o), (int)REGNODE_MAX);
15501 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15503 k = PL_regkind[OP(o)];
15506 sv_catpvs(sv, " ");
15507 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15508 * is a crude hack but it may be the best for now since
15509 * we have no flag "this EXACTish node was UTF-8"
15511 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15512 PERL_PV_ESCAPE_UNI_DETECT |
15513 PERL_PV_ESCAPE_NONASCII |
15514 PERL_PV_PRETTY_ELLIPSES |
15515 PERL_PV_PRETTY_LTGT |
15516 PERL_PV_PRETTY_NOCLEAR
15518 } else if (k == TRIE) {
15519 /* print the details of the trie in dumpuntil instead, as
15520 * progi->data isn't available here */
15521 const char op = OP(o);
15522 const U32 n = ARG(o);
15523 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15524 (reg_ac_data *)progi->data->data[n] :
15526 const reg_trie_data * const trie
15527 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15529 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15530 DEBUG_TRIE_COMPILE_r(
15531 Perl_sv_catpvf(aTHX_ sv,
15532 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15533 (UV)trie->startstate,
15534 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15535 (UV)trie->wordcount,
15538 (UV)TRIE_CHARCOUNT(trie),
15539 (UV)trie->uniquecharcount
15542 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15543 sv_catpvs(sv, "[");
15544 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15546 : TRIE_BITMAP(trie));
15547 sv_catpvs(sv, "]");
15550 } else if (k == CURLY) {
15551 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15552 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15553 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15555 else if (k == WHILEM && o->flags) /* Ordinal/of */
15556 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15557 else if (k == REF || k == OPEN || k == CLOSE
15558 || k == GROUPP || OP(o)==ACCEPT)
15560 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15561 if ( RXp_PAREN_NAMES(prog) ) {
15562 if ( k != REF || (OP(o) < NREF)) {
15563 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15564 SV **name= av_fetch(list, ARG(o), 0 );
15566 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15569 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15570 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15571 I32 *nums=(I32*)SvPVX(sv_dat);
15572 SV **name= av_fetch(list, nums[0], 0 );
15575 for ( n=0; n<SvIVX(sv_dat); n++ ) {
15576 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15577 (n ? "," : ""), (IV)nums[n]);
15579 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15583 } else if (k == GOSUB)
15584 /* Paren and offset */
15585 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15586 else if (k == VERB) {
15588 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15589 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15590 } else if (k == LOGICAL)
15591 /* 2: embedded, otherwise 1 */
15592 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15593 else if (k == ANYOF) {
15594 const U8 flags = ANYOF_FLAGS(o);
15598 if (flags & ANYOF_LOCALE)
15599 sv_catpvs(sv, "{loc}");
15600 if (flags & ANYOF_LOC_FOLD)
15601 sv_catpvs(sv, "{i}");
15602 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15603 if (flags & ANYOF_INVERT)
15604 sv_catpvs(sv, "^");
15606 /* output what the standard cp 0-255 bitmap matches */
15607 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15609 /* output any special charclass tests (used entirely under use
15611 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15613 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15614 if (ANYOF_POSIXL_TEST(o,i)) {
15615 sv_catpv(sv, anyofs[i]);
15621 if ((flags & ANYOF_ABOVE_LATIN1_ALL)
15622 || ANYOF_UTF8_LOCALE_INVLIST(o) || ANYOF_NONBITMAP(o))
15625 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15626 if (flags & ANYOF_INVERT)
15627 /*make sure the invert info is in each */
15628 sv_catpvs(sv, "^");
15631 if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15632 sv_catpvs(sv, "{non-utf8-latin1-all}");
15635 /* output information about the unicode matching */
15636 if (flags & ANYOF_ABOVE_LATIN1_ALL)
15637 sv_catpvs(sv, "{unicode_all}");
15638 else if (ANYOF_NONBITMAP(o)) {
15639 SV *lv; /* Set if there is something outside the bit map. */
15640 bool byte_output = FALSE; /* If something in the bitmap has
15643 /* Get the stuff that wasn't in the bitmap */
15644 (void) regclass_swash(prog, o, FALSE, &lv, NULL);
15645 if (lv && lv != &PL_sv_undef) {
15646 char *s = savesvpv(lv);
15647 char * const origs = s;
15649 while (*s && *s != '\n')
15653 const char * const t = ++s;
15655 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15656 sv_catpvs(sv, "{outside bitmap}");
15659 sv_catpvs(sv, "{utf8}");
15663 sv_catpvs(sv, " ");
15669 /* Truncate very long output */
15670 if (s - origs > 256) {
15671 Perl_sv_catpvf(aTHX_ sv,
15673 (int) (s - origs - 1),
15679 else if (*s == '\t') {
15693 SvREFCNT_dec_NN(lv);
15697 /* Output any UTF-8 locale code points */
15698 if (flags & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(o)) {
15700 int max_entries = 256;
15702 sv_catpvs(sv, "{utf8 locale}");
15703 invlist_iterinit(ANYOF_UTF8_LOCALE_INVLIST(o));
15704 while (invlist_iternext(ANYOF_UTF8_LOCALE_INVLIST(o),
15706 put_range(sv, start, end);
15708 if (max_entries < 0) {
15709 sv_catpvs(sv, "...");
15713 invlist_iterfinish(ANYOF_UTF8_LOCALE_INVLIST(o));
15717 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15719 else if (k == POSIXD || k == NPOSIXD) {
15720 U8 index = FLAGS(o) * 2;
15721 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
15722 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15725 if (*anyofs[index] != '[') {
15728 sv_catpv(sv, anyofs[index]);
15729 if (*anyofs[index] != '[') {
15734 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15735 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15737 PERL_UNUSED_CONTEXT;
15738 PERL_UNUSED_ARG(sv);
15739 PERL_UNUSED_ARG(o);
15740 PERL_UNUSED_ARG(prog);
15741 #endif /* DEBUGGING */
15745 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15746 { /* Assume that RE_INTUIT is set */
15748 struct regexp *const prog = ReANY(r);
15749 GET_RE_DEBUG_FLAGS_DECL;
15751 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15752 PERL_UNUSED_CONTEXT;
15756 const char * const s = SvPV_nolen_const(prog->check_substr
15757 ? prog->check_substr : prog->check_utf8);
15759 if (!PL_colorset) reginitcolors();
15760 PerlIO_printf(Perl_debug_log,
15761 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15763 prog->check_substr ? "" : "utf8 ",
15764 PL_colors[5],PL_colors[0],
15767 (strlen(s) > 60 ? "..." : ""));
15770 return prog->check_substr ? prog->check_substr : prog->check_utf8;
15776 handles refcounting and freeing the perl core regexp structure. When
15777 it is necessary to actually free the structure the first thing it
15778 does is call the 'free' method of the regexp_engine associated to
15779 the regexp, allowing the handling of the void *pprivate; member
15780 first. (This routine is not overridable by extensions, which is why
15781 the extensions free is called first.)
15783 See regdupe and regdupe_internal if you change anything here.
15785 #ifndef PERL_IN_XSUB_RE
15787 Perl_pregfree(pTHX_ REGEXP *r)
15793 Perl_pregfree2(pTHX_ REGEXP *rx)
15796 struct regexp *const r = ReANY(rx);
15797 GET_RE_DEBUG_FLAGS_DECL;
15799 PERL_ARGS_ASSERT_PREGFREE2;
15801 if (r->mother_re) {
15802 ReREFCNT_dec(r->mother_re);
15804 CALLREGFREE_PVT(rx); /* free the private data */
15805 SvREFCNT_dec(RXp_PAREN_NAMES(r));
15806 Safefree(r->xpv_len_u.xpvlenu_pv);
15809 SvREFCNT_dec(r->anchored_substr);
15810 SvREFCNT_dec(r->anchored_utf8);
15811 SvREFCNT_dec(r->float_substr);
15812 SvREFCNT_dec(r->float_utf8);
15813 Safefree(r->substrs);
15815 RX_MATCH_COPY_FREE(rx);
15816 #ifdef PERL_ANY_COW
15817 SvREFCNT_dec(r->saved_copy);
15820 SvREFCNT_dec(r->qr_anoncv);
15821 rx->sv_u.svu_rx = 0;
15826 This is a hacky workaround to the structural issue of match results
15827 being stored in the regexp structure which is in turn stored in
15828 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15829 could be PL_curpm in multiple contexts, and could require multiple
15830 result sets being associated with the pattern simultaneously, such
15831 as when doing a recursive match with (??{$qr})
15833 The solution is to make a lightweight copy of the regexp structure
15834 when a qr// is returned from the code executed by (??{$qr}) this
15835 lightweight copy doesn't actually own any of its data except for
15836 the starp/end and the actual regexp structure itself.
15842 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15844 struct regexp *ret;
15845 struct regexp *const r = ReANY(rx);
15846 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15848 PERL_ARGS_ASSERT_REG_TEMP_COPY;
15851 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15853 SvOK_off((SV *)ret_x);
15855 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15856 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
15857 made both spots point to the same regexp body.) */
15858 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15859 assert(!SvPVX(ret_x));
15860 ret_x->sv_u.svu_rx = temp->sv_any;
15861 temp->sv_any = NULL;
15862 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15863 SvREFCNT_dec_NN(temp);
15864 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15865 ing below will not set it. */
15866 SvCUR_set(ret_x, SvCUR(rx));
15869 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15870 sv_force_normal(sv) is called. */
15872 ret = ReANY(ret_x);
15874 SvFLAGS(ret_x) |= SvUTF8(rx);
15875 /* We share the same string buffer as the original regexp, on which we
15876 hold a reference count, incremented when mother_re is set below.
15877 The string pointer is copied here, being part of the regexp struct.
15879 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15880 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15882 const I32 npar = r->nparens+1;
15883 Newx(ret->offs, npar, regexp_paren_pair);
15884 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15887 Newx(ret->substrs, 1, struct reg_substr_data);
15888 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15890 SvREFCNT_inc_void(ret->anchored_substr);
15891 SvREFCNT_inc_void(ret->anchored_utf8);
15892 SvREFCNT_inc_void(ret->float_substr);
15893 SvREFCNT_inc_void(ret->float_utf8);
15895 /* check_substr and check_utf8, if non-NULL, point to either their
15896 anchored or float namesakes, and don't hold a second reference. */
15898 RX_MATCH_COPIED_off(ret_x);
15899 #ifdef PERL_ANY_COW
15900 ret->saved_copy = NULL;
15902 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15903 SvREFCNT_inc_void(ret->qr_anoncv);
15909 /* regfree_internal()
15911 Free the private data in a regexp. This is overloadable by
15912 extensions. Perl takes care of the regexp structure in pregfree(),
15913 this covers the *pprivate pointer which technically perl doesn't
15914 know about, however of course we have to handle the
15915 regexp_internal structure when no extension is in use.
15917 Note this is called before freeing anything in the regexp
15922 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15925 struct regexp *const r = ReANY(rx);
15926 RXi_GET_DECL(r,ri);
15927 GET_RE_DEBUG_FLAGS_DECL;
15929 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15935 SV *dsv= sv_newmortal();
15936 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15937 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15938 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15939 PL_colors[4],PL_colors[5],s);
15942 #ifdef RE_TRACK_PATTERN_OFFSETS
15944 Safefree(ri->u.offsets); /* 20010421 MJD */
15946 if (ri->code_blocks) {
15948 for (n = 0; n < ri->num_code_blocks; n++)
15949 SvREFCNT_dec(ri->code_blocks[n].src_regex);
15950 Safefree(ri->code_blocks);
15954 int n = ri->data->count;
15957 /* If you add a ->what type here, update the comment in regcomp.h */
15958 switch (ri->data->what[n]) {
15964 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15967 Safefree(ri->data->data[n]);
15973 { /* Aho Corasick add-on structure for a trie node.
15974 Used in stclass optimization only */
15976 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15978 refcount = --aho->refcount;
15981 PerlMemShared_free(aho->states);
15982 PerlMemShared_free(aho->fail);
15983 /* do this last!!!! */
15984 PerlMemShared_free(ri->data->data[n]);
15985 PerlMemShared_free(ri->regstclass);
15991 /* trie structure. */
15993 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15995 refcount = --trie->refcount;
15998 PerlMemShared_free(trie->charmap);
15999 PerlMemShared_free(trie->states);
16000 PerlMemShared_free(trie->trans);
16002 PerlMemShared_free(trie->bitmap);
16004 PerlMemShared_free(trie->jump);
16005 PerlMemShared_free(trie->wordinfo);
16006 /* do this last!!!! */
16007 PerlMemShared_free(ri->data->data[n]);
16012 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16013 ri->data->what[n]);
16016 Safefree(ri->data->what);
16017 Safefree(ri->data);
16023 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16024 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16025 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16028 re_dup - duplicate a regexp.
16030 This routine is expected to clone a given regexp structure. It is only
16031 compiled under USE_ITHREADS.
16033 After all of the core data stored in struct regexp is duplicated
16034 the regexp_engine.dupe method is used to copy any private data
16035 stored in the *pprivate pointer. This allows extensions to handle
16036 any duplication it needs to do.
16038 See pregfree() and regfree_internal() if you change anything here.
16040 #if defined(USE_ITHREADS)
16041 #ifndef PERL_IN_XSUB_RE
16043 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16047 const struct regexp *r = ReANY(sstr);
16048 struct regexp *ret = ReANY(dstr);
16050 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16052 npar = r->nparens+1;
16053 Newx(ret->offs, npar, regexp_paren_pair);
16054 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16056 if (ret->substrs) {
16057 /* Do it this way to avoid reading from *r after the StructCopy().
16058 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16059 cache, it doesn't matter. */
16060 const bool anchored = r->check_substr
16061 ? r->check_substr == r->anchored_substr
16062 : r->check_utf8 == r->anchored_utf8;
16063 Newx(ret->substrs, 1, struct reg_substr_data);
16064 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16066 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16067 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16068 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16069 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16071 /* check_substr and check_utf8, if non-NULL, point to either their
16072 anchored or float namesakes, and don't hold a second reference. */
16074 if (ret->check_substr) {
16076 assert(r->check_utf8 == r->anchored_utf8);
16077 ret->check_substr = ret->anchored_substr;
16078 ret->check_utf8 = ret->anchored_utf8;
16080 assert(r->check_substr == r->float_substr);
16081 assert(r->check_utf8 == r->float_utf8);
16082 ret->check_substr = ret->float_substr;
16083 ret->check_utf8 = ret->float_utf8;
16085 } else if (ret->check_utf8) {
16087 ret->check_utf8 = ret->anchored_utf8;
16089 ret->check_utf8 = ret->float_utf8;
16094 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16095 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16098 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16100 if (RX_MATCH_COPIED(dstr))
16101 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16103 ret->subbeg = NULL;
16104 #ifdef PERL_ANY_COW
16105 ret->saved_copy = NULL;
16108 /* Whether mother_re be set or no, we need to copy the string. We
16109 cannot refrain from copying it when the storage points directly to
16110 our mother regexp, because that's
16111 1: a buffer in a different thread
16112 2: something we no longer hold a reference on
16113 so we need to copy it locally. */
16114 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16115 ret->mother_re = NULL;
16117 #endif /* PERL_IN_XSUB_RE */
16122 This is the internal complement to regdupe() which is used to copy
16123 the structure pointed to by the *pprivate pointer in the regexp.
16124 This is the core version of the extension overridable cloning hook.
16125 The regexp structure being duplicated will be copied by perl prior
16126 to this and will be provided as the regexp *r argument, however
16127 with the /old/ structures pprivate pointer value. Thus this routine
16128 may override any copying normally done by perl.
16130 It returns a pointer to the new regexp_internal structure.
16134 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16137 struct regexp *const r = ReANY(rx);
16138 regexp_internal *reti;
16140 RXi_GET_DECL(r,ri);
16142 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16146 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16147 char, regexp_internal);
16148 Copy(ri->program, reti->program, len+1, regnode);
16150 reti->num_code_blocks = ri->num_code_blocks;
16151 if (ri->code_blocks) {
16153 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16154 struct reg_code_block);
16155 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16156 struct reg_code_block);
16157 for (n = 0; n < ri->num_code_blocks; n++)
16158 reti->code_blocks[n].src_regex = (REGEXP*)
16159 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16162 reti->code_blocks = NULL;
16164 reti->regstclass = NULL;
16167 struct reg_data *d;
16168 const int count = ri->data->count;
16171 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16172 char, struct reg_data);
16173 Newx(d->what, count, U8);
16176 for (i = 0; i < count; i++) {
16177 d->what[i] = ri->data->what[i];
16178 switch (d->what[i]) {
16179 /* see also regcomp.h and regfree_internal() */
16180 case 'a': /* actually an AV, but the dup function is identical. */
16184 case 'u': /* actually an HV, but the dup function is identical. */
16185 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16188 /* This is cheating. */
16189 Newx(d->data[i], 1, regnode_ssc);
16190 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16191 reti->regstclass = (regnode*)d->data[i];
16194 /* Trie stclasses are readonly and can thus be shared
16195 * without duplication. We free the stclass in pregfree
16196 * when the corresponding reg_ac_data struct is freed.
16198 reti->regstclass= ri->regstclass;
16202 ((reg_trie_data*)ri->data->data[i])->refcount++;
16207 d->data[i] = ri->data->data[i];
16210 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16211 ri->data->what[i]);
16220 reti->name_list_idx = ri->name_list_idx;
16222 #ifdef RE_TRACK_PATTERN_OFFSETS
16223 if (ri->u.offsets) {
16224 Newx(reti->u.offsets, 2*len+1, U32);
16225 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16228 SetProgLen(reti,len);
16231 return (void*)reti;
16234 #endif /* USE_ITHREADS */
16236 #ifndef PERL_IN_XSUB_RE
16239 - regnext - dig the "next" pointer out of a node
16242 Perl_regnext(pTHX_ regnode *p)
16250 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16251 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16252 (int)OP(p), (int)REGNODE_MAX);
16255 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16264 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16267 STRLEN l1 = strlen(pat1);
16268 STRLEN l2 = strlen(pat2);
16271 const char *message;
16273 PERL_ARGS_ASSERT_RE_CROAK2;
16279 Copy(pat1, buf, l1 , char);
16280 Copy(pat2, buf + l1, l2 , char);
16281 buf[l1 + l2] = '\n';
16282 buf[l1 + l2 + 1] = '\0';
16283 va_start(args, pat2);
16284 msv = vmess(buf, &args);
16286 message = SvPV_const(msv,l1);
16289 Copy(message, buf, l1 , char);
16290 /* l1-1 to avoid \n */
16291 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16294 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
16296 #ifndef PERL_IN_XSUB_RE
16298 Perl_save_re_context(pTHX)
16302 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16304 const REGEXP * const rx = PM_GETRE(PL_curpm);
16307 for (i = 1; i <= RX_NPARENS(rx); i++) {
16308 char digits[TYPE_CHARS(long)];
16309 const STRLEN len = my_snprintf(digits, sizeof(digits),
16311 GV *const *const gvp
16312 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16315 GV * const gv = *gvp;
16316 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16328 S_put_byte(pTHX_ SV *sv, int c)
16330 PERL_ARGS_ASSERT_PUT_BYTE;
16334 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16335 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16336 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16337 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16338 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16341 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16346 const char string = c;
16347 if (c == '-' || c == ']' || c == '\\' || c == '^')
16348 sv_catpvs(sv, "\\");
16349 sv_catpvn(sv, &string, 1);
16354 S_put_range(pTHX_ SV *sv, UV start, UV end)
16357 /* Appends to 'sv' a displayable version of the range of code points from
16358 * 'start' to 'end' */
16360 assert(start <= end);
16362 PERL_ARGS_ASSERT_PUT_RANGE;
16364 if (end - start < 3) { /* Individual chars in short ranges */
16365 for (; start <= end; start++)
16366 put_byte(sv, start);
16368 else if ( end > 255
16369 || ! isALPHANUMERIC(start)
16370 || ! isALPHANUMERIC(end)
16371 || isDIGIT(start) != isDIGIT(end)
16372 || isUPPER(start) != isUPPER(end)
16373 || isLOWER(start) != isLOWER(end)
16375 /* This final test should get optimized out except on EBCDIC
16376 * platforms, where it causes ranges that cross discontinuities
16377 * like i/j to be shown as hex instead of the misleading,
16378 * e.g. H-K (since that range includes more than H, I, J, K).
16380 || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16382 Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16384 (end < 256) ? end : 255);
16386 else { /* Here, the ends of the range are both digits, or both uppercase,
16387 or both lowercase; and there's no discontinuity in the range
16388 (which could happen on EBCDIC platforms) */
16389 put_byte(sv, start);
16390 sv_catpvs(sv, "-");
16396 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16398 /* Appends to 'sv' a displayable version of the innards of the bracketed
16399 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16400 * output anything */
16403 bool has_output_anything = FALSE;
16405 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16407 for (i = 0; i < 256; i++) {
16408 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16410 /* The character at index i should be output. Find the next
16411 * character that should NOT be output */
16413 for (j = i + 1; j <= 256; j++) {
16414 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16419 /* Everything between them is a single range that should be output
16421 put_range(sv, i, j - 1);
16422 has_output_anything = TRUE;
16427 return has_output_anything;
16430 #define CLEAR_OPTSTART \
16431 if (optstart) STMT_START { \
16432 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
16433 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16437 #define DUMPUNTIL(b,e) \
16439 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16441 STATIC const regnode *
16442 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16443 const regnode *last, const regnode *plast,
16444 SV* sv, I32 indent, U32 depth)
16447 U8 op = PSEUDO; /* Arbitrary non-END op. */
16448 const regnode *next;
16449 const regnode *optstart= NULL;
16451 RXi_GET_DECL(r,ri);
16452 GET_RE_DEBUG_FLAGS_DECL;
16454 PERL_ARGS_ASSERT_DUMPUNTIL;
16456 #ifdef DEBUG_DUMPUNTIL
16457 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16458 last ? last-start : 0,plast ? plast-start : 0);
16461 if (plast && plast < last)
16464 while (PL_regkind[op] != END && (!last || node < last)) {
16465 /* While that wasn't END last time... */
16468 if (op == CLOSE || op == WHILEM)
16470 next = regnext((regnode *)node);
16473 if (OP(node) == OPTIMIZED) {
16474 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16481 regprop(r, sv, node);
16482 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16483 (int)(2*indent + 1), "", SvPVX_const(sv));
16485 if (OP(node) != OPTIMIZED) {
16486 if (next == NULL) /* Next ptr. */
16487 PerlIO_printf(Perl_debug_log, " (0)");
16488 else if (PL_regkind[(U8)op] == BRANCH
16489 && PL_regkind[OP(next)] != BRANCH )
16490 PerlIO_printf(Perl_debug_log, " (FAIL)");
16492 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16493 (void)PerlIO_putc(Perl_debug_log, '\n');
16497 if (PL_regkind[(U8)op] == BRANCHJ) {
16500 const regnode *nnode = (OP(next) == LONGJMP
16501 ? regnext((regnode *)next)
16503 if (last && nnode > last)
16505 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16508 else if (PL_regkind[(U8)op] == BRANCH) {
16510 DUMPUNTIL(NEXTOPER(node), next);
16512 else if ( PL_regkind[(U8)op] == TRIE ) {
16513 const regnode *this_trie = node;
16514 const char op = OP(node);
16515 const U32 n = ARG(node);
16516 const reg_ac_data * const ac = op>=AHOCORASICK ?
16517 (reg_ac_data *)ri->data->data[n] :
16519 const reg_trie_data * const trie =
16520 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16522 AV *const trie_words
16523 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16525 const regnode *nextbranch= NULL;
16528 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16529 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16531 PerlIO_printf(Perl_debug_log, "%*s%s ",
16532 (int)(2*(indent+3)), "",
16534 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16535 SvCUR(*elem_ptr), 60,
16536 PL_colors[0], PL_colors[1],
16538 ? PERL_PV_ESCAPE_UNI
16540 | PERL_PV_PRETTY_ELLIPSES
16541 | PERL_PV_PRETTY_LTGT
16546 U16 dist= trie->jump[word_idx+1];
16547 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16548 (UV)((dist ? this_trie + dist : next) - start));
16551 nextbranch= this_trie + trie->jump[0];
16552 DUMPUNTIL(this_trie + dist, nextbranch);
16554 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16555 nextbranch= regnext((regnode *)nextbranch);
16557 PerlIO_printf(Perl_debug_log, "\n");
16560 if (last && next > last)
16565 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
16566 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16567 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16569 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16571 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16573 else if ( op == PLUS || op == STAR) {
16574 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16576 else if (PL_regkind[(U8)op] == ANYOF) {
16577 /* arglen 1 + class block */
16578 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
16579 ? ANYOF_POSIXL_FOLD_SKIP
16580 : (ANYOF_FLAGS(node) & ANYOF_POSIXL)
16581 ? ANYOF_POSIXL_SKIP
16583 node = NEXTOPER(node);
16585 else if (PL_regkind[(U8)op] == EXACT) {
16586 /* Literal string, where present. */
16587 node += NODE_SZ_STR(node) - 1;
16588 node = NEXTOPER(node);
16591 node = NEXTOPER(node);
16592 node += regarglen[(U8)op];
16594 if (op == CURLYX || op == OPEN)
16598 #ifdef DEBUG_DUMPUNTIL
16599 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16604 #endif /* DEBUGGING */
16608 * c-indentation-style: bsd
16609 * c-basic-offset: 4
16610 * indent-tabs-mode: nil
16613 * ex: set ts=8 sts=4 sw=4 et: