5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 struct RExC_state_t {
104 U32 flags; /* RXf_* are we folding, multilining? */
105 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
106 char *precomp; /* uncompiled string. */
107 REGEXP *rx_sv; /* The SV that is the regexp. */
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
110 char *start; /* Start of input for compile */
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
113 SSize_t whilem_seen; /* number of WHILEM in this expr */
114 regnode *emit_start; /* Start of emitted-code area */
115 regnode *emit_bound; /* First regnode outside of the allocated space */
116 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
117 implies compiling, so don't emit */
118 regnode_ssc emit_dummy; /* placeholder for emit to point to;
119 large enough for the largest
120 non-EXACTish node, so can use it as
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
125 SSize_t size; /* Code size. */
126 I32 npar; /* Capture buffer count, (OPEN). */
127 I32 cpar; /* Capture buffer count, (CLOSE). */
128 I32 nestroot; /* root parens we are in - used by accept */
131 regnode **open_parens; /* pointers to open parens */
132 regnode **close_parens; /* pointers to close parens */
133 regnode *opend; /* END node in program */
134 I32 utf8; /* whether the pattern is utf8 or not */
135 I32 orig_utf8; /* whether the pattern was originally in utf8 */
136 /* XXX use this for future optimisation of case
137 * where pattern must be upgraded to utf8. */
138 I32 uni_semantics; /* If a d charset modifier should use unicode
139 rules, even if the pattern is not in
141 HV *paren_names; /* Paren names */
143 regnode **recurse; /* Recurse regops */
144 I32 recurse_count; /* Number of recurse regops */
148 I32 override_recoding;
149 I32 in_multi_char_class;
150 struct reg_code_block *code_blocks; /* positions of literal (?{})
152 int num_code_blocks; /* size of code_blocks[] */
153 int code_index; /* next code_blocks[] slot */
155 char *starttry; /* -Dr: where regtry was called. */
156 #define RExC_starttry (pRExC_state->starttry)
158 SV *runtime_code_qr; /* qr with the runtime code blocks */
160 const char *lastparse;
162 AV *paren_name_list; /* idx -> name */
163 #define RExC_lastparse (pRExC_state->lastparse)
164 #define RExC_lastnum (pRExC_state->lastnum)
165 #define RExC_paren_name_list (pRExC_state->paren_name_list)
169 #define RExC_flags (pRExC_state->flags)
170 #define RExC_pm_flags (pRExC_state->pm_flags)
171 #define RExC_precomp (pRExC_state->precomp)
172 #define RExC_rx_sv (pRExC_state->rx_sv)
173 #define RExC_rx (pRExC_state->rx)
174 #define RExC_rxi (pRExC_state->rxi)
175 #define RExC_start (pRExC_state->start)
176 #define RExC_end (pRExC_state->end)
177 #define RExC_parse (pRExC_state->parse)
178 #define RExC_whilem_seen (pRExC_state->whilem_seen)
179 #ifdef RE_TRACK_PATTERN_OFFSETS
180 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
182 #define RExC_emit (pRExC_state->emit)
183 #define RExC_emit_dummy (pRExC_state->emit_dummy)
184 #define RExC_emit_start (pRExC_state->emit_start)
185 #define RExC_emit_bound (pRExC_state->emit_bound)
186 #define RExC_naughty (pRExC_state->naughty)
187 #define RExC_sawback (pRExC_state->sawback)
188 #define RExC_seen (pRExC_state->seen)
189 #define RExC_size (pRExC_state->size)
190 #define RExC_npar (pRExC_state->npar)
191 #define RExC_nestroot (pRExC_state->nestroot)
192 #define RExC_extralen (pRExC_state->extralen)
193 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
194 #define RExC_utf8 (pRExC_state->utf8)
195 #define RExC_uni_semantics (pRExC_state->uni_semantics)
196 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
197 #define RExC_open_parens (pRExC_state->open_parens)
198 #define RExC_close_parens (pRExC_state->close_parens)
199 #define RExC_opend (pRExC_state->opend)
200 #define RExC_paren_names (pRExC_state->paren_names)
201 #define RExC_recurse (pRExC_state->recurse)
202 #define RExC_recurse_count (pRExC_state->recurse_count)
203 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
204 #define RExC_contains_locale (pRExC_state->contains_locale)
205 #define RExC_contains_i (pRExC_state->contains_i)
206 #define RExC_override_recoding (pRExC_state->override_recoding)
207 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
210 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
211 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
212 ((*s) == '{' && regcurly(s, FALSE)))
215 * Flags to be passed up and down.
217 #define WORST 0 /* Worst case. */
218 #define HASWIDTH 0x01 /* Known to match non-null strings. */
220 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
221 * character. (There needs to be a case: in the switch statement in regexec.c
222 * for any node marked SIMPLE.) Note that this is not the same thing as
225 #define SPSTART 0x04 /* Starts with * or + */
226 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
227 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
228 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
230 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
232 /* whether trie related optimizations are enabled */
233 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
234 #define TRIE_STUDY_OPT
235 #define FULL_TRIE_STUDY
241 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
242 #define PBITVAL(paren) (1 << ((paren) & 7))
243 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
244 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
245 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
247 #define REQUIRE_UTF8 STMT_START { \
249 *flagp = RESTART_UTF8; \
254 /* This converts the named class defined in regcomp.h to its equivalent class
255 * number defined in handy.h. */
256 #define namedclass_to_classnum(class) ((int) ((class) / 2))
257 #define classnum_to_namedclass(classnum) ((classnum) * 2)
259 #define _invlist_union_complement_2nd(a, b, output) \
260 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
261 #define _invlist_intersection_complement_2nd(a, b, output) \
262 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
264 /* About scan_data_t.
266 During optimisation we recurse through the regexp program performing
267 various inplace (keyhole style) optimisations. In addition study_chunk
268 and scan_commit populate this data structure with information about
269 what strings MUST appear in the pattern. We look for the longest
270 string that must appear at a fixed location, and we look for the
271 longest string that may appear at a floating location. So for instance
276 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
277 strings (because they follow a .* construct). study_chunk will identify
278 both FOO and BAR as being the longest fixed and floating strings respectively.
280 The strings can be composites, for instance
284 will result in a composite fixed substring 'foo'.
286 For each string some basic information is maintained:
288 - offset or min_offset
289 This is the position the string must appear at, or not before.
290 It also implicitly (when combined with minlenp) tells us how many
291 characters must match before the string we are searching for.
292 Likewise when combined with minlenp and the length of the string it
293 tells us how many characters must appear after the string we have
297 Only used for floating strings. This is the rightmost point that
298 the string can appear at. If set to SSize_t_MAX it indicates that the
299 string can occur infinitely far to the right.
302 A pointer to the minimum number of characters of the pattern that the
303 string was found inside. This is important as in the case of positive
304 lookahead or positive lookbehind we can have multiple patterns
309 The minimum length of the pattern overall is 3, the minimum length
310 of the lookahead part is 3, but the minimum length of the part that
311 will actually match is 1. So 'FOO's minimum length is 3, but the
312 minimum length for the F is 1. This is important as the minimum length
313 is used to determine offsets in front of and behind the string being
314 looked for. Since strings can be composites this is the length of the
315 pattern at the time it was committed with a scan_commit. Note that
316 the length is calculated by study_chunk, so that the minimum lengths
317 are not known until the full pattern has been compiled, thus the
318 pointer to the value.
322 In the case of lookbehind the string being searched for can be
323 offset past the start point of the final matching string.
324 If this value was just blithely removed from the min_offset it would
325 invalidate some of the calculations for how many chars must match
326 before or after (as they are derived from min_offset and minlen and
327 the length of the string being searched for).
328 When the final pattern is compiled and the data is moved from the
329 scan_data_t structure into the regexp structure the information
330 about lookbehind is factored in, with the information that would
331 have been lost precalculated in the end_shift field for the
334 The fields pos_min and pos_delta are used to store the minimum offset
335 and the delta to the maximum offset at the current point in the pattern.
339 typedef struct scan_data_t {
340 /*I32 len_min; unused */
341 /*I32 len_delta; unused */
345 SSize_t last_end; /* min value, <0 unless valid. */
346 SSize_t last_start_min;
347 SSize_t last_start_max;
348 SV **longest; /* Either &l_fixed, or &l_float. */
349 SV *longest_fixed; /* longest fixed string found in pattern */
350 SSize_t offset_fixed; /* offset where it starts */
351 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
352 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
353 SV *longest_float; /* longest floating string found in pattern */
354 SSize_t offset_float_min; /* earliest point in string it can appear */
355 SSize_t offset_float_max; /* latest point in string it can appear */
356 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
357 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
360 SSize_t *last_closep;
361 regnode_ssc *start_class;
364 /* The below is perhaps overboard, but this allows us to save a test at the
365 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
366 * and 'a' differ by a single bit; the same with the upper and lower case of
367 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
368 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
369 * then inverts it to form a mask, with just a single 0, in the bit position
370 * where the upper- and lowercase differ. XXX There are about 40 other
371 * instances in the Perl core where this micro-optimization could be used.
372 * Should decide if maintenance cost is worse, before changing those
374 * Returns a boolean as to whether or not 'v' is either a lowercase or
375 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
376 * compile-time constant, the generated code is better than some optimizing
377 * compilers figure out, amounting to a mask and test. The results are
378 * meaningless if 'c' is not one of [A-Za-z] */
379 #define isARG2_lower_or_UPPER_ARG1(c, v) \
380 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
383 * Forward declarations for pregcomp()'s friends.
386 static const scan_data_t zero_scan_data =
387 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
389 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
390 #define SF_BEFORE_SEOL 0x0001
391 #define SF_BEFORE_MEOL 0x0002
392 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
393 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
395 #define SF_FIX_SHIFT_EOL (+2)
396 #define SF_FL_SHIFT_EOL (+4)
398 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
399 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
401 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
402 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
403 #define SF_IS_INF 0x0040
404 #define SF_HAS_PAR 0x0080
405 #define SF_IN_PAR 0x0100
406 #define SF_HAS_EVAL 0x0200
407 #define SCF_DO_SUBSTR 0x0400
408 #define SCF_DO_STCLASS_AND 0x0800
409 #define SCF_DO_STCLASS_OR 0x1000
410 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
411 #define SCF_WHILEM_VISITED_POS 0x2000
413 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
414 #define SCF_SEEN_ACCEPT 0x8000
415 #define SCF_TRIE_DOING_RESTUDY 0x10000
417 #define UTF cBOOL(RExC_utf8)
419 /* The enums for all these are ordered so things work out correctly */
420 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
421 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
422 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
423 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
424 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
425 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
426 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
428 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
430 #define OOB_NAMEDCLASS -1
432 /* There is no code point that is out-of-bounds, so this is problematic. But
433 * its only current use is to initialize a variable that is always set before
435 #define OOB_UNICODE 0xDEADBEEF
437 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
438 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
441 /* length of regex to show in messages that don't mark a position within */
442 #define RegexLengthToShowInErrorMessages 127
445 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
446 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
447 * op/pragma/warn/regcomp.
449 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
450 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
452 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
454 #define REPORT_LOCATION_ARGS(offset) \
455 UTF8fARG(UTF, offset, RExC_precomp), \
456 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
459 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
460 * arg. Show regex, up to a maximum length. If it's too long, chop and add
463 #define _FAIL(code) STMT_START { \
464 const char *ellipses = ""; \
465 IV len = RExC_end - RExC_precomp; \
468 SAVEFREESV(RExC_rx_sv); \
469 if (len > RegexLengthToShowInErrorMessages) { \
470 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
471 len = RegexLengthToShowInErrorMessages - 10; \
477 #define FAIL(msg) _FAIL( \
478 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
479 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
481 #define FAIL2(msg,arg) _FAIL( \
482 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
483 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
486 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
488 #define Simple_vFAIL(m) STMT_START { \
489 const IV offset = RExC_parse - RExC_precomp; \
490 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
491 m, REPORT_LOCATION_ARGS(offset)); \
495 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
497 #define vFAIL(m) STMT_START { \
499 SAVEFREESV(RExC_rx_sv); \
504 * Like Simple_vFAIL(), but accepts two arguments.
506 #define Simple_vFAIL2(m,a1) STMT_START { \
507 const IV offset = RExC_parse - RExC_precomp; \
508 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
509 REPORT_LOCATION_ARGS(offset)); \
513 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
515 #define vFAIL2(m,a1) STMT_START { \
517 SAVEFREESV(RExC_rx_sv); \
518 Simple_vFAIL2(m, a1); \
523 * Like Simple_vFAIL(), but accepts three arguments.
525 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
526 const IV offset = RExC_parse - RExC_precomp; \
527 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
528 REPORT_LOCATION_ARGS(offset)); \
532 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
534 #define vFAIL3(m,a1,a2) STMT_START { \
536 SAVEFREESV(RExC_rx_sv); \
537 Simple_vFAIL3(m, a1, a2); \
541 * Like Simple_vFAIL(), but accepts four arguments.
543 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
544 const IV offset = RExC_parse - RExC_precomp; \
545 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
546 REPORT_LOCATION_ARGS(offset)); \
549 #define vFAIL4(m,a1,a2,a3) STMT_START { \
551 SAVEFREESV(RExC_rx_sv); \
552 Simple_vFAIL4(m, a1, a2, a3); \
555 /* A specialized version of vFAIL2 that works with UTF8f */
556 #define vFAIL2utf8f(m, a1) STMT_START { \
557 const IV offset = RExC_parse - RExC_precomp; \
559 SAVEFREESV(RExC_rx_sv); \
560 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
561 REPORT_LOCATION_ARGS(offset)); \
565 /* m is not necessarily a "literal string", in this macro */
566 #define reg_warn_non_literal_string(loc, m) STMT_START { \
567 const IV offset = loc - RExC_precomp; \
568 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
569 m, REPORT_LOCATION_ARGS(offset)); \
572 #define ckWARNreg(loc,m) STMT_START { \
573 const IV offset = loc - RExC_precomp; \
574 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
575 REPORT_LOCATION_ARGS(offset)); \
578 #define vWARN_dep(loc, m) STMT_START { \
579 const IV offset = loc - RExC_precomp; \
580 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
581 REPORT_LOCATION_ARGS(offset)); \
584 #define ckWARNdep(loc,m) STMT_START { \
585 const IV offset = loc - RExC_precomp; \
586 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
588 REPORT_LOCATION_ARGS(offset)); \
591 #define ckWARNregdep(loc,m) STMT_START { \
592 const IV offset = loc - RExC_precomp; \
593 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
595 REPORT_LOCATION_ARGS(offset)); \
598 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
599 const IV offset = loc - RExC_precomp; \
600 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
602 a1, REPORT_LOCATION_ARGS(offset)); \
605 #define ckWARN2reg(loc, m, a1) STMT_START { \
606 const IV offset = loc - RExC_precomp; \
607 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
608 a1, REPORT_LOCATION_ARGS(offset)); \
611 #define vWARN3(loc, m, a1, a2) STMT_START { \
612 const IV offset = loc - RExC_precomp; \
613 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
614 a1, a2, REPORT_LOCATION_ARGS(offset)); \
617 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
618 const IV offset = loc - RExC_precomp; \
619 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
620 a1, a2, REPORT_LOCATION_ARGS(offset)); \
623 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
624 const IV offset = loc - RExC_precomp; \
625 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
626 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
629 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
630 const IV offset = loc - RExC_precomp; \
631 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
632 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
635 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
636 const IV offset = loc - RExC_precomp; \
637 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
638 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
642 /* Allow for side effects in s */
643 #define REGC(c,s) STMT_START { \
644 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
647 /* Macros for recording node offsets. 20001227 mjd@plover.com
648 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
649 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
650 * Element 0 holds the number n.
651 * Position is 1 indexed.
653 #ifndef RE_TRACK_PATTERN_OFFSETS
654 #define Set_Node_Offset_To_R(node,byte)
655 #define Set_Node_Offset(node,byte)
656 #define Set_Cur_Node_Offset
657 #define Set_Node_Length_To_R(node,len)
658 #define Set_Node_Length(node,len)
659 #define Set_Node_Cur_Length(node,start)
660 #define Node_Offset(n)
661 #define Node_Length(n)
662 #define Set_Node_Offset_Length(node,offset,len)
663 #define ProgLen(ri) ri->u.proglen
664 #define SetProgLen(ri,x) ri->u.proglen = x
666 #define ProgLen(ri) ri->u.offsets[0]
667 #define SetProgLen(ri,x) ri->u.offsets[0] = x
668 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
670 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
671 __LINE__, (int)(node), (int)(byte))); \
673 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
675 RExC_offsets[2*(node)-1] = (byte); \
680 #define Set_Node_Offset(node,byte) \
681 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
682 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
684 #define Set_Node_Length_To_R(node,len) STMT_START { \
686 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
687 __LINE__, (int)(node), (int)(len))); \
689 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
691 RExC_offsets[2*(node)] = (len); \
696 #define Set_Node_Length(node,len) \
697 Set_Node_Length_To_R((node)-RExC_emit_start, len)
698 #define Set_Node_Cur_Length(node, start) \
699 Set_Node_Length(node, RExC_parse - start)
701 /* Get offsets and lengths */
702 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
703 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
705 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
706 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
707 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
711 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
712 #define EXPERIMENTAL_INPLACESCAN
713 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
715 #define DEBUG_STUDYDATA(str,data,depth) \
716 DEBUG_OPTIMISE_MORE_r(if(data){ \
717 PerlIO_printf(Perl_debug_log, \
718 "%*s" str "Pos:%"IVdf"/%"IVdf \
719 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
720 (int)(depth)*2, "", \
721 (IV)((data)->pos_min), \
722 (IV)((data)->pos_delta), \
723 (UV)((data)->flags), \
724 (IV)((data)->whilem_c), \
725 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
726 is_inf ? "INF " : "" \
728 if ((data)->last_found) \
729 PerlIO_printf(Perl_debug_log, \
730 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
731 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
732 SvPVX_const((data)->last_found), \
733 (IV)((data)->last_end), \
734 (IV)((data)->last_start_min), \
735 (IV)((data)->last_start_max), \
736 ((data)->longest && \
737 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
738 SvPVX_const((data)->longest_fixed), \
739 (IV)((data)->offset_fixed), \
740 ((data)->longest && \
741 (data)->longest==&((data)->longest_float)) ? "*" : "", \
742 SvPVX_const((data)->longest_float), \
743 (IV)((data)->offset_float_min), \
744 (IV)((data)->offset_float_max) \
746 PerlIO_printf(Perl_debug_log,"\n"); \
749 /* Mark that we cannot extend a found fixed substring at this point.
750 Update the longest found anchored substring and the longest found
751 floating substrings if needed. */
754 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
755 SSize_t *minlenp, int is_inf)
757 const STRLEN l = CHR_SVLEN(data->last_found);
758 const STRLEN old_l = CHR_SVLEN(*data->longest);
759 GET_RE_DEBUG_FLAGS_DECL;
761 PERL_ARGS_ASSERT_SCAN_COMMIT;
763 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
764 SvSetMagicSV(*data->longest, data->last_found);
765 if (*data->longest == data->longest_fixed) {
766 data->offset_fixed = l ? data->last_start_min : data->pos_min;
767 if (data->flags & SF_BEFORE_EOL)
769 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
771 data->flags &= ~SF_FIX_BEFORE_EOL;
772 data->minlen_fixed=minlenp;
773 data->lookbehind_fixed=0;
775 else { /* *data->longest == data->longest_float */
776 data->offset_float_min = l ? data->last_start_min : data->pos_min;
777 data->offset_float_max = (l
778 ? data->last_start_max
779 : (data->pos_delta == SSize_t_MAX
781 : data->pos_min + data->pos_delta));
783 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
784 data->offset_float_max = SSize_t_MAX;
785 if (data->flags & SF_BEFORE_EOL)
787 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
789 data->flags &= ~SF_FL_BEFORE_EOL;
790 data->minlen_float=minlenp;
791 data->lookbehind_float=0;
794 SvCUR_set(data->last_found, 0);
796 SV * const sv = data->last_found;
797 if (SvUTF8(sv) && SvMAGICAL(sv)) {
798 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
804 data->flags &= ~SF_BEFORE_EOL;
805 DEBUG_STUDYDATA("commit: ",data,0);
808 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
809 * list that describes which code points it matches */
812 S_ssc_anything(pTHX_ regnode_ssc *ssc)
814 /* Set the SSC 'ssc' to match an empty string or any code point */
816 PERL_ARGS_ASSERT_SSC_ANYTHING;
818 assert(OP(ssc) == ANYOF_SYNTHETIC);
820 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
821 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
822 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
826 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
828 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
829 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
830 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
831 * in any way, so there's no point in using it */
836 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
838 assert(OP(ssc) == ANYOF_SYNTHETIC);
840 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
844 /* See if the list consists solely of the range 0 - Infinity */
845 invlist_iterinit(ssc->invlist);
846 ret = invlist_iternext(ssc->invlist, &start, &end)
850 invlist_iterfinish(ssc->invlist);
856 /* If e.g., both \w and \W are set, matches everything */
857 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
859 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
860 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
870 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
872 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
873 * string, any code point, or any posix class under locale */
875 PERL_ARGS_ASSERT_SSC_INIT;
877 Zero(ssc, 1, regnode_ssc);
878 OP(ssc) = ANYOF_SYNTHETIC;
879 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
882 /* If any portion of the regex is to operate under locale rules,
883 * initialization includes it. The reason this isn't done for all regexes
884 * is that the optimizer was written under the assumption that locale was
885 * all-or-nothing. Given the complexity and lack of documentation in the
886 * optimizer, and that there are inadequate test cases for locale, many
887 * parts of it may not work properly, it is safest to avoid locale unless
889 if (RExC_contains_locale) {
890 ANYOF_POSIXL_SETALL(ssc);
891 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
892 if (RExC_contains_i) {
893 ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
897 ANYOF_POSIXL_ZERO(ssc);
902 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
903 const regnode_ssc *ssc)
905 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
906 * to the list of code points matched, and locale posix classes; hence does
907 * not check its flags) */
912 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
914 assert(OP(ssc) == ANYOF_SYNTHETIC);
916 invlist_iterinit(ssc->invlist);
917 ret = invlist_iternext(ssc->invlist, &start, &end)
921 invlist_iterfinish(ssc->invlist);
927 if (RExC_contains_locale) {
928 if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
929 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
930 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
934 if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
943 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
944 const regnode_charclass_posixl* const node)
946 /* Returns a mortal inversion list defining which code points are matched
947 * by 'node', which is of type ANYOF. Handles complementing the result if
948 * appropriate. If some code points aren't knowable at this time, the
949 * returned list must, and will, contain every possible code point. */
951 SV* invlist = sv_2mortal(_new_invlist(0));
953 const U32 n = ARG(node);
955 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
957 /* Look at the data structure created by S_set_ANYOF_arg() */
958 if (n != ANYOF_NONBITMAP_EMPTY) {
959 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
960 AV * const av = MUTABLE_AV(SvRV(rv));
961 SV **const ary = AvARRAY(av);
962 assert(RExC_rxi->data->what[n] == 's');
964 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
965 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
967 else if (ary[0] && ary[0] != &PL_sv_undef) {
969 /* Here, no compile-time swash, and there are things that won't be
970 * known until runtime -- we have to assume it could be anything */
971 return _add_range_to_invlist(invlist, 0, UV_MAX);
975 /* Here no compile-time swash, and no run-time only data. Use the
976 * node's inversion list */
977 invlist = sv_2mortal(invlist_clone(ary[2]));
981 /* An ANYOF node contains a bitmap for the first 256 code points, and an
982 * inversion list for the others, but if there are code points that should
983 * match only conditionally on the target string being UTF-8, those are
984 * placed in the inversion list, and not the bitmap. Since there are
985 * circumstances under which they could match, they are included in the
986 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
987 * here, so that when we invert below, the end result actually does include
988 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
989 * before we add the unconditionally matched code points */
990 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
991 _invlist_intersection_complement_2nd(invlist,
996 /* Add in the points from the bit map */
997 for (i = 0; i < 256; i++) {
998 if (ANYOF_BITMAP_TEST(node, i)) {
999 invlist = add_cp_to_invlist(invlist, i);
1003 /* If this can match all upper Latin1 code points, have to add them
1005 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
1006 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1009 /* Similarly for these */
1010 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1011 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1014 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1015 _invlist_invert(invlist);
1021 /* These two functions currently do the exact same thing */
1022 #define ssc_init_zero ssc_init
1024 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1025 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1028 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1030 /* Take the flags 'and_with' and accumulate them anded into the flags for
1031 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
1032 * The flags 'and_with' should not come from another SSC (otherwise the
1033 * EMPTY_STRING flag won't work) */
1035 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
1037 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1039 /* Use just the SSC-related flags from 'and_with' */
1040 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_LOCALE_FLAGS);
1041 ANYOF_FLAGS(ssc) |= ssc_only_flags;
1044 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1045 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1046 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1049 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1050 const regnode_ssc *and_with)
1052 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1053 * another SSC or a regular ANYOF class. Can create false positives. */
1058 PERL_ARGS_ASSERT_SSC_AND;
1060 assert(OP(ssc) == ANYOF_SYNTHETIC);
1062 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1063 * the code point inversion list and just the relevant flags */
1064 if (OP(and_with) == ANYOF_SYNTHETIC) {
1065 anded_cp_list = and_with->invlist;
1066 anded_flags = ANYOF_FLAGS(and_with);
1069 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1070 (regnode_charclass_posixl*) and_with);
1071 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
1074 ANYOF_FLAGS(ssc) &= anded_flags;
1076 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1077 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1078 * 'and_with' may be inverted. When not inverted, we have the situation of
1080 * (C1 | P1) & (C2 | P2)
1081 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1082 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1083 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1084 * <= ((C1 & C2) | P1 | P2)
1085 * Alternatively, the last few steps could be:
1086 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1087 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1088 * <= (C1 | C2 | (P1 & P2))
1089 * We favor the second approach if either P1 or P2 is non-empty. This is
1090 * because these components are a barrier to doing optimizations, as what
1091 * they match cannot be known until the moment of matching as they are
1092 * dependent on the current locale, 'AND"ing them likely will reduce or
1094 * But we can do better if we know that C1,P1 are in their initial state (a
1095 * frequent occurrence), each matching everything:
1096 * (<everything>) & (C2 | P2) = C2 | P2
1097 * Similarly, if C2,P2 are in their initial state (again a frequent
1098 * occurrence), the result is a no-op
1099 * (C1 | P1) & (<everything>) = C1 | P1
1102 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1103 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1104 * <= (C1 & ~C2) | (P1 & ~P2)
1107 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1108 && OP(and_with) != ANYOF_SYNTHETIC)
1112 ssc_intersection(ssc,
1114 FALSE /* Has already been inverted */
1117 /* If either P1 or P2 is empty, the intersection will be also; can skip
1119 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1120 ANYOF_POSIXL_ZERO(ssc);
1122 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1124 /* Note that the Posix class component P from 'and_with' actually
1126 * P = Pa | Pb | ... | Pn
1127 * where each component is one posix class, such as in [\w\s].
1129 * ~P = ~(Pa | Pb | ... | Pn)
1130 * = ~Pa & ~Pb & ... & ~Pn
1131 * <= ~Pa | ~Pb | ... | ~Pn
1132 * The last is something we can easily calculate, but unfortunately
1133 * is likely to have many false positives. We could do better
1134 * in some (but certainly not all) instances if two classes in
1135 * P have known relationships. For example
1136 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1138 * :lower: & :print: = :lower:
1139 * And similarly for classes that must be disjoint. For example,
1140 * since \s and \w can have no elements in common based on rules in
1141 * the POSIX standard,
1142 * \w & ^\S = nothing
1143 * Unfortunately, some vendor locales do not meet the Posix
1144 * standard, in particular almost everything by Microsoft.
1145 * The loop below just changes e.g., \w into \W and vice versa */
1147 regnode_charclass_posixl temp;
1148 int add = 1; /* To calculate the index of the complement */
1150 ANYOF_POSIXL_ZERO(&temp);
1151 for (i = 0; i < ANYOF_MAX; i++) {
1153 || ! ANYOF_POSIXL_TEST(and_with, i)
1154 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1156 if (ANYOF_POSIXL_TEST(and_with, i)) {
1157 ANYOF_POSIXL_SET(&temp, i + add);
1159 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1161 ANYOF_POSIXL_AND(&temp, ssc);
1163 } /* else ssc already has no posixes */
1164 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1165 in its initial state */
1166 else if (OP(and_with) != ANYOF_SYNTHETIC
1167 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1169 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1170 * copy it over 'ssc' */
1171 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1172 if (OP(and_with) == ANYOF_SYNTHETIC) {
1173 StructCopy(and_with, ssc, regnode_ssc);
1176 ssc->invlist = anded_cp_list;
1177 ANYOF_POSIXL_ZERO(ssc);
1178 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1179 ANYOF_POSIXL_OR(and_with, ssc);
1183 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1184 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1186 /* One or the other of P1, P2 is non-empty. */
1187 ANYOF_POSIXL_AND(and_with, ssc);
1188 ssc_union(ssc, anded_cp_list, FALSE);
1190 else { /* P1 = P2 = empty */
1191 ssc_intersection(ssc, anded_cp_list, FALSE);
1197 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1198 const regnode_ssc *or_with)
1200 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1201 * another SSC or a regular ANYOF class. Can create false positives if
1202 * 'or_with' is to be inverted. */
1207 PERL_ARGS_ASSERT_SSC_OR;
1209 assert(OP(ssc) == ANYOF_SYNTHETIC);
1211 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1212 * the code point inversion list and just the relevant flags */
1213 if (OP(or_with) == ANYOF_SYNTHETIC) {
1214 ored_cp_list = or_with->invlist;
1215 ored_flags = ANYOF_FLAGS(or_with);
1218 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1219 (regnode_charclass_posixl*) or_with);
1220 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
1223 ANYOF_FLAGS(ssc) |= ored_flags;
1225 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1226 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1227 * 'or_with' may be inverted. When not inverted, we have the simple
1228 * situation of computing:
1229 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1230 * If P1|P2 yields a situation with both a class and its complement are
1231 * set, like having both \w and \W, this matches all code points, and we
1232 * can delete these from the P component of the ssc going forward. XXX We
1233 * might be able to delete all the P components, but I (khw) am not certain
1234 * about this, and it is better to be safe.
1237 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1238 * <= (C1 | P1) | ~C2
1239 * <= (C1 | ~C2) | P1
1240 * (which results in actually simpler code than the non-inverted case)
1243 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1244 && OP(or_with) != ANYOF_SYNTHETIC)
1246 /* We ignore P2, leaving P1 going forward */
1248 else { /* Not inverted */
1249 ANYOF_POSIXL_OR(or_with, ssc);
1250 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1252 for (i = 0; i < ANYOF_MAX; i += 2) {
1253 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1255 ssc_match_all_cp(ssc);
1256 ANYOF_POSIXL_CLEAR(ssc, i);
1257 ANYOF_POSIXL_CLEAR(ssc, i+1);
1258 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1259 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1268 FALSE /* Already has been inverted */
1272 PERL_STATIC_INLINE void
1273 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1275 PERL_ARGS_ASSERT_SSC_UNION;
1277 assert(OP(ssc) == ANYOF_SYNTHETIC);
1279 _invlist_union_maybe_complement_2nd(ssc->invlist,
1285 PERL_STATIC_INLINE void
1286 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1288 const bool invert2nd)
1290 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1292 assert(OP(ssc) == ANYOF_SYNTHETIC);
1294 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1300 PERL_STATIC_INLINE void
1301 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1303 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1305 assert(OP(ssc) == ANYOF_SYNTHETIC);
1307 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1310 PERL_STATIC_INLINE void
1311 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1313 /* AND just the single code point 'cp' into the SSC 'ssc' */
1315 SV* cp_list = _new_invlist(2);
1317 PERL_ARGS_ASSERT_SSC_CP_AND;
1319 assert(OP(ssc) == ANYOF_SYNTHETIC);
1321 cp_list = add_cp_to_invlist(cp_list, cp);
1322 ssc_intersection(ssc, cp_list,
1323 FALSE /* Not inverted */
1325 SvREFCNT_dec_NN(cp_list);
1328 PERL_STATIC_INLINE void
1329 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1331 /* Set the SSC 'ssc' to not match any locale things */
1333 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1335 assert(OP(ssc) == ANYOF_SYNTHETIC);
1337 ANYOF_POSIXL_ZERO(ssc);
1338 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1342 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1344 /* The inversion list in the SSC is marked mortal; now we need a more
1345 * permanent copy, which is stored the same way that is done in a regular
1346 * ANYOF node, with the first 256 code points in a bit map */
1348 SV* invlist = invlist_clone(ssc->invlist);
1350 PERL_ARGS_ASSERT_SSC_FINALIZE;
1352 assert(OP(ssc) == ANYOF_SYNTHETIC);
1354 /* The code in this file assumes that all but these flags aren't relevant
1355 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1356 * time we reach here */
1357 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
1359 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1361 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1363 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1366 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1367 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1368 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1369 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1374 dump_trie(trie,widecharmap,revcharmap)
1375 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1376 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1378 These routines dump out a trie in a somewhat readable format.
1379 The _interim_ variants are used for debugging the interim
1380 tables that are used to generate the final compressed
1381 representation which is what dump_trie expects.
1383 Part of the reason for their existence is to provide a form
1384 of documentation as to how the different representations function.
1389 Dumps the final compressed table form of the trie to Perl_debug_log.
1390 Used for debugging make_trie().
1394 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1395 AV *revcharmap, U32 depth)
1398 SV *sv=sv_newmortal();
1399 int colwidth= widecharmap ? 6 : 4;
1401 GET_RE_DEBUG_FLAGS_DECL;
1403 PERL_ARGS_ASSERT_DUMP_TRIE;
1405 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1406 (int)depth * 2 + 2,"",
1407 "Match","Base","Ofs" );
1409 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1410 SV ** const tmp = av_fetch( revcharmap, state, 0);
1412 PerlIO_printf( Perl_debug_log, "%*s",
1414 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1415 PL_colors[0], PL_colors[1],
1416 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1417 PERL_PV_ESCAPE_FIRSTCHAR
1422 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1423 (int)depth * 2 + 2,"");
1425 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1426 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1427 PerlIO_printf( Perl_debug_log, "\n");
1429 for( state = 1 ; state < trie->statecount ; state++ ) {
1430 const U32 base = trie->states[ state ].trans.base;
1432 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1434 if ( trie->states[ state ].wordnum ) {
1435 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1437 PerlIO_printf( Perl_debug_log, "%6s", "" );
1440 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1445 while( ( base + ofs < trie->uniquecharcount ) ||
1446 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1447 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1450 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1452 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1453 if ( ( base + ofs >= trie->uniquecharcount ) &&
1454 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1455 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1457 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1459 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1461 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1465 PerlIO_printf( Perl_debug_log, "]");
1468 PerlIO_printf( Perl_debug_log, "\n" );
1470 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1471 for (word=1; word <= trie->wordcount; word++) {
1472 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1473 (int)word, (int)(trie->wordinfo[word].prev),
1474 (int)(trie->wordinfo[word].len));
1476 PerlIO_printf(Perl_debug_log, "\n" );
1479 Dumps a fully constructed but uncompressed trie in list form.
1480 List tries normally only are used for construction when the number of
1481 possible chars (trie->uniquecharcount) is very high.
1482 Used for debugging make_trie().
1485 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1486 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1490 SV *sv=sv_newmortal();
1491 int colwidth= widecharmap ? 6 : 4;
1492 GET_RE_DEBUG_FLAGS_DECL;
1494 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1496 /* print out the table precompression. */
1497 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1498 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1499 "------:-----+-----------------\n" );
1501 for( state=1 ; state < next_alloc ; state ++ ) {
1504 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1505 (int)depth * 2 + 2,"", (UV)state );
1506 if ( ! trie->states[ state ].wordnum ) {
1507 PerlIO_printf( Perl_debug_log, "%5s| ","");
1509 PerlIO_printf( Perl_debug_log, "W%4x| ",
1510 trie->states[ state ].wordnum
1513 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1514 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1516 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1518 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1519 PL_colors[0], PL_colors[1],
1520 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1521 PERL_PV_ESCAPE_FIRSTCHAR
1523 TRIE_LIST_ITEM(state,charid).forid,
1524 (UV)TRIE_LIST_ITEM(state,charid).newstate
1527 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1528 (int)((depth * 2) + 14), "");
1531 PerlIO_printf( Perl_debug_log, "\n");
1536 Dumps a fully constructed but uncompressed trie in table form.
1537 This is the normal DFA style state transition table, with a few
1538 twists to facilitate compression later.
1539 Used for debugging make_trie().
1542 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1543 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1548 SV *sv=sv_newmortal();
1549 int colwidth= widecharmap ? 6 : 4;
1550 GET_RE_DEBUG_FLAGS_DECL;
1552 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1555 print out the table precompression so that we can do a visual check
1556 that they are identical.
1559 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1561 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1562 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1564 PerlIO_printf( Perl_debug_log, "%*s",
1566 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1567 PL_colors[0], PL_colors[1],
1568 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1569 PERL_PV_ESCAPE_FIRSTCHAR
1575 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1577 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1578 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1581 PerlIO_printf( Perl_debug_log, "\n" );
1583 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1585 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1586 (int)depth * 2 + 2,"",
1587 (UV)TRIE_NODENUM( state ) );
1589 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1590 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1592 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1594 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1596 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1597 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1599 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1600 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1608 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1609 startbranch: the first branch in the whole branch sequence
1610 first : start branch of sequence of branch-exact nodes.
1611 May be the same as startbranch
1612 last : Thing following the last branch.
1613 May be the same as tail.
1614 tail : item following the branch sequence
1615 count : words in the sequence
1616 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1617 depth : indent depth
1619 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1621 A trie is an N'ary tree where the branches are determined by digital
1622 decomposition of the key. IE, at the root node you look up the 1st character and
1623 follow that branch repeat until you find the end of the branches. Nodes can be
1624 marked as "accepting" meaning they represent a complete word. Eg:
1628 would convert into the following structure. Numbers represent states, letters
1629 following numbers represent valid transitions on the letter from that state, if
1630 the number is in square brackets it represents an accepting state, otherwise it
1631 will be in parenthesis.
1633 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1637 (1) +-i->(6)-+-s->[7]
1639 +-s->(3)-+-h->(4)-+-e->[5]
1641 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1643 This shows that when matching against the string 'hers' we will begin at state 1
1644 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1645 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1646 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1647 single traverse. We store a mapping from accepting to state to which word was
1648 matched, and then when we have multiple possibilities we try to complete the
1649 rest of the regex in the order in which they occured in the alternation.
1651 The only prior NFA like behaviour that would be changed by the TRIE support is
1652 the silent ignoring of duplicate alternations which are of the form:
1654 / (DUPE|DUPE) X? (?{ ... }) Y /x
1656 Thus EVAL blocks following a trie may be called a different number of times with
1657 and without the optimisation. With the optimisations dupes will be silently
1658 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1659 the following demonstrates:
1661 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1663 which prints out 'word' three times, but
1665 'words'=~/(word|word|word)(?{ print $1 })S/
1667 which doesnt print it out at all. This is due to other optimisations kicking in.
1669 Example of what happens on a structural level:
1671 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1673 1: CURLYM[1] {1,32767}(18)
1684 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1685 and should turn into:
1687 1: CURLYM[1] {1,32767}(18)
1689 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1697 Cases where tail != last would be like /(?foo|bar)baz/:
1707 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1708 and would end up looking like:
1711 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1718 d = uvchr_to_utf8_flags(d, uv, 0);
1720 is the recommended Unicode-aware way of saying
1725 #define TRIE_STORE_REVCHAR(val) \
1728 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1729 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1730 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1731 SvCUR_set(zlopp, kapow - flrbbbbb); \
1734 av_push(revcharmap, zlopp); \
1736 char ooooff = (char)val; \
1737 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1741 /* This gets the next character from the input, folding it if not already
1743 #define TRIE_READ_CHAR STMT_START { \
1746 /* if it is UTF then it is either already folded, or does not need \
1748 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1750 else if (folder == PL_fold_latin1) { \
1751 /* This folder implies Unicode rules, which in the range expressible \
1752 * by not UTF is the lower case, with the two exceptions, one of \
1753 * which should have been taken care of before calling this */ \
1754 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1755 uvc = toLOWER_L1(*uc); \
1756 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1759 /* raw data, will be folded later if needed */ \
1767 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1768 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1769 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1770 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1772 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1773 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1774 TRIE_LIST_CUR( state )++; \
1777 #define TRIE_LIST_NEW(state) STMT_START { \
1778 Newxz( trie->states[ state ].trans.list, \
1779 4, reg_trie_trans_le ); \
1780 TRIE_LIST_CUR( state ) = 1; \
1781 TRIE_LIST_LEN( state ) = 4; \
1784 #define TRIE_HANDLE_WORD(state) STMT_START { \
1785 U16 dupe= trie->states[ state ].wordnum; \
1786 regnode * const noper_next = regnext( noper ); \
1789 /* store the word for dumping */ \
1791 if (OP(noper) != NOTHING) \
1792 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1794 tmp = newSVpvn_utf8( "", 0, UTF ); \
1795 av_push( trie_words, tmp ); \
1799 trie->wordinfo[curword].prev = 0; \
1800 trie->wordinfo[curword].len = wordlen; \
1801 trie->wordinfo[curword].accept = state; \
1803 if ( noper_next < tail ) { \
1805 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1806 trie->jump[curword] = (U16)(noper_next - convert); \
1808 jumper = noper_next; \
1810 nextbranch= regnext(cur); \
1814 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1815 /* chain, so that when the bits of chain are later */\
1816 /* linked together, the dups appear in the chain */\
1817 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1818 trie->wordinfo[dupe].prev = curword; \
1820 /* we haven't inserted this word yet. */ \
1821 trie->states[ state ].wordnum = curword; \
1826 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1827 ( ( base + charid >= ucharcount \
1828 && base + charid < ubound \
1829 && state == trie->trans[ base - ucharcount + charid ].check \
1830 && trie->trans[ base - ucharcount + charid ].next ) \
1831 ? trie->trans[ base - ucharcount + charid ].next \
1832 : ( state==1 ? special : 0 ) \
1836 #define MADE_JUMP_TRIE 2
1837 #define MADE_EXACT_TRIE 4
1840 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1843 /* first pass, loop through and scan words */
1844 reg_trie_data *trie;
1845 HV *widecharmap = NULL;
1846 AV *revcharmap = newAV();
1852 regnode *jumper = NULL;
1853 regnode *nextbranch = NULL;
1854 regnode *convert = NULL;
1855 U32 *prev_states; /* temp array mapping each state to previous one */
1856 /* we just use folder as a flag in utf8 */
1857 const U8 * folder = NULL;
1860 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1861 AV *trie_words = NULL;
1862 /* along with revcharmap, this only used during construction but both are
1863 * useful during debugging so we store them in the struct when debugging.
1866 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1867 STRLEN trie_charcount=0;
1869 SV *re_trie_maxbuff;
1870 GET_RE_DEBUG_FLAGS_DECL;
1872 PERL_ARGS_ASSERT_MAKE_TRIE;
1874 PERL_UNUSED_ARG(depth);
1881 case EXACTFU: folder = PL_fold_latin1; break;
1882 case EXACTF: folder = PL_fold; break;
1883 case EXACTFL: folder = PL_fold_locale; break;
1884 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1887 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1889 trie->startstate = 1;
1890 trie->wordcount = word_count;
1891 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1892 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1894 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1895 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1896 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1899 trie_words = newAV();
1902 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1903 if (!SvIOK(re_trie_maxbuff)) {
1904 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1906 DEBUG_TRIE_COMPILE_r({
1907 PerlIO_printf( Perl_debug_log,
1908 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1909 (int)depth * 2 + 2, "",
1910 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1911 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1915 /* Find the node we are going to overwrite */
1916 if ( first == startbranch && OP( last ) != BRANCH ) {
1917 /* whole branch chain */
1920 /* branch sub-chain */
1921 convert = NEXTOPER( first );
1924 /* -- First loop and Setup --
1926 We first traverse the branches and scan each word to determine if it
1927 contains widechars, and how many unique chars there are, this is
1928 important as we have to build a table with at least as many columns as we
1931 We use an array of integers to represent the character codes 0..255
1932 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1933 native representation of the character value as the key and IV's for the
1936 *TODO* If we keep track of how many times each character is used we can
1937 remap the columns so that the table compression later on is more
1938 efficient in terms of memory by ensuring the most common value is in the
1939 middle and the least common are on the outside. IMO this would be better
1940 than a most to least common mapping as theres a decent chance the most
1941 common letter will share a node with the least common, meaning the node
1942 will not be compressible. With a middle is most common approach the worst
1943 case is when we have the least common nodes twice.
1947 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1948 regnode *noper = NEXTOPER( cur );
1949 const U8 *uc = (U8*)STRING( noper );
1950 const U8 *e = uc + STR_LEN( noper );
1952 U32 wordlen = 0; /* required init */
1953 STRLEN minbytes = 0;
1954 STRLEN maxbytes = 0;
1955 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1957 if (OP(noper) == NOTHING) {
1958 regnode *noper_next= regnext(noper);
1959 if (noper_next != tail && OP(noper_next) == flags) {
1961 uc= (U8*)STRING(noper);
1962 e= uc + STR_LEN(noper);
1963 trie->minlen= STR_LEN(noper);
1970 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1971 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1972 regardless of encoding */
1973 if (OP( noper ) == EXACTFU_SS) {
1974 /* false positives are ok, so just set this */
1975 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
1978 for ( ; uc < e ; uc += len ) {
1979 TRIE_CHARCOUNT(trie)++;
1982 /* Acummulate to the current values, the range in the number of
1983 * bytes that this character could match. The max is presumed to
1984 * be the same as the folded input (which TRIE_READ_CHAR returns),
1985 * except that when this is not in UTF-8, it could be matched
1986 * against a string which is UTF-8, and the variant characters
1987 * could be 2 bytes instead of the 1 here. Likewise, for the
1988 * minimum number of bytes when not folded. When folding, the min
1989 * is assumed to be 1 byte could fold to match the single character
1990 * here, or in the case of a multi-char fold, 1 byte can fold to
1991 * the whole sequence. 'foldlen' is used to denote whether we are
1992 * in such a sequence, skipping the min setting if so. XXX TODO
1993 * Use the exact list of what folds to each character, from
1994 * PL_utf8_foldclosures */
1996 maxbytes += UTF8SKIP(uc);
1998 /* A non-UTF-8 string could be 1 byte to match our 2 */
1999 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2005 foldlen -= UTF8SKIP(uc);
2008 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2014 maxbytes += (UNI_IS_INVARIANT(*uc))
2025 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2032 U8 folded= folder[ (U8) uvc ];
2033 if ( !trie->charmap[ folded ] ) {
2034 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2035 TRIE_STORE_REVCHAR( folded );
2038 if ( !trie->charmap[ uvc ] ) {
2039 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2040 TRIE_STORE_REVCHAR( uvc );
2043 /* store the codepoint in the bitmap, and its folded
2045 TRIE_BITMAP_SET(trie, uvc);
2047 /* store the folded codepoint */
2048 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2051 /* store first byte of utf8 representation of
2052 variant codepoints */
2053 if (! UVCHR_IS_INVARIANT(uvc)) {
2054 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2057 set_bit = 0; /* We've done our bit :-) */
2062 widecharmap = newHV();
2064 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2067 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2069 if ( !SvTRUE( *svpp ) ) {
2070 sv_setiv( *svpp, ++trie->uniquecharcount );
2071 TRIE_STORE_REVCHAR(uvc);
2075 if( cur == first ) {
2076 trie->minlen = minbytes;
2077 trie->maxlen = maxbytes;
2078 } else if (minbytes < trie->minlen) {
2079 trie->minlen = minbytes;
2080 } else if (maxbytes > trie->maxlen) {
2081 trie->maxlen = maxbytes;
2083 } /* end first pass */
2084 DEBUG_TRIE_COMPILE_r(
2085 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2086 (int)depth * 2 + 2,"",
2087 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2088 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2089 (int)trie->minlen, (int)trie->maxlen )
2093 We now know what we are dealing with in terms of unique chars and
2094 string sizes so we can calculate how much memory a naive
2095 representation using a flat table will take. If it's over a reasonable
2096 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2097 conservative but potentially much slower representation using an array
2100 At the end we convert both representations into the same compressed
2101 form that will be used in regexec.c for matching with. The latter
2102 is a form that cannot be used to construct with but has memory
2103 properties similar to the list form and access properties similar
2104 to the table form making it both suitable for fast searches and
2105 small enough that its feasable to store for the duration of a program.
2107 See the comment in the code where the compressed table is produced
2108 inplace from the flat tabe representation for an explanation of how
2109 the compression works.
2114 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2117 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
2119 Second Pass -- Array Of Lists Representation
2121 Each state will be represented by a list of charid:state records
2122 (reg_trie_trans_le) the first such element holds the CUR and LEN
2123 points of the allocated array. (See defines above).
2125 We build the initial structure using the lists, and then convert
2126 it into the compressed table form which allows faster lookups
2127 (but cant be modified once converted).
2130 STRLEN transcount = 1;
2132 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2133 "%*sCompiling trie using list compiler\n",
2134 (int)depth * 2 + 2, ""));
2136 trie->states = (reg_trie_state *)
2137 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2138 sizeof(reg_trie_state) );
2142 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2144 regnode *noper = NEXTOPER( cur );
2145 U8 *uc = (U8*)STRING( noper );
2146 const U8 *e = uc + STR_LEN( noper );
2147 U32 state = 1; /* required init */
2148 U16 charid = 0; /* sanity init */
2149 U32 wordlen = 0; /* required init */
2151 if (OP(noper) == NOTHING) {
2152 regnode *noper_next= regnext(noper);
2153 if (noper_next != tail && OP(noper_next) == flags) {
2155 uc= (U8*)STRING(noper);
2156 e= uc + STR_LEN(noper);
2160 if (OP(noper) != NOTHING) {
2161 for ( ; uc < e ; uc += len ) {
2166 charid = trie->charmap[ uvc ];
2168 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2172 charid=(U16)SvIV( *svpp );
2175 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2182 if ( !trie->states[ state ].trans.list ) {
2183 TRIE_LIST_NEW( state );
2185 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2186 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2187 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2192 newstate = next_alloc++;
2193 prev_states[newstate] = state;
2194 TRIE_LIST_PUSH( state, charid, newstate );
2199 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2203 TRIE_HANDLE_WORD(state);
2205 } /* end second pass */
2207 /* next alloc is the NEXT state to be allocated */
2208 trie->statecount = next_alloc;
2209 trie->states = (reg_trie_state *)
2210 PerlMemShared_realloc( trie->states,
2212 * sizeof(reg_trie_state) );
2214 /* and now dump it out before we compress it */
2215 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2216 revcharmap, next_alloc,
2220 trie->trans = (reg_trie_trans *)
2221 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2228 for( state=1 ; state < next_alloc ; state ++ ) {
2232 DEBUG_TRIE_COMPILE_MORE_r(
2233 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2237 if (trie->states[state].trans.list) {
2238 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2242 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2243 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2244 if ( forid < minid ) {
2246 } else if ( forid > maxid ) {
2250 if ( transcount < tp + maxid - minid + 1) {
2252 trie->trans = (reg_trie_trans *)
2253 PerlMemShared_realloc( trie->trans,
2255 * sizeof(reg_trie_trans) );
2256 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2258 base = trie->uniquecharcount + tp - minid;
2259 if ( maxid == minid ) {
2261 for ( ; zp < tp ; zp++ ) {
2262 if ( ! trie->trans[ zp ].next ) {
2263 base = trie->uniquecharcount + zp - minid;
2264 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2265 trie->trans[ zp ].check = state;
2271 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2272 trie->trans[ tp ].check = state;
2277 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2278 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2279 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2280 trie->trans[ tid ].check = state;
2282 tp += ( maxid - minid + 1 );
2284 Safefree(trie->states[ state ].trans.list);
2287 DEBUG_TRIE_COMPILE_MORE_r(
2288 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2291 trie->states[ state ].trans.base=base;
2293 trie->lasttrans = tp + 1;
2297 Second Pass -- Flat Table Representation.
2299 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2300 each. We know that we will need Charcount+1 trans at most to store
2301 the data (one row per char at worst case) So we preallocate both
2302 structures assuming worst case.
2304 We then construct the trie using only the .next slots of the entry
2307 We use the .check field of the first entry of the node temporarily
2308 to make compression both faster and easier by keeping track of how
2309 many non zero fields are in the node.
2311 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2314 There are two terms at use here: state as a TRIE_NODEIDX() which is
2315 a number representing the first entry of the node, and state as a
2316 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2317 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2318 if there are 2 entrys per node. eg:
2326 The table is internally in the right hand, idx form. However as we
2327 also have to deal with the states array which is indexed by nodenum
2328 we have to use TRIE_NODENUM() to convert.
2331 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2332 "%*sCompiling trie using table compiler\n",
2333 (int)depth * 2 + 2, ""));
2335 trie->trans = (reg_trie_trans *)
2336 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2337 * trie->uniquecharcount + 1,
2338 sizeof(reg_trie_trans) );
2339 trie->states = (reg_trie_state *)
2340 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2341 sizeof(reg_trie_state) );
2342 next_alloc = trie->uniquecharcount + 1;
2345 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2347 regnode *noper = NEXTOPER( cur );
2348 const U8 *uc = (U8*)STRING( noper );
2349 const U8 *e = uc + STR_LEN( noper );
2351 U32 state = 1; /* required init */
2353 U16 charid = 0; /* sanity init */
2354 U32 accept_state = 0; /* sanity init */
2356 U32 wordlen = 0; /* required init */
2358 if (OP(noper) == NOTHING) {
2359 regnode *noper_next= regnext(noper);
2360 if (noper_next != tail && OP(noper_next) == flags) {
2362 uc= (U8*)STRING(noper);
2363 e= uc + STR_LEN(noper);
2367 if ( OP(noper) != NOTHING ) {
2368 for ( ; uc < e ; uc += len ) {
2373 charid = trie->charmap[ uvc ];
2375 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2376 charid = svpp ? (U16)SvIV(*svpp) : 0;
2380 if ( !trie->trans[ state + charid ].next ) {
2381 trie->trans[ state + charid ].next = next_alloc;
2382 trie->trans[ state ].check++;
2383 prev_states[TRIE_NODENUM(next_alloc)]
2384 = TRIE_NODENUM(state);
2385 next_alloc += trie->uniquecharcount;
2387 state = trie->trans[ state + charid ].next;
2389 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2391 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2394 accept_state = TRIE_NODENUM( state );
2395 TRIE_HANDLE_WORD(accept_state);
2397 } /* end second pass */
2399 /* and now dump it out before we compress it */
2400 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2402 next_alloc, depth+1));
2406 * Inplace compress the table.*
2408 For sparse data sets the table constructed by the trie algorithm will
2409 be mostly 0/FAIL transitions or to put it another way mostly empty.
2410 (Note that leaf nodes will not contain any transitions.)
2412 This algorithm compresses the tables by eliminating most such
2413 transitions, at the cost of a modest bit of extra work during lookup:
2415 - Each states[] entry contains a .base field which indicates the
2416 index in the state[] array wheres its transition data is stored.
2418 - If .base is 0 there are no valid transitions from that node.
2420 - If .base is nonzero then charid is added to it to find an entry in
2423 -If trans[states[state].base+charid].check!=state then the
2424 transition is taken to be a 0/Fail transition. Thus if there are fail
2425 transitions at the front of the node then the .base offset will point
2426 somewhere inside the previous nodes data (or maybe even into a node
2427 even earlier), but the .check field determines if the transition is
2431 The following process inplace converts the table to the compressed
2432 table: We first do not compress the root node 1,and mark all its
2433 .check pointers as 1 and set its .base pointer as 1 as well. This
2434 allows us to do a DFA construction from the compressed table later,
2435 and ensures that any .base pointers we calculate later are greater
2438 - We set 'pos' to indicate the first entry of the second node.
2440 - We then iterate over the columns of the node, finding the first and
2441 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2442 and set the .check pointers accordingly, and advance pos
2443 appropriately and repreat for the next node. Note that when we copy
2444 the next pointers we have to convert them from the original
2445 NODEIDX form to NODENUM form as the former is not valid post
2448 - If a node has no transitions used we mark its base as 0 and do not
2449 advance the pos pointer.
2451 - If a node only has one transition we use a second pointer into the
2452 structure to fill in allocated fail transitions from other states.
2453 This pointer is independent of the main pointer and scans forward
2454 looking for null transitions that are allocated to a state. When it
2455 finds one it writes the single transition into the "hole". If the
2456 pointer doesnt find one the single transition is appended as normal.
2458 - Once compressed we can Renew/realloc the structures to release the
2461 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2462 specifically Fig 3.47 and the associated pseudocode.
2466 const U32 laststate = TRIE_NODENUM( next_alloc );
2469 trie->statecount = laststate;
2471 for ( state = 1 ; state < laststate ; state++ ) {
2473 const U32 stateidx = TRIE_NODEIDX( state );
2474 const U32 o_used = trie->trans[ stateidx ].check;
2475 U32 used = trie->trans[ stateidx ].check;
2476 trie->trans[ stateidx ].check = 0;
2478 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2479 if ( flag || trie->trans[ stateidx + charid ].next ) {
2480 if ( trie->trans[ stateidx + charid ].next ) {
2482 for ( ; zp < pos ; zp++ ) {
2483 if ( ! trie->trans[ zp ].next ) {
2487 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2488 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2489 trie->trans[ zp ].check = state;
2490 if ( ++zp > pos ) pos = zp;
2497 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2499 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2500 trie->trans[ pos ].check = state;
2505 trie->lasttrans = pos + 1;
2506 trie->states = (reg_trie_state *)
2507 PerlMemShared_realloc( trie->states, laststate
2508 * sizeof(reg_trie_state) );
2509 DEBUG_TRIE_COMPILE_MORE_r(
2510 PerlIO_printf( Perl_debug_log,
2511 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2512 (int)depth * 2 + 2,"",
2513 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2516 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2519 } /* end table compress */
2521 DEBUG_TRIE_COMPILE_MORE_r(
2522 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2523 (int)depth * 2 + 2, "",
2524 (UV)trie->statecount,
2525 (UV)trie->lasttrans)
2527 /* resize the trans array to remove unused space */
2528 trie->trans = (reg_trie_trans *)
2529 PerlMemShared_realloc( trie->trans, trie->lasttrans
2530 * sizeof(reg_trie_trans) );
2532 { /* Modify the program and insert the new TRIE node */
2533 U8 nodetype =(U8)(flags & 0xFF);
2537 regnode *optimize = NULL;
2538 #ifdef RE_TRACK_PATTERN_OFFSETS
2541 U32 mjd_nodelen = 0;
2542 #endif /* RE_TRACK_PATTERN_OFFSETS */
2543 #endif /* DEBUGGING */
2545 This means we convert either the first branch or the first Exact,
2546 depending on whether the thing following (in 'last') is a branch
2547 or not and whther first is the startbranch (ie is it a sub part of
2548 the alternation or is it the whole thing.)
2549 Assuming its a sub part we convert the EXACT otherwise we convert
2550 the whole branch sequence, including the first.
2552 /* Find the node we are going to overwrite */
2553 if ( first != startbranch || OP( last ) == BRANCH ) {
2554 /* branch sub-chain */
2555 NEXT_OFF( first ) = (U16)(last - first);
2556 #ifdef RE_TRACK_PATTERN_OFFSETS
2558 mjd_offset= Node_Offset((convert));
2559 mjd_nodelen= Node_Length((convert));
2562 /* whole branch chain */
2564 #ifdef RE_TRACK_PATTERN_OFFSETS
2567 const regnode *nop = NEXTOPER( convert );
2568 mjd_offset= Node_Offset((nop));
2569 mjd_nodelen= Node_Length((nop));
2573 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2574 (int)depth * 2 + 2, "",
2575 (UV)mjd_offset, (UV)mjd_nodelen)
2578 /* But first we check to see if there is a common prefix we can
2579 split out as an EXACT and put in front of the TRIE node. */
2580 trie->startstate= 1;
2581 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2583 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2587 const U32 base = trie->states[ state ].trans.base;
2589 if ( trie->states[state].wordnum )
2592 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2593 if ( ( base + ofs >= trie->uniquecharcount ) &&
2594 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2595 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2597 if ( ++count > 1 ) {
2598 SV **tmp = av_fetch( revcharmap, ofs, 0);
2599 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2600 if ( state == 1 ) break;
2602 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2604 PerlIO_printf(Perl_debug_log,
2605 "%*sNew Start State=%"UVuf" Class: [",
2606 (int)depth * 2 + 2, "",
2609 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2610 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2612 TRIE_BITMAP_SET(trie,*ch);
2614 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2616 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2620 TRIE_BITMAP_SET(trie,*ch);
2622 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2623 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2629 SV **tmp = av_fetch( revcharmap, idx, 0);
2631 char *ch = SvPV( *tmp, len );
2633 SV *sv=sv_newmortal();
2634 PerlIO_printf( Perl_debug_log,
2635 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2636 (int)depth * 2 + 2, "",
2638 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2639 PL_colors[0], PL_colors[1],
2640 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2641 PERL_PV_ESCAPE_FIRSTCHAR
2646 OP( convert ) = nodetype;
2647 str=STRING(convert);
2650 STR_LEN(convert) += len;
2656 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2661 trie->prefixlen = (state-1);
2663 regnode *n = convert+NODE_SZ_STR(convert);
2664 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2665 trie->startstate = state;
2666 trie->minlen -= (state - 1);
2667 trie->maxlen -= (state - 1);
2669 /* At least the UNICOS C compiler choked on this
2670 * being argument to DEBUG_r(), so let's just have
2673 #ifdef PERL_EXT_RE_BUILD
2679 regnode *fix = convert;
2680 U32 word = trie->wordcount;
2682 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2683 while( ++fix < n ) {
2684 Set_Node_Offset_Length(fix, 0, 0);
2687 SV ** const tmp = av_fetch( trie_words, word, 0 );
2689 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2690 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2692 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2700 NEXT_OFF(convert) = (U16)(tail - convert);
2701 DEBUG_r(optimize= n);
2707 if ( trie->maxlen ) {
2708 NEXT_OFF( convert ) = (U16)(tail - convert);
2709 ARG_SET( convert, data_slot );
2710 /* Store the offset to the first unabsorbed branch in
2711 jump[0], which is otherwise unused by the jump logic.
2712 We use this when dumping a trie and during optimisation. */
2714 trie->jump[0] = (U16)(nextbranch - convert);
2716 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2717 * and there is a bitmap
2718 * and the first "jump target" node we found leaves enough room
2719 * then convert the TRIE node into a TRIEC node, with the bitmap
2720 * embedded inline in the opcode - this is hypothetically faster.
2722 if ( !trie->states[trie->startstate].wordnum
2724 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2726 OP( convert ) = TRIEC;
2727 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2728 PerlMemShared_free(trie->bitmap);
2731 OP( convert ) = TRIE;
2733 /* store the type in the flags */
2734 convert->flags = nodetype;
2738 + regarglen[ OP( convert ) ];
2740 /* XXX We really should free up the resource in trie now,
2741 as we won't use them - (which resources?) dmq */
2743 /* needed for dumping*/
2744 DEBUG_r(if (optimize) {
2745 regnode *opt = convert;
2747 while ( ++opt < optimize) {
2748 Set_Node_Offset_Length(opt,0,0);
2751 Try to clean up some of the debris left after the
2754 while( optimize < jumper ) {
2755 mjd_nodelen += Node_Length((optimize));
2756 OP( optimize ) = OPTIMIZED;
2757 Set_Node_Offset_Length(optimize,0,0);
2760 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2762 } /* end node insert */
2764 /* Finish populating the prev field of the wordinfo array. Walk back
2765 * from each accept state until we find another accept state, and if
2766 * so, point the first word's .prev field at the second word. If the
2767 * second already has a .prev field set, stop now. This will be the
2768 * case either if we've already processed that word's accept state,
2769 * or that state had multiple words, and the overspill words were
2770 * already linked up earlier.
2777 for (word=1; word <= trie->wordcount; word++) {
2779 if (trie->wordinfo[word].prev)
2781 state = trie->wordinfo[word].accept;
2783 state = prev_states[state];
2786 prev = trie->states[state].wordnum;
2790 trie->wordinfo[word].prev = prev;
2792 Safefree(prev_states);
2796 /* and now dump out the compressed format */
2797 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2799 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2801 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2802 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2804 SvREFCNT_dec_NN(revcharmap);
2808 : trie->startstate>1
2814 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2816 /* The Trie is constructed and compressed now so we can build a fail array if
2819 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2821 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2825 We find the fail state for each state in the trie, this state is the longest
2826 proper suffix of the current state's 'word' that is also a proper prefix of
2827 another word in our trie. State 1 represents the word '' and is thus the
2828 default fail state. This allows the DFA not to have to restart after its
2829 tried and failed a word at a given point, it simply continues as though it
2830 had been matching the other word in the first place.
2832 'abcdgu'=~/abcdefg|cdgu/
2833 When we get to 'd' we are still matching the first word, we would encounter
2834 'g' which would fail, which would bring us to the state representing 'd' in
2835 the second word where we would try 'g' and succeed, proceeding to match
2838 /* add a fail transition */
2839 const U32 trie_offset = ARG(source);
2840 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2842 const U32 ucharcount = trie->uniquecharcount;
2843 const U32 numstates = trie->statecount;
2844 const U32 ubound = trie->lasttrans + ucharcount;
2848 U32 base = trie->states[ 1 ].trans.base;
2851 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
2852 GET_RE_DEBUG_FLAGS_DECL;
2854 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2856 PERL_UNUSED_ARG(depth);
2860 ARG_SET( stclass, data_slot );
2861 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2862 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2863 aho->trie=trie_offset;
2864 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2865 Copy( trie->states, aho->states, numstates, reg_trie_state );
2866 Newxz( q, numstates, U32);
2867 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2870 /* initialize fail[0..1] to be 1 so that we always have
2871 a valid final fail state */
2872 fail[ 0 ] = fail[ 1 ] = 1;
2874 for ( charid = 0; charid < ucharcount ; charid++ ) {
2875 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2877 q[ q_write ] = newstate;
2878 /* set to point at the root */
2879 fail[ q[ q_write++ ] ]=1;
2882 while ( q_read < q_write) {
2883 const U32 cur = q[ q_read++ % numstates ];
2884 base = trie->states[ cur ].trans.base;
2886 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2887 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2889 U32 fail_state = cur;
2892 fail_state = fail[ fail_state ];
2893 fail_base = aho->states[ fail_state ].trans.base;
2894 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2896 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2897 fail[ ch_state ] = fail_state;
2898 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2900 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2902 q[ q_write++ % numstates] = ch_state;
2906 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2907 when we fail in state 1, this allows us to use the
2908 charclass scan to find a valid start char. This is based on the principle
2909 that theres a good chance the string being searched contains lots of stuff
2910 that cant be a start char.
2912 fail[ 0 ] = fail[ 1 ] = 0;
2913 DEBUG_TRIE_COMPILE_r({
2914 PerlIO_printf(Perl_debug_log,
2915 "%*sStclass Failtable (%"UVuf" states): 0",
2916 (int)(depth * 2), "", (UV)numstates
2918 for( q_read=1; q_read<numstates; q_read++ ) {
2919 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2921 PerlIO_printf(Perl_debug_log, "\n");
2924 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2928 #define DEBUG_PEEP(str,scan,depth) \
2929 DEBUG_OPTIMISE_r({if (scan){ \
2930 SV * const mysv=sv_newmortal(); \
2931 regnode *Next = regnext(scan); \
2932 regprop(RExC_rx, mysv, scan); \
2933 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2934 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2935 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2939 /* The below joins as many adjacent EXACTish nodes as possible into a single
2940 * one. The regop may be changed if the node(s) contain certain sequences that
2941 * require special handling. The joining is only done if:
2942 * 1) there is room in the current conglomerated node to entirely contain the
2944 * 2) they are the exact same node type
2946 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2947 * these get optimized out
2949 * If a node is to match under /i (folded), the number of characters it matches
2950 * can be different than its character length if it contains a multi-character
2951 * fold. *min_subtract is set to the total delta of the input nodes.
2953 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2954 * and contains LATIN SMALL LETTER SHARP S
2956 * This is as good a place as any to discuss the design of handling these
2957 * multi-character fold sequences. It's been wrong in Perl for a very long
2958 * time. There are three code points in Unicode whose multi-character folds
2959 * were long ago discovered to mess things up. The previous designs for
2960 * dealing with these involved assigning a special node for them. This
2961 * approach doesn't work, as evidenced by this example:
2962 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2963 * Both these fold to "sss", but if the pattern is parsed to create a node that
2964 * would match just the \xDF, it won't be able to handle the case where a
2965 * successful match would have to cross the node's boundary. The new approach
2966 * that hopefully generally solves the problem generates an EXACTFU_SS node
2969 * It turns out that there are problems with all multi-character folds, and not
2970 * just these three. Now the code is general, for all such cases. The
2971 * approach taken is:
2972 * 1) This routine examines each EXACTFish node that could contain multi-
2973 * character fold sequences. It returns in *min_subtract how much to
2974 * subtract from the the actual length of the string to get a real minimum
2975 * match length; it is 0 if there are no multi-char folds. This delta is
2976 * used by the caller to adjust the min length of the match, and the delta
2977 * between min and max, so that the optimizer doesn't reject these
2978 * possibilities based on size constraints.
2979 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2980 * is used for an EXACTFU node that contains at least one "ss" sequence in
2981 * it. For non-UTF-8 patterns and strings, this is the only case where
2982 * there is a possible fold length change. That means that a regular
2983 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2984 * with length changes, and so can be processed faster. regexec.c takes
2985 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2986 * pre-folded by regcomp.c. This saves effort in regex matching.
2987 * However, the pre-folding isn't done for non-UTF8 patterns because the
2988 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2989 * down by forcing the pattern into UTF8 unless necessary. Also what
2990 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2991 * possibilities for the non-UTF8 patterns are quite simple, except for
2992 * the sharp s. All the ones that don't involve a UTF-8 target string are
2993 * members of a fold-pair, and arrays are set up for all of them so that
2994 * the other member of the pair can be found quickly. Code elsewhere in
2995 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2996 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2997 * described in the next item.
2998 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2999 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3000 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
3001 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
3002 * character in the pattern corresponds to at most a single character in
3003 * the target string. (And I do mean character, and not byte here, unlike
3004 * other parts of the documentation that have never been updated to
3005 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
3006 * two character string 'ss'; in EXACTFA nodes it can match
3007 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
3008 * instances where it is violated. I'm reluctant to try to change the
3009 * assumption, as the code involved is impenetrable to me (khw), so
3010 * instead the code here punts. This routine examines (when the pattern
3011 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3012 * boolean indicating whether or not the node contains a sharp s. When it
3013 * is true, the caller sets a flag that later causes the optimizer in this
3014 * file to not set values for the floating and fixed string lengths, and
3015 * thus avoids the optimizer code in regexec.c that makes the invalid
3016 * assumption. Thus, there is no optimization based on string lengths for
3017 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3018 * (The reason the assumption is wrong only in these two cases is that all
3019 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3020 * other folds to their expanded versions. We can't prefold sharp s to
3021 * 'ss' in EXACTF nodes because we don't know at compile time if it
3022 * actually matches 'ss' or not. It will match iff the target string is
3023 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3024 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
3025 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3026 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3027 * require the pattern to be forced into UTF-8, the overhead of which we
3030 * Similarly, the code that generates tries doesn't currently handle
3031 * not-already-folded multi-char folds, and it looks like a pain to change
3032 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3033 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3034 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3035 * using /iaa matching will be doing so almost entirely with ASCII
3036 * strings, so this should rarely be encountered in practice */
3038 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
3039 if (PL_regkind[OP(scan)] == EXACT) \
3040 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
3043 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
3044 /* Merge several consecutive EXACTish nodes into one. */
3045 regnode *n = regnext(scan);
3047 regnode *next = scan + NODE_SZ_STR(scan);
3051 regnode *stop = scan;
3052 GET_RE_DEBUG_FLAGS_DECL;
3054 PERL_UNUSED_ARG(depth);
3057 PERL_ARGS_ASSERT_JOIN_EXACT;
3058 #ifndef EXPERIMENTAL_INPLACESCAN
3059 PERL_UNUSED_ARG(flags);
3060 PERL_UNUSED_ARG(val);
3062 DEBUG_PEEP("join",scan,depth);
3064 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3065 * EXACT ones that are mergeable to the current one. */
3067 && (PL_regkind[OP(n)] == NOTHING
3068 || (stringok && OP(n) == OP(scan)))
3070 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3073 if (OP(n) == TAIL || n > next)
3075 if (PL_regkind[OP(n)] == NOTHING) {
3076 DEBUG_PEEP("skip:",n,depth);
3077 NEXT_OFF(scan) += NEXT_OFF(n);
3078 next = n + NODE_STEP_REGNODE;
3085 else if (stringok) {
3086 const unsigned int oldl = STR_LEN(scan);
3087 regnode * const nnext = regnext(n);
3089 /* XXX I (khw) kind of doubt that this works on platforms where
3090 * U8_MAX is above 255 because of lots of other assumptions */
3091 /* Don't join if the sum can't fit into a single node */
3092 if (oldl + STR_LEN(n) > U8_MAX)
3095 DEBUG_PEEP("merg",n,depth);
3098 NEXT_OFF(scan) += NEXT_OFF(n);
3099 STR_LEN(scan) += STR_LEN(n);
3100 next = n + NODE_SZ_STR(n);
3101 /* Now we can overwrite *n : */
3102 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3110 #ifdef EXPERIMENTAL_INPLACESCAN
3111 if (flags && !NEXT_OFF(n)) {
3112 DEBUG_PEEP("atch", val, depth);
3113 if (reg_off_by_arg[OP(n)]) {
3114 ARG_SET(n, val - n);
3117 NEXT_OFF(n) = val - n;
3125 *has_exactf_sharp_s = FALSE;
3127 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3128 * can now analyze for sequences of problematic code points. (Prior to
3129 * this final joining, sequences could have been split over boundaries, and
3130 * hence missed). The sequences only happen in folding, hence for any
3131 * non-EXACT EXACTish node */
3132 if (OP(scan) != EXACT) {
3133 const U8 * const s0 = (U8*) STRING(scan);
3135 const U8 * const s_end = s0 + STR_LEN(scan);
3137 /* One pass is made over the node's string looking for all the
3138 * possibilities. to avoid some tests in the loop, there are two main
3139 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3143 /* Examine the string for a multi-character fold sequence. UTF-8
3144 * patterns have all characters pre-folded by the time this code is
3146 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3147 length sequence we are looking for is 2 */
3150 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3151 if (! len) { /* Not a multi-char fold: get next char */
3156 /* Nodes with 'ss' require special handling, except for EXACTFL
3157 * and EXACTFA-ish for which there is no multi-char fold to
3159 if (len == 2 && *s == 's' && *(s+1) == 's'
3160 && OP(scan) != EXACTFL
3161 && OP(scan) != EXACTFA
3162 && OP(scan) != EXACTFA_NO_TRIE)
3165 OP(scan) = EXACTFU_SS;
3168 else { /* Here is a generic multi-char fold. */
3169 const U8* multi_end = s + len;
3171 /* Count how many characters in it. In the case of /l and
3172 * /aa, no folds which contain ASCII code points are
3173 * allowed, so check for those, and skip if found. (In
3174 * EXACTFL, no folds are allowed to any Latin1 code point,
3175 * not just ASCII. But there aren't any of these
3176 * currently, nor ever likely, so don't take the time to
3177 * test for them. The code that generates the
3178 * is_MULTI_foo() macros croaks should one actually get put
3179 * into Unicode .) */
3180 if (OP(scan) != EXACTFL
3181 && OP(scan) != EXACTFA
3182 && OP(scan) != EXACTFA_NO_TRIE)
3184 count = utf8_length(s, multi_end);
3188 while (s < multi_end) {
3191 goto next_iteration;
3201 /* The delta is how long the sequence is minus 1 (1 is how long
3202 * the character that folds to the sequence is) */
3203 *min_subtract += count - 1;
3207 else if (OP(scan) == EXACTFA) {
3209 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3210 * fold to the ASCII range (and there are no existing ones in the
3211 * upper latin1 range). But, as outlined in the comments preceding
3212 * this function, we need to flag any occurrences of the sharp s.
3213 * This character forbids trie formation (because of added
3216 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3217 OP(scan) = EXACTFA_NO_TRIE;
3218 *has_exactf_sharp_s = TRUE;
3225 else if (OP(scan) != EXACTFL) {
3227 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
3228 * multi-char folds that are all Latin1. (This code knows that
3229 * there are no current multi-char folds possible with EXACTFL,
3230 * relying on fold_grind.t to catch any errors if the very unlikely
3231 * event happens that some get added in future Unicode versions.)
3232 * As explained in the comments preceding this function, we look
3233 * also for the sharp s in EXACTF nodes; it can be in the final
3234 * position. Otherwise we can stop looking 1 byte earlier because
3235 * have to find at least two characters for a multi-fold */
3236 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3239 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3240 if (! len) { /* Not a multi-char fold. */
3241 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3243 *has_exactf_sharp_s = TRUE;
3250 && isARG2_lower_or_UPPER_ARG1('s', *s)
3251 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3254 /* EXACTF nodes need to know that the minimum length
3255 * changed so that a sharp s in the string can match this
3256 * ss in the pattern, but they remain EXACTF nodes, as they
3257 * won't match this unless the target string is is UTF-8,
3258 * which we don't know until runtime */
3259 if (OP(scan) != EXACTF) {
3260 OP(scan) = EXACTFU_SS;
3264 *min_subtract += len - 1;
3271 /* Allow dumping but overwriting the collection of skipped
3272 * ops and/or strings with fake optimized ops */
3273 n = scan + NODE_SZ_STR(scan);
3281 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3285 /* REx optimizer. Converts nodes into quicker variants "in place".
3286 Finds fixed substrings. */
3288 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3289 to the position after last scanned or to NULL. */
3291 #define INIT_AND_WITHP \
3292 assert(!and_withp); \
3293 Newx(and_withp,1, regnode_ssc); \
3294 SAVEFREEPV(and_withp)
3296 /* this is a chain of data about sub patterns we are processing that
3297 need to be handled separately/specially in study_chunk. Its so
3298 we can simulate recursion without losing state. */
3300 typedef struct scan_frame {
3301 regnode *last; /* last node to process in this frame */
3302 regnode *next; /* next node to process when last is reached */
3303 struct scan_frame *prev; /*previous frame*/
3304 I32 stop; /* what stopparen do we use */
3308 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3311 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3312 SSize_t *minlenp, SSize_t *deltap,
3317 regnode_ssc *and_withp,
3318 U32 flags, U32 depth)
3319 /* scanp: Start here (read-write). */
3320 /* deltap: Write maxlen-minlen here. */
3321 /* last: Stop before this one. */
3322 /* data: string data about the pattern */
3323 /* stopparen: treat close N as END */
3324 /* recursed: which subroutines have we recursed into */
3325 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3328 /* There must be at least this number of characters to match */
3331 regnode *scan = *scanp, *next;
3333 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3334 int is_inf_internal = 0; /* The studied chunk is infinite */
3335 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3336 scan_data_t data_fake;
3337 SV *re_trie_maxbuff = NULL;
3338 regnode *first_non_open = scan;
3339 SSize_t stopmin = SSize_t_MAX;
3340 scan_frame *frame = NULL;
3341 GET_RE_DEBUG_FLAGS_DECL;
3343 PERL_ARGS_ASSERT_STUDY_CHUNK;
3346 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3350 while (first_non_open && OP(first_non_open) == OPEN)
3351 first_non_open=regnext(first_non_open);
3356 while ( scan && OP(scan) != END && scan < last ){
3357 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3358 node length to get a real minimum (because
3359 the folded version may be shorter) */
3360 bool has_exactf_sharp_s = FALSE;
3361 /* Peephole optimizer: */
3362 DEBUG_STUDYDATA("Peep:", data,depth);
3363 DEBUG_PEEP("Peep",scan,depth);
3365 /* Its not clear to khw or hv why this is done here, and not in the
3366 * clauses that deal with EXACT nodes. khw's guess is that it's
3367 * because of a previous design */
3368 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3370 /* Follow the next-chain of the current node and optimize
3371 away all the NOTHINGs from it. */
3372 if (OP(scan) != CURLYX) {
3373 const int max = (reg_off_by_arg[OP(scan)]
3375 /* I32 may be smaller than U16 on CRAYs! */
3376 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3377 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3381 /* Skip NOTHING and LONGJMP. */
3382 while ((n = regnext(n))
3383 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3384 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3385 && off + noff < max)
3387 if (reg_off_by_arg[OP(scan)])
3390 NEXT_OFF(scan) = off;
3395 /* The principal pseudo-switch. Cannot be a switch, since we
3396 look into several different things. */
3397 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3398 || OP(scan) == IFTHEN) {
3399 next = regnext(scan);
3401 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3403 if (OP(next) == code || code == IFTHEN) {
3404 /* NOTE - There is similar code to this block below for
3405 * handling TRIE nodes on a re-study. If you change stuff here
3406 * check there too. */
3407 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3409 regnode * const startbranch=scan;
3411 if (flags & SCF_DO_SUBSTR)
3412 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3413 if (flags & SCF_DO_STCLASS)
3414 ssc_init_zero(pRExC_state, &accum);
3416 while (OP(scan) == code) {
3417 SSize_t deltanext, minnext, fake;
3419 regnode_ssc this_class;
3422 data_fake.flags = 0;
3424 data_fake.whilem_c = data->whilem_c;
3425 data_fake.last_closep = data->last_closep;
3428 data_fake.last_closep = &fake;
3430 data_fake.pos_delta = delta;
3431 next = regnext(scan);
3432 scan = NEXTOPER(scan);
3434 scan = NEXTOPER(scan);
3435 if (flags & SCF_DO_STCLASS) {
3436 ssc_init(pRExC_state, &this_class);
3437 data_fake.start_class = &this_class;
3438 f = SCF_DO_STCLASS_AND;
3440 if (flags & SCF_WHILEM_VISITED_POS)
3441 f |= SCF_WHILEM_VISITED_POS;
3443 /* we suppose the run is continuous, last=next...*/
3444 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3446 stopparen, recursed, NULL, f,depth+1);
3449 if (deltanext == SSize_t_MAX) {
3450 is_inf = is_inf_internal = 1;
3452 } else if (max1 < minnext + deltanext)
3453 max1 = minnext + deltanext;
3455 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3457 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3458 if ( stopmin > minnext)
3459 stopmin = min + min1;
3460 flags &= ~SCF_DO_SUBSTR;
3462 data->flags |= SCF_SEEN_ACCEPT;
3465 if (data_fake.flags & SF_HAS_EVAL)
3466 data->flags |= SF_HAS_EVAL;
3467 data->whilem_c = data_fake.whilem_c;
3469 if (flags & SCF_DO_STCLASS)
3470 ssc_or(pRExC_state, &accum, &this_class);
3472 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3474 if (flags & SCF_DO_SUBSTR) {
3475 data->pos_min += min1;
3476 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3477 data->pos_delta = SSize_t_MAX;
3479 data->pos_delta += max1 - min1;
3480 if (max1 != min1 || is_inf)
3481 data->longest = &(data->longest_float);
3484 if (delta == SSize_t_MAX
3485 || SSize_t_MAX - delta - (max1 - min1) < 0)
3486 delta = SSize_t_MAX;
3488 delta += max1 - min1;
3489 if (flags & SCF_DO_STCLASS_OR) {
3490 ssc_or(pRExC_state, data->start_class, &accum);
3492 ssc_and(pRExC_state, data->start_class, and_withp);
3493 flags &= ~SCF_DO_STCLASS;
3496 else if (flags & SCF_DO_STCLASS_AND) {
3498 ssc_and(pRExC_state, data->start_class, &accum);
3499 flags &= ~SCF_DO_STCLASS;
3502 /* Switch to OR mode: cache the old value of
3503 * data->start_class */
3505 StructCopy(data->start_class, and_withp, regnode_ssc);
3506 flags &= ~SCF_DO_STCLASS_AND;
3507 StructCopy(&accum, data->start_class, regnode_ssc);
3508 flags |= SCF_DO_STCLASS_OR;
3512 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3515 Assuming this was/is a branch we are dealing with: 'scan'
3516 now points at the item that follows the branch sequence,
3517 whatever it is. We now start at the beginning of the
3518 sequence and look for subsequences of
3524 which would be constructed from a pattern like
3527 If we can find such a subsequence we need to turn the first
3528 element into a trie and then add the subsequent branch exact
3529 strings to the trie.
3533 1. patterns where the whole set of branches can be
3536 2. patterns where only a subset can be converted.
3538 In case 1 we can replace the whole set with a single regop
3539 for the trie. In case 2 we need to keep the start and end
3542 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3543 becomes BRANCH TRIE; BRANCH X;
3545 There is an additional case, that being where there is a
3546 common prefix, which gets split out into an EXACT like node
3547 preceding the TRIE node.
3549 If x(1..n)==tail then we can do a simple trie, if not we make
3550 a "jump" trie, such that when we match the appropriate word
3551 we "jump" to the appropriate tail node. Essentially we turn
3552 a nested if into a case structure of sorts.
3557 if (!re_trie_maxbuff) {
3558 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3559 if (!SvIOK(re_trie_maxbuff))
3560 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3562 if ( SvIV(re_trie_maxbuff)>=0 ) {
3564 regnode *first = (regnode *)NULL;
3565 regnode *last = (regnode *)NULL;
3566 regnode *tail = scan;
3571 SV * const mysv = sv_newmortal(); /* for dumping */
3573 /* var tail is used because there may be a TAIL
3574 regop in the way. Ie, the exacts will point to the
3575 thing following the TAIL, but the last branch will
3576 point at the TAIL. So we advance tail. If we
3577 have nested (?:) we may have to move through several
3581 while ( OP( tail ) == TAIL ) {
3582 /* this is the TAIL generated by (?:) */
3583 tail = regnext( tail );
3587 DEBUG_TRIE_COMPILE_r({
3588 regprop(RExC_rx, mysv, tail );
3589 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3590 (int)depth * 2 + 2, "",
3591 "Looking for TRIE'able sequences. Tail node is: ",
3592 SvPV_nolen_const( mysv )
3598 Step through the branches
3599 cur represents each branch,
3600 noper is the first thing to be matched as part
3602 noper_next is the regnext() of that node.
3604 We normally handle a case like this
3605 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3606 support building with NOJUMPTRIE, which restricts
3607 the trie logic to structures like /FOO|BAR/.
3609 If noper is a trieable nodetype then the branch is
3610 a possible optimization target. If we are building
3611 under NOJUMPTRIE then we require that noper_next is
3612 the same as scan (our current position in the regex
3615 Once we have two or more consecutive such branches
3616 we can create a trie of the EXACT's contents and
3617 stitch it in place into the program.
3619 If the sequence represents all of the branches in
3620 the alternation we replace the entire thing with a
3623 Otherwise when it is a subsequence we need to
3624 stitch it in place and replace only the relevant
3625 branches. This means the first branch has to remain
3626 as it is used by the alternation logic, and its
3627 next pointer, and needs to be repointed at the item
3628 on the branch chain following the last branch we
3629 have optimized away.
3631 This could be either a BRANCH, in which case the
3632 subsequence is internal, or it could be the item
3633 following the branch sequence in which case the
3634 subsequence is at the end (which does not
3635 necessarily mean the first node is the start of the
3638 TRIE_TYPE(X) is a define which maps the optype to a
3642 ----------------+-----------
3646 EXACTFU_SS | EXACTFU
3651 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3652 ( EXACT == (X) ) ? EXACT : \
3653 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3654 ( EXACTFA == (X) ) ? EXACTFA : \
3657 /* dont use tail as the end marker for this traverse */
3658 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3659 regnode * const noper = NEXTOPER( cur );
3660 U8 noper_type = OP( noper );
3661 U8 noper_trietype = TRIE_TYPE( noper_type );
3662 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3663 regnode * const noper_next = regnext( noper );
3664 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3665 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3668 DEBUG_TRIE_COMPILE_r({
3669 regprop(RExC_rx, mysv, cur);
3670 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3671 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3673 regprop(RExC_rx, mysv, noper);
3674 PerlIO_printf( Perl_debug_log, " -> %s",
3675 SvPV_nolen_const(mysv));
3678 regprop(RExC_rx, mysv, noper_next );
3679 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3680 SvPV_nolen_const(mysv));
3682 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3683 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3684 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3688 /* Is noper a trieable nodetype that can be merged
3689 * with the current trie (if there is one)? */
3693 ( noper_trietype == NOTHING)
3694 || ( trietype == NOTHING )
3695 || ( trietype == noper_trietype )
3698 && noper_next == tail
3702 /* Handle mergable triable node Either we are
3703 * the first node in a new trieable sequence,
3704 * in which case we do some bookkeeping,
3705 * otherwise we update the end pointer. */
3708 if ( noper_trietype == NOTHING ) {
3709 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3710 regnode * const noper_next = regnext( noper );
3711 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3712 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3715 if ( noper_next_trietype ) {
3716 trietype = noper_next_trietype;
3717 } else if (noper_next_type) {
3718 /* a NOTHING regop is 1 regop wide.
3719 * We need at least two for a trie
3720 * so we can't merge this in */
3724 trietype = noper_trietype;
3727 if ( trietype == NOTHING )
3728 trietype = noper_trietype;
3733 } /* end handle mergable triable node */
3735 /* handle unmergable node -
3736 * noper may either be a triable node which can
3737 * not be tried together with the current trie,
3738 * or a non triable node */
3740 /* If last is set and trietype is not
3741 * NOTHING then we have found at least two
3742 * triable branch sequences in a row of a
3743 * similar trietype so we can turn them
3744 * into a trie. If/when we allow NOTHING to
3745 * start a trie sequence this condition
3746 * will be required, and it isn't expensive
3747 * so we leave it in for now. */
3748 if ( trietype && trietype != NOTHING )
3749 make_trie( pRExC_state,
3750 startbranch, first, cur, tail, count,
3751 trietype, depth+1 );
3752 last = NULL; /* note: we clear/update
3753 first, trietype etc below,
3754 so we dont do it here */
3758 && noper_next == tail
3761 /* noper is triable, so we can start a new
3765 trietype = noper_trietype;
3767 /* if we already saw a first but the
3768 * current node is not triable then we have
3769 * to reset the first information. */
3774 } /* end handle unmergable node */
3775 } /* loop over branches */
3776 DEBUG_TRIE_COMPILE_r({
3777 regprop(RExC_rx, mysv, cur);
3778 PerlIO_printf( Perl_debug_log,
3779 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3780 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3783 if ( last && trietype ) {
3784 if ( trietype != NOTHING ) {
3785 /* the last branch of the sequence was part of
3786 * a trie, so we have to construct it here
3787 * outside of the loop */
3788 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3789 #ifdef TRIE_STUDY_OPT
3790 if ( ((made == MADE_EXACT_TRIE &&
3791 startbranch == first)
3792 || ( first_non_open == first )) &&
3794 flags |= SCF_TRIE_RESTUDY;
3795 if ( startbranch == first
3798 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3803 /* at this point we know whatever we have is a
3804 * NOTHING sequence/branch AND if 'startbranch'
3805 * is 'first' then we can turn the whole thing
3808 if ( startbranch == first ) {
3810 /* the entire thing is a NOTHING sequence,
3811 * something like this: (?:|) So we can
3812 * turn it into a plain NOTHING op. */
3813 DEBUG_TRIE_COMPILE_r({
3814 regprop(RExC_rx, mysv, cur);
3815 PerlIO_printf( Perl_debug_log,
3816 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3817 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3820 OP(startbranch)= NOTHING;
3821 NEXT_OFF(startbranch)= tail - startbranch;
3822 for ( opt= startbranch + 1; opt < tail ; opt++ )
3826 } /* end if ( last) */
3827 } /* TRIE_MAXBUF is non zero */
3832 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3833 scan = NEXTOPER(NEXTOPER(scan));
3834 } else /* single branch is optimized. */
3835 scan = NEXTOPER(scan);
3837 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3838 scan_frame *newframe = NULL;
3843 if (OP(scan) != SUSPEND) {
3844 /* set the pointer */
3845 if (OP(scan) == GOSUB) {
3847 RExC_recurse[ARG2L(scan)] = scan;
3848 start = RExC_open_parens[paren-1];
3849 end = RExC_close_parens[paren-1];
3852 start = RExC_rxi->program + 1;
3856 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3857 SAVEFREEPV(recursed);
3859 if (!PAREN_TEST(recursed,paren+1)) {
3860 PAREN_SET(recursed,paren+1);
3861 Newx(newframe,1,scan_frame);
3863 if (flags & SCF_DO_SUBSTR) {
3864 SCAN_COMMIT(pRExC_state,data,minlenp);
3865 data->longest = &(data->longest_float);
3867 is_inf = is_inf_internal = 1;
3868 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3869 ssc_anything(data->start_class);
3870 flags &= ~SCF_DO_STCLASS;
3873 Newx(newframe,1,scan_frame);
3876 end = regnext(scan);
3881 SAVEFREEPV(newframe);
3882 newframe->next = regnext(scan);
3883 newframe->last = last;
3884 newframe->stop = stopparen;
3885 newframe->prev = frame;
3895 else if (OP(scan) == EXACT) {
3896 SSize_t l = STR_LEN(scan);
3899 const U8 * const s = (U8*)STRING(scan);
3900 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3901 l = utf8_length(s, s + l);
3903 uc = *((U8*)STRING(scan));
3906 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3907 /* The code below prefers earlier match for fixed
3908 offset, later match for variable offset. */
3909 if (data->last_end == -1) { /* Update the start info. */
3910 data->last_start_min = data->pos_min;
3911 data->last_start_max = is_inf
3912 ? SSize_t_MAX : data->pos_min + data->pos_delta;
3914 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3916 SvUTF8_on(data->last_found);
3918 SV * const sv = data->last_found;
3919 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3920 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3921 if (mg && mg->mg_len >= 0)
3922 mg->mg_len += utf8_length((U8*)STRING(scan),
3923 (U8*)STRING(scan)+STR_LEN(scan));
3925 data->last_end = data->pos_min + l;
3926 data->pos_min += l; /* As in the first entry. */
3927 data->flags &= ~SF_BEFORE_EOL;
3930 /* ANDing the code point leaves at most it, and not in locale, and
3931 * can't match null string */
3932 if (flags & SCF_DO_STCLASS_AND) {
3933 ssc_cp_and(data->start_class, uc);
3934 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
3935 ssc_clear_locale(data->start_class);
3937 else if (flags & SCF_DO_STCLASS_OR) {
3938 ssc_add_cp(data->start_class, uc);
3939 ssc_and(pRExC_state, data->start_class, and_withp);
3941 flags &= ~SCF_DO_STCLASS;
3943 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3944 SSize_t l = STR_LEN(scan);
3945 UV uc = *((U8*)STRING(scan));
3946 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
3947 separate code points */
3949 /* Search for fixed substrings supports EXACT only. */
3950 if (flags & SCF_DO_SUBSTR) {
3952 SCAN_COMMIT(pRExC_state, data, minlenp);
3955 const U8 * const s = (U8 *)STRING(scan);
3956 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3957 l = utf8_length(s, s + l);
3959 if (has_exactf_sharp_s) {
3960 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3962 min += l - min_subtract;
3964 delta += min_subtract;
3965 if (flags & SCF_DO_SUBSTR) {
3966 data->pos_min += l - min_subtract;
3967 if (data->pos_min < 0) {
3970 data->pos_delta += min_subtract;
3972 data->longest = &(data->longest_float);
3975 if (OP(scan) == EXACTFL) {
3976 if (flags & SCF_DO_STCLASS_AND) {
3977 ssc_flags_and(data->start_class,
3978 ANYOF_LOCALE|ANYOF_LOC_FOLD);
3980 else if (flags & SCF_DO_STCLASS_OR) {
3981 ANYOF_FLAGS(data->start_class)
3982 |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3985 /* We don't know what the folds are; it could be anything. XXX
3986 * Actually, we only support UTF-8 encoding for code points
3987 * above Latin1, so we could know what those folds are. */
3988 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
3992 else { /* Non-locale EXACTFish */
3993 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
3994 if (flags & SCF_DO_STCLASS_AND) {
3995 ssc_clear_locale(data->start_class);
3997 if (uc < 256) { /* We know what the Latin1 folds are ... */
3998 if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
3999 know if anything folds
4001 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4002 PL_fold_latin1[uc]);
4003 if (OP(scan) != EXACTFA) { /* The folds below aren't
4005 if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4007 = add_cp_to_invlist(EXACTF_invlist,
4008 LATIN_SMALL_LETTER_SHARP_S);
4010 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4012 = add_cp_to_invlist(EXACTF_invlist, 's');
4014 = add_cp_to_invlist(EXACTF_invlist, 'S');
4018 /* We also know if there are above-Latin1 code points
4019 * that fold to this (none legal for ASCII and /iaa) */
4020 if ((! isASCII(uc) || OP(scan) != EXACTFA)
4021 && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4023 /* XXX We could know exactly what does fold to this
4024 * if the reverse folds are loaded, as currently in
4026 _invlist_union(EXACTF_invlist,
4032 else { /* Non-locale, above Latin1. XXX We don't currently
4033 know what participates in folds with this, so have
4034 to assume anything could */
4036 /* XXX We could know exactly what does fold to this if the
4037 * reverse folds are loaded, as currently in S_regclass().
4038 * But we do know that under /iaa nothing in the ASCII
4039 * range can participate */
4040 if (OP(scan) == EXACTFA) {
4041 _invlist_union_complement_2nd(EXACTF_invlist,
4042 PL_Posix_ptrs[_CC_ASCII],
4046 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4051 if (flags & SCF_DO_STCLASS_AND) {
4052 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4053 ANYOF_POSIXL_ZERO(data->start_class);
4054 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4056 else if (flags & SCF_DO_STCLASS_OR) {
4057 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4058 ssc_and(pRExC_state, data->start_class, and_withp);
4060 flags &= ~SCF_DO_STCLASS;
4061 SvREFCNT_dec(EXACTF_invlist);
4063 else if (REGNODE_VARIES(OP(scan))) {
4064 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4065 I32 fl = 0, f = flags;
4066 regnode * const oscan = scan;
4067 regnode_ssc this_class;
4068 regnode_ssc *oclass = NULL;
4069 I32 next_is_eval = 0;
4071 switch (PL_regkind[OP(scan)]) {
4072 case WHILEM: /* End of (?:...)* . */
4073 scan = NEXTOPER(scan);
4076 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4077 next = NEXTOPER(scan);
4078 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4080 maxcount = REG_INFTY;
4081 next = regnext(scan);
4082 scan = NEXTOPER(scan);
4086 if (flags & SCF_DO_SUBSTR)
4091 if (flags & SCF_DO_STCLASS) {
4093 maxcount = REG_INFTY;
4094 next = regnext(scan);
4095 scan = NEXTOPER(scan);
4098 is_inf = is_inf_internal = 1;
4099 scan = regnext(scan);
4100 if (flags & SCF_DO_SUBSTR) {
4101 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
4102 data->longest = &(data->longest_float);
4104 goto optimize_curly_tail;
4106 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4107 && (scan->flags == stopparen))
4112 mincount = ARG1(scan);
4113 maxcount = ARG2(scan);
4115 next = regnext(scan);
4116 if (OP(scan) == CURLYX) {
4117 I32 lp = (data ? *(data->last_closep) : 0);
4118 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4120 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4121 next_is_eval = (OP(scan) == EVAL);
4123 if (flags & SCF_DO_SUBSTR) {
4124 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
4125 pos_before = data->pos_min;
4129 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4131 data->flags |= SF_IS_INF;
4133 if (flags & SCF_DO_STCLASS) {
4134 ssc_init(pRExC_state, &this_class);
4135 oclass = data->start_class;
4136 data->start_class = &this_class;
4137 f |= SCF_DO_STCLASS_AND;
4138 f &= ~SCF_DO_STCLASS_OR;
4140 /* Exclude from super-linear cache processing any {n,m}
4141 regops for which the combination of input pos and regex
4142 pos is not enough information to determine if a match
4145 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4146 regex pos at the \s*, the prospects for a match depend not
4147 only on the input position but also on how many (bar\s*)
4148 repeats into the {4,8} we are. */
4149 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4150 f &= ~SCF_WHILEM_VISITED_POS;
4152 /* This will finish on WHILEM, setting scan, or on NULL: */
4153 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4154 last, data, stopparen, recursed, NULL,
4156 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
4158 if (flags & SCF_DO_STCLASS)
4159 data->start_class = oclass;
4160 if (mincount == 0 || minnext == 0) {
4161 if (flags & SCF_DO_STCLASS_OR) {
4162 ssc_or(pRExC_state, data->start_class, &this_class);
4164 else if (flags & SCF_DO_STCLASS_AND) {
4165 /* Switch to OR mode: cache the old value of
4166 * data->start_class */
4168 StructCopy(data->start_class, and_withp, regnode_ssc);
4169 flags &= ~SCF_DO_STCLASS_AND;
4170 StructCopy(&this_class, data->start_class, regnode_ssc);
4171 flags |= SCF_DO_STCLASS_OR;
4172 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4174 } else { /* Non-zero len */
4175 if (flags & SCF_DO_STCLASS_OR) {
4176 ssc_or(pRExC_state, data->start_class, &this_class);
4177 ssc_and(pRExC_state, data->start_class, and_withp);
4179 else if (flags & SCF_DO_STCLASS_AND)
4180 ssc_and(pRExC_state, data->start_class, &this_class);
4181 flags &= ~SCF_DO_STCLASS;
4183 if (!scan) /* It was not CURLYX, but CURLY. */
4185 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4186 /* ? quantifier ok, except for (?{ ... }) */
4187 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4188 && (minnext == 0) && (deltanext == 0)
4189 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4190 && maxcount <= REG_INFTY/3) /* Complement check for big count */
4192 /* Fatal warnings may leak the regexp without this: */
4193 SAVEFREESV(RExC_rx_sv);
4194 ckWARNreg(RExC_parse,
4195 "Quantifier unexpected on zero-length expression");
4196 (void)ReREFCNT_inc(RExC_rx_sv);
4199 min += minnext * mincount;
4200 is_inf_internal |= deltanext == SSize_t_MAX
4201 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4202 is_inf |= is_inf_internal;
4204 delta = SSize_t_MAX;
4206 delta += (minnext + deltanext) * maxcount - minnext * mincount;
4208 /* Try powerful optimization CURLYX => CURLYN. */
4209 if ( OP(oscan) == CURLYX && data
4210 && data->flags & SF_IN_PAR
4211 && !(data->flags & SF_HAS_EVAL)
4212 && !deltanext && minnext == 1 ) {
4213 /* Try to optimize to CURLYN. */
4214 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4215 regnode * const nxt1 = nxt;
4222 if (!REGNODE_SIMPLE(OP(nxt))
4223 && !(PL_regkind[OP(nxt)] == EXACT
4224 && STR_LEN(nxt) == 1))
4230 if (OP(nxt) != CLOSE)
4232 if (RExC_open_parens) {
4233 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4234 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4236 /* Now we know that nxt2 is the only contents: */
4237 oscan->flags = (U8)ARG(nxt);
4239 OP(nxt1) = NOTHING; /* was OPEN. */
4242 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4243 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4244 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4245 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4246 OP(nxt + 1) = OPTIMIZED; /* was count. */
4247 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4252 /* Try optimization CURLYX => CURLYM. */
4253 if ( OP(oscan) == CURLYX && data
4254 && !(data->flags & SF_HAS_PAR)
4255 && !(data->flags & SF_HAS_EVAL)
4256 && !deltanext /* atom is fixed width */
4257 && minnext != 0 /* CURLYM can't handle zero width */
4258 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4260 /* XXXX How to optimize if data == 0? */
4261 /* Optimize to a simpler form. */
4262 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4266 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4267 && (OP(nxt2) != WHILEM))
4269 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4270 /* Need to optimize away parenths. */
4271 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4272 /* Set the parenth number. */
4273 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4275 oscan->flags = (U8)ARG(nxt);
4276 if (RExC_open_parens) {
4277 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4278 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4280 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4281 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4284 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4285 OP(nxt + 1) = OPTIMIZED; /* was count. */
4286 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4287 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4290 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4291 regnode *nnxt = regnext(nxt1);
4293 if (reg_off_by_arg[OP(nxt1)])
4294 ARG_SET(nxt1, nxt2 - nxt1);
4295 else if (nxt2 - nxt1 < U16_MAX)
4296 NEXT_OFF(nxt1) = nxt2 - nxt1;
4298 OP(nxt) = NOTHING; /* Cannot beautify */
4303 /* Optimize again: */
4304 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4305 NULL, stopparen, recursed, NULL, 0,depth+1);
4310 else if ((OP(oscan) == CURLYX)
4311 && (flags & SCF_WHILEM_VISITED_POS)
4312 /* See the comment on a similar expression above.
4313 However, this time it's not a subexpression
4314 we care about, but the expression itself. */
4315 && (maxcount == REG_INFTY)
4316 && data && ++data->whilem_c < 16) {
4317 /* This stays as CURLYX, we can put the count/of pair. */
4318 /* Find WHILEM (as in regexec.c) */
4319 regnode *nxt = oscan + NEXT_OFF(oscan);
4321 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4323 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4324 | (RExC_whilem_seen << 4)); /* On WHILEM */
4326 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4328 if (flags & SCF_DO_SUBSTR) {
4329 SV *last_str = NULL;
4330 int counted = mincount != 0;
4332 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4333 SSize_t b = pos_before >= data->last_start_min
4334 ? pos_before : data->last_start_min;
4336 const char * const s = SvPV_const(data->last_found, l);
4337 SSize_t old = b - data->last_start_min;
4340 old = utf8_hop((U8*)s, old) - (U8*)s;
4342 /* Get the added string: */
4343 last_str = newSVpvn_utf8(s + old, l, UTF);
4344 if (deltanext == 0 && pos_before == b) {
4345 /* What was added is a constant string */
4347 SvGROW(last_str, (mincount * l) + 1);
4348 repeatcpy(SvPVX(last_str) + l,
4349 SvPVX_const(last_str), l, mincount - 1);
4350 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4351 /* Add additional parts. */
4352 SvCUR_set(data->last_found,
4353 SvCUR(data->last_found) - l);
4354 sv_catsv(data->last_found, last_str);
4356 SV * sv = data->last_found;
4358 SvUTF8(sv) && SvMAGICAL(sv) ?
4359 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4360 if (mg && mg->mg_len >= 0)
4361 mg->mg_len += CHR_SVLEN(last_str) - l;
4363 data->last_end += l * (mincount - 1);
4366 /* start offset must point into the last copy */
4367 data->last_start_min += minnext * (mincount - 1);
4368 data->last_start_max += is_inf ? SSize_t_MAX
4369 : (maxcount - 1) * (minnext + data->pos_delta);
4372 /* It is counted once already... */
4373 data->pos_min += minnext * (mincount - counted);
4375 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4376 " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4377 " maxcount=%"UVdf" mincount=%"UVdf"\n",
4378 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4380 if (deltanext != SSize_t_MAX)
4381 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4382 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4383 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4385 if (deltanext == SSize_t_MAX ||
4386 -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4387 data->pos_delta = SSize_t_MAX;
4389 data->pos_delta += - counted * deltanext +
4390 (minnext + deltanext) * maxcount - minnext * mincount;
4391 if (mincount != maxcount) {
4392 /* Cannot extend fixed substrings found inside
4394 SCAN_COMMIT(pRExC_state,data,minlenp);
4395 if (mincount && last_str) {
4396 SV * const sv = data->last_found;
4397 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4398 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4402 sv_setsv(sv, last_str);
4403 data->last_end = data->pos_min;
4404 data->last_start_min =
4405 data->pos_min - CHR_SVLEN(last_str);
4406 data->last_start_max = is_inf
4408 : data->pos_min + data->pos_delta
4409 - CHR_SVLEN(last_str);
4411 data->longest = &(data->longest_float);
4413 SvREFCNT_dec(last_str);
4415 if (data && (fl & SF_HAS_EVAL))
4416 data->flags |= SF_HAS_EVAL;
4417 optimize_curly_tail:
4418 if (OP(oscan) != CURLYX) {
4419 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4421 NEXT_OFF(oscan) += NEXT_OFF(next);
4427 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4432 if (flags & SCF_DO_SUBSTR) {
4433 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4434 data->longest = &(data->longest_float);
4436 is_inf = is_inf_internal = 1;
4437 if (flags & SCF_DO_STCLASS_OR) {
4438 if (OP(scan) == CLUMP) {
4439 /* Actually is any start char, but very few code points
4440 * aren't start characters */
4441 ssc_match_all_cp(data->start_class);
4444 ssc_anything(data->start_class);
4447 flags &= ~SCF_DO_STCLASS;
4451 else if (OP(scan) == LNBREAK) {
4452 if (flags & SCF_DO_STCLASS) {
4453 if (flags & SCF_DO_STCLASS_AND) {
4454 ssc_intersection(data->start_class,
4455 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4456 ssc_clear_locale(data->start_class);
4457 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4459 else if (flags & SCF_DO_STCLASS_OR) {
4460 ssc_union(data->start_class,
4461 PL_XPosix_ptrs[_CC_VERTSPACE],
4463 ssc_and(pRExC_state, data->start_class, and_withp);
4465 flags &= ~SCF_DO_STCLASS;
4468 delta++; /* Because of the 2 char string cr-lf */
4469 if (flags & SCF_DO_SUBSTR) {
4470 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4472 data->pos_delta += 1;
4473 data->longest = &(data->longest_float);
4476 else if (REGNODE_SIMPLE(OP(scan))) {
4478 if (flags & SCF_DO_SUBSTR) {
4479 SCAN_COMMIT(pRExC_state,data,minlenp);
4483 if (flags & SCF_DO_STCLASS) {
4485 SV* my_invlist = sv_2mortal(_new_invlist(0));
4489 if (flags & SCF_DO_STCLASS_AND) {
4490 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4493 /* Some of the logic below assumes that switching
4494 locale on will only add false positives. */
4499 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4503 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4504 ssc_match_all_cp(data->start_class);
4509 SV* REG_ANY_invlist = _new_invlist(2);
4510 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4512 if (flags & SCF_DO_STCLASS_OR) {
4513 ssc_union(data->start_class,
4515 TRUE /* TRUE => invert, hence all but \n
4519 else if (flags & SCF_DO_STCLASS_AND) {
4520 ssc_intersection(data->start_class,
4522 TRUE /* TRUE => invert */
4524 ssc_clear_locale(data->start_class);
4526 SvREFCNT_dec_NN(REG_ANY_invlist);
4530 case ANYOF_WARN_SUPER:
4532 if (flags & SCF_DO_STCLASS_AND)
4533 ssc_and(pRExC_state, data->start_class,
4534 (regnode_ssc*) scan);
4536 ssc_or(pRExC_state, data->start_class,
4537 (regnode_ssc*)scan);
4545 classnum = FLAGS(scan);
4546 namedclass = classnum_to_namedclass(classnum) + invert;
4547 if (flags & SCF_DO_STCLASS_AND) {
4548 bool was_there = cBOOL(
4549 ANYOF_POSIXL_TEST(data->start_class,
4551 ANYOF_POSIXL_ZERO(data->start_class);
4552 if (was_there) { /* Do an AND */
4553 ANYOF_POSIXL_SET(data->start_class, namedclass);
4555 /* No individual code points can now match */
4556 data->start_class->invlist
4557 = sv_2mortal(_new_invlist(0));
4560 int complement = namedclass + ((invert) ? -1 : 1);
4562 assert(flags & SCF_DO_STCLASS_OR);
4564 /* If the complement of this class was already there,
4565 * the result is that they match all code points,
4566 * (\d + \D == everything). Remove the classes from
4567 * future consideration. Locale is not relevant in
4569 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4570 ssc_match_all_cp(data->start_class);
4571 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4572 ANYOF_POSIXL_CLEAR(data->start_class, complement);
4573 if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
4575 ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
4578 else { /* The usual case; just add this class to the
4580 ANYOF_POSIXL_SET(data->start_class, namedclass);
4581 ANYOF_FLAGS(data->start_class)
4582 |= ANYOF_LOCALE|ANYOF_POSIXL;
4587 case NPOSIXA: /* For these, we always know the exact set of
4592 classnum = FLAGS(scan);
4593 my_invlist = PL_Posix_ptrs[classnum];
4602 classnum = FLAGS(scan);
4604 /* If we know all the code points that match the class, use
4605 * that; otherwise use the Latin1 code points, plus we have
4606 * to assume that it could match anything above Latin1 */
4607 if (PL_XPosix_ptrs[classnum]) {
4608 my_invlist = invlist_clone(PL_XPosix_ptrs[classnum]);
4611 _invlist_union(PL_L1Posix_ptrs[classnum],
4612 PL_AboveLatin1, &my_invlist);
4615 /* NPOSIXD matches all upper Latin1 code points unless the
4616 * target string being matched is UTF-8, which is
4617 * unknowable until match time */
4618 if (PL_regkind[OP(scan)] == NPOSIXD) {
4619 _invlist_union_complement_2nd(my_invlist,
4620 PL_Posix_ptrs[_CC_ASCII], &my_invlist);
4625 if (flags & SCF_DO_STCLASS_AND) {
4626 ssc_intersection(data->start_class, my_invlist, invert);
4627 ssc_clear_locale(data->start_class);
4630 assert(flags & SCF_DO_STCLASS_OR);
4631 ssc_union(data->start_class, my_invlist, invert);
4634 if (flags & SCF_DO_STCLASS_OR)
4635 ssc_and(pRExC_state, data->start_class, and_withp);
4636 flags &= ~SCF_DO_STCLASS;
4639 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4640 data->flags |= (OP(scan) == MEOL
4643 SCAN_COMMIT(pRExC_state, data, minlenp);
4646 else if ( PL_regkind[OP(scan)] == BRANCHJ
4647 /* Lookbehind, or need to calculate parens/evals/stclass: */
4648 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4649 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4650 if ( OP(scan) == UNLESSM &&
4652 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4653 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4656 regnode *upto= regnext(scan);
4658 SV * const mysv_val=sv_newmortal();
4659 DEBUG_STUDYDATA("OPFAIL",data,depth);
4661 /*DEBUG_PARSE_MSG("opfail");*/
4662 regprop(RExC_rx, mysv_val, upto);
4663 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4664 SvPV_nolen_const(mysv_val),
4665 (IV)REG_NODE_NUM(upto),
4670 NEXT_OFF(scan) = upto - scan;
4671 for (opt= scan + 1; opt < upto ; opt++)
4672 OP(opt) = OPTIMIZED;
4676 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4677 || OP(scan) == UNLESSM )
4679 /* Negative Lookahead/lookbehind
4680 In this case we can't do fixed string optimisation.
4683 SSize_t deltanext, minnext, fake = 0;
4688 data_fake.flags = 0;
4690 data_fake.whilem_c = data->whilem_c;
4691 data_fake.last_closep = data->last_closep;
4694 data_fake.last_closep = &fake;
4695 data_fake.pos_delta = delta;
4696 if ( flags & SCF_DO_STCLASS && !scan->flags
4697 && OP(scan) == IFMATCH ) { /* Lookahead */
4698 ssc_init(pRExC_state, &intrnl);
4699 data_fake.start_class = &intrnl;
4700 f |= SCF_DO_STCLASS_AND;
4702 if (flags & SCF_WHILEM_VISITED_POS)
4703 f |= SCF_WHILEM_VISITED_POS;
4704 next = regnext(scan);
4705 nscan = NEXTOPER(NEXTOPER(scan));
4706 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4707 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4710 FAIL("Variable length lookbehind not implemented");
4712 else if (minnext > (I32)U8_MAX) {
4713 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4715 scan->flags = (U8)minnext;
4718 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4720 if (data_fake.flags & SF_HAS_EVAL)
4721 data->flags |= SF_HAS_EVAL;
4722 data->whilem_c = data_fake.whilem_c;
4724 if (f & SCF_DO_STCLASS_AND) {
4725 if (flags & SCF_DO_STCLASS_OR) {
4726 /* OR before, AND after: ideally we would recurse with
4727 * data_fake to get the AND applied by study of the
4728 * remainder of the pattern, and then derecurse;
4729 * *** HACK *** for now just treat as "no information".
4730 * See [perl #56690].
4732 ssc_init(pRExC_state, data->start_class);
4734 /* AND before and after: combine and continue */
4735 ssc_and(pRExC_state, data->start_class, &intrnl);
4739 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4741 /* Positive Lookahead/lookbehind
4742 In this case we can do fixed string optimisation,
4743 but we must be careful about it. Note in the case of
4744 lookbehind the positions will be offset by the minimum
4745 length of the pattern, something we won't know about
4746 until after the recurse.
4748 SSize_t deltanext, fake = 0;
4752 /* We use SAVEFREEPV so that when the full compile
4753 is finished perl will clean up the allocated
4754 minlens when it's all done. This way we don't
4755 have to worry about freeing them when we know
4756 they wont be used, which would be a pain.
4759 Newx( minnextp, 1, SSize_t );
4760 SAVEFREEPV(minnextp);
4763 StructCopy(data, &data_fake, scan_data_t);
4764 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4767 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4768 data_fake.last_found=newSVsv(data->last_found);
4772 data_fake.last_closep = &fake;
4773 data_fake.flags = 0;
4774 data_fake.pos_delta = delta;
4776 data_fake.flags |= SF_IS_INF;
4777 if ( flags & SCF_DO_STCLASS && !scan->flags
4778 && OP(scan) == IFMATCH ) { /* Lookahead */
4779 ssc_init(pRExC_state, &intrnl);
4780 data_fake.start_class = &intrnl;
4781 f |= SCF_DO_STCLASS_AND;
4783 if (flags & SCF_WHILEM_VISITED_POS)
4784 f |= SCF_WHILEM_VISITED_POS;
4785 next = regnext(scan);
4786 nscan = NEXTOPER(NEXTOPER(scan));
4788 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4789 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4792 FAIL("Variable length lookbehind not implemented");
4794 else if (*minnextp > (I32)U8_MAX) {
4795 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4797 scan->flags = (U8)*minnextp;
4802 if (f & SCF_DO_STCLASS_AND) {
4803 ssc_and(pRExC_state, data->start_class, &intrnl);
4806 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4808 if (data_fake.flags & SF_HAS_EVAL)
4809 data->flags |= SF_HAS_EVAL;
4810 data->whilem_c = data_fake.whilem_c;
4811 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4812 if (RExC_rx->minlen<*minnextp)
4813 RExC_rx->minlen=*minnextp;
4814 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4815 SvREFCNT_dec_NN(data_fake.last_found);
4817 if ( data_fake.minlen_fixed != minlenp )
4819 data->offset_fixed= data_fake.offset_fixed;
4820 data->minlen_fixed= data_fake.minlen_fixed;
4821 data->lookbehind_fixed+= scan->flags;
4823 if ( data_fake.minlen_float != minlenp )
4825 data->minlen_float= data_fake.minlen_float;
4826 data->offset_float_min=data_fake.offset_float_min;
4827 data->offset_float_max=data_fake.offset_float_max;
4828 data->lookbehind_float+= scan->flags;
4835 else if (OP(scan) == OPEN) {
4836 if (stopparen != (I32)ARG(scan))
4839 else if (OP(scan) == CLOSE) {
4840 if (stopparen == (I32)ARG(scan)) {
4843 if ((I32)ARG(scan) == is_par) {
4844 next = regnext(scan);
4846 if ( next && (OP(next) != WHILEM) && next < last)
4847 is_par = 0; /* Disable optimization */
4850 *(data->last_closep) = ARG(scan);
4852 else if (OP(scan) == EVAL) {
4854 data->flags |= SF_HAS_EVAL;
4856 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4857 if (flags & SCF_DO_SUBSTR) {
4858 SCAN_COMMIT(pRExC_state,data,minlenp);
4859 flags &= ~SCF_DO_SUBSTR;
4861 if (data && OP(scan)==ACCEPT) {
4862 data->flags |= SCF_SEEN_ACCEPT;
4867 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4869 if (flags & SCF_DO_SUBSTR) {
4870 SCAN_COMMIT(pRExC_state,data,minlenp);
4871 data->longest = &(data->longest_float);
4873 is_inf = is_inf_internal = 1;
4874 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4875 ssc_anything(data->start_class);
4876 flags &= ~SCF_DO_STCLASS;
4878 else if (OP(scan) == GPOS) {
4879 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4880 !(delta || is_inf || (data && data->pos_delta)))
4882 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4883 RExC_rx->extflags |= RXf_ANCH_GPOS;
4884 if (RExC_rx->gofs < (STRLEN)min)
4885 RExC_rx->gofs = min;
4887 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4891 #ifdef TRIE_STUDY_OPT
4892 #ifdef FULL_TRIE_STUDY
4893 else if (PL_regkind[OP(scan)] == TRIE) {
4894 /* NOTE - There is similar code to this block above for handling
4895 BRANCH nodes on the initial study. If you change stuff here
4897 regnode *trie_node= scan;
4898 regnode *tail= regnext(scan);
4899 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4900 SSize_t max1 = 0, min1 = SSize_t_MAX;
4903 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4904 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4905 if (flags & SCF_DO_STCLASS)
4906 ssc_init_zero(pRExC_state, &accum);
4912 const regnode *nextbranch= NULL;
4915 for ( word=1 ; word <= trie->wordcount ; word++)
4917 SSize_t deltanext=0, minnext=0, f = 0, fake;
4918 regnode_ssc this_class;
4920 data_fake.flags = 0;
4922 data_fake.whilem_c = data->whilem_c;
4923 data_fake.last_closep = data->last_closep;
4926 data_fake.last_closep = &fake;
4927 data_fake.pos_delta = delta;
4928 if (flags & SCF_DO_STCLASS) {
4929 ssc_init(pRExC_state, &this_class);
4930 data_fake.start_class = &this_class;
4931 f = SCF_DO_STCLASS_AND;
4933 if (flags & SCF_WHILEM_VISITED_POS)
4934 f |= SCF_WHILEM_VISITED_POS;
4936 if (trie->jump[word]) {
4938 nextbranch = trie_node + trie->jump[0];
4939 scan= trie_node + trie->jump[word];
4940 /* We go from the jump point to the branch that follows
4941 it. Note this means we need the vestigal unused branches
4942 even though they arent otherwise used.
4944 minnext = study_chunk(pRExC_state, &scan, minlenp,
4945 &deltanext, (regnode *)nextbranch, &data_fake,
4946 stopparen, recursed, NULL, f,depth+1);
4948 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4949 nextbranch= regnext((regnode*)nextbranch);
4951 if (min1 > (SSize_t)(minnext + trie->minlen))
4952 min1 = minnext + trie->minlen;
4953 if (deltanext == SSize_t_MAX) {
4954 is_inf = is_inf_internal = 1;
4956 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
4957 max1 = minnext + deltanext + trie->maxlen;
4959 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4961 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4962 if ( stopmin > min + min1)
4963 stopmin = min + min1;
4964 flags &= ~SCF_DO_SUBSTR;
4966 data->flags |= SCF_SEEN_ACCEPT;
4969 if (data_fake.flags & SF_HAS_EVAL)
4970 data->flags |= SF_HAS_EVAL;
4971 data->whilem_c = data_fake.whilem_c;
4973 if (flags & SCF_DO_STCLASS)
4974 ssc_or(pRExC_state, &accum, &this_class);
4977 if (flags & SCF_DO_SUBSTR) {
4978 data->pos_min += min1;
4979 data->pos_delta += max1 - min1;
4980 if (max1 != min1 || is_inf)
4981 data->longest = &(data->longest_float);
4984 delta += max1 - min1;
4985 if (flags & SCF_DO_STCLASS_OR) {
4986 ssc_or(pRExC_state, data->start_class, &accum);
4988 ssc_and(pRExC_state, data->start_class, and_withp);
4989 flags &= ~SCF_DO_STCLASS;
4992 else if (flags & SCF_DO_STCLASS_AND) {
4994 ssc_and(pRExC_state, data->start_class, &accum);
4995 flags &= ~SCF_DO_STCLASS;
4998 /* Switch to OR mode: cache the old value of
4999 * data->start_class */
5001 StructCopy(data->start_class, and_withp, regnode_ssc);
5002 flags &= ~SCF_DO_STCLASS_AND;
5003 StructCopy(&accum, data->start_class, regnode_ssc);
5004 flags |= SCF_DO_STCLASS_OR;
5011 else if (PL_regkind[OP(scan)] == TRIE) {
5012 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5015 min += trie->minlen;
5016 delta += (trie->maxlen - trie->minlen);
5017 flags &= ~SCF_DO_STCLASS; /* xxx */
5018 if (flags & SCF_DO_SUBSTR) {
5019 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
5020 data->pos_min += trie->minlen;
5021 data->pos_delta += (trie->maxlen - trie->minlen);
5022 if (trie->maxlen != trie->minlen)
5023 data->longest = &(data->longest_float);
5025 if (trie->jump) /* no more substrings -- for now /grr*/
5026 flags &= ~SCF_DO_SUBSTR;
5028 #endif /* old or new */
5029 #endif /* TRIE_STUDY_OPT */
5031 /* Else: zero-length, ignore. */
5032 scan = regnext(scan);
5037 stopparen = frame->stop;
5038 frame = frame->prev;
5039 goto fake_study_recurse;
5044 DEBUG_STUDYDATA("pre-fin:",data,depth);
5047 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5048 if (flags & SCF_DO_SUBSTR && is_inf)
5049 data->pos_delta = SSize_t_MAX - data->pos_min;
5050 if (is_par > (I32)U8_MAX)
5052 if (is_par && pars==1 && data) {
5053 data->flags |= SF_IN_PAR;
5054 data->flags &= ~SF_HAS_PAR;
5056 else if (pars && data) {
5057 data->flags |= SF_HAS_PAR;
5058 data->flags &= ~SF_IN_PAR;
5060 if (flags & SCF_DO_STCLASS_OR)
5061 ssc_and(pRExC_state, data->start_class, and_withp);
5062 if (flags & SCF_TRIE_RESTUDY)
5063 data->flags |= SCF_TRIE_RESTUDY;
5065 DEBUG_STUDYDATA("post-fin:",data,depth);
5067 return min < stopmin ? min : stopmin;
5071 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5073 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5075 PERL_ARGS_ASSERT_ADD_DATA;
5077 Renewc(RExC_rxi->data,
5078 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5079 char, struct reg_data);
5081 Renew(RExC_rxi->data->what, count + n, U8);
5083 Newx(RExC_rxi->data->what, n, U8);
5084 RExC_rxi->data->count = count + n;
5085 Copy(s, RExC_rxi->data->what + count, n, U8);
5089 /*XXX: todo make this not included in a non debugging perl */
5090 #ifndef PERL_IN_XSUB_RE
5092 Perl_reginitcolors(pTHX)
5095 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5097 char *t = savepv(s);
5101 t = strchr(t, '\t');
5107 PL_colors[i] = t = (char *)"";
5112 PL_colors[i++] = (char *)"";
5119 #ifdef TRIE_STUDY_OPT
5120 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5123 (data.flags & SCF_TRIE_RESTUDY) \
5131 #define CHECK_RESTUDY_GOTO_butfirst
5135 * pregcomp - compile a regular expression into internal code
5137 * Decides which engine's compiler to call based on the hint currently in
5141 #ifndef PERL_IN_XSUB_RE
5143 /* return the currently in-scope regex engine (or the default if none) */
5145 regexp_engine const *
5146 Perl_current_re_engine(pTHX)
5150 if (IN_PERL_COMPILETIME) {
5151 HV * const table = GvHV(PL_hintgv);
5154 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5155 return &PL_core_reg_engine;
5156 ptr = hv_fetchs(table, "regcomp", FALSE);
5157 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5158 return &PL_core_reg_engine;
5159 return INT2PTR(regexp_engine*,SvIV(*ptr));
5163 if (!PL_curcop->cop_hints_hash)
5164 return &PL_core_reg_engine;
5165 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5166 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5167 return &PL_core_reg_engine;
5168 return INT2PTR(regexp_engine*,SvIV(ptr));
5174 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5177 regexp_engine const *eng = current_re_engine();
5178 GET_RE_DEBUG_FLAGS_DECL;
5180 PERL_ARGS_ASSERT_PREGCOMP;
5182 /* Dispatch a request to compile a regexp to correct regexp engine. */
5184 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5187 return CALLREGCOMP_ENG(eng, pattern, flags);
5191 /* public(ish) entry point for the perl core's own regex compiling code.
5192 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5193 * pattern rather than a list of OPs, and uses the internal engine rather
5194 * than the current one */
5197 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5199 SV *pat = pattern; /* defeat constness! */
5200 PERL_ARGS_ASSERT_RE_COMPILE;
5201 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5202 #ifdef PERL_IN_XSUB_RE
5205 &PL_core_reg_engine,
5207 NULL, NULL, rx_flags, 0);
5211 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5212 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5213 * point to the realloced string and length.
5215 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5219 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5220 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5222 U8 *const src = (U8*)*pat_p;
5225 STRLEN s = 0, d = 0;
5227 GET_RE_DEBUG_FLAGS_DECL;
5229 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5230 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5232 Newx(dst, *plen_p * 2 + 1, U8);
5234 while (s < *plen_p) {
5235 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5238 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5239 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
5241 if (n < num_code_blocks) {
5242 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5243 pRExC_state->code_blocks[n].start = d;
5244 assert(dst[d] == '(');
5247 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5248 pRExC_state->code_blocks[n].end = d;
5249 assert(dst[d] == ')');
5259 *pat_p = (char*) dst;
5261 RExC_orig_utf8 = RExC_utf8 = 1;
5266 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5267 * while recording any code block indices, and handling overloading,
5268 * nested qr// objects etc. If pat is null, it will allocate a new
5269 * string, or just return the first arg, if there's only one.
5271 * Returns the malloced/updated pat.
5272 * patternp and pat_count is the array of SVs to be concatted;
5273 * oplist is the optional list of ops that generated the SVs;
5274 * recompile_p is a pointer to a boolean that will be set if
5275 * the regex will need to be recompiled.
5276 * delim, if non-null is an SV that will be inserted between each element
5280 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5281 SV *pat, SV ** const patternp, int pat_count,
5282 OP *oplist, bool *recompile_p, SV *delim)
5286 bool use_delim = FALSE;
5287 bool alloced = FALSE;
5289 /* if we know we have at least two args, create an empty string,
5290 * then concatenate args to that. For no args, return an empty string */
5291 if (!pat && pat_count != 1) {
5292 pat = newSVpvn("", 0);
5297 for (svp = patternp; svp < patternp + pat_count; svp++) {
5300 STRLEN orig_patlen = 0;
5302 SV *msv = use_delim ? delim : *svp;
5303 if (!msv) msv = &PL_sv_undef;
5305 /* if we've got a delimiter, we go round the loop twice for each
5306 * svp slot (except the last), using the delimiter the second
5315 if (SvTYPE(msv) == SVt_PVAV) {
5316 /* we've encountered an interpolated array within
5317 * the pattern, e.g. /...@a..../. Expand the list of elements,
5318 * then recursively append elements.
5319 * The code in this block is based on S_pushav() */
5321 AV *const av = (AV*)msv;
5322 const SSize_t maxarg = AvFILL(av) + 1;
5326 assert(oplist->op_type == OP_PADAV
5327 || oplist->op_type == OP_RV2AV);
5328 oplist = oplist->op_sibling;;
5331 if (SvRMAGICAL(av)) {
5334 Newx(array, maxarg, SV*);
5336 for (i=0; i < maxarg; i++) {
5337 SV ** const svp = av_fetch(av, i, FALSE);
5338 array[i] = svp ? *svp : &PL_sv_undef;
5342 array = AvARRAY(av);
5344 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5345 array, maxarg, NULL, recompile_p,
5347 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5353 /* we make the assumption here that each op in the list of
5354 * op_siblings maps to one SV pushed onto the stack,
5355 * except for code blocks, with have both an OP_NULL and
5357 * This allows us to match up the list of SVs against the
5358 * list of OPs to find the next code block.
5360 * Note that PUSHMARK PADSV PADSV ..
5362 * PADRANGE PADSV PADSV ..
5363 * so the alignment still works. */
5366 if (oplist->op_type == OP_NULL
5367 && (oplist->op_flags & OPf_SPECIAL))
5369 assert(n < pRExC_state->num_code_blocks);
5370 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5371 pRExC_state->code_blocks[n].block = oplist;
5372 pRExC_state->code_blocks[n].src_regex = NULL;
5375 oplist = oplist->op_sibling; /* skip CONST */
5378 oplist = oplist->op_sibling;;
5381 /* apply magic and QR overloading to arg */
5384 if (SvROK(msv) && SvAMAGIC(msv)) {
5385 SV *sv = AMG_CALLunary(msv, regexp_amg);
5389 if (SvTYPE(sv) != SVt_REGEXP)
5390 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5395 /* try concatenation overload ... */
5396 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5397 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5400 /* overloading involved: all bets are off over literal
5401 * code. Pretend we haven't seen it */
5402 pRExC_state->num_code_blocks -= n;
5406 /* ... or failing that, try "" overload */
5407 while (SvAMAGIC(msv)
5408 && (sv = AMG_CALLunary(msv, string_amg))
5412 && SvRV(msv) == SvRV(sv))
5417 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5421 /* this is a partially unrolled
5422 * sv_catsv_nomg(pat, msv);
5423 * that allows us to adjust code block indices if
5426 char *dst = SvPV_force_nomg(pat, dlen);
5428 if (SvUTF8(msv) && !SvUTF8(pat)) {
5429 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5430 sv_setpvn(pat, dst, dlen);
5433 sv_catsv_nomg(pat, msv);
5440 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5443 /* extract any code blocks within any embedded qr//'s */
5444 if (rx && SvTYPE(rx) == SVt_REGEXP
5445 && RX_ENGINE((REGEXP*)rx)->op_comp)
5448 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5449 if (ri->num_code_blocks) {
5451 /* the presence of an embedded qr// with code means
5452 * we should always recompile: the text of the
5453 * qr// may not have changed, but it may be a
5454 * different closure than last time */
5456 Renew(pRExC_state->code_blocks,
5457 pRExC_state->num_code_blocks + ri->num_code_blocks,
5458 struct reg_code_block);
5459 pRExC_state->num_code_blocks += ri->num_code_blocks;
5461 for (i=0; i < ri->num_code_blocks; i++) {
5462 struct reg_code_block *src, *dst;
5463 STRLEN offset = orig_patlen
5464 + ReANY((REGEXP *)rx)->pre_prefix;
5465 assert(n < pRExC_state->num_code_blocks);
5466 src = &ri->code_blocks[i];
5467 dst = &pRExC_state->code_blocks[n];
5468 dst->start = src->start + offset;
5469 dst->end = src->end + offset;
5470 dst->block = src->block;
5471 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5480 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5489 /* see if there are any run-time code blocks in the pattern.
5490 * False positives are allowed */
5493 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5494 char *pat, STRLEN plen)
5499 for (s = 0; s < plen; s++) {
5500 if (n < pRExC_state->num_code_blocks
5501 && s == pRExC_state->code_blocks[n].start)
5503 s = pRExC_state->code_blocks[n].end;
5507 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5509 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5511 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5518 /* Handle run-time code blocks. We will already have compiled any direct
5519 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5520 * copy of it, but with any literal code blocks blanked out and
5521 * appropriate chars escaped; then feed it into
5523 * eval "qr'modified_pattern'"
5527 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5531 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5533 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5534 * and merge them with any code blocks of the original regexp.
5536 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5537 * instead, just save the qr and return FALSE; this tells our caller that
5538 * the original pattern needs upgrading to utf8.
5542 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5543 char *pat, STRLEN plen)
5547 GET_RE_DEBUG_FLAGS_DECL;
5549 if (pRExC_state->runtime_code_qr) {
5550 /* this is the second time we've been called; this should
5551 * only happen if the main pattern got upgraded to utf8
5552 * during compilation; re-use the qr we compiled first time
5553 * round (which should be utf8 too)
5555 qr = pRExC_state->runtime_code_qr;
5556 pRExC_state->runtime_code_qr = NULL;
5557 assert(RExC_utf8 && SvUTF8(qr));
5563 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5567 /* determine how many extra chars we need for ' and \ escaping */
5568 for (s = 0; s < plen; s++) {
5569 if (pat[s] == '\'' || pat[s] == '\\')
5573 Newx(newpat, newlen, char);
5575 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5577 for (s = 0; s < plen; s++) {
5578 if (n < pRExC_state->num_code_blocks
5579 && s == pRExC_state->code_blocks[n].start)
5581 /* blank out literal code block */
5582 assert(pat[s] == '(');
5583 while (s <= pRExC_state->code_blocks[n].end) {
5591 if (pat[s] == '\'' || pat[s] == '\\')
5596 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5600 PerlIO_printf(Perl_debug_log,
5601 "%sre-parsing pattern for runtime code:%s %s\n",
5602 PL_colors[4],PL_colors[5],newpat);
5605 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5611 PUSHSTACKi(PERLSI_REQUIRE);
5612 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5613 * parsing qr''; normally only q'' does this. It also alters
5615 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5616 SvREFCNT_dec_NN(sv);
5621 SV * const errsv = ERRSV;
5622 if (SvTRUE_NN(errsv))
5624 Safefree(pRExC_state->code_blocks);
5625 /* use croak_sv ? */
5626 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5629 assert(SvROK(qr_ref));
5631 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5632 /* the leaving below frees the tmp qr_ref.
5633 * Give qr a life of its own */
5641 if (!RExC_utf8 && SvUTF8(qr)) {
5642 /* first time through; the pattern got upgraded; save the
5643 * qr for the next time through */
5644 assert(!pRExC_state->runtime_code_qr);
5645 pRExC_state->runtime_code_qr = qr;
5650 /* extract any code blocks within the returned qr// */
5653 /* merge the main (r1) and run-time (r2) code blocks into one */
5655 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5656 struct reg_code_block *new_block, *dst;
5657 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5660 if (!r2->num_code_blocks) /* we guessed wrong */
5662 SvREFCNT_dec_NN(qr);
5667 r1->num_code_blocks + r2->num_code_blocks,
5668 struct reg_code_block);
5671 while ( i1 < r1->num_code_blocks
5672 || i2 < r2->num_code_blocks)
5674 struct reg_code_block *src;
5677 if (i1 == r1->num_code_blocks) {
5678 src = &r2->code_blocks[i2++];
5681 else if (i2 == r2->num_code_blocks)
5682 src = &r1->code_blocks[i1++];
5683 else if ( r1->code_blocks[i1].start
5684 < r2->code_blocks[i2].start)
5686 src = &r1->code_blocks[i1++];
5687 assert(src->end < r2->code_blocks[i2].start);
5690 assert( r1->code_blocks[i1].start
5691 > r2->code_blocks[i2].start);
5692 src = &r2->code_blocks[i2++];
5694 assert(src->end < r1->code_blocks[i1].start);
5697 assert(pat[src->start] == '(');
5698 assert(pat[src->end] == ')');
5699 dst->start = src->start;
5700 dst->end = src->end;
5701 dst->block = src->block;
5702 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5706 r1->num_code_blocks += r2->num_code_blocks;
5707 Safefree(r1->code_blocks);
5708 r1->code_blocks = new_block;
5711 SvREFCNT_dec_NN(qr);
5717 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5718 SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5720 /* This is the common code for setting up the floating and fixed length
5721 * string data extracted from Perl_re_op_compile() below. Returns a boolean
5722 * as to whether succeeded or not */
5727 if (! (longest_length
5728 || (eol /* Can't have SEOL and MULTI */
5729 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5731 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5732 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5737 /* copy the information about the longest from the reg_scan_data
5738 over to the program. */
5739 if (SvUTF8(sv_longest)) {
5740 *rx_utf8 = sv_longest;
5743 *rx_substr = sv_longest;
5746 /* end_shift is how many chars that must be matched that
5747 follow this item. We calculate it ahead of time as once the
5748 lookbehind offset is added in we lose the ability to correctly
5750 ml = minlen ? *(minlen) : (SSize_t)longest_length;
5751 *rx_end_shift = ml - offset
5752 - longest_length + (SvTAIL(sv_longest) != 0)
5755 t = (eol/* Can't have SEOL and MULTI */
5756 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5757 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5763 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5764 * regular expression into internal code.
5765 * The pattern may be passed either as:
5766 * a list of SVs (patternp plus pat_count)
5767 * a list of OPs (expr)
5768 * If both are passed, the SV list is used, but the OP list indicates
5769 * which SVs are actually pre-compiled code blocks
5771 * The SVs in the list have magic and qr overloading applied to them (and
5772 * the list may be modified in-place with replacement SVs in the latter
5775 * If the pattern hasn't changed from old_re, then old_re will be
5778 * eng is the current engine. If that engine has an op_comp method, then
5779 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5780 * do the initial concatenation of arguments and pass on to the external
5783 * If is_bare_re is not null, set it to a boolean indicating whether the
5784 * arg list reduced (after overloading) to a single bare regex which has
5785 * been returned (i.e. /$qr/).
5787 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5789 * pm_flags contains the PMf_* flags, typically based on those from the
5790 * pm_flags field of the related PMOP. Currently we're only interested in
5791 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5793 * We can't allocate space until we know how big the compiled form will be,
5794 * but we can't compile it (and thus know how big it is) until we've got a
5795 * place to put the code. So we cheat: we compile it twice, once with code
5796 * generation turned off and size counting turned on, and once "for real".
5797 * This also means that we don't allocate space until we are sure that the
5798 * thing really will compile successfully, and we never have to move the
5799 * code and thus invalidate pointers into it. (Note that it has to be in
5800 * one piece because free() must be able to free it all.) [NB: not true in perl]
5802 * Beware that the optimization-preparation code in here knows about some
5803 * of the structure of the compiled regexp. [I'll say.]
5807 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5808 OP *expr, const regexp_engine* eng, REGEXP *old_re,
5809 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5814 regexp_internal *ri;
5822 SV *code_blocksv = NULL;
5823 SV** new_patternp = patternp;
5825 /* these are all flags - maybe they should be turned
5826 * into a single int with different bit masks */
5827 I32 sawlookahead = 0;
5832 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5834 bool runtime_code = 0;
5836 RExC_state_t RExC_state;
5837 RExC_state_t * const pRExC_state = &RExC_state;
5838 #ifdef TRIE_STUDY_OPT
5840 RExC_state_t copyRExC_state;
5842 GET_RE_DEBUG_FLAGS_DECL;
5844 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5846 DEBUG_r(if (!PL_colorset) reginitcolors());
5848 #ifndef PERL_IN_XSUB_RE
5849 /* Initialize these here instead of as-needed, as is quick and avoids
5850 * having to test them each time otherwise */
5851 if (! PL_AboveLatin1) {
5852 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5853 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5854 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
5856 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5857 PL_L1Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5858 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5860 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5861 = _new_invlist_C_array(L1PosixAlnum_invlist);
5862 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5863 = _new_invlist_C_array(PosixAlnum_invlist);
5865 PL_L1Posix_ptrs[_CC_ALPHA]
5866 = _new_invlist_C_array(L1PosixAlpha_invlist);
5867 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5869 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5870 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5872 /* Cased is the same as Alpha in the ASCII range */
5873 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5874 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5876 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5877 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5879 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5880 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5882 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5883 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5885 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5886 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5888 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5889 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5891 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5892 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5894 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5895 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5896 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5897 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5899 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5900 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5902 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5904 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5905 PL_L1Posix_ptrs[_CC_WORDCHAR]
5906 = _new_invlist_C_array(L1PosixWord_invlist);
5908 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5909 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5911 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5915 pRExC_state->code_blocks = NULL;
5916 pRExC_state->num_code_blocks = 0;
5919 *is_bare_re = FALSE;
5921 if (expr && (expr->op_type == OP_LIST ||
5922 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5923 /* allocate code_blocks if needed */
5927 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5928 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5929 ncode++; /* count of DO blocks */
5931 pRExC_state->num_code_blocks = ncode;
5932 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5937 /* compile-time pattern with just OP_CONSTs and DO blocks */
5942 /* find how many CONSTs there are */
5945 if (expr->op_type == OP_CONST)
5948 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5949 if (o->op_type == OP_CONST)
5953 /* fake up an SV array */
5955 assert(!new_patternp);
5956 Newx(new_patternp, n, SV*);
5957 SAVEFREEPV(new_patternp);
5961 if (expr->op_type == OP_CONST)
5962 new_patternp[n] = cSVOPx_sv(expr);
5964 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5965 if (o->op_type == OP_CONST)
5966 new_patternp[n++] = cSVOPo_sv;
5971 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5972 "Assembling pattern from %d elements%s\n", pat_count,
5973 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5975 /* set expr to the first arg op */
5977 if (pRExC_state->num_code_blocks
5978 && expr->op_type != OP_CONST)
5980 expr = cLISTOPx(expr)->op_first;
5981 assert( expr->op_type == OP_PUSHMARK
5982 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5983 || expr->op_type == OP_PADRANGE);
5984 expr = expr->op_sibling;
5987 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5988 expr, &recompile, NULL);
5990 /* handle bare (possibly after overloading) regex: foo =~ $re */
5995 if (SvTYPE(re) == SVt_REGEXP) {
5999 Safefree(pRExC_state->code_blocks);
6000 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6001 "Precompiled pattern%s\n",
6002 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6008 exp = SvPV_nomg(pat, plen);
6010 if (!eng->op_comp) {
6011 if ((SvUTF8(pat) && IN_BYTES)
6012 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6014 /* make a temporary copy; either to convert to bytes,
6015 * or to avoid repeating get-magic / overloaded stringify */
6016 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6017 (IN_BYTES ? 0 : SvUTF8(pat)));
6019 Safefree(pRExC_state->code_blocks);
6020 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6023 /* ignore the utf8ness if the pattern is 0 length */
6024 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6025 RExC_uni_semantics = 0;
6026 RExC_contains_locale = 0;
6027 RExC_contains_i = 0;
6028 pRExC_state->runtime_code_qr = NULL;
6031 SV *dsv= sv_newmortal();
6032 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6033 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6034 PL_colors[4],PL_colors[5],s);
6038 /* we jump here if we upgrade the pattern to utf8 and have to
6041 if ((pm_flags & PMf_USE_RE_EVAL)
6042 /* this second condition covers the non-regex literal case,
6043 * i.e. $foo =~ '(?{})'. */
6044 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6046 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6048 /* return old regex if pattern hasn't changed */
6049 /* XXX: note in the below we have to check the flags as well as the pattern.
6051 * Things get a touch tricky as we have to compare the utf8 flag independently
6052 * from the compile flags.
6057 && !!RX_UTF8(old_re) == !!RExC_utf8
6058 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6059 && RX_PRECOMP(old_re)
6060 && RX_PRELEN(old_re) == plen
6061 && memEQ(RX_PRECOMP(old_re), exp, plen)
6062 && !runtime_code /* with runtime code, always recompile */ )
6064 Safefree(pRExC_state->code_blocks);
6068 rx_flags = orig_rx_flags;
6070 if (rx_flags & PMf_FOLD) {
6071 RExC_contains_i = 1;
6073 if (initial_charset == REGEX_LOCALE_CHARSET) {
6074 RExC_contains_locale = 1;
6076 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6078 /* Set to use unicode semantics if the pattern is in utf8 and has the
6079 * 'depends' charset specified, as it means unicode when utf8 */
6080 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6084 RExC_flags = rx_flags;
6085 RExC_pm_flags = pm_flags;
6088 if (TAINTING_get && TAINT_get)
6089 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6091 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6092 /* whoops, we have a non-utf8 pattern, whilst run-time code
6093 * got compiled as utf8. Try again with a utf8 pattern */
6094 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6095 pRExC_state->num_code_blocks);
6096 goto redo_first_pass;
6099 assert(!pRExC_state->runtime_code_qr);
6104 RExC_in_lookbehind = 0;
6105 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6107 RExC_override_recoding = 0;
6108 RExC_in_multi_char_class = 0;
6110 /* First pass: determine size, legality. */
6113 RExC_end = exp + plen;
6118 RExC_emit = (regnode *) &RExC_emit_dummy;
6119 RExC_whilem_seen = 0;
6120 RExC_open_parens = NULL;
6121 RExC_close_parens = NULL;
6123 RExC_paren_names = NULL;
6125 RExC_paren_name_list = NULL;
6127 RExC_recurse = NULL;
6128 RExC_recurse_count = 0;
6129 pRExC_state->code_index = 0;
6131 #if 0 /* REGC() is (currently) a NOP at the first pass.
6132 * Clever compilers notice this and complain. --jhi */
6133 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6136 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6138 RExC_lastparse=NULL;
6140 /* reg may croak on us, not giving us a chance to free
6141 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6142 need it to survive as long as the regexp (qr/(?{})/).
6143 We must check that code_blocksv is not already set, because we may
6144 have jumped back to restart the sizing pass. */
6145 if (pRExC_state->code_blocks && !code_blocksv) {
6146 code_blocksv = newSV_type(SVt_PV);
6147 SAVEFREESV(code_blocksv);
6148 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6149 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6151 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6152 /* It's possible to write a regexp in ascii that represents Unicode
6153 codepoints outside of the byte range, such as via \x{100}. If we
6154 detect such a sequence we have to convert the entire pattern to utf8
6155 and then recompile, as our sizing calculation will have been based
6156 on 1 byte == 1 character, but we will need to use utf8 to encode
6157 at least some part of the pattern, and therefore must convert the whole
6160 if (flags & RESTART_UTF8) {
6161 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6162 pRExC_state->num_code_blocks);
6163 goto redo_first_pass;
6165 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6168 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6171 PerlIO_printf(Perl_debug_log,
6172 "Required size %"IVdf" nodes\n"
6173 "Starting second pass (creation)\n",
6176 RExC_lastparse=NULL;
6179 /* The first pass could have found things that force Unicode semantics */
6180 if ((RExC_utf8 || RExC_uni_semantics)
6181 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6183 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6186 /* Small enough for pointer-storage convention?
6187 If extralen==0, this means that we will not need long jumps. */
6188 if (RExC_size >= 0x10000L && RExC_extralen)
6189 RExC_size += RExC_extralen;
6192 if (RExC_whilem_seen > 15)
6193 RExC_whilem_seen = 15;
6195 /* Allocate space and zero-initialize. Note, the two step process
6196 of zeroing when in debug mode, thus anything assigned has to
6197 happen after that */
6198 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6200 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6201 char, regexp_internal);
6202 if ( r == NULL || ri == NULL )
6203 FAIL("Regexp out of space");
6205 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6206 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
6208 /* bulk initialize base fields with 0. */
6209 Zero(ri, sizeof(regexp_internal), char);
6212 /* non-zero initialization begins here */
6215 r->extflags = rx_flags;
6216 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6218 if (pm_flags & PMf_IS_QR) {
6219 ri->code_blocks = pRExC_state->code_blocks;
6220 ri->num_code_blocks = pRExC_state->num_code_blocks;
6225 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6226 if (pRExC_state->code_blocks[n].src_regex)
6227 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6228 SAVEFREEPV(pRExC_state->code_blocks);
6232 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6233 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
6235 /* The caret is output if there are any defaults: if not all the STD
6236 * flags are set, or if no character set specifier is needed */
6238 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6240 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
6241 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6242 >> RXf_PMf_STD_PMMOD_SHIFT);
6243 const char *fptr = STD_PAT_MODS; /*"msix"*/
6245 /* Allocate for the worst case, which is all the std flags are turned
6246 * on. If more precision is desired, we could do a population count of
6247 * the flags set. This could be done with a small lookup table, or by
6248 * shifting, masking and adding, or even, when available, assembly
6249 * language for a machine-language population count.
6250 * We never output a minus, as all those are defaults, so are
6251 * covered by the caret */
6252 const STRLEN wraplen = plen + has_p + has_runon
6253 + has_default /* If needs a caret */
6255 /* If needs a character set specifier */
6256 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6257 + (sizeof(STD_PAT_MODS) - 1)
6258 + (sizeof("(?:)") - 1);
6260 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6261 r->xpv_len_u.xpvlenu_pv = p;
6263 SvFLAGS(rx) |= SVf_UTF8;
6266 /* If a default, cover it using the caret */
6268 *p++= DEFAULT_PAT_MOD;
6272 const char* const name = get_regex_charset_name(r->extflags, &len);
6273 Copy(name, p, len, char);
6277 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6280 while((ch = *fptr++)) {
6288 Copy(RExC_precomp, p, plen, char);
6289 assert ((RX_WRAPPED(rx) - p) < 16);
6290 r->pre_prefix = p - RX_WRAPPED(rx);
6296 SvCUR_set(rx, p - RX_WRAPPED(rx));
6300 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6302 if (RExC_seen & REG_SEEN_RECURSE) {
6303 Newxz(RExC_open_parens, RExC_npar,regnode *);
6304 SAVEFREEPV(RExC_open_parens);
6305 Newxz(RExC_close_parens,RExC_npar,regnode *);
6306 SAVEFREEPV(RExC_close_parens);
6309 /* Useful during FAIL. */
6310 #ifdef RE_TRACK_PATTERN_OFFSETS
6311 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6312 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6313 "%s %"UVuf" bytes for offset annotations.\n",
6314 ri->u.offsets ? "Got" : "Couldn't get",
6315 (UV)((2*RExC_size+1) * sizeof(U32))));
6317 SetProgLen(ri,RExC_size);
6322 /* Second pass: emit code. */
6323 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6324 RExC_pm_flags = pm_flags;
6326 RExC_end = exp + plen;
6329 RExC_emit_start = ri->program;
6330 RExC_emit = ri->program;
6331 RExC_emit_bound = ri->program + RExC_size + 1;
6332 pRExC_state->code_index = 0;
6334 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6335 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6337 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6339 /* XXXX To minimize changes to RE engine we always allocate
6340 3-units-long substrs field. */
6341 Newx(r->substrs, 1, struct reg_substr_data);
6342 if (RExC_recurse_count) {
6343 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6344 SAVEFREEPV(RExC_recurse);
6348 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6349 Zero(r->substrs, 1, struct reg_substr_data);
6351 #ifdef TRIE_STUDY_OPT
6353 StructCopy(&zero_scan_data, &data, scan_data_t);
6354 copyRExC_state = RExC_state;
6357 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6359 RExC_state = copyRExC_state;
6360 if (seen & REG_TOP_LEVEL_BRANCHES)
6361 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6363 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6364 StructCopy(&zero_scan_data, &data, scan_data_t);
6367 StructCopy(&zero_scan_data, &data, scan_data_t);
6370 /* Dig out information for optimizations. */
6371 r->extflags = RExC_flags; /* was pm_op */
6372 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6375 SvUTF8_on(rx); /* Unicode in it? */
6376 ri->regstclass = NULL;
6377 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6378 r->intflags |= PREGf_NAUGHTY;
6379 scan = ri->program + 1; /* First BRANCH. */
6381 /* testing for BRANCH here tells us whether there is "must appear"
6382 data in the pattern. If there is then we can use it for optimisations */
6383 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6385 STRLEN longest_float_length, longest_fixed_length;
6386 regnode_ssc ch_class; /* pointed to by data */
6388 SSize_t last_close = 0; /* pointed to by data */
6389 regnode *first= scan;
6390 regnode *first_next= regnext(first);
6392 * Skip introductions and multiplicators >= 1
6393 * so that we can extract the 'meat' of the pattern that must
6394 * match in the large if() sequence following.
6395 * NOTE that EXACT is NOT covered here, as it is normally
6396 * picked up by the optimiser separately.
6398 * This is unfortunate as the optimiser isnt handling lookahead
6399 * properly currently.
6402 while ((OP(first) == OPEN && (sawopen = 1)) ||
6403 /* An OR of *one* alternative - should not happen now. */
6404 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6405 /* for now we can't handle lookbehind IFMATCH*/
6406 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6407 (OP(first) == PLUS) ||
6408 (OP(first) == MINMOD) ||
6409 /* An {n,m} with n>0 */
6410 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6411 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6414 * the only op that could be a regnode is PLUS, all the rest
6415 * will be regnode_1 or regnode_2.
6417 * (yves doesn't think this is true)
6419 if (OP(first) == PLUS)
6422 if (OP(first) == MINMOD)
6424 first += regarglen[OP(first)];
6426 first = NEXTOPER(first);
6427 first_next= regnext(first);
6430 /* Starting-point info. */
6432 DEBUG_PEEP("first:",first,0);
6433 /* Ignore EXACT as we deal with it later. */
6434 if (PL_regkind[OP(first)] == EXACT) {
6435 if (OP(first) == EXACT)
6436 NOOP; /* Empty, get anchored substr later. */
6438 ri->regstclass = first;
6441 else if (PL_regkind[OP(first)] == TRIE &&
6442 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6445 /* this can happen only on restudy */
6446 if ( OP(first) == TRIE ) {
6447 struct regnode_1 *trieop = (struct regnode_1 *)
6448 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6449 StructCopy(first,trieop,struct regnode_1);
6450 trie_op=(regnode *)trieop;
6452 struct regnode_charclass *trieop = (struct regnode_charclass *)
6453 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6454 StructCopy(first,trieop,struct regnode_charclass);
6455 trie_op=(regnode *)trieop;
6458 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6459 ri->regstclass = trie_op;
6462 else if (REGNODE_SIMPLE(OP(first)))
6463 ri->regstclass = first;
6464 else if (PL_regkind[OP(first)] == BOUND ||
6465 PL_regkind[OP(first)] == NBOUND)
6466 ri->regstclass = first;
6467 else if (PL_regkind[OP(first)] == BOL) {
6468 r->extflags |= (OP(first) == MBOL
6470 : (OP(first) == SBOL
6473 first = NEXTOPER(first);
6476 else if (OP(first) == GPOS) {
6477 r->extflags |= RXf_ANCH_GPOS;
6478 first = NEXTOPER(first);
6481 else if ((!sawopen || !RExC_sawback) &&
6482 (OP(first) == STAR &&
6483 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6484 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6486 /* turn .* into ^.* with an implied $*=1 */
6488 (OP(NEXTOPER(first)) == REG_ANY)
6491 r->extflags |= type;
6492 r->intflags |= PREGf_IMPLICIT;
6493 first = NEXTOPER(first);
6496 if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6497 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6498 /* x+ must match at the 1st pos of run of x's */
6499 r->intflags |= PREGf_SKIP;
6501 /* Scan is after the zeroth branch, first is atomic matcher. */
6502 #ifdef TRIE_STUDY_OPT
6505 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6506 (IV)(first - scan + 1))
6510 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6511 (IV)(first - scan + 1))
6517 * If there's something expensive in the r.e., find the
6518 * longest literal string that must appear and make it the
6519 * regmust. Resolve ties in favor of later strings, since
6520 * the regstart check works with the beginning of the r.e.
6521 * and avoiding duplication strengthens checking. Not a
6522 * strong reason, but sufficient in the absence of others.
6523 * [Now we resolve ties in favor of the earlier string if
6524 * it happens that c_offset_min has been invalidated, since the
6525 * earlier string may buy us something the later one won't.]
6528 data.longest_fixed = newSVpvs("");
6529 data.longest_float = newSVpvs("");
6530 data.last_found = newSVpvs("");
6531 data.longest = &(data.longest_fixed);
6532 ENTER_with_name("study_chunk");
6533 SAVEFREESV(data.longest_fixed);
6534 SAVEFREESV(data.longest_float);
6535 SAVEFREESV(data.last_found);
6537 if (!ri->regstclass) {
6538 ssc_init(pRExC_state, &ch_class);
6539 data.start_class = &ch_class;
6540 stclass_flag = SCF_DO_STCLASS_AND;
6541 } else /* XXXX Check for BOUND? */
6543 data.last_closep = &last_close;
6545 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6546 &data, -1, NULL, NULL,
6547 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6548 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6552 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6555 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6556 && data.last_start_min == 0 && data.last_end > 0
6557 && !RExC_seen_zerolen
6558 && !(RExC_seen & REG_SEEN_VERBARG)
6559 && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6560 r->extflags |= RXf_CHECK_ALL;
6561 scan_commit(pRExC_state, &data,&minlen,0);
6563 longest_float_length = CHR_SVLEN(data.longest_float);
6565 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6566 && data.offset_fixed == data.offset_float_min
6567 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6568 && S_setup_longest (aTHX_ pRExC_state,
6572 &(r->float_end_shift),
6573 data.lookbehind_float,
6574 data.offset_float_min,
6576 longest_float_length,
6577 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6578 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6580 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6581 r->float_max_offset = data.offset_float_max;
6582 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6583 r->float_max_offset -= data.lookbehind_float;
6584 SvREFCNT_inc_simple_void_NN(data.longest_float);
6587 r->float_substr = r->float_utf8 = NULL;
6588 longest_float_length = 0;
6591 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6593 if (S_setup_longest (aTHX_ pRExC_state,
6595 &(r->anchored_utf8),
6596 &(r->anchored_substr),
6597 &(r->anchored_end_shift),
6598 data.lookbehind_fixed,
6601 longest_fixed_length,
6602 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6603 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6605 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6606 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6609 r->anchored_substr = r->anchored_utf8 = NULL;
6610 longest_fixed_length = 0;
6612 LEAVE_with_name("study_chunk");
6615 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6616 ri->regstclass = NULL;
6618 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6620 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6621 && !ssc_is_anything(data.start_class))
6623 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6625 ssc_finalize(pRExC_state, data.start_class);
6627 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6628 StructCopy(data.start_class,
6629 (regnode_ssc*)RExC_rxi->data->data[n],
6631 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6632 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6633 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6634 regprop(r, sv, (regnode*)data.start_class);
6635 PerlIO_printf(Perl_debug_log,
6636 "synthetic stclass \"%s\".\n",
6637 SvPVX_const(sv));});
6638 data.start_class = NULL;
6641 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6642 if (longest_fixed_length > longest_float_length) {
6643 r->check_end_shift = r->anchored_end_shift;
6644 r->check_substr = r->anchored_substr;
6645 r->check_utf8 = r->anchored_utf8;
6646 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6647 if (r->extflags & RXf_ANCH_SINGLE)
6648 r->extflags |= RXf_NOSCAN;
6651 r->check_end_shift = r->float_end_shift;
6652 r->check_substr = r->float_substr;
6653 r->check_utf8 = r->float_utf8;
6654 r->check_offset_min = r->float_min_offset;
6655 r->check_offset_max = r->float_max_offset;
6657 if ((r->check_substr || r->check_utf8) ) {
6658 r->extflags |= RXf_USE_INTUIT;
6659 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6660 r->extflags |= RXf_INTUIT_TAIL;
6662 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6663 if ( (STRLEN)minlen < longest_float_length )
6664 minlen= longest_float_length;
6665 if ( (STRLEN)minlen < longest_fixed_length )
6666 minlen= longest_fixed_length;
6670 /* Several toplevels. Best we can is to set minlen. */
6672 regnode_ssc ch_class;
6673 SSize_t last_close = 0;
6675 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6677 scan = ri->program + 1;
6678 ssc_init(pRExC_state, &ch_class);
6679 data.start_class = &ch_class;
6680 data.last_closep = &last_close;
6683 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6684 &data, -1, NULL, NULL,
6685 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6686 |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6689 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6691 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6692 = r->float_substr = r->float_utf8 = NULL;
6694 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6695 && ! ssc_is_anything(data.start_class))
6697 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6699 ssc_finalize(pRExC_state, data.start_class);
6701 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6702 StructCopy(data.start_class,
6703 (regnode_ssc*)RExC_rxi->data->data[n],
6705 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6706 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6707 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6708 regprop(r, sv, (regnode*)data.start_class);
6709 PerlIO_printf(Perl_debug_log,
6710 "synthetic stclass \"%s\".\n",
6711 SvPVX_const(sv));});
6712 data.start_class = NULL;
6716 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6717 the "real" pattern. */
6719 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6720 (IV)minlen, (IV)r->minlen);
6722 r->minlenret = minlen;
6723 if (r->minlen < minlen)
6726 if (RExC_seen & REG_SEEN_GPOS)
6727 r->extflags |= RXf_GPOS_SEEN;
6728 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6729 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6730 if (pRExC_state->num_code_blocks)
6731 r->extflags |= RXf_EVAL_SEEN;
6732 if (RExC_seen & REG_SEEN_CANY)
6733 r->extflags |= RXf_CANY_SEEN;
6734 if (RExC_seen & REG_SEEN_VERBARG)
6736 r->intflags |= PREGf_VERBARG_SEEN;
6737 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6739 if (RExC_seen & REG_SEEN_CUTGROUP)
6740 r->intflags |= PREGf_CUTGROUP_SEEN;
6741 if (pm_flags & PMf_USE_RE_EVAL)
6742 r->intflags |= PREGf_USE_RE_EVAL;
6743 if (RExC_paren_names)
6744 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6746 RXp_PAREN_NAMES(r) = NULL;
6749 regnode *first = ri->program + 1;
6751 regnode *next = NEXTOPER(first);
6754 if (PL_regkind[fop] == NOTHING && nop == END)
6755 r->extflags |= RXf_NULL;
6756 else if (PL_regkind[fop] == BOL && nop == END)
6757 r->extflags |= RXf_START_ONLY;
6758 else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6759 r->extflags |= RXf_WHITE;
6760 else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6761 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6765 if (RExC_paren_names) {
6766 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
6767 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6770 ri->name_list_idx = 0;
6772 if (RExC_recurse_count) {
6773 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6774 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6775 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6778 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6779 /* assume we don't need to swap parens around before we match */
6782 PerlIO_printf(Perl_debug_log,"Final program:\n");
6785 #ifdef RE_TRACK_PATTERN_OFFSETS
6786 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6787 const STRLEN len = ri->u.offsets[0];
6789 GET_RE_DEBUG_FLAGS_DECL;
6790 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6791 for (i = 1; i <= len; i++) {
6792 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6793 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6794 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6796 PerlIO_printf(Perl_debug_log, "\n");
6801 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6802 * by setting the regexp SV to readonly-only instead. If the
6803 * pattern's been recompiled, the USEDness should remain. */
6804 if (old_re && SvREADONLY(old_re))
6812 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6815 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6817 PERL_UNUSED_ARG(value);
6819 if (flags & RXapif_FETCH) {
6820 return reg_named_buff_fetch(rx, key, flags);
6821 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6822 Perl_croak_no_modify();
6824 } else if (flags & RXapif_EXISTS) {
6825 return reg_named_buff_exists(rx, key, flags)
6828 } else if (flags & RXapif_REGNAMES) {
6829 return reg_named_buff_all(rx, flags);
6830 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6831 return reg_named_buff_scalar(rx, flags);
6833 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6839 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6842 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6843 PERL_UNUSED_ARG(lastkey);
6845 if (flags & RXapif_FIRSTKEY)
6846 return reg_named_buff_firstkey(rx, flags);
6847 else if (flags & RXapif_NEXTKEY)
6848 return reg_named_buff_nextkey(rx, flags);
6850 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6856 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6859 AV *retarray = NULL;
6861 struct regexp *const rx = ReANY(r);
6863 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6865 if (flags & RXapif_ALL)
6868 if (rx && RXp_PAREN_NAMES(rx)) {
6869 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6872 SV* sv_dat=HeVAL(he_str);
6873 I32 *nums=(I32*)SvPVX(sv_dat);
6874 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6875 if ((I32)(rx->nparens) >= nums[i]
6876 && rx->offs[nums[i]].start != -1
6877 && rx->offs[nums[i]].end != -1)
6880 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6885 ret = newSVsv(&PL_sv_undef);
6888 av_push(retarray, ret);
6891 return newRV_noinc(MUTABLE_SV(retarray));
6898 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6901 struct regexp *const rx = ReANY(r);
6903 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6905 if (rx && RXp_PAREN_NAMES(rx)) {
6906 if (flags & RXapif_ALL) {
6907 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6909 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6911 SvREFCNT_dec_NN(sv);
6923 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6925 struct regexp *const rx = ReANY(r);
6927 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6929 if ( rx && RXp_PAREN_NAMES(rx) ) {
6930 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6932 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6939 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6941 struct regexp *const rx = ReANY(r);
6942 GET_RE_DEBUG_FLAGS_DECL;
6944 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6946 if (rx && RXp_PAREN_NAMES(rx)) {
6947 HV *hv = RXp_PAREN_NAMES(rx);
6949 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6952 SV* sv_dat = HeVAL(temphe);
6953 I32 *nums = (I32*)SvPVX(sv_dat);
6954 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6955 if ((I32)(rx->lastparen) >= nums[i] &&
6956 rx->offs[nums[i]].start != -1 &&
6957 rx->offs[nums[i]].end != -1)
6963 if (parno || flags & RXapif_ALL) {
6964 return newSVhek(HeKEY_hek(temphe));
6972 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6977 struct regexp *const rx = ReANY(r);
6979 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6981 if (rx && RXp_PAREN_NAMES(rx)) {
6982 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6983 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6984 } else if (flags & RXapif_ONE) {
6985 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6986 av = MUTABLE_AV(SvRV(ret));
6987 length = av_len(av);
6988 SvREFCNT_dec_NN(ret);
6989 return newSViv(length + 1);
6991 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6995 return &PL_sv_undef;
6999 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7001 struct regexp *const rx = ReANY(r);
7004 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7006 if (rx && RXp_PAREN_NAMES(rx)) {
7007 HV *hv= RXp_PAREN_NAMES(rx);
7009 (void)hv_iterinit(hv);
7010 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7013 SV* sv_dat = HeVAL(temphe);
7014 I32 *nums = (I32*)SvPVX(sv_dat);
7015 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7016 if ((I32)(rx->lastparen) >= nums[i] &&
7017 rx->offs[nums[i]].start != -1 &&
7018 rx->offs[nums[i]].end != -1)
7024 if (parno || flags & RXapif_ALL) {
7025 av_push(av, newSVhek(HeKEY_hek(temphe)));
7030 return newRV_noinc(MUTABLE_SV(av));
7034 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7037 struct regexp *const rx = ReANY(r);
7043 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7045 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7046 || n == RX_BUFF_IDX_CARET_FULLMATCH
7047 || n == RX_BUFF_IDX_CARET_POSTMATCH
7050 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7052 /* on something like
7055 * the KEEPCOPY is set on the PMOP rather than the regex */
7056 if (PL_curpm && r == PM_GETRE(PL_curpm))
7057 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7066 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7067 /* no need to distinguish between them any more */
7068 n = RX_BUFF_IDX_FULLMATCH;
7070 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7071 && rx->offs[0].start != -1)
7073 /* $`, ${^PREMATCH} */
7074 i = rx->offs[0].start;
7078 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7079 && rx->offs[0].end != -1)
7081 /* $', ${^POSTMATCH} */
7082 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7083 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7086 if ( 0 <= n && n <= (I32)rx->nparens &&
7087 (s1 = rx->offs[n].start) != -1 &&
7088 (t1 = rx->offs[n].end) != -1)
7090 /* $&, ${^MATCH}, $1 ... */
7092 s = rx->subbeg + s1 - rx->suboffset;
7097 assert(s >= rx->subbeg);
7098 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7100 #if NO_TAINT_SUPPORT
7101 sv_setpvn(sv, s, i);
7103 const int oldtainted = TAINT_get;
7105 sv_setpvn(sv, s, i);
7106 TAINT_set(oldtainted);
7108 if ( (rx->extflags & RXf_CANY_SEEN)
7109 ? (RXp_MATCH_UTF8(rx)
7110 && (!i || is_utf8_string((U8*)s, i)))
7111 : (RXp_MATCH_UTF8(rx)) )
7118 if (RXp_MATCH_TAINTED(rx)) {
7119 if (SvTYPE(sv) >= SVt_PVMG) {
7120 MAGIC* const mg = SvMAGIC(sv);
7123 SvMAGIC_set(sv, mg->mg_moremagic);
7125 if ((mgt = SvMAGIC(sv))) {
7126 mg->mg_moremagic = mgt;
7127 SvMAGIC_set(sv, mg);
7138 sv_setsv(sv,&PL_sv_undef);
7144 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7145 SV const * const value)
7147 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7149 PERL_UNUSED_ARG(rx);
7150 PERL_UNUSED_ARG(paren);
7151 PERL_UNUSED_ARG(value);
7154 Perl_croak_no_modify();
7158 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7161 struct regexp *const rx = ReANY(r);
7165 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7167 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7168 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7169 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7172 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7174 /* on something like
7177 * the KEEPCOPY is set on the PMOP rather than the regex */
7178 if (PL_curpm && r == PM_GETRE(PL_curpm))
7179 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7185 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7187 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7188 case RX_BUFF_IDX_PREMATCH: /* $` */
7189 if (rx->offs[0].start != -1) {
7190 i = rx->offs[0].start;
7199 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7200 case RX_BUFF_IDX_POSTMATCH: /* $' */
7201 if (rx->offs[0].end != -1) {
7202 i = rx->sublen - rx->offs[0].end;
7204 s1 = rx->offs[0].end;
7211 default: /* $& / ${^MATCH}, $1, $2, ... */
7212 if (paren <= (I32)rx->nparens &&
7213 (s1 = rx->offs[paren].start) != -1 &&
7214 (t1 = rx->offs[paren].end) != -1)
7220 if (ckWARN(WARN_UNINITIALIZED))
7221 report_uninit((const SV *)sv);
7226 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7227 const char * const s = rx->subbeg - rx->suboffset + s1;
7232 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7239 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7241 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7242 PERL_UNUSED_ARG(rx);
7246 return newSVpvs("Regexp");
7249 /* Scans the name of a named buffer from the pattern.
7250 * If flags is REG_RSN_RETURN_NULL returns null.
7251 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7252 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7253 * to the parsed name as looked up in the RExC_paren_names hash.
7254 * If there is an error throws a vFAIL().. type exception.
7257 #define REG_RSN_RETURN_NULL 0
7258 #define REG_RSN_RETURN_NAME 1
7259 #define REG_RSN_RETURN_DATA 2
7262 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7264 char *name_start = RExC_parse;
7266 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7268 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7269 /* skip IDFIRST by using do...while */
7272 RExC_parse += UTF8SKIP(RExC_parse);
7273 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7277 } while (isWORDCHAR(*RExC_parse));
7279 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7280 vFAIL("Group name must start with a non-digit word character");
7284 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7285 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7286 if ( flags == REG_RSN_RETURN_NAME)
7288 else if (flags==REG_RSN_RETURN_DATA) {
7291 if ( ! sv_name ) /* should not happen*/
7292 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7293 if (RExC_paren_names)
7294 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7296 sv_dat = HeVAL(he_str);
7298 vFAIL("Reference to nonexistent named group");
7302 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7303 (unsigned long) flags);
7305 assert(0); /* NOT REACHED */
7310 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7311 int rem=(int)(RExC_end - RExC_parse); \
7320 if (RExC_lastparse!=RExC_parse) \
7321 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7324 iscut ? "..." : "<" \
7327 PerlIO_printf(Perl_debug_log,"%16s",""); \
7330 num = RExC_size + 1; \
7332 num=REG_NODE_NUM(RExC_emit); \
7333 if (RExC_lastnum!=num) \
7334 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7336 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7337 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7338 (int)((depth*2)), "", \
7342 RExC_lastparse=RExC_parse; \
7347 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7348 DEBUG_PARSE_MSG((funcname)); \
7349 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7351 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7352 DEBUG_PARSE_MSG((funcname)); \
7353 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7356 /* This section of code defines the inversion list object and its methods. The
7357 * interfaces are highly subject to change, so as much as possible is static to
7358 * this file. An inversion list is here implemented as a malloc'd C UV array
7359 * as an SVt_INVLIST scalar.
7361 * An inversion list for Unicode is an array of code points, sorted by ordinal
7362 * number. The zeroth element is the first code point in the list. The 1th
7363 * element is the first element beyond that not in the list. In other words,
7364 * the first range is
7365 * invlist[0]..(invlist[1]-1)
7366 * The other ranges follow. Thus every element whose index is divisible by two
7367 * marks the beginning of a range that is in the list, and every element not
7368 * divisible by two marks the beginning of a range not in the list. A single
7369 * element inversion list that contains the single code point N generally
7370 * consists of two elements
7373 * (The exception is when N is the highest representable value on the
7374 * machine, in which case the list containing just it would be a single
7375 * element, itself. By extension, if the last range in the list extends to
7376 * infinity, then the first element of that range will be in the inversion list
7377 * at a position that is divisible by two, and is the final element in the
7379 * Taking the complement (inverting) an inversion list is quite simple, if the
7380 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7381 * This implementation reserves an element at the beginning of each inversion
7382 * list to always contain 0; there is an additional flag in the header which
7383 * indicates if the list begins at the 0, or is offset to begin at the next
7386 * More about inversion lists can be found in "Unicode Demystified"
7387 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7388 * More will be coming when functionality is added later.
7390 * The inversion list data structure is currently implemented as an SV pointing
7391 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7392 * array of UV whose memory management is automatically handled by the existing
7393 * facilities for SV's.
7395 * Some of the methods should always be private to the implementation, and some
7396 * should eventually be made public */
7398 /* The header definitions are in F<inline_invlist.c> */
7400 PERL_STATIC_INLINE UV*
7401 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7403 /* Returns a pointer to the first element in the inversion list's array.
7404 * This is called upon initialization of an inversion list. Where the
7405 * array begins depends on whether the list has the code point U+0000 in it
7406 * or not. The other parameter tells it whether the code that follows this
7407 * call is about to put a 0 in the inversion list or not. The first
7408 * element is either the element reserved for 0, if TRUE, or the element
7409 * after it, if FALSE */
7411 bool* offset = get_invlist_offset_addr(invlist);
7412 UV* zero_addr = (UV *) SvPVX(invlist);
7414 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7417 assert(! _invlist_len(invlist));
7421 /* 1^1 = 0; 1^0 = 1 */
7422 *offset = 1 ^ will_have_0;
7423 return zero_addr + *offset;
7426 PERL_STATIC_INLINE UV*
7427 S_invlist_array(pTHX_ SV* const invlist)
7429 /* Returns the pointer to the inversion list's array. Every time the
7430 * length changes, this needs to be called in case malloc or realloc moved
7433 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7435 /* Must not be empty. If these fail, you probably didn't check for <len>
7436 * being non-zero before trying to get the array */
7437 assert(_invlist_len(invlist));
7439 /* The very first element always contains zero, The array begins either
7440 * there, or if the inversion list is offset, at the element after it.
7441 * The offset header field determines which; it contains 0 or 1 to indicate
7442 * how much additionally to add */
7443 assert(0 == *(SvPVX(invlist)));
7444 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7447 PERL_STATIC_INLINE void
7448 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7450 /* Sets the current number of elements stored in the inversion list.
7451 * Updates SvCUR correspondingly */
7453 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7455 assert(SvTYPE(invlist) == SVt_INVLIST);
7460 : TO_INTERNAL_SIZE(len + offset));
7461 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7464 PERL_STATIC_INLINE IV*
7465 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7467 /* Return the address of the IV that is reserved to hold the cached index
7470 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7472 assert(SvTYPE(invlist) == SVt_INVLIST);
7474 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7477 PERL_STATIC_INLINE IV
7478 S_invlist_previous_index(pTHX_ SV* const invlist)
7480 /* Returns cached index of previous search */
7482 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7484 return *get_invlist_previous_index_addr(invlist);
7487 PERL_STATIC_INLINE void
7488 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7490 /* Caches <index> for later retrieval */
7492 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7494 assert(index == 0 || index < (int) _invlist_len(invlist));
7496 *get_invlist_previous_index_addr(invlist) = index;
7499 PERL_STATIC_INLINE UV
7500 S_invlist_max(pTHX_ SV* const invlist)
7502 /* Returns the maximum number of elements storable in the inversion list's
7503 * array, without having to realloc() */
7505 PERL_ARGS_ASSERT_INVLIST_MAX;
7507 assert(SvTYPE(invlist) == SVt_INVLIST);
7509 /* Assumes worst case, in which the 0 element is not counted in the
7510 * inversion list, so subtracts 1 for that */
7511 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7512 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7513 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7516 #ifndef PERL_IN_XSUB_RE
7518 Perl__new_invlist(pTHX_ IV initial_size)
7521 /* Return a pointer to a newly constructed inversion list, with enough
7522 * space to store 'initial_size' elements. If that number is negative, a
7523 * system default is used instead */
7527 if (initial_size < 0) {
7531 /* Allocate the initial space */
7532 new_list = newSV_type(SVt_INVLIST);
7534 /* First 1 is in case the zero element isn't in the list; second 1 is for
7536 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7537 invlist_set_len(new_list, 0, 0);
7539 /* Force iterinit() to be used to get iteration to work */
7540 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7542 *get_invlist_previous_index_addr(new_list) = 0;
7549 S__new_invlist_C_array(pTHX_ const UV* const list)
7551 /* Return a pointer to a newly constructed inversion list, initialized to
7552 * point to <list>, which has to be in the exact correct inversion list
7553 * form, including internal fields. Thus this is a dangerous routine that
7554 * should not be used in the wrong hands. The passed in 'list' contains
7555 * several header fields at the beginning that are not part of the
7556 * inversion list body proper */
7558 const STRLEN length = (STRLEN) list[0];
7559 const UV version_id = list[1];
7560 const bool offset = cBOOL(list[2]);
7561 #define HEADER_LENGTH 3
7562 /* If any of the above changes in any way, you must change HEADER_LENGTH
7563 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7564 * perl -E 'say int(rand 2**31-1)'
7566 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7567 data structure type, so that one being
7568 passed in can be validated to be an
7569 inversion list of the correct vintage.
7572 SV* invlist = newSV_type(SVt_INVLIST);
7574 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7576 if (version_id != INVLIST_VERSION_ID) {
7577 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7580 /* The generated array passed in includes header elements that aren't part
7581 * of the list proper, so start it just after them */
7582 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7584 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7585 shouldn't touch it */
7587 *(get_invlist_offset_addr(invlist)) = offset;
7589 /* The 'length' passed to us is the physical number of elements in the
7590 * inversion list. But if there is an offset the logical number is one
7592 invlist_set_len(invlist, length - offset, offset);
7594 invlist_set_previous_index(invlist, 0);
7596 /* Initialize the iteration pointer. */
7597 invlist_iterfinish(invlist);
7603 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7605 /* Grow the maximum size of an inversion list */
7607 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7609 assert(SvTYPE(invlist) == SVt_INVLIST);
7611 /* Add one to account for the zero element at the beginning which may not
7612 * be counted by the calling parameters */
7613 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7616 PERL_STATIC_INLINE void
7617 S_invlist_trim(pTHX_ SV* const invlist)
7619 PERL_ARGS_ASSERT_INVLIST_TRIM;
7621 assert(SvTYPE(invlist) == SVt_INVLIST);
7623 /* Change the length of the inversion list to how many entries it currently
7625 SvPV_shrink_to_cur((SV *) invlist);
7629 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7631 /* Subject to change or removal. Append the range from 'start' to 'end' at
7632 * the end of the inversion list. The range must be above any existing
7636 UV max = invlist_max(invlist);
7637 UV len = _invlist_len(invlist);
7640 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7642 if (len == 0) { /* Empty lists must be initialized */
7643 offset = start != 0;
7644 array = _invlist_array_init(invlist, ! offset);
7647 /* Here, the existing list is non-empty. The current max entry in the
7648 * list is generally the first value not in the set, except when the
7649 * set extends to the end of permissible values, in which case it is
7650 * the first entry in that final set, and so this call is an attempt to
7651 * append out-of-order */
7653 UV final_element = len - 1;
7654 array = invlist_array(invlist);
7655 if (array[final_element] > start
7656 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7658 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7659 array[final_element], start,
7660 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7663 /* Here, it is a legal append. If the new range begins with the first
7664 * value not in the set, it is extending the set, so the new first
7665 * value not in the set is one greater than the newly extended range.
7667 offset = *get_invlist_offset_addr(invlist);
7668 if (array[final_element] == start) {
7669 if (end != UV_MAX) {
7670 array[final_element] = end + 1;
7673 /* But if the end is the maximum representable on the machine,
7674 * just let the range that this would extend to have no end */
7675 invlist_set_len(invlist, len - 1, offset);
7681 /* Here the new range doesn't extend any existing set. Add it */
7683 len += 2; /* Includes an element each for the start and end of range */
7685 /* If wll overflow the existing space, extend, which may cause the array to
7688 invlist_extend(invlist, len);
7690 /* Have to set len here to avoid assert failure in invlist_array() */
7691 invlist_set_len(invlist, len, offset);
7693 array = invlist_array(invlist);
7696 invlist_set_len(invlist, len, offset);
7699 /* The next item on the list starts the range, the one after that is
7700 * one past the new range. */
7701 array[len - 2] = start;
7702 if (end != UV_MAX) {
7703 array[len - 1] = end + 1;
7706 /* But if the end is the maximum representable on the machine, just let
7707 * the range have no end */
7708 invlist_set_len(invlist, len - 1, offset);
7712 #ifndef PERL_IN_XSUB_RE
7715 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7717 /* Searches the inversion list for the entry that contains the input code
7718 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7719 * return value is the index into the list's array of the range that
7724 IV high = _invlist_len(invlist);
7725 const IV highest_element = high - 1;
7728 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7730 /* If list is empty, return failure. */
7735 /* (We can't get the array unless we know the list is non-empty) */
7736 array = invlist_array(invlist);
7738 mid = invlist_previous_index(invlist);
7739 assert(mid >=0 && mid <= highest_element);
7741 /* <mid> contains the cache of the result of the previous call to this
7742 * function (0 the first time). See if this call is for the same result,
7743 * or if it is for mid-1. This is under the theory that calls to this
7744 * function will often be for related code points that are near each other.
7745 * And benchmarks show that caching gives better results. We also test
7746 * here if the code point is within the bounds of the list. These tests
7747 * replace others that would have had to be made anyway to make sure that
7748 * the array bounds were not exceeded, and these give us extra information
7749 * at the same time */
7750 if (cp >= array[mid]) {
7751 if (cp >= array[highest_element]) {
7752 return highest_element;
7755 /* Here, array[mid] <= cp < array[highest_element]. This means that
7756 * the final element is not the answer, so can exclude it; it also
7757 * means that <mid> is not the final element, so can refer to 'mid + 1'
7759 if (cp < array[mid + 1]) {
7765 else { /* cp < aray[mid] */
7766 if (cp < array[0]) { /* Fail if outside the array */
7770 if (cp >= array[mid - 1]) {
7775 /* Binary search. What we are looking for is <i> such that
7776 * array[i] <= cp < array[i+1]
7777 * The loop below converges on the i+1. Note that there may not be an
7778 * (i+1)th element in the array, and things work nonetheless */
7779 while (low < high) {
7780 mid = (low + high) / 2;
7781 assert(mid <= highest_element);
7782 if (array[mid] <= cp) { /* cp >= array[mid] */
7785 /* We could do this extra test to exit the loop early.
7786 if (cp < array[low]) {
7791 else { /* cp < array[mid] */
7798 invlist_set_previous_index(invlist, high);
7803 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7805 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7806 * but is used when the swash has an inversion list. This makes this much
7807 * faster, as it uses a binary search instead of a linear one. This is
7808 * intimately tied to that function, and perhaps should be in utf8.c,
7809 * except it is intimately tied to inversion lists as well. It assumes
7810 * that <swatch> is all 0's on input */
7813 const IV len = _invlist_len(invlist);
7817 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7819 if (len == 0) { /* Empty inversion list */
7823 array = invlist_array(invlist);
7825 /* Find which element it is */
7826 i = _invlist_search(invlist, start);
7828 /* We populate from <start> to <end> */
7829 while (current < end) {
7832 /* The inversion list gives the results for every possible code point
7833 * after the first one in the list. Only those ranges whose index is
7834 * even are ones that the inversion list matches. For the odd ones,
7835 * and if the initial code point is not in the list, we have to skip
7836 * forward to the next element */
7837 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7839 if (i >= len) { /* Finished if beyond the end of the array */
7843 if (current >= end) { /* Finished if beyond the end of what we
7845 if (LIKELY(end < UV_MAX)) {
7849 /* We get here when the upper bound is the maximum
7850 * representable on the machine, and we are looking for just
7851 * that code point. Have to special case it */
7853 goto join_end_of_list;
7856 assert(current >= start);
7858 /* The current range ends one below the next one, except don't go past
7861 upper = (i < len && array[i] < end) ? array[i] : end;
7863 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7864 * for each code point in it */
7865 for (; current < upper; current++) {
7866 const STRLEN offset = (STRLEN)(current - start);
7867 swatch[offset >> 3] |= 1 << (offset & 7);
7872 /* Quit if at the end of the list */
7875 /* But first, have to deal with the highest possible code point on
7876 * the platform. The previous code assumes that <end> is one
7877 * beyond where we want to populate, but that is impossible at the
7878 * platform's infinity, so have to handle it specially */
7879 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7881 const STRLEN offset = (STRLEN)(end - start);
7882 swatch[offset >> 3] |= 1 << (offset & 7);
7887 /* Advance to the next range, which will be for code points not in the
7896 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
7898 /* Take the union of two inversion lists and point <output> to it. *output
7899 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7900 * the reference count to that list will be decremented if not already a
7901 * temporary (mortal); otherwise *output will be made correspondingly
7902 * mortal. The first list, <a>, may be NULL, in which case a copy of the
7903 * second list is returned. If <complement_b> is TRUE, the union is taken
7904 * of the complement (inversion) of <b> instead of b itself.
7906 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7907 * Richard Gillam, published by Addison-Wesley, and explained at some
7908 * length there. The preface says to incorporate its examples into your
7909 * code at your own risk.
7911 * The algorithm is like a merge sort.
7913 * XXX A potential performance improvement is to keep track as we go along
7914 * if only one of the inputs contributes to the result, meaning the other
7915 * is a subset of that one. In that case, we can skip the final copy and
7916 * return the larger of the input lists, but then outside code might need
7917 * to keep track of whether to free the input list or not */
7919 const UV* array_a; /* a's array */
7921 UV len_a; /* length of a's array */
7924 SV* u; /* the resulting union */
7928 UV i_a = 0; /* current index into a's array */
7932 /* running count, as explained in the algorithm source book; items are
7933 * stopped accumulating and are output when the count changes to/from 0.
7934 * The count is incremented when we start a range that's in the set, and
7935 * decremented when we start a range that's not in the set. So its range
7936 * is 0 to 2. Only when the count is zero is something not in the set.
7940 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7943 /* If either one is empty, the union is the other one */
7944 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7945 bool make_temp = FALSE; /* Should we mortalize the result? */
7949 if (! (make_temp = cBOOL(SvTEMP(a)))) {
7955 *output = invlist_clone(b);
7957 _invlist_invert(*output);
7959 } /* else *output already = b; */
7962 sv_2mortal(*output);
7966 else if ((len_b = _invlist_len(b)) == 0) {
7967 bool make_temp = FALSE;
7969 if (! (make_temp = cBOOL(SvTEMP(b)))) {
7974 /* The complement of an empty list is a list that has everything in it,
7975 * so the union with <a> includes everything too */
7978 if (! (make_temp = cBOOL(SvTEMP(a)))) {
7982 *output = _new_invlist(1);
7983 _append_range_to_invlist(*output, 0, UV_MAX);
7985 else if (*output != a) {
7986 *output = invlist_clone(a);
7988 /* else *output already = a; */
7991 sv_2mortal(*output);
7996 /* Here both lists exist and are non-empty */
7997 array_a = invlist_array(a);
7998 array_b = invlist_array(b);
8000 /* If are to take the union of 'a' with the complement of b, set it
8001 * up so are looking at b's complement. */
8004 /* To complement, we invert: if the first element is 0, remove it. To
8005 * do this, we just pretend the array starts one later */
8006 if (array_b[0] == 0) {
8012 /* But if the first element is not zero, we pretend the list starts
8013 * at the 0 that is always stored immediately before the array. */
8019 /* Size the union for the worst case: that the sets are completely
8021 u = _new_invlist(len_a + len_b);
8023 /* Will contain U+0000 if either component does */
8024 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8025 || (len_b > 0 && array_b[0] == 0));
8027 /* Go through each list item by item, stopping when exhausted one of
8029 while (i_a < len_a && i_b < len_b) {
8030 UV cp; /* The element to potentially add to the union's array */
8031 bool cp_in_set; /* is it in the the input list's set or not */
8033 /* We need to take one or the other of the two inputs for the union.
8034 * Since we are merging two sorted lists, we take the smaller of the
8035 * next items. In case of a tie, we take the one that is in its set
8036 * first. If we took one not in the set first, it would decrement the
8037 * count, possibly to 0 which would cause it to be output as ending the
8038 * range, and the next time through we would take the same number, and
8039 * output it again as beginning the next range. By doing it the
8040 * opposite way, there is no possibility that the count will be
8041 * momentarily decremented to 0, and thus the two adjoining ranges will
8042 * be seamlessly merged. (In a tie and both are in the set or both not
8043 * in the set, it doesn't matter which we take first.) */
8044 if (array_a[i_a] < array_b[i_b]
8045 || (array_a[i_a] == array_b[i_b]
8046 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8048 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8052 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8053 cp = array_b[i_b++];
8056 /* Here, have chosen which of the two inputs to look at. Only output
8057 * if the running count changes to/from 0, which marks the
8058 * beginning/end of a range in that's in the set */
8061 array_u[i_u++] = cp;
8068 array_u[i_u++] = cp;
8073 /* Here, we are finished going through at least one of the lists, which
8074 * means there is something remaining in at most one. We check if the list
8075 * that hasn't been exhausted is positioned such that we are in the middle
8076 * of a range in its set or not. (i_a and i_b point to the element beyond
8077 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8078 * is potentially more to output.
8079 * There are four cases:
8080 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8081 * in the union is entirely from the non-exhausted set.
8082 * 2) Both were in their sets, count is 2. Nothing further should
8083 * be output, as everything that remains will be in the exhausted
8084 * list's set, hence in the union; decrementing to 1 but not 0 insures
8086 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8087 * Nothing further should be output because the union includes
8088 * everything from the exhausted set. Not decrementing ensures that.
8089 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8090 * decrementing to 0 insures that we look at the remainder of the
8091 * non-exhausted set */
8092 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8093 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8098 /* The final length is what we've output so far, plus what else is about to
8099 * be output. (If 'count' is non-zero, then the input list we exhausted
8100 * has everything remaining up to the machine's limit in its set, and hence
8101 * in the union, so there will be no further output. */
8104 /* At most one of the subexpressions will be non-zero */
8105 len_u += (len_a - i_a) + (len_b - i_b);
8108 /* Set result to final length, which can change the pointer to array_u, so
8110 if (len_u != _invlist_len(u)) {
8111 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8113 array_u = invlist_array(u);
8116 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8117 * the other) ended with everything above it not in its set. That means
8118 * that the remaining part of the union is precisely the same as the
8119 * non-exhausted list, so can just copy it unchanged. (If both list were
8120 * exhausted at the same time, then the operations below will be both 0.)
8123 IV copy_count; /* At most one will have a non-zero copy count */
8124 if ((copy_count = len_a - i_a) > 0) {
8125 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8127 else if ((copy_count = len_b - i_b) > 0) {
8128 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8132 /* We may be removing a reference to one of the inputs. If so, the output
8133 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8134 * count decremented) */
8135 if (a == *output || b == *output) {
8136 assert(! invlist_is_iterating(*output));
8137 if ((SvTEMP(*output))) {
8141 SvREFCNT_dec_NN(*output);
8151 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
8153 /* Take the intersection of two inversion lists and point <i> to it. *i
8154 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8155 * the reference count to that list will be decremented if not already a
8156 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8157 * The first list, <a>, may be NULL, in which case an empty list is
8158 * returned. If <complement_b> is TRUE, the result will be the
8159 * intersection of <a> and the complement (or inversion) of <b> instead of
8162 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8163 * Richard Gillam, published by Addison-Wesley, and explained at some
8164 * length there. The preface says to incorporate its examples into your
8165 * code at your own risk. In fact, it had bugs
8167 * The algorithm is like a merge sort, and is essentially the same as the
8171 const UV* array_a; /* a's array */
8173 UV len_a; /* length of a's array */
8176 SV* r; /* the resulting intersection */
8180 UV i_a = 0; /* current index into a's array */
8184 /* running count, as explained in the algorithm source book; items are
8185 * stopped accumulating and are output when the count changes to/from 2.
8186 * The count is incremented when we start a range that's in the set, and
8187 * decremented when we start a range that's not in the set. So its range
8188 * is 0 to 2. Only when the count is 2 is something in the intersection.
8192 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8195 /* Special case if either one is empty */
8196 len_a = (a == NULL) ? 0 : _invlist_len(a);
8197 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8198 bool make_temp = FALSE;
8200 if (len_a != 0 && complement_b) {
8202 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8203 * be empty. Here, also we are using 'b's complement, which hence
8204 * must be every possible code point. Thus the intersection is
8208 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8213 *i = invlist_clone(a);
8215 /* else *i is already 'a' */
8223 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8224 * intersection must be empty */
8226 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8231 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8235 *i = _new_invlist(0);
8243 /* Here both lists exist and are non-empty */
8244 array_a = invlist_array(a);
8245 array_b = invlist_array(b);
8247 /* If are to take the intersection of 'a' with the complement of b, set it
8248 * up so are looking at b's complement. */
8251 /* To complement, we invert: if the first element is 0, remove it. To
8252 * do this, we just pretend the array starts one later */
8253 if (array_b[0] == 0) {
8259 /* But if the first element is not zero, we pretend the list starts
8260 * at the 0 that is always stored immediately before the array. */
8266 /* Size the intersection for the worst case: that the intersection ends up
8267 * fragmenting everything to be completely disjoint */
8268 r= _new_invlist(len_a + len_b);
8270 /* Will contain U+0000 iff both components do */
8271 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8272 && len_b > 0 && array_b[0] == 0);
8274 /* Go through each list item by item, stopping when exhausted one of
8276 while (i_a < len_a && i_b < len_b) {
8277 UV cp; /* The element to potentially add to the intersection's
8279 bool cp_in_set; /* Is it in the input list's set or not */
8281 /* We need to take one or the other of the two inputs for the
8282 * intersection. Since we are merging two sorted lists, we take the
8283 * smaller of the next items. In case of a tie, we take the one that
8284 * is not in its set first (a difference from the union algorithm). If
8285 * we took one in the set first, it would increment the count, possibly
8286 * to 2 which would cause it to be output as starting a range in the
8287 * intersection, and the next time through we would take that same
8288 * number, and output it again as ending the set. By doing it the
8289 * opposite of this, there is no possibility that the count will be
8290 * momentarily incremented to 2. (In a tie and both are in the set or
8291 * both not in the set, it doesn't matter which we take first.) */
8292 if (array_a[i_a] < array_b[i_b]
8293 || (array_a[i_a] == array_b[i_b]
8294 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8296 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8300 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8304 /* Here, have chosen which of the two inputs to look at. Only output
8305 * if the running count changes to/from 2, which marks the
8306 * beginning/end of a range that's in the intersection */
8310 array_r[i_r++] = cp;
8315 array_r[i_r++] = cp;
8321 /* Here, we are finished going through at least one of the lists, which
8322 * means there is something remaining in at most one. We check if the list
8323 * that has been exhausted is positioned such that we are in the middle
8324 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8325 * the ones we care about.) There are four cases:
8326 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8327 * nothing left in the intersection.
8328 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8329 * above 2. What should be output is exactly that which is in the
8330 * non-exhausted set, as everything it has is also in the intersection
8331 * set, and everything it doesn't have can't be in the intersection
8332 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8333 * gets incremented to 2. Like the previous case, the intersection is
8334 * everything that remains in the non-exhausted set.
8335 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8336 * remains 1. And the intersection has nothing more. */
8337 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8338 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8343 /* The final length is what we've output so far plus what else is in the
8344 * intersection. At most one of the subexpressions below will be non-zero */
8347 len_r += (len_a - i_a) + (len_b - i_b);
8350 /* Set result to final length, which can change the pointer to array_r, so
8352 if (len_r != _invlist_len(r)) {
8353 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8355 array_r = invlist_array(r);
8358 /* Finish outputting any remaining */
8359 if (count >= 2) { /* At most one will have a non-zero copy count */
8361 if ((copy_count = len_a - i_a) > 0) {
8362 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8364 else if ((copy_count = len_b - i_b) > 0) {
8365 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8369 /* We may be removing a reference to one of the inputs. If so, the output
8370 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8371 * count decremented) */
8372 if (a == *i || b == *i) {
8373 assert(! invlist_is_iterating(*i));
8378 SvREFCNT_dec_NN(*i);
8388 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8390 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8391 * set. A pointer to the inversion list is returned. This may actually be
8392 * a new list, in which case the passed in one has been destroyed. The
8393 * passed in inversion list can be NULL, in which case a new one is created
8394 * with just the one range in it */
8399 if (invlist == NULL) {
8400 invlist = _new_invlist(2);
8404 len = _invlist_len(invlist);
8407 /* If comes after the final entry actually in the list, can just append it
8410 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8411 && start >= invlist_array(invlist)[len - 1]))
8413 _append_range_to_invlist(invlist, start, end);
8417 /* Here, can't just append things, create and return a new inversion list
8418 * which is the union of this range and the existing inversion list */
8419 range_invlist = _new_invlist(2);
8420 _append_range_to_invlist(range_invlist, start, end);
8422 _invlist_union(invlist, range_invlist, &invlist);
8424 /* The temporary can be freed */
8425 SvREFCNT_dec_NN(range_invlist);
8432 PERL_STATIC_INLINE SV*
8433 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8434 return _add_range_to_invlist(invlist, cp, cp);
8437 #ifndef PERL_IN_XSUB_RE
8439 Perl__invlist_invert(pTHX_ SV* const invlist)
8441 /* Complement the input inversion list. This adds a 0 if the list didn't
8442 * have a zero; removes it otherwise. As described above, the data
8443 * structure is set up so that this is very efficient */
8445 PERL_ARGS_ASSERT__INVLIST_INVERT;
8447 assert(! invlist_is_iterating(invlist));
8449 /* The inverse of matching nothing is matching everything */
8450 if (_invlist_len(invlist) == 0) {
8451 _append_range_to_invlist(invlist, 0, UV_MAX);
8455 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8459 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8461 /* Complement the input inversion list (which must be a Unicode property,
8462 * all of which don't match above the Unicode maximum code point.) And
8463 * Perl has chosen to not have the inversion match above that either. This
8464 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8470 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8472 _invlist_invert(invlist);
8474 len = _invlist_len(invlist);
8476 if (len != 0) { /* If empty do nothing */
8477 array = invlist_array(invlist);
8478 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8479 /* Add 0x110000. First, grow if necessary */
8481 if (invlist_max(invlist) < len) {
8482 invlist_extend(invlist, len);
8483 array = invlist_array(invlist);
8485 invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8486 array[len - 1] = PERL_UNICODE_MAX + 1;
8488 else { /* Remove the 0x110000 */
8489 invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8497 PERL_STATIC_INLINE SV*
8498 S_invlist_clone(pTHX_ SV* const invlist)
8501 /* Return a new inversion list that is a copy of the input one, which is
8502 * unchanged. The new list will not be mortal even if the old one was. */
8504 /* Need to allocate extra space to accommodate Perl's addition of a
8505 * trailing NUL to SvPV's, since it thinks they are always strings */
8506 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8507 STRLEN physical_length = SvCUR(invlist);
8508 bool offset = *(get_invlist_offset_addr(invlist));
8510 PERL_ARGS_ASSERT_INVLIST_CLONE;
8512 *(get_invlist_offset_addr(new_invlist)) = offset;
8513 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8514 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8519 PERL_STATIC_INLINE STRLEN*
8520 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8522 /* Return the address of the UV that contains the current iteration
8525 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8527 assert(SvTYPE(invlist) == SVt_INVLIST);
8529 return &(((XINVLIST*) SvANY(invlist))->iterator);
8532 PERL_STATIC_INLINE void
8533 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8535 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8537 *get_invlist_iter_addr(invlist) = 0;
8540 PERL_STATIC_INLINE void
8541 S_invlist_iterfinish(pTHX_ SV* invlist)
8543 /* Terminate iterator for invlist. This is to catch development errors.
8544 * Any iteration that is interrupted before completed should call this
8545 * function. Functions that add code points anywhere else but to the end
8546 * of an inversion list assert that they are not in the middle of an
8547 * iteration. If they were, the addition would make the iteration
8548 * problematical: if the iteration hadn't reached the place where things
8549 * were being added, it would be ok */
8551 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8553 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8557 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8559 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8560 * This call sets in <*start> and <*end>, the next range in <invlist>.
8561 * Returns <TRUE> if successful and the next call will return the next
8562 * range; <FALSE> if was already at the end of the list. If the latter,
8563 * <*start> and <*end> are unchanged, and the next call to this function
8564 * will start over at the beginning of the list */
8566 STRLEN* pos = get_invlist_iter_addr(invlist);
8567 UV len = _invlist_len(invlist);
8570 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8573 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8577 array = invlist_array(invlist);
8579 *start = array[(*pos)++];
8585 *end = array[(*pos)++] - 1;
8591 PERL_STATIC_INLINE bool
8592 S_invlist_is_iterating(pTHX_ SV* const invlist)
8594 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8596 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8599 PERL_STATIC_INLINE UV
8600 S_invlist_highest(pTHX_ SV* const invlist)
8602 /* Returns the highest code point that matches an inversion list. This API
8603 * has an ambiguity, as it returns 0 under either the highest is actually
8604 * 0, or if the list is empty. If this distinction matters to you, check
8605 * for emptiness before calling this function */
8607 UV len = _invlist_len(invlist);
8610 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8616 array = invlist_array(invlist);
8618 /* The last element in the array in the inversion list always starts a
8619 * range that goes to infinity. That range may be for code points that are
8620 * matched in the inversion list, or it may be for ones that aren't
8621 * matched. In the latter case, the highest code point in the set is one
8622 * less than the beginning of this range; otherwise it is the final element
8623 * of this range: infinity */
8624 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8626 : array[len - 1] - 1;
8629 #ifndef PERL_IN_XSUB_RE
8631 Perl__invlist_contents(pTHX_ SV* const invlist)
8633 /* Get the contents of an inversion list into a string SV so that they can
8634 * be printed out. It uses the format traditionally done for debug tracing
8638 SV* output = newSVpvs("\n");
8640 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8642 assert(! invlist_is_iterating(invlist));
8644 invlist_iterinit(invlist);
8645 while (invlist_iternext(invlist, &start, &end)) {
8646 if (end == UV_MAX) {
8647 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8649 else if (end != start) {
8650 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8654 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8662 #ifndef PERL_IN_XSUB_RE
8664 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8666 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
8667 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
8668 * the string 'indent'. The output looks like this:
8669 [0] 0x000A .. 0x000D
8671 [4] 0x2028 .. 0x2029
8672 [6] 0x3104 .. INFINITY
8673 * This means that the first range of code points matched by the list are
8674 * 0xA through 0xD; the second range contains only the single code point
8675 * 0x85, etc. An inversion list is an array of UVs. Two array elements
8676 * are used to define each range (except if the final range extends to
8677 * infinity, only a single element is needed). The array index of the
8678 * first element for the corresponding range is given in brackets. */
8683 PERL_ARGS_ASSERT__INVLIST_DUMP;
8685 if (invlist_is_iterating(invlist)) {
8686 Perl_dump_indent(aTHX_ level, file,
8687 "%sCan't dump inversion list because is in middle of iterating\n",
8692 invlist_iterinit(invlist);
8693 while (invlist_iternext(invlist, &start, &end)) {
8694 if (end == UV_MAX) {
8695 Perl_dump_indent(aTHX_ level, file,
8696 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8697 indent, (UV)count, start);
8699 else if (end != start) {
8700 Perl_dump_indent(aTHX_ level, file,
8701 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8702 indent, (UV)count, start, end);
8705 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8706 indent, (UV)count, start);
8713 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8715 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8717 /* Return a boolean as to if the two passed in inversion lists are
8718 * identical. The final argument, if TRUE, says to take the complement of
8719 * the second inversion list before doing the comparison */
8721 const UV* array_a = invlist_array(a);
8722 const UV* array_b = invlist_array(b);
8723 UV len_a = _invlist_len(a);
8724 UV len_b = _invlist_len(b);
8726 UV i = 0; /* current index into the arrays */
8727 bool retval = TRUE; /* Assume are identical until proven otherwise */
8729 PERL_ARGS_ASSERT__INVLISTEQ;
8731 /* If are to compare 'a' with the complement of b, set it
8732 * up so are looking at b's complement. */
8735 /* The complement of nothing is everything, so <a> would have to have
8736 * just one element, starting at zero (ending at infinity) */
8738 return (len_a == 1 && array_a[0] == 0);
8740 else if (array_b[0] == 0) {
8742 /* Otherwise, to complement, we invert. Here, the first element is
8743 * 0, just remove it. To do this, we just pretend the array starts
8751 /* But if the first element is not zero, we pretend the list starts
8752 * at the 0 that is always stored immediately before the array. */
8758 /* Make sure that the lengths are the same, as well as the final element
8759 * before looping through the remainder. (Thus we test the length, final,
8760 * and first elements right off the bat) */
8761 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8764 else for (i = 0; i < len_a - 1; i++) {
8765 if (array_a[i] != array_b[i]) {
8775 #undef HEADER_LENGTH
8776 #undef TO_INTERNAL_SIZE
8777 #undef FROM_INTERNAL_SIZE
8778 #undef INVLIST_VERSION_ID
8780 /* End of inversion list object */
8783 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
8785 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8786 * constructs, and updates RExC_flags with them. On input, RExC_parse
8787 * should point to the first flag; it is updated on output to point to the
8788 * final ')' or ':'. There needs to be at least one flag, or this will
8791 /* for (?g), (?gc), and (?o) warnings; warning
8792 about (?c) will warn about (?g) -- japhy */
8794 #define WASTED_O 0x01
8795 #define WASTED_G 0x02
8796 #define WASTED_C 0x04
8797 #define WASTED_GC (WASTED_G|WASTED_C)
8798 I32 wastedflags = 0x00;
8799 U32 posflags = 0, negflags = 0;
8800 U32 *flagsp = &posflags;
8801 char has_charset_modifier = '\0';
8803 bool has_use_defaults = FALSE;
8804 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8806 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8808 /* '^' as an initial flag sets certain defaults */
8809 if (UCHARAT(RExC_parse) == '^') {
8811 has_use_defaults = TRUE;
8812 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8813 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8814 ? REGEX_UNICODE_CHARSET
8815 : REGEX_DEPENDS_CHARSET);
8818 cs = get_regex_charset(RExC_flags);
8819 if (cs == REGEX_DEPENDS_CHARSET
8820 && (RExC_utf8 || RExC_uni_semantics))
8822 cs = REGEX_UNICODE_CHARSET;
8825 while (*RExC_parse) {
8826 /* && strchr("iogcmsx", *RExC_parse) */
8827 /* (?g), (?gc) and (?o) are useless here
8828 and must be globally applied -- japhy */
8829 switch (*RExC_parse) {
8831 /* Code for the imsx flags */
8832 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8834 case LOCALE_PAT_MOD:
8835 if (has_charset_modifier) {
8836 goto excess_modifier;
8838 else if (flagsp == &negflags) {
8841 cs = REGEX_LOCALE_CHARSET;
8842 has_charset_modifier = LOCALE_PAT_MOD;
8843 RExC_contains_locale = 1;
8845 case UNICODE_PAT_MOD:
8846 if (has_charset_modifier) {
8847 goto excess_modifier;
8849 else if (flagsp == &negflags) {
8852 cs = REGEX_UNICODE_CHARSET;
8853 has_charset_modifier = UNICODE_PAT_MOD;
8855 case ASCII_RESTRICT_PAT_MOD:
8856 if (flagsp == &negflags) {
8859 if (has_charset_modifier) {
8860 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8861 goto excess_modifier;
8863 /* Doubled modifier implies more restricted */
8864 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8867 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8869 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8871 case DEPENDS_PAT_MOD:
8872 if (has_use_defaults) {
8873 goto fail_modifiers;
8875 else if (flagsp == &negflags) {
8878 else if (has_charset_modifier) {
8879 goto excess_modifier;
8882 /* The dual charset means unicode semantics if the
8883 * pattern (or target, not known until runtime) are
8884 * utf8, or something in the pattern indicates unicode
8886 cs = (RExC_utf8 || RExC_uni_semantics)
8887 ? REGEX_UNICODE_CHARSET
8888 : REGEX_DEPENDS_CHARSET;
8889 has_charset_modifier = DEPENDS_PAT_MOD;
8893 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8894 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8896 else if (has_charset_modifier == *(RExC_parse - 1)) {
8897 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8900 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8905 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8907 case ONCE_PAT_MOD: /* 'o' */
8908 case GLOBAL_PAT_MOD: /* 'g' */
8909 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8910 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8911 if (! (wastedflags & wflagbit) ) {
8912 wastedflags |= wflagbit;
8913 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8916 "Useless (%s%c) - %suse /%c modifier",
8917 flagsp == &negflags ? "?-" : "?",
8919 flagsp == &negflags ? "don't " : "",
8926 case CONTINUE_PAT_MOD: /* 'c' */
8927 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8928 if (! (wastedflags & WASTED_C) ) {
8929 wastedflags |= WASTED_GC;
8930 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8933 "Useless (%sc) - %suse /gc modifier",
8934 flagsp == &negflags ? "?-" : "?",
8935 flagsp == &negflags ? "don't " : ""
8940 case KEEPCOPY_PAT_MOD: /* 'p' */
8941 if (flagsp == &negflags) {
8943 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8945 *flagsp |= RXf_PMf_KEEPCOPY;
8949 /* A flag is a default iff it is following a minus, so
8950 * if there is a minus, it means will be trying to
8951 * re-specify a default which is an error */
8952 if (has_use_defaults || flagsp == &negflags) {
8953 goto fail_modifiers;
8956 wastedflags = 0; /* reset so (?g-c) warns twice */
8960 RExC_flags |= posflags;
8961 RExC_flags &= ~negflags;
8962 set_regex_charset(&RExC_flags, cs);
8963 if (RExC_flags & RXf_PMf_FOLD) {
8964 RExC_contains_i = 1;
8970 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
8971 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
8972 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
8981 - reg - regular expression, i.e. main body or parenthesized thing
8983 * Caller must absorb opening parenthesis.
8985 * Combining parenthesis handling with the base level of regular expression
8986 * is a trifle forced, but the need to tie the tails of the branches to what
8987 * follows makes it hard to avoid.
8989 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8991 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8993 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8996 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8997 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8998 needs to be restarted.
8999 Otherwise would only return NULL if regbranch() returns NULL, which
9002 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9003 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9004 * 2 is like 1, but indicates that nextchar() has been called to advance
9005 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9006 * this flag alerts us to the need to check for that */
9009 regnode *ret; /* Will be the head of the group. */
9012 regnode *ender = NULL;
9015 U32 oregflags = RExC_flags;
9016 bool have_branch = 0;
9018 I32 freeze_paren = 0;
9019 I32 after_freeze = 0;
9021 char * parse_start = RExC_parse; /* MJD */
9022 char * const oregcomp_parse = RExC_parse;
9024 GET_RE_DEBUG_FLAGS_DECL;
9026 PERL_ARGS_ASSERT_REG;
9027 DEBUG_PARSE("reg ");
9029 *flagp = 0; /* Tentatively. */
9032 /* Make an OPEN node, if parenthesized. */
9035 /* Under /x, space and comments can be gobbled up between the '(' and
9036 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9037 * intervening space, as the sequence is a token, and a token should be
9039 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9041 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9042 char *start_verb = RExC_parse;
9043 STRLEN verb_len = 0;
9044 char *start_arg = NULL;
9045 unsigned char op = 0;
9047 int internal_argval = 0; /* internal_argval is only useful if !argok */
9049 if (has_intervening_patws && SIZE_ONLY) {
9050 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9052 while ( *RExC_parse && *RExC_parse != ')' ) {
9053 if ( *RExC_parse == ':' ) {
9054 start_arg = RExC_parse + 1;
9060 verb_len = RExC_parse - start_verb;
9063 while ( *RExC_parse && *RExC_parse != ')' )
9065 if ( *RExC_parse != ')' )
9066 vFAIL("Unterminated verb pattern argument");
9067 if ( RExC_parse == start_arg )
9070 if ( *RExC_parse != ')' )
9071 vFAIL("Unterminated verb pattern");
9074 switch ( *start_verb ) {
9075 case 'A': /* (*ACCEPT) */
9076 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9078 internal_argval = RExC_nestroot;
9081 case 'C': /* (*COMMIT) */
9082 if ( memEQs(start_verb,verb_len,"COMMIT") )
9085 case 'F': /* (*FAIL) */
9086 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9091 case ':': /* (*:NAME) */
9092 case 'M': /* (*MARK:NAME) */
9093 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9098 case 'P': /* (*PRUNE) */
9099 if ( memEQs(start_verb,verb_len,"PRUNE") )
9102 case 'S': /* (*SKIP) */
9103 if ( memEQs(start_verb,verb_len,"SKIP") )
9106 case 'T': /* (*THEN) */
9107 /* [19:06] <TimToady> :: is then */
9108 if ( memEQs(start_verb,verb_len,"THEN") ) {
9110 RExC_seen |= REG_SEEN_CUTGROUP;
9115 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9117 "Unknown verb pattern '%"UTF8f"'",
9118 UTF8fARG(UTF, verb_len, start_verb));
9121 if ( start_arg && internal_argval ) {
9122 vFAIL3("Verb pattern '%.*s' may not have an argument",
9123 verb_len, start_verb);
9124 } else if ( argok < 0 && !start_arg ) {
9125 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9126 verb_len, start_verb);
9128 ret = reganode(pRExC_state, op, internal_argval);
9129 if ( ! internal_argval && ! SIZE_ONLY ) {
9131 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
9132 ARG(ret) = add_data( pRExC_state, STR_WITH_LEN("S"));
9133 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9140 if (!internal_argval)
9141 RExC_seen |= REG_SEEN_VERBARG;
9142 } else if ( start_arg ) {
9143 vFAIL3("Verb pattern '%.*s' may not have an argument",
9144 verb_len, start_verb);
9146 ret = reg_node(pRExC_state, op);
9148 nextchar(pRExC_state);
9151 else if (*RExC_parse == '?') { /* (?...) */
9152 bool is_logical = 0;
9153 const char * const seqstart = RExC_parse;
9154 if (has_intervening_patws && SIZE_ONLY) {
9155 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9159 paren = *RExC_parse++;
9160 ret = NULL; /* For look-ahead/behind. */
9163 case 'P': /* (?P...) variants for those used to PCRE/Python */
9164 paren = *RExC_parse++;
9165 if ( paren == '<') /* (?P<...>) named capture */
9167 else if (paren == '>') { /* (?P>name) named recursion */
9168 goto named_recursion;
9170 else if (paren == '=') { /* (?P=...) named backref */
9171 /* this pretty much dupes the code for \k<NAME> in regatom(), if
9172 you change this make sure you change that */
9173 char* name_start = RExC_parse;
9175 SV *sv_dat = reg_scan_name(pRExC_state,
9176 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9177 if (RExC_parse == name_start || *RExC_parse != ')')
9178 vFAIL2("Sequence %.3s... not terminated",parse_start);
9181 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9182 RExC_rxi->data->data[num]=(void*)sv_dat;
9183 SvREFCNT_inc_simple_void(sv_dat);
9186 ret = reganode(pRExC_state,
9189 : (ASCII_FOLD_RESTRICTED)
9191 : (AT_LEAST_UNI_SEMANTICS)
9199 Set_Node_Offset(ret, parse_start+1);
9200 Set_Node_Cur_Length(ret, parse_start);
9202 nextchar(pRExC_state);
9206 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9208 case '<': /* (?<...) */
9209 if (*RExC_parse == '!')
9211 else if (*RExC_parse != '=')
9217 case '\'': /* (?'...') */
9218 name_start= RExC_parse;
9219 svname = reg_scan_name(pRExC_state,
9220 SIZE_ONLY ? /* reverse test from the others */
9221 REG_RSN_RETURN_NAME :
9222 REG_RSN_RETURN_NULL);
9223 if (RExC_parse == name_start) {
9225 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9228 if (*RExC_parse != paren)
9229 vFAIL2("Sequence (?%c... not terminated",
9230 paren=='>' ? '<' : paren);
9234 if (!svname) /* shouldn't happen */
9236 "panic: reg_scan_name returned NULL");
9237 if (!RExC_paren_names) {
9238 RExC_paren_names= newHV();
9239 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9241 RExC_paren_name_list= newAV();
9242 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9245 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9247 sv_dat = HeVAL(he_str);
9249 /* croak baby croak */
9251 "panic: paren_name hash element allocation failed");
9252 } else if ( SvPOK(sv_dat) ) {
9253 /* (?|...) can mean we have dupes so scan to check
9254 its already been stored. Maybe a flag indicating
9255 we are inside such a construct would be useful,
9256 but the arrays are likely to be quite small, so
9257 for now we punt -- dmq */
9258 IV count = SvIV(sv_dat);
9259 I32 *pv = (I32*)SvPVX(sv_dat);
9261 for ( i = 0 ; i < count ; i++ ) {
9262 if ( pv[i] == RExC_npar ) {
9268 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
9269 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9270 pv[count] = RExC_npar;
9271 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9274 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9275 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
9277 SvIV_set(sv_dat, 1);
9280 /* Yes this does cause a memory leak in debugging Perls */
9281 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
9282 SvREFCNT_dec_NN(svname);
9285 /*sv_dump(sv_dat);*/
9287 nextchar(pRExC_state);
9289 goto capturing_parens;
9291 RExC_seen |= REG_SEEN_LOOKBEHIND;
9292 RExC_in_lookbehind++;
9294 case '=': /* (?=...) */
9295 RExC_seen_zerolen++;
9297 case '!': /* (?!...) */
9298 RExC_seen_zerolen++;
9299 if (*RExC_parse == ')') {
9300 ret=reg_node(pRExC_state, OPFAIL);
9301 nextchar(pRExC_state);
9305 case '|': /* (?|...) */
9306 /* branch reset, behave like a (?:...) except that
9307 buffers in alternations share the same numbers */
9309 after_freeze = freeze_paren = RExC_npar;
9311 case ':': /* (?:...) */
9312 case '>': /* (?>...) */
9314 case '$': /* (?$...) */
9315 case '@': /* (?@...) */
9316 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9318 case '#': /* (?#...) */
9319 /* XXX As soon as we disallow separating the '?' and '*' (by
9320 * spaces or (?#...) comment), it is believed that this case
9321 * will be unreachable and can be removed. See
9323 while (*RExC_parse && *RExC_parse != ')')
9325 if (*RExC_parse != ')')
9326 FAIL("Sequence (?#... not terminated");
9327 nextchar(pRExC_state);
9330 case '0' : /* (?0) */
9331 case 'R' : /* (?R) */
9332 if (*RExC_parse != ')')
9333 FAIL("Sequence (?R) not terminated");
9334 ret = reg_node(pRExC_state, GOSTART);
9335 *flagp |= POSTPONED;
9336 nextchar(pRExC_state);
9339 { /* named and numeric backreferences */
9341 case '&': /* (?&NAME) */
9342 parse_start = RExC_parse - 1;
9345 SV *sv_dat = reg_scan_name(pRExC_state,
9346 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9347 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9349 goto gen_recurse_regop;
9350 assert(0); /* NOT REACHED */
9352 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9354 vFAIL("Illegal pattern");
9356 goto parse_recursion;
9358 case '-': /* (?-1) */
9359 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9360 RExC_parse--; /* rewind to let it be handled later */
9364 case '1': case '2': case '3': case '4': /* (?1) */
9365 case '5': case '6': case '7': case '8': case '9':
9368 num = atoi(RExC_parse);
9369 parse_start = RExC_parse - 1; /* MJD */
9370 if (*RExC_parse == '-')
9372 while (isDIGIT(*RExC_parse))
9374 if (*RExC_parse!=')')
9375 vFAIL("Expecting close bracket");
9378 if ( paren == '-' ) {
9380 Diagram of capture buffer numbering.
9381 Top line is the normal capture buffer numbers
9382 Bottom line is the negative indexing as from
9386 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9390 num = RExC_npar + num;
9393 vFAIL("Reference to nonexistent group");
9395 } else if ( paren == '+' ) {
9396 num = RExC_npar + num - 1;
9399 ret = reganode(pRExC_state, GOSUB, num);
9401 if (num > (I32)RExC_rx->nparens) {
9403 vFAIL("Reference to nonexistent group");
9405 ARG2L_SET( ret, RExC_recurse_count++);
9407 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9408 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9412 RExC_seen |= REG_SEEN_RECURSE;
9413 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9414 Set_Node_Offset(ret, parse_start); /* MJD */
9416 *flagp |= POSTPONED;
9417 nextchar(pRExC_state);
9419 } /* named and numeric backreferences */
9420 assert(0); /* NOT REACHED */
9422 case '?': /* (??...) */
9424 if (*RExC_parse != '{') {
9427 "Sequence (%"UTF8f"...) not recognized",
9428 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9431 *flagp |= POSTPONED;
9432 paren = *RExC_parse++;
9434 case '{': /* (?{...}) */
9437 struct reg_code_block *cb;
9439 RExC_seen_zerolen++;
9441 if ( !pRExC_state->num_code_blocks
9442 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9443 || pRExC_state->code_blocks[pRExC_state->code_index].start
9444 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9447 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9448 FAIL("panic: Sequence (?{...}): no code block found\n");
9449 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9451 /* this is a pre-compiled code block (?{...}) */
9452 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9453 RExC_parse = RExC_start + cb->end;
9456 if (cb->src_regex) {
9457 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9458 RExC_rxi->data->data[n] =
9459 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9460 RExC_rxi->data->data[n+1] = (void*)o;
9463 n = add_data(pRExC_state,
9464 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9465 RExC_rxi->data->data[n] = (void*)o;
9468 pRExC_state->code_index++;
9469 nextchar(pRExC_state);
9473 ret = reg_node(pRExC_state, LOGICAL);
9474 eval = reganode(pRExC_state, EVAL, n);
9477 /* for later propagation into (??{}) return value */
9478 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9480 REGTAIL(pRExC_state, ret, eval);
9481 /* deal with the length of this later - MJD */
9484 ret = reganode(pRExC_state, EVAL, n);
9485 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9486 Set_Node_Offset(ret, parse_start);
9489 case '(': /* (?(?{...})...) and (?(?=...)...) */
9492 if (RExC_parse[0] == '?') { /* (?(?...)) */
9493 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9494 || RExC_parse[1] == '<'
9495 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9499 ret = reg_node(pRExC_state, LOGICAL);
9503 tail = reg(pRExC_state, 1, &flag, depth+1);
9504 if (flag & RESTART_UTF8) {
9505 *flagp = RESTART_UTF8;
9508 REGTAIL(pRExC_state, ret, tail);
9512 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9513 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9515 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9516 char *name_start= RExC_parse++;
9518 SV *sv_dat=reg_scan_name(pRExC_state,
9519 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9520 if (RExC_parse == name_start || *RExC_parse != ch)
9521 vFAIL2("Sequence (?(%c... not terminated",
9522 (ch == '>' ? '<' : ch));
9525 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9526 RExC_rxi->data->data[num]=(void*)sv_dat;
9527 SvREFCNT_inc_simple_void(sv_dat);
9529 ret = reganode(pRExC_state,NGROUPP,num);
9530 goto insert_if_check_paren;
9532 else if (RExC_parse[0] == 'D' &&
9533 RExC_parse[1] == 'E' &&
9534 RExC_parse[2] == 'F' &&
9535 RExC_parse[3] == 'I' &&
9536 RExC_parse[4] == 'N' &&
9537 RExC_parse[5] == 'E')
9539 ret = reganode(pRExC_state,DEFINEP,0);
9542 goto insert_if_check_paren;
9544 else if (RExC_parse[0] == 'R') {
9547 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9548 parno = atoi(RExC_parse++);
9549 while (isDIGIT(*RExC_parse))
9551 } else if (RExC_parse[0] == '&') {
9554 sv_dat = reg_scan_name(pRExC_state,
9555 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9556 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9558 ret = reganode(pRExC_state,INSUBP,parno);
9559 goto insert_if_check_paren;
9561 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9565 parno = atoi(RExC_parse++);
9567 while (isDIGIT(*RExC_parse))
9569 ret = reganode(pRExC_state, GROUPP, parno);
9571 insert_if_check_paren:
9572 if (*(tmp = nextchar(pRExC_state)) != ')') {
9574 /* Like the name implies, nextchar deals in chars,
9575 * not characters, so if under UTF, undo its work
9576 * and skip over the the next character.
9579 RExC_parse += UTF8SKIP(RExC_parse);
9581 vFAIL("Switch condition not recognized");
9584 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9585 br = regbranch(pRExC_state, &flags, 1,depth+1);
9587 if (flags & RESTART_UTF8) {
9588 *flagp = RESTART_UTF8;
9591 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9594 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9595 c = *nextchar(pRExC_state);
9600 vFAIL("(?(DEFINE)....) does not allow branches");
9601 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9602 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9603 if (flags & RESTART_UTF8) {
9604 *flagp = RESTART_UTF8;
9607 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9610 REGTAIL(pRExC_state, ret, lastbr);
9613 c = *nextchar(pRExC_state);
9618 vFAIL("Switch (?(condition)... contains too many branches");
9619 ender = reg_node(pRExC_state, TAIL);
9620 REGTAIL(pRExC_state, br, ender);
9622 REGTAIL(pRExC_state, lastbr, ender);
9623 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9626 REGTAIL(pRExC_state, ret, ender);
9627 RExC_size++; /* XXX WHY do we need this?!!
9628 For large programs it seems to be required
9629 but I can't figure out why. -- dmq*/
9633 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9634 vFAIL("Unknown switch condition (?(...))");
9637 case '[': /* (?[ ... ]) */
9638 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9641 RExC_parse--; /* for vFAIL to print correctly */
9642 vFAIL("Sequence (? incomplete");
9644 default: /* e.g., (?i) */
9647 parse_lparen_question_flags(pRExC_state);
9648 if (UCHARAT(RExC_parse) != ':') {
9649 nextchar(pRExC_state);
9654 nextchar(pRExC_state);
9664 ret = reganode(pRExC_state, OPEN, parno);
9667 RExC_nestroot = parno;
9668 if (RExC_seen & REG_SEEN_RECURSE
9669 && !RExC_open_parens[parno-1])
9671 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9672 "Setting open paren #%"IVdf" to %d\n",
9673 (IV)parno, REG_NODE_NUM(ret)));
9674 RExC_open_parens[parno-1]= ret;
9677 Set_Node_Length(ret, 1); /* MJD */
9678 Set_Node_Offset(ret, RExC_parse); /* MJD */
9686 /* Pick up the branches, linking them together. */
9687 parse_start = RExC_parse; /* MJD */
9688 br = regbranch(pRExC_state, &flags, 1,depth+1);
9690 /* branch_len = (paren != 0); */
9693 if (flags & RESTART_UTF8) {
9694 *flagp = RESTART_UTF8;
9697 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9699 if (*RExC_parse == '|') {
9700 if (!SIZE_ONLY && RExC_extralen) {
9701 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9704 reginsert(pRExC_state, BRANCH, br, depth+1);
9705 Set_Node_Length(br, paren != 0);
9706 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9710 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9712 else if (paren == ':') {
9713 *flagp |= flags&SIMPLE;
9715 if (is_open) { /* Starts with OPEN. */
9716 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9718 else if (paren != '?') /* Not Conditional */
9720 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9722 while (*RExC_parse == '|') {
9723 if (!SIZE_ONLY && RExC_extralen) {
9724 ender = reganode(pRExC_state, LONGJMP,0);
9725 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9728 RExC_extralen += 2; /* Account for LONGJMP. */
9729 nextchar(pRExC_state);
9731 if (RExC_npar > after_freeze)
9732 after_freeze = RExC_npar;
9733 RExC_npar = freeze_paren;
9735 br = regbranch(pRExC_state, &flags, 0, depth+1);
9738 if (flags & RESTART_UTF8) {
9739 *flagp = RESTART_UTF8;
9742 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9744 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9746 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9749 if (have_branch || paren != ':') {
9750 /* Make a closing node, and hook it on the end. */
9753 ender = reg_node(pRExC_state, TAIL);
9756 ender = reganode(pRExC_state, CLOSE, parno);
9757 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9758 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9759 "Setting close paren #%"IVdf" to %d\n",
9760 (IV)parno, REG_NODE_NUM(ender)));
9761 RExC_close_parens[parno-1]= ender;
9762 if (RExC_nestroot == parno)
9765 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9766 Set_Node_Length(ender,1); /* MJD */
9772 *flagp &= ~HASWIDTH;
9775 ender = reg_node(pRExC_state, SUCCEED);
9778 ender = reg_node(pRExC_state, END);
9780 assert(!RExC_opend); /* there can only be one! */
9785 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9786 SV * const mysv_val1=sv_newmortal();
9787 SV * const mysv_val2=sv_newmortal();
9788 DEBUG_PARSE_MSG("lsbr");
9789 regprop(RExC_rx, mysv_val1, lastbr);
9790 regprop(RExC_rx, mysv_val2, ender);
9791 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9792 SvPV_nolen_const(mysv_val1),
9793 (IV)REG_NODE_NUM(lastbr),
9794 SvPV_nolen_const(mysv_val2),
9795 (IV)REG_NODE_NUM(ender),
9796 (IV)(ender - lastbr)
9799 REGTAIL(pRExC_state, lastbr, ender);
9801 if (have_branch && !SIZE_ONLY) {
9804 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9806 /* Hook the tails of the branches to the closing node. */
9807 for (br = ret; br; br = regnext(br)) {
9808 const U8 op = PL_regkind[OP(br)];
9810 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9811 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9814 else if (op == BRANCHJ) {
9815 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9816 /* for now we always disable this optimisation * /
9817 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9823 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9824 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9825 SV * const mysv_val1=sv_newmortal();
9826 SV * const mysv_val2=sv_newmortal();
9827 DEBUG_PARSE_MSG("NADA");
9828 regprop(RExC_rx, mysv_val1, ret);
9829 regprop(RExC_rx, mysv_val2, ender);
9830 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9831 SvPV_nolen_const(mysv_val1),
9832 (IV)REG_NODE_NUM(ret),
9833 SvPV_nolen_const(mysv_val2),
9834 (IV)REG_NODE_NUM(ender),
9839 if (OP(ender) == TAIL) {
9844 for ( opt= br + 1; opt < ender ; opt++ )
9846 NEXT_OFF(br)= ender - br;
9854 static const char parens[] = "=!<,>";
9856 if (paren && (p = strchr(parens, paren))) {
9857 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9858 int flag = (p - parens) > 1;
9861 node = SUSPEND, flag = 0;
9862 reginsert(pRExC_state, node,ret, depth+1);
9863 Set_Node_Cur_Length(ret, parse_start);
9864 Set_Node_Offset(ret, parse_start + 1);
9866 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9870 /* Check for proper termination. */
9872 /* restore original flags, but keep (?p) */
9873 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9874 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9875 RExC_parse = oregcomp_parse;
9876 vFAIL("Unmatched (");
9879 else if (!paren && RExC_parse < RExC_end) {
9880 if (*RExC_parse == ')') {
9882 vFAIL("Unmatched )");
9885 FAIL("Junk on end of regexp"); /* "Can't happen". */
9886 assert(0); /* NOTREACHED */
9889 if (RExC_in_lookbehind) {
9890 RExC_in_lookbehind--;
9892 if (after_freeze > RExC_npar)
9893 RExC_npar = after_freeze;
9898 - regbranch - one alternative of an | operator
9900 * Implements the concatenation operator.
9902 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9906 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9910 regnode *chain = NULL;
9912 I32 flags = 0, c = 0;
9913 GET_RE_DEBUG_FLAGS_DECL;
9915 PERL_ARGS_ASSERT_REGBRANCH;
9917 DEBUG_PARSE("brnc");
9922 if (!SIZE_ONLY && RExC_extralen)
9923 ret = reganode(pRExC_state, BRANCHJ,0);
9925 ret = reg_node(pRExC_state, BRANCH);
9926 Set_Node_Length(ret, 1);
9930 if (!first && SIZE_ONLY)
9931 RExC_extralen += 1; /* BRANCHJ */
9933 *flagp = WORST; /* Tentatively. */
9936 nextchar(pRExC_state);
9937 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9939 latest = regpiece(pRExC_state, &flags,depth+1);
9940 if (latest == NULL) {
9941 if (flags & TRYAGAIN)
9943 if (flags & RESTART_UTF8) {
9944 *flagp = RESTART_UTF8;
9947 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
9949 else if (ret == NULL)
9951 *flagp |= flags&(HASWIDTH|POSTPONED);
9952 if (chain == NULL) /* First piece. */
9953 *flagp |= flags&SPSTART;
9956 REGTAIL(pRExC_state, chain, latest);
9961 if (chain == NULL) { /* Loop ran zero times. */
9962 chain = reg_node(pRExC_state, NOTHING);
9967 *flagp |= flags&SIMPLE;
9974 - regpiece - something followed by possible [*+?]
9976 * Note that the branching code sequences used for ? and the general cases
9977 * of * and + are somewhat optimized: they use the same NOTHING node as
9978 * both the endmarker for their branch list and the body of the last branch.
9979 * It might seem that this node could be dispensed with entirely, but the
9980 * endmarker role is not redundant.
9982 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9984 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9988 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9995 const char * const origparse = RExC_parse;
9997 I32 max = REG_INFTY;
9998 #ifdef RE_TRACK_PATTERN_OFFSETS
10001 const char *maxpos = NULL;
10003 /* Save the original in case we change the emitted regop to a FAIL. */
10004 regnode * const orig_emit = RExC_emit;
10006 GET_RE_DEBUG_FLAGS_DECL;
10008 PERL_ARGS_ASSERT_REGPIECE;
10010 DEBUG_PARSE("piec");
10012 ret = regatom(pRExC_state, &flags,depth+1);
10014 if (flags & (TRYAGAIN|RESTART_UTF8))
10015 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10017 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10023 if (op == '{' && regcurly(RExC_parse, FALSE)) {
10025 #ifdef RE_TRACK_PATTERN_OFFSETS
10026 parse_start = RExC_parse; /* MJD */
10028 next = RExC_parse + 1;
10029 while (isDIGIT(*next) || *next == ',') {
10030 if (*next == ',') {
10038 if (*next == '}') { /* got one */
10042 min = atoi(RExC_parse);
10043 if (*maxpos == ',')
10046 maxpos = RExC_parse;
10047 max = atoi(maxpos);
10048 if (!max && *maxpos != '0')
10049 max = REG_INFTY; /* meaning "infinity" */
10050 else if (max >= REG_INFTY)
10051 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10053 nextchar(pRExC_state);
10054 if (max < min) { /* If can't match, warn and optimize to fail
10057 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10059 /* We can't back off the size because we have to reserve
10060 * enough space for all the things we are about to throw
10061 * away, but we can shrink it by the ammount we are about
10062 * to re-use here */
10063 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10066 RExC_emit = orig_emit;
10068 ret = reg_node(pRExC_state, OPFAIL);
10073 if ((flags&SIMPLE)) {
10074 RExC_naughty += 2 + RExC_naughty / 2;
10075 reginsert(pRExC_state, CURLY, ret, depth+1);
10076 Set_Node_Offset(ret, parse_start+1); /* MJD */
10077 Set_Node_Cur_Length(ret, parse_start);
10080 regnode * const w = reg_node(pRExC_state, WHILEM);
10083 REGTAIL(pRExC_state, ret, w);
10084 if (!SIZE_ONLY && RExC_extralen) {
10085 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10086 reginsert(pRExC_state, NOTHING,ret, depth+1);
10087 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10089 reginsert(pRExC_state, CURLYX,ret, depth+1);
10091 Set_Node_Offset(ret, parse_start+1);
10092 Set_Node_Length(ret,
10093 op == '{' ? (RExC_parse - parse_start) : 1);
10095 if (!SIZE_ONLY && RExC_extralen)
10096 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10097 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10099 RExC_whilem_seen++, RExC_extralen += 3;
10100 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10107 *flagp |= HASWIDTH;
10109 ARG1_SET(ret, (U16)min);
10110 ARG2_SET(ret, (U16)max);
10117 if (!ISMULT1(op)) {
10122 #if 0 /* Now runtime fix should be reliable. */
10124 /* if this is reinstated, don't forget to put this back into perldiag:
10126 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10128 (F) The part of the regexp subject to either the * or + quantifier
10129 could match an empty string. The {#} shows in the regular
10130 expression about where the problem was discovered.
10134 if (!(flags&HASWIDTH) && op != '?')
10135 vFAIL("Regexp *+ operand could be empty");
10138 #ifdef RE_TRACK_PATTERN_OFFSETS
10139 parse_start = RExC_parse;
10141 nextchar(pRExC_state);
10143 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10145 if (op == '*' && (flags&SIMPLE)) {
10146 reginsert(pRExC_state, STAR, ret, depth+1);
10150 else if (op == '*') {
10154 else if (op == '+' && (flags&SIMPLE)) {
10155 reginsert(pRExC_state, PLUS, ret, depth+1);
10159 else if (op == '+') {
10163 else if (op == '?') {
10168 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10169 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10170 ckWARN2reg(RExC_parse,
10171 "%"UTF8f" matches null string many times",
10172 UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
10174 (void)ReREFCNT_inc(RExC_rx_sv);
10177 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10178 nextchar(pRExC_state);
10179 reginsert(pRExC_state, MINMOD, ret, depth+1);
10180 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10183 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10185 nextchar(pRExC_state);
10186 ender = reg_node(pRExC_state, SUCCEED);
10187 REGTAIL(pRExC_state, ret, ender);
10188 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10190 ender = reg_node(pRExC_state, TAIL);
10191 REGTAIL(pRExC_state, ret, ender);
10194 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10196 vFAIL("Nested quantifiers");
10203 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10204 const bool strict /* Apply stricter parsing rules? */
10208 /* This is expected to be called by a parser routine that has recognized '\N'
10209 and needs to handle the rest. RExC_parse is expected to point at the first
10210 char following the N at the time of the call. On successful return,
10211 RExC_parse has been updated to point to just after the sequence identified
10212 by this routine, and <*flagp> has been updated.
10214 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10217 \N may begin either a named sequence, or if outside a character class, mean
10218 to match a non-newline. For non single-quoted regexes, the tokenizer has
10219 attempted to decide which, and in the case of a named sequence, converted it
10220 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10221 where c1... are the characters in the sequence. For single-quoted regexes,
10222 the tokenizer passes the \N sequence through unchanged; this code will not
10223 attempt to determine this nor expand those, instead raising a syntax error.
10224 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10225 or there is no '}', it signals that this \N occurrence means to match a
10228 Only the \N{U+...} form should occur in a character class, for the same
10229 reason that '.' inside a character class means to just match a period: it
10230 just doesn't make sense.
10232 The function raises an error (via vFAIL), and doesn't return for various
10233 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
10234 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10235 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10236 only possible if node_p is non-NULL.
10239 If <valuep> is non-null, it means the caller can accept an input sequence
10240 consisting of a just a single code point; <*valuep> is set to that value
10241 if the input is such.
10243 If <node_p> is non-null it signifies that the caller can accept any other
10244 legal sequence (i.e., one that isn't just a single code point). <*node_p>
10246 1) \N means not-a-NL: points to a newly created REG_ANY node;
10247 2) \N{}: points to a new NOTHING node;
10248 3) otherwise: points to a new EXACT node containing the resolved
10250 Note that FALSE is returned for single code point sequences if <valuep> is
10254 char * endbrace; /* '}' following the name */
10256 char *endchar; /* Points to '.' or '}' ending cur char in the input
10258 bool has_multiple_chars; /* true if the input stream contains a sequence of
10259 more than one character */
10261 GET_RE_DEBUG_FLAGS_DECL;
10263 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10265 GET_RE_DEBUG_FLAGS;
10267 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10269 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10270 * modifier. The other meaning does not */
10271 p = (RExC_flags & RXf_PMf_EXTENDED)
10272 ? regwhite( pRExC_state, RExC_parse )
10275 /* Disambiguate between \N meaning a named character versus \N meaning
10276 * [^\n]. The former is assumed when it can't be the latter. */
10277 if (*p != '{' || regcurly(p, FALSE)) {
10280 /* no bare \N in a charclass */
10281 if (in_char_class) {
10282 vFAIL("\\N in a character class must be a named character: \\N{...}");
10286 nextchar(pRExC_state);
10287 *node_p = reg_node(pRExC_state, REG_ANY);
10288 *flagp |= HASWIDTH|SIMPLE;
10291 Set_Node_Length(*node_p, 1); /* MJD */
10295 /* Here, we have decided it should be a named character or sequence */
10297 /* The test above made sure that the next real character is a '{', but
10298 * under the /x modifier, it could be separated by space (or a comment and
10299 * \n) and this is not allowed (for consistency with \x{...} and the
10300 * tokenizer handling of \N{NAME}). */
10301 if (*RExC_parse != '{') {
10302 vFAIL("Missing braces on \\N{}");
10305 RExC_parse++; /* Skip past the '{' */
10307 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10308 || ! (endbrace == RExC_parse /* nothing between the {} */
10309 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
10310 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
10312 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10313 vFAIL("\\N{NAME} must be resolved by the lexer");
10316 if (endbrace == RExC_parse) { /* empty: \N{} */
10319 *node_p = reg_node(pRExC_state,NOTHING);
10321 else if (in_char_class) {
10322 if (SIZE_ONLY && in_char_class) {
10324 RExC_parse++; /* Position after the "}" */
10325 vFAIL("Zero length \\N{}");
10328 ckWARNreg(RExC_parse,
10329 "Ignoring zero length \\N{} in character class");
10337 nextchar(pRExC_state);
10341 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10342 RExC_parse += 2; /* Skip past the 'U+' */
10344 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10346 /* Code points are separated by dots. If none, there is only one code
10347 * point, and is terminated by the brace */
10348 has_multiple_chars = (endchar < endbrace);
10350 if (valuep && (! has_multiple_chars || in_char_class)) {
10351 /* We only pay attention to the first char of
10352 multichar strings being returned in char classes. I kinda wonder
10353 if this makes sense as it does change the behaviour
10354 from earlier versions, OTOH that behaviour was broken
10355 as well. XXX Solution is to recharacterize as
10356 [rest-of-class]|multi1|multi2... */
10358 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10359 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10360 | PERL_SCAN_DISALLOW_PREFIX
10361 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10363 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10365 /* The tokenizer should have guaranteed validity, but it's possible to
10366 * bypass it by using single quoting, so check */
10367 if (length_of_hex == 0
10368 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10370 RExC_parse += length_of_hex; /* Includes all the valid */
10371 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10372 ? UTF8SKIP(RExC_parse)
10374 /* Guard against malformed utf8 */
10375 if (RExC_parse >= endchar) {
10376 RExC_parse = endchar;
10378 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10381 if (in_char_class && has_multiple_chars) {
10383 RExC_parse = endbrace;
10384 vFAIL("\\N{} in character class restricted to one character");
10387 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10391 RExC_parse = endbrace + 1;
10393 else if (! node_p || ! has_multiple_chars) {
10395 /* Here, the input is legal, but not according to the caller's
10396 * options. We fail without advancing the parse, so that the
10397 * caller can try again */
10403 /* What is done here is to convert this to a sub-pattern of the form
10404 * (?:\x{char1}\x{char2}...)
10405 * and then call reg recursively. That way, it retains its atomicness,
10406 * while not having to worry about special handling that some code
10407 * points may have. toke.c has converted the original Unicode values
10408 * to native, so that we can just pass on the hex values unchanged. We
10409 * do have to set a flag to keep recoding from happening in the
10412 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10414 char *orig_end = RExC_end;
10417 while (RExC_parse < endbrace) {
10419 /* Convert to notation the rest of the code understands */
10420 sv_catpv(substitute_parse, "\\x{");
10421 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10422 sv_catpv(substitute_parse, "}");
10424 /* Point to the beginning of the next character in the sequence. */
10425 RExC_parse = endchar + 1;
10426 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10428 sv_catpv(substitute_parse, ")");
10430 RExC_parse = SvPV(substitute_parse, len);
10432 /* Don't allow empty number */
10434 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10436 RExC_end = RExC_parse + len;
10438 /* The values are Unicode, and therefore not subject to recoding */
10439 RExC_override_recoding = 1;
10441 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10442 if (flags & RESTART_UTF8) {
10443 *flagp = RESTART_UTF8;
10446 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10449 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10451 RExC_parse = endbrace;
10452 RExC_end = orig_end;
10453 RExC_override_recoding = 0;
10455 nextchar(pRExC_state);
10465 * It returns the code point in utf8 for the value in *encp.
10466 * value: a code value in the source encoding
10467 * encp: a pointer to an Encode object
10469 * If the result from Encode is not a single character,
10470 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10473 S_reg_recode(pTHX_ const char value, SV **encp)
10476 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10477 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10478 const STRLEN newlen = SvCUR(sv);
10479 UV uv = UNICODE_REPLACEMENT;
10481 PERL_ARGS_ASSERT_REG_RECODE;
10485 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10488 if (!newlen || numlen != newlen) {
10489 uv = UNICODE_REPLACEMENT;
10495 PERL_STATIC_INLINE U8
10496 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10500 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10506 op = get_regex_charset(RExC_flags);
10507 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10508 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10509 been, so there is no hole */
10512 return op + EXACTF;
10515 PERL_STATIC_INLINE void
10516 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10518 /* This knows the details about sizing an EXACTish node, setting flags for
10519 * it (by setting <*flagp>, and potentially populating it with a single
10522 * If <len> (the length in bytes) is non-zero, this function assumes that
10523 * the node has already been populated, and just does the sizing. In this
10524 * case <code_point> should be the final code point that has already been
10525 * placed into the node. This value will be ignored except that under some
10526 * circumstances <*flagp> is set based on it.
10528 * If <len> is zero, the function assumes that the node is to contain only
10529 * the single character given by <code_point> and calculates what <len>
10530 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10531 * additionally will populate the node's STRING with <code_point>, if <len>
10532 * is 0. In both cases <*flagp> is appropriately set
10534 * It knows that under FOLD, the Latin Sharp S and UTF characters above
10535 * 255, must be folded (the former only when the rules indicate it can
10538 bool len_passed_in = cBOOL(len != 0);
10539 U8 character[UTF8_MAXBYTES_CASE+1];
10541 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10543 if (! len_passed_in) {
10545 if (FOLD && (! LOC || code_point > 255)) {
10546 _to_uni_fold_flags(code_point,
10549 FOLD_FLAGS_FULL | ((LOC)
10550 ? FOLD_FLAGS_LOCALE
10551 : (ASCII_FOLD_RESTRICTED)
10552 ? FOLD_FLAGS_NOMIX_ASCII
10556 uvchr_to_utf8( character, code_point);
10557 len = UTF8SKIP(character);
10561 || code_point != LATIN_SMALL_LETTER_SHARP_S
10562 || ASCII_FOLD_RESTRICTED
10563 || ! AT_LEAST_UNI_SEMANTICS)
10565 *character = (U8) code_point;
10570 *(character + 1) = 's';
10576 RExC_size += STR_SZ(len);
10579 RExC_emit += STR_SZ(len);
10580 STR_LEN(node) = len;
10581 if (! len_passed_in) {
10582 Copy((char *) character, STRING(node), len, char);
10586 *flagp |= HASWIDTH;
10588 /* A single character node is SIMPLE, except for the special-cased SHARP S
10590 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10591 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10592 || ! FOLD || ! DEPENDS_SEMANTICS))
10599 /* return atoi(p), unless it's too big to sensibly be a backref,
10600 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
10603 S_backref_value(char *p)
10607 for (;isDIGIT(*q); q++); /* calculate length of num */
10608 if (q - p == 0 || q - p > 9)
10615 - regatom - the lowest level
10617 Try to identify anything special at the start of the pattern. If there
10618 is, then handle it as required. This may involve generating a single regop,
10619 such as for an assertion; or it may involve recursing, such as to
10620 handle a () structure.
10622 If the string doesn't start with something special then we gobble up
10623 as much literal text as we can.
10625 Once we have been able to handle whatever type of thing started the
10626 sequence, we return.
10628 Note: we have to be careful with escapes, as they can be both literal
10629 and special, and in the case of \10 and friends, context determines which.
10631 A summary of the code structure is:
10633 switch (first_byte) {
10634 cases for each special:
10635 handle this special;
10638 switch (2nd byte) {
10639 cases for each unambiguous special:
10640 handle this special;
10642 cases for each ambigous special/literal:
10644 if (special) handle here
10646 default: // unambiguously literal:
10649 default: // is a literal char
10652 create EXACTish node for literal;
10653 while (more input and node isn't full) {
10654 switch (input_byte) {
10655 cases for each special;
10656 make sure parse pointer is set so that the next call to
10657 regatom will see this special first
10658 goto loopdone; // EXACTish node terminated by prev. char
10660 append char to EXACTISH node;
10662 get next input byte;
10666 return the generated node;
10668 Specifically there are two separate switches for handling
10669 escape sequences, with the one for handling literal escapes requiring
10670 a dummy entry for all of the special escapes that are actually handled
10673 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10675 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10677 Otherwise does not return NULL.
10681 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10684 regnode *ret = NULL;
10686 char *parse_start = RExC_parse;
10690 GET_RE_DEBUG_FLAGS_DECL;
10692 *flagp = WORST; /* Tentatively. */
10694 DEBUG_PARSE("atom");
10696 PERL_ARGS_ASSERT_REGATOM;
10699 switch ((U8)*RExC_parse) {
10701 RExC_seen_zerolen++;
10702 nextchar(pRExC_state);
10703 if (RExC_flags & RXf_PMf_MULTILINE)
10704 ret = reg_node(pRExC_state, MBOL);
10705 else if (RExC_flags & RXf_PMf_SINGLELINE)
10706 ret = reg_node(pRExC_state, SBOL);
10708 ret = reg_node(pRExC_state, BOL);
10709 Set_Node_Length(ret, 1); /* MJD */
10712 nextchar(pRExC_state);
10714 RExC_seen_zerolen++;
10715 if (RExC_flags & RXf_PMf_MULTILINE)
10716 ret = reg_node(pRExC_state, MEOL);
10717 else if (RExC_flags & RXf_PMf_SINGLELINE)
10718 ret = reg_node(pRExC_state, SEOL);
10720 ret = reg_node(pRExC_state, EOL);
10721 Set_Node_Length(ret, 1); /* MJD */
10724 nextchar(pRExC_state);
10725 if (RExC_flags & RXf_PMf_SINGLELINE)
10726 ret = reg_node(pRExC_state, SANY);
10728 ret = reg_node(pRExC_state, REG_ANY);
10729 *flagp |= HASWIDTH|SIMPLE;
10731 Set_Node_Length(ret, 1); /* MJD */
10735 char * const oregcomp_parse = ++RExC_parse;
10736 ret = regclass(pRExC_state, flagp,depth+1,
10737 FALSE, /* means parse the whole char class */
10738 TRUE, /* allow multi-char folds */
10739 FALSE, /* don't silence non-portable warnings. */
10741 if (*RExC_parse != ']') {
10742 RExC_parse = oregcomp_parse;
10743 vFAIL("Unmatched [");
10746 if (*flagp & RESTART_UTF8)
10748 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10751 nextchar(pRExC_state);
10752 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10756 nextchar(pRExC_state);
10757 ret = reg(pRExC_state, 2, &flags,depth+1);
10759 if (flags & TRYAGAIN) {
10760 if (RExC_parse == RExC_end) {
10761 /* Make parent create an empty node if needed. */
10762 *flagp |= TRYAGAIN;
10767 if (flags & RESTART_UTF8) {
10768 *flagp = RESTART_UTF8;
10771 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10773 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10777 if (flags & TRYAGAIN) {
10778 *flagp |= TRYAGAIN;
10781 vFAIL("Internal urp");
10782 /* Supposed to be caught earlier. */
10785 if (!regcurly(RExC_parse, FALSE)) {
10794 vFAIL("Quantifier follows nothing");
10799 This switch handles escape sequences that resolve to some kind
10800 of special regop and not to literal text. Escape sequnces that
10801 resolve to literal text are handled below in the switch marked
10804 Every entry in this switch *must* have a corresponding entry
10805 in the literal escape switch. However, the opposite is not
10806 required, as the default for this switch is to jump to the
10807 literal text handling code.
10809 switch ((U8)*++RExC_parse) {
10811 /* Special Escapes */
10813 RExC_seen_zerolen++;
10814 ret = reg_node(pRExC_state, SBOL);
10816 goto finish_meta_pat;
10818 ret = reg_node(pRExC_state, GPOS);
10819 RExC_seen |= REG_SEEN_GPOS;
10821 goto finish_meta_pat;
10823 RExC_seen_zerolen++;
10824 ret = reg_node(pRExC_state, KEEPS);
10826 /* XXX:dmq : disabling in-place substitution seems to
10827 * be necessary here to avoid cases of memory corruption, as
10828 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10830 RExC_seen |= REG_SEEN_LOOKBEHIND;
10831 goto finish_meta_pat;
10833 ret = reg_node(pRExC_state, SEOL);
10835 RExC_seen_zerolen++; /* Do not optimize RE away */
10836 goto finish_meta_pat;
10838 ret = reg_node(pRExC_state, EOS);
10840 RExC_seen_zerolen++; /* Do not optimize RE away */
10841 goto finish_meta_pat;
10843 ret = reg_node(pRExC_state, CANY);
10844 RExC_seen |= REG_SEEN_CANY;
10845 *flagp |= HASWIDTH|SIMPLE;
10846 goto finish_meta_pat;
10848 ret = reg_node(pRExC_state, CLUMP);
10849 *flagp |= HASWIDTH;
10850 goto finish_meta_pat;
10856 arg = ANYOF_WORDCHAR;
10860 RExC_seen_zerolen++;
10861 RExC_seen |= REG_SEEN_LOOKBEHIND;
10862 op = BOUND + get_regex_charset(RExC_flags);
10863 if (op > BOUNDA) { /* /aa is same as /a */
10866 ret = reg_node(pRExC_state, op);
10867 FLAGS(ret) = get_regex_charset(RExC_flags);
10869 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10870 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10872 goto finish_meta_pat;
10874 RExC_seen_zerolen++;
10875 RExC_seen |= REG_SEEN_LOOKBEHIND;
10876 op = NBOUND + get_regex_charset(RExC_flags);
10877 if (op > NBOUNDA) { /* /aa is same as /a */
10880 ret = reg_node(pRExC_state, op);
10881 FLAGS(ret) = get_regex_charset(RExC_flags);
10883 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10884 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10886 goto finish_meta_pat;
10896 ret = reg_node(pRExC_state, LNBREAK);
10897 *flagp |= HASWIDTH|SIMPLE;
10898 goto finish_meta_pat;
10906 goto join_posix_op_known;
10912 arg = ANYOF_VERTWS;
10914 goto join_posix_op_known;
10924 op = POSIXD + get_regex_charset(RExC_flags);
10925 if (op > POSIXA) { /* /aa is same as /a */
10929 join_posix_op_known:
10932 op += NPOSIXD - POSIXD;
10935 ret = reg_node(pRExC_state, op);
10937 FLAGS(ret) = namedclass_to_classnum(arg);
10940 *flagp |= HASWIDTH|SIMPLE;
10944 nextchar(pRExC_state);
10945 Set_Node_Length(ret, 2); /* MJD */
10951 char* parse_start = RExC_parse - 2;
10956 ret = regclass(pRExC_state, flagp,depth+1,
10957 TRUE, /* means just parse this element */
10958 FALSE, /* don't allow multi-char folds */
10959 FALSE, /* don't silence non-portable warnings.
10960 It would be a bug if these returned
10963 /* regclass() can only return RESTART_UTF8 if multi-char folds
10966 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10971 Set_Node_Offset(ret, parse_start + 2);
10972 Set_Node_Cur_Length(ret, parse_start);
10973 nextchar(pRExC_state);
10977 /* Handle \N and \N{NAME} with multiple code points here and not
10978 * below because it can be multicharacter. join_exact() will join
10979 * them up later on. Also this makes sure that things like
10980 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10981 * The options to the grok function call causes it to fail if the
10982 * sequence is just a single code point. We then go treat it as
10983 * just another character in the current EXACT node, and hence it
10984 * gets uniform treatment with all the other characters. The
10985 * special treatment for quantifiers is not needed for such single
10986 * character sequences */
10988 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10989 FALSE /* not strict */ )) {
10990 if (*flagp & RESTART_UTF8)
10996 case 'k': /* Handle \k<NAME> and \k'NAME' */
10999 char ch= RExC_parse[1];
11000 if (ch != '<' && ch != '\'' && ch != '{') {
11002 vFAIL2("Sequence %.2s... not terminated",parse_start);
11004 /* this pretty much dupes the code for (?P=...) in reg(), if
11005 you change this make sure you change that */
11006 char* name_start = (RExC_parse += 2);
11008 SV *sv_dat = reg_scan_name(pRExC_state,
11009 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11010 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11011 if (RExC_parse == name_start || *RExC_parse != ch)
11012 vFAIL2("Sequence %.3s... not terminated",parse_start);
11015 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11016 RExC_rxi->data->data[num]=(void*)sv_dat;
11017 SvREFCNT_inc_simple_void(sv_dat);
11021 ret = reganode(pRExC_state,
11024 : (ASCII_FOLD_RESTRICTED)
11026 : (AT_LEAST_UNI_SEMANTICS)
11032 *flagp |= HASWIDTH;
11034 /* override incorrect value set in reganode MJD */
11035 Set_Node_Offset(ret, parse_start+1);
11036 Set_Node_Cur_Length(ret, parse_start);
11037 nextchar(pRExC_state);
11043 case '1': case '2': case '3': case '4':
11044 case '5': case '6': case '7': case '8': case '9':
11049 if (*RExC_parse == 'g') {
11053 if (*RExC_parse == '{') {
11057 if (*RExC_parse == '-') {
11061 if (hasbrace && !isDIGIT(*RExC_parse)) {
11062 if (isrel) RExC_parse--;
11064 goto parse_named_seq;
11067 num = S_backref_value(RExC_parse);
11069 vFAIL("Reference to invalid group 0");
11070 else if (num == I32_MAX) {
11071 if (isDIGIT(*RExC_parse))
11072 vFAIL("Reference to nonexistent group");
11074 vFAIL("Unterminated \\g... pattern");
11078 num = RExC_npar - num;
11080 vFAIL("Reference to nonexistent or unclosed group");
11084 num = S_backref_value(RExC_parse);
11085 /* bare \NNN might be backref or octal */
11086 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11087 && *RExC_parse != '8' && *RExC_parse != '9'))
11088 /* Probably a character specified in octal, e.g. \35 */
11092 /* at this point RExC_parse definitely points to a backref
11095 #ifdef RE_TRACK_PATTERN_OFFSETS
11096 char * const parse_start = RExC_parse - 1; /* MJD */
11098 while (isDIGIT(*RExC_parse))
11101 if (*RExC_parse != '}')
11102 vFAIL("Unterminated \\g{...} pattern");
11106 if (num > (I32)RExC_rx->nparens)
11107 vFAIL("Reference to nonexistent group");
11110 ret = reganode(pRExC_state,
11113 : (ASCII_FOLD_RESTRICTED)
11115 : (AT_LEAST_UNI_SEMANTICS)
11121 *flagp |= HASWIDTH;
11123 /* override incorrect value set in reganode MJD */
11124 Set_Node_Offset(ret, parse_start+1);
11125 Set_Node_Cur_Length(ret, parse_start);
11127 nextchar(pRExC_state);
11132 if (RExC_parse >= RExC_end)
11133 FAIL("Trailing \\");
11136 /* Do not generate "unrecognized" warnings here, we fall
11137 back into the quick-grab loop below */
11144 if (RExC_flags & RXf_PMf_EXTENDED) {
11145 if ( reg_skipcomment( pRExC_state ) )
11152 parse_start = RExC_parse - 1;
11161 #define MAX_NODE_STRING_SIZE 127
11162 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11164 U8 upper_parse = MAX_NODE_STRING_SIZE;
11166 U8 node_type = compute_EXACTish(pRExC_state);
11167 bool next_is_quantifier;
11168 char * oldp = NULL;
11170 /* We can convert EXACTF nodes to EXACTFU if they contain only
11171 * characters that match identically regardless of the target
11172 * string's UTF8ness. The reason to do this is that EXACTF is not
11173 * trie-able, EXACTFU is. (We don't need to figure this out until
11175 bool maybe_exactfu = node_type == EXACTF && PASS2;
11177 /* If a folding node contains only code points that don't
11178 * participate in folds, it can be changed into an EXACT node,
11179 * which allows the optimizer more things to look for */
11182 ret = reg_node(pRExC_state, node_type);
11184 /* In pass1, folded, we use a temporary buffer instead of the
11185 * actual node, as the node doesn't exist yet */
11186 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11192 /* We do the EXACTFish to EXACT node only if folding, and not if in
11193 * locale, as whether a character folds or not isn't known until
11194 * runtime. (And we don't need to figure this out until pass 2) */
11195 maybe_exact = FOLD && ! LOC && PASS2;
11197 /* XXX The node can hold up to 255 bytes, yet this only goes to
11198 * 127. I (khw) do not know why. Keeping it somewhat less than
11199 * 255 allows us to not have to worry about overflow due to
11200 * converting to utf8 and fold expansion, but that value is
11201 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11202 * split up by this limit into a single one using the real max of
11203 * 255. Even at 127, this breaks under rare circumstances. If
11204 * folding, we do not want to split a node at a character that is a
11205 * non-final in a multi-char fold, as an input string could just
11206 * happen to want to match across the node boundary. The join
11207 * would solve that problem if the join actually happens. But a
11208 * series of more than two nodes in a row each of 127 would cause
11209 * the first join to succeed to get to 254, but then there wouldn't
11210 * be room for the next one, which could at be one of those split
11211 * multi-char folds. I don't know of any fool-proof solution. One
11212 * could back off to end with only a code point that isn't such a
11213 * non-final, but it is possible for there not to be any in the
11215 for (p = RExC_parse - 1;
11216 len < upper_parse && p < RExC_end;
11221 if (RExC_flags & RXf_PMf_EXTENDED)
11222 p = regwhite( pRExC_state, p );
11233 /* Literal Escapes Switch
11235 This switch is meant to handle escape sequences that
11236 resolve to a literal character.
11238 Every escape sequence that represents something
11239 else, like an assertion or a char class, is handled
11240 in the switch marked 'Special Escapes' above in this
11241 routine, but also has an entry here as anything that
11242 isn't explicitly mentioned here will be treated as
11243 an unescaped equivalent literal.
11246 switch ((U8)*++p) {
11247 /* These are all the special escapes. */
11248 case 'A': /* Start assertion */
11249 case 'b': case 'B': /* Word-boundary assertion*/
11250 case 'C': /* Single char !DANGEROUS! */
11251 case 'd': case 'D': /* digit class */
11252 case 'g': case 'G': /* generic-backref, pos assertion */
11253 case 'h': case 'H': /* HORIZWS */
11254 case 'k': case 'K': /* named backref, keep marker */
11255 case 'p': case 'P': /* Unicode property */
11256 case 'R': /* LNBREAK */
11257 case 's': case 'S': /* space class */
11258 case 'v': case 'V': /* VERTWS */
11259 case 'w': case 'W': /* word class */
11260 case 'X': /* eXtended Unicode "combining character sequence" */
11261 case 'z': case 'Z': /* End of line/string assertion */
11265 /* Anything after here is an escape that resolves to a
11266 literal. (Except digits, which may or may not)
11272 case 'N': /* Handle a single-code point named character. */
11273 /* The options cause it to fail if a multiple code
11274 * point sequence. Handle those in the switch() above
11276 RExC_parse = p + 1;
11277 if (! grok_bslash_N(pRExC_state, NULL, &ender,
11278 flagp, depth, FALSE,
11279 FALSE /* not strict */ ))
11281 if (*flagp & RESTART_UTF8)
11282 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11283 RExC_parse = p = oldp;
11287 if (ender > 0xff) {
11304 ender = ASCII_TO_NATIVE('\033');
11314 const char* error_msg;
11316 bool valid = grok_bslash_o(&p,
11319 TRUE, /* out warnings */
11320 FALSE, /* not strict */
11321 TRUE, /* Output warnings
11326 RExC_parse = p; /* going to die anyway; point
11327 to exact spot of failure */
11331 if (PL_encoding && ender < 0x100) {
11332 goto recode_encoding;
11334 if (ender > 0xff) {
11341 UV result = UV_MAX; /* initialize to erroneous
11343 const char* error_msg;
11345 bool valid = grok_bslash_x(&p,
11348 TRUE, /* out warnings */
11349 FALSE, /* not strict */
11350 TRUE, /* Output warnings
11355 RExC_parse = p; /* going to die anyway; point
11356 to exact spot of failure */
11361 if (PL_encoding && ender < 0x100) {
11362 goto recode_encoding;
11364 if (ender > 0xff) {
11371 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11373 case '8': case '9': /* must be a backreference */
11376 case '1': case '2': case '3':case '4':
11377 case '5': case '6': case '7':
11378 /* When we parse backslash escapes there is ambiguity
11379 * between backreferences and octal escapes. Any escape
11380 * from \1 - \9 is a backreference, any multi-digit
11381 * escape which does not start with 0 and which when
11382 * evaluated as decimal could refer to an already
11383 * parsed capture buffer is a backslash. Anything else
11386 * Note this implies that \118 could be interpreted as
11387 * 118 OR as "\11" . "8" depending on whether there
11388 * were 118 capture buffers defined already in the
11390 if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
11391 { /* Not to be treated as an octal constant, go
11398 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11400 ender = grok_oct(p, &numlen, &flags, NULL);
11401 if (ender > 0xff) {
11405 if (SIZE_ONLY /* like \08, \178 */
11408 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11410 reg_warn_non_literal_string(
11412 form_short_octal_warning(p, numlen));
11415 if (PL_encoding && ender < 0x100)
11416 goto recode_encoding;
11419 if (! RExC_override_recoding) {
11420 SV* enc = PL_encoding;
11421 ender = reg_recode((const char)(U8)ender, &enc);
11422 if (!enc && SIZE_ONLY)
11423 ckWARNreg(p, "Invalid escape in the specified encoding");
11429 FAIL("Trailing \\");
11432 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11433 /* Include any { following the alpha to emphasize
11434 * that it could be part of an escape at some point
11436 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11437 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11439 goto normal_default;
11440 } /* End of switch on '\' */
11442 default: /* A literal character */
11445 && RExC_flags & RXf_PMf_EXTENDED
11446 && ckWARN_d(WARN_DEPRECATED)
11447 && is_PATWS_non_low(p, UTF))
11449 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11450 "Escape literal pattern white space under /x");
11454 if (UTF8_IS_START(*p) && UTF) {
11456 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11457 &numlen, UTF8_ALLOW_DEFAULT);
11463 } /* End of switch on the literal */
11465 /* Here, have looked at the literal character and <ender>
11466 * contains its ordinal, <p> points to the character after it
11469 if ( RExC_flags & RXf_PMf_EXTENDED)
11470 p = regwhite( pRExC_state, p );
11472 /* If the next thing is a quantifier, it applies to this
11473 * character only, which means that this character has to be in
11474 * its own node and can't just be appended to the string in an
11475 * existing node, so if there are already other characters in
11476 * the node, close the node with just them, and set up to do
11477 * this character again next time through, when it will be the
11478 * only thing in its new node */
11479 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11487 const STRLEN unilen = reguni(pRExC_state, ender, s);
11493 /* The loop increments <len> each time, as all but this
11494 * path (and one other) through it add a single byte to
11495 * the EXACTish node. But this one has changed len to
11496 * be the correct final value, so subtract one to
11497 * cancel out the increment that follows */
11501 REGC((char)ender, s++);
11504 else /* FOLD */ if (! ( UTF
11505 /* See comments for join_exact() as to why we fold this
11506 * non-UTF at compile time */
11507 || (node_type == EXACTFU
11508 && ender == LATIN_SMALL_LETTER_SHARP_S)))
11510 if (IS_IN_SOME_FOLD_L1(ender)) {
11511 maybe_exact = FALSE;
11513 /* See if the character's fold differs between /d and
11514 * /u. This includes the multi-char fold SHARP S to
11517 && (PL_fold[ender] != PL_fold_latin1[ender]
11518 || ender == LATIN_SMALL_LETTER_SHARP_S
11520 && isARG2_lower_or_UPPER_ARG1('s', ender)
11521 && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11523 maybe_exactfu = FALSE;
11526 *(s++) = (char) ender;
11530 /* Prime the casefolded buffer. Locale rules, which apply
11531 * only to code points < 256, aren't known until execution,
11532 * so for them, just output the original character using
11533 * utf8. If we start to fold non-UTF patterns, be sure to
11534 * update join_exact() */
11535 if (LOC && ender < 256) {
11536 if (UVCHR_IS_INVARIANT(ender)) {
11540 *s = UTF8_TWO_BYTE_HI(ender);
11541 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11546 UV folded = _to_uni_fold_flags(
11551 | ((LOC) ? FOLD_FLAGS_LOCALE
11552 : (ASCII_FOLD_RESTRICTED)
11553 ? FOLD_FLAGS_NOMIX_ASCII
11557 /* If this node only contains non-folding code points
11558 * so far, see if this new one is also non-folding */
11560 if (folded != ender) {
11561 maybe_exact = FALSE;
11564 /* Here the fold is the original; we have
11565 * to check further to see if anything
11567 if (! PL_utf8_foldable) {
11568 SV* swash = swash_init("utf8",
11570 &PL_sv_undef, 1, 0);
11572 _get_swash_invlist(swash);
11573 SvREFCNT_dec_NN(swash);
11575 if (_invlist_contains_cp(PL_utf8_foldable,
11578 maybe_exact = FALSE;
11586 /* The loop increments <len> each time, as all but this
11587 * path (and one other) through it add a single byte to the
11588 * EXACTish node. But this one has changed len to be the
11589 * correct final value, so subtract one to cancel out the
11590 * increment that follows */
11591 len += foldlen - 1;
11594 if (next_is_quantifier) {
11596 /* Here, the next input is a quantifier, and to get here,
11597 * the current character is the only one in the node.
11598 * Also, here <len> doesn't include the final byte for this
11604 } /* End of loop through literal characters */
11606 /* Here we have either exhausted the input or ran out of room in
11607 * the node. (If we encountered a character that can't be in the
11608 * node, transfer is made directly to <loopdone>, and so we
11609 * wouldn't have fallen off the end of the loop.) In the latter
11610 * case, we artificially have to split the node into two, because
11611 * we just don't have enough space to hold everything. This
11612 * creates a problem if the final character participates in a
11613 * multi-character fold in the non-final position, as a match that
11614 * should have occurred won't, due to the way nodes are matched,
11615 * and our artificial boundary. So back off until we find a non-
11616 * problematic character -- one that isn't at the beginning or
11617 * middle of such a fold. (Either it doesn't participate in any
11618 * folds, or appears only in the final position of all the folds it
11619 * does participate in.) A better solution with far fewer false
11620 * positives, and that would fill the nodes more completely, would
11621 * be to actually have available all the multi-character folds to
11622 * test against, and to back-off only far enough to be sure that
11623 * this node isn't ending with a partial one. <upper_parse> is set
11624 * further below (if we need to reparse the node) to include just
11625 * up through that final non-problematic character that this code
11626 * identifies, so when it is set to less than the full node, we can
11627 * skip the rest of this */
11628 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11630 const STRLEN full_len = len;
11632 assert(len >= MAX_NODE_STRING_SIZE);
11634 /* Here, <s> points to the final byte of the final character.
11635 * Look backwards through the string until find a non-
11636 * problematic character */
11640 /* These two have no multi-char folds to non-UTF characters
11642 if (ASCII_FOLD_RESTRICTED || LOC) {
11646 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11650 if (! PL_NonL1NonFinalFold) {
11651 PL_NonL1NonFinalFold = _new_invlist_C_array(
11652 NonL1_Perl_Non_Final_Folds_invlist);
11655 /* Point to the first byte of the final character */
11656 s = (char *) utf8_hop((U8 *) s, -1);
11658 while (s >= s0) { /* Search backwards until find
11659 non-problematic char */
11660 if (UTF8_IS_INVARIANT(*s)) {
11662 /* There are no ascii characters that participate
11663 * in multi-char folds under /aa. In EBCDIC, the
11664 * non-ascii invariants are all control characters,
11665 * so don't ever participate in any folds. */
11666 if (ASCII_FOLD_RESTRICTED
11667 || ! IS_NON_FINAL_FOLD(*s))
11672 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11674 /* No Latin1 characters participate in multi-char
11675 * folds under /l */
11677 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11683 else if (! _invlist_contains_cp(
11684 PL_NonL1NonFinalFold,
11685 valid_utf8_to_uvchr((U8 *) s, NULL)))
11690 /* Here, the current character is problematic in that
11691 * it does occur in the non-final position of some
11692 * fold, so try the character before it, but have to
11693 * special case the very first byte in the string, so
11694 * we don't read outside the string */
11695 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11696 } /* End of loop backwards through the string */
11698 /* If there were only problematic characters in the string,
11699 * <s> will point to before s0, in which case the length
11700 * should be 0, otherwise include the length of the
11701 * non-problematic character just found */
11702 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11705 /* Here, have found the final character, if any, that is
11706 * non-problematic as far as ending the node without splitting
11707 * it across a potential multi-char fold. <len> contains the
11708 * number of bytes in the node up-to and including that
11709 * character, or is 0 if there is no such character, meaning
11710 * the whole node contains only problematic characters. In
11711 * this case, give up and just take the node as-is. We can't
11716 /* If the node ends in an 's' we make sure it stays EXACTF,
11717 * as if it turns into an EXACTFU, it could later get
11718 * joined with another 's' that would then wrongly match
11720 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11722 maybe_exactfu = FALSE;
11726 /* Here, the node does contain some characters that aren't
11727 * problematic. If one such is the final character in the
11728 * node, we are done */
11729 if (len == full_len) {
11732 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11734 /* If the final character is problematic, but the
11735 * penultimate is not, back-off that last character to
11736 * later start a new node with it */
11741 /* Here, the final non-problematic character is earlier
11742 * in the input than the penultimate character. What we do
11743 * is reparse from the beginning, going up only as far as
11744 * this final ok one, thus guaranteeing that the node ends
11745 * in an acceptable character. The reason we reparse is
11746 * that we know how far in the character is, but we don't
11747 * know how to correlate its position with the input parse.
11748 * An alternate implementation would be to build that
11749 * correlation as we go along during the original parse,
11750 * but that would entail extra work for every node, whereas
11751 * this code gets executed only when the string is too
11752 * large for the node, and the final two characters are
11753 * problematic, an infrequent occurrence. Yet another
11754 * possible strategy would be to save the tail of the
11755 * string, and the next time regatom is called, initialize
11756 * with that. The problem with this is that unless you
11757 * back off one more character, you won't be guaranteed
11758 * regatom will get called again, unless regbranch,
11759 * regpiece ... are also changed. If you do back off that
11760 * extra character, so that there is input guaranteed to
11761 * force calling regatom, you can't handle the case where
11762 * just the first character in the node is acceptable. I
11763 * (khw) decided to try this method which doesn't have that
11764 * pitfall; if performance issues are found, we can do a
11765 * combination of the current approach plus that one */
11771 } /* End of verifying node ends with an appropriate char */
11773 loopdone: /* Jumped to when encounters something that shouldn't be in
11776 /* I (khw) don't know if you can get here with zero length, but the
11777 * old code handled this situation by creating a zero-length EXACT
11778 * node. Might as well be NOTHING instead */
11784 /* If 'maybe_exact' is still set here, means there are no
11785 * code points in the node that participate in folds;
11786 * similarly for 'maybe_exactfu' and code points that match
11787 * differently depending on UTF8ness of the target string
11792 else if (maybe_exactfu) {
11796 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11799 RExC_parse = p - 1;
11800 Set_Node_Cur_Length(ret, parse_start);
11801 nextchar(pRExC_state);
11803 /* len is STRLEN which is unsigned, need to copy to signed */
11806 vFAIL("Internal disaster");
11809 } /* End of label 'defchar:' */
11811 } /* End of giant switch on input character */
11817 S_regwhite( RExC_state_t *pRExC_state, char *p )
11819 const char *e = RExC_end;
11821 PERL_ARGS_ASSERT_REGWHITE;
11826 else if (*p == '#') {
11829 if (*p++ == '\n') {
11835 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11844 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11846 /* Returns the next non-pattern-white space, non-comment character (the
11847 * latter only if 'recognize_comment is true) in the string p, which is
11848 * ended by RExC_end. If there is no line break ending a comment,
11849 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11850 const char *e = RExC_end;
11852 PERL_ARGS_ASSERT_REGPATWS;
11856 if ((len = is_PATWS_safe(p, e, UTF))) {
11859 else if (recognize_comment && *p == '#') {
11863 if (is_LNBREAK_safe(p, e, UTF)) {
11869 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11878 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
11880 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
11881 * sets up the bitmap and any flags, removing those code points from the
11882 * inversion list, setting it to NULL should it become completely empty */
11884 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
11885 assert(PL_regkind[OP(node)] == ANYOF);
11887 ANYOF_BITMAP_ZERO(node);
11888 if (*invlist_ptr) {
11890 /* This gets set if we actually need to modify things */
11891 bool change_invlist = FALSE;
11895 /* Start looking through *invlist_ptr */
11896 invlist_iterinit(*invlist_ptr);
11897 while (invlist_iternext(*invlist_ptr, &start, &end)) {
11901 if (end == UV_MAX && start <= 256) {
11902 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
11905 /* Quit if are above what we should change */
11910 change_invlist = TRUE;
11912 /* Set all the bits in the range, up to the max that we are doing */
11913 high = (end < 255) ? end : 255;
11914 for (i = start; i <= (int) high; i++) {
11915 if (! ANYOF_BITMAP_TEST(node, i)) {
11916 ANYOF_BITMAP_SET(node, i);
11920 invlist_iterfinish(*invlist_ptr);
11922 /* Done with loop; remove any code points that are in the bitmap from
11923 * *invlist_ptr; similarly for code points above latin1 if we have a flag
11924 * to match all of them anyways */
11925 if (change_invlist) {
11926 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
11928 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
11929 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
11932 /* If have completely emptied it, remove it completely */
11933 if (_invlist_len(*invlist_ptr) == 0) {
11934 SvREFCNT_dec_NN(*invlist_ptr);
11935 *invlist_ptr = NULL;
11940 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11941 Character classes ([:foo:]) can also be negated ([:^foo:]).
11942 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11943 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11944 but trigger failures because they are currently unimplemented. */
11946 #define POSIXCC_DONE(c) ((c) == ':')
11947 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11948 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11950 PERL_STATIC_INLINE I32
11951 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11954 I32 namedclass = OOB_NAMEDCLASS;
11956 PERL_ARGS_ASSERT_REGPPOSIXCC;
11958 if (value == '[' && RExC_parse + 1 < RExC_end &&
11959 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11960 POSIXCC(UCHARAT(RExC_parse)))
11962 const char c = UCHARAT(RExC_parse);
11963 char* const s = RExC_parse++;
11965 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11967 if (RExC_parse == RExC_end) {
11970 /* Try to give a better location for the error (than the end of
11971 * the string) by looking for the matching ']' */
11973 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11976 vFAIL2("Unmatched '%c' in POSIX class", c);
11978 /* Grandfather lone [:, [=, [. */
11982 const char* const t = RExC_parse++; /* skip over the c */
11985 if (UCHARAT(RExC_parse) == ']') {
11986 const char *posixcc = s + 1;
11987 RExC_parse++; /* skip over the ending ] */
11990 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11991 const I32 skip = t - posixcc;
11993 /* Initially switch on the length of the name. */
11996 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11997 this is the Perl \w
11999 namedclass = ANYOF_WORDCHAR;
12002 /* Names all of length 5. */
12003 /* alnum alpha ascii blank cntrl digit graph lower
12004 print punct space upper */
12005 /* Offset 4 gives the best switch position. */
12006 switch (posixcc[4]) {
12008 if (memEQ(posixcc, "alph", 4)) /* alpha */
12009 namedclass = ANYOF_ALPHA;
12012 if (memEQ(posixcc, "spac", 4)) /* space */
12013 namedclass = ANYOF_PSXSPC;
12016 if (memEQ(posixcc, "grap", 4)) /* graph */
12017 namedclass = ANYOF_GRAPH;
12020 if (memEQ(posixcc, "asci", 4)) /* ascii */
12021 namedclass = ANYOF_ASCII;
12024 if (memEQ(posixcc, "blan", 4)) /* blank */
12025 namedclass = ANYOF_BLANK;
12028 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12029 namedclass = ANYOF_CNTRL;
12032 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12033 namedclass = ANYOF_ALPHANUMERIC;
12036 if (memEQ(posixcc, "lowe", 4)) /* lower */
12037 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12038 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12039 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12042 if (memEQ(posixcc, "digi", 4)) /* digit */
12043 namedclass = ANYOF_DIGIT;
12044 else if (memEQ(posixcc, "prin", 4)) /* print */
12045 namedclass = ANYOF_PRINT;
12046 else if (memEQ(posixcc, "punc", 4)) /* punct */
12047 namedclass = ANYOF_PUNCT;
12052 if (memEQ(posixcc, "xdigit", 6))
12053 namedclass = ANYOF_XDIGIT;
12057 if (namedclass == OOB_NAMEDCLASS)
12059 "POSIX class [:%"UTF8f":] unknown",
12060 UTF8fARG(UTF, t - s - 1, s + 1));
12062 /* The #defines are structured so each complement is +1 to
12063 * the normal one */
12067 assert (posixcc[skip] == ':');
12068 assert (posixcc[skip+1] == ']');
12069 } else if (!SIZE_ONLY) {
12070 /* [[=foo=]] and [[.foo.]] are still future. */
12072 /* adjust RExC_parse so the warning shows after
12073 the class closes */
12074 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12076 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12079 /* Maternal grandfather:
12080 * "[:" ending in ":" but not in ":]" */
12082 vFAIL("Unmatched '[' in POSIX class");
12085 /* Grandfather lone [:, [=, [. */
12095 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12097 /* This applies some heuristics at the current parse position (which should
12098 * be at a '[') to see if what follows might be intended to be a [:posix:]
12099 * class. It returns true if it really is a posix class, of course, but it
12100 * also can return true if it thinks that what was intended was a posix
12101 * class that didn't quite make it.
12103 * It will return true for
12105 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12106 * ')' indicating the end of the (?[
12107 * [:any garbage including %^&$ punctuation:]
12109 * This is designed to be called only from S_handle_regex_sets; it could be
12110 * easily adapted to be called from the spot at the beginning of regclass()
12111 * that checks to see in a normal bracketed class if the surrounding []
12112 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12113 * change long-standing behavior, so I (khw) didn't do that */
12114 char* p = RExC_parse + 1;
12115 char first_char = *p;
12117 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12119 assert(*(p - 1) == '[');
12121 if (! POSIXCC(first_char)) {
12126 while (p < RExC_end && isWORDCHAR(*p)) p++;
12128 if (p >= RExC_end) {
12132 if (p - RExC_parse > 2 /* Got at least 1 word character */
12133 && (*p == first_char
12134 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12139 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12142 && p - RExC_parse > 2 /* [:] evaluates to colon;
12143 [::] is a bad posix class. */
12144 && first_char == *(p - 1));
12148 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
12149 char * const oregcomp_parse)
12151 /* Handle the (?[...]) construct to do set operations */
12154 UV start, end; /* End points of code point ranges */
12156 char *save_end, *save_parse;
12161 const bool save_fold = FOLD;
12163 GET_RE_DEBUG_FLAGS_DECL;
12165 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12168 vFAIL("(?[...]) not valid in locale");
12170 RExC_uni_semantics = 1;
12172 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12173 * (such as EXACT). Thus we can skip most everything if just sizing. We
12174 * call regclass to handle '[]' so as to not have to reinvent its parsing
12175 * rules here (throwing away the size it computes each time). And, we exit
12176 * upon an unescaped ']' that isn't one ending a regclass. To do both
12177 * these things, we need to realize that something preceded by a backslash
12178 * is escaped, so we have to keep track of backslashes */
12180 UV depth = 0; /* how many nested (?[...]) constructs */
12182 Perl_ck_warner_d(aTHX_
12183 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12184 "The regex_sets feature is experimental" REPORT_LOCATION,
12185 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12186 UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
12188 while (RExC_parse < RExC_end) {
12189 SV* current = NULL;
12190 RExC_parse = regpatws(pRExC_state, RExC_parse,
12191 TRUE); /* means recognize comments */
12192 switch (*RExC_parse) {
12194 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12199 /* Skip the next byte (which could cause us to end up in
12200 * the middle of a UTF-8 character, but since none of those
12201 * are confusable with anything we currently handle in this
12202 * switch (invariants all), it's safe. We'll just hit the
12203 * default: case next time and keep on incrementing until
12204 * we find one of the invariants we do handle. */
12209 /* If this looks like it is a [:posix:] class, leave the
12210 * parse pointer at the '[' to fool regclass() into
12211 * thinking it is part of a '[[:posix:]]'. That function
12212 * will use strict checking to force a syntax error if it
12213 * doesn't work out to a legitimate class */
12214 bool is_posix_class
12215 = could_it_be_a_POSIX_class(pRExC_state);
12216 if (! is_posix_class) {
12220 /* regclass() can only return RESTART_UTF8 if multi-char
12221 folds are allowed. */
12222 if (!regclass(pRExC_state, flagp,depth+1,
12223 is_posix_class, /* parse the whole char
12224 class only if not a
12226 FALSE, /* don't allow multi-char folds */
12227 TRUE, /* silence non-portable warnings. */
12229 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12232 /* function call leaves parse pointing to the ']', except
12233 * if we faked it */
12234 if (is_posix_class) {
12238 SvREFCNT_dec(current); /* In case it returned something */
12243 if (depth--) break;
12245 if (RExC_parse < RExC_end
12246 && *RExC_parse == ')')
12248 node = reganode(pRExC_state, ANYOF, 0);
12249 RExC_size += ANYOF_SKIP;
12250 nextchar(pRExC_state);
12251 Set_Node_Length(node,
12252 RExC_parse - oregcomp_parse + 1); /* MJD */
12261 FAIL("Syntax error in (?[...])");
12264 /* Pass 2 only after this. Everything in this construct is a
12265 * metacharacter. Operands begin with either a '\' (for an escape
12266 * sequence), or a '[' for a bracketed character class. Any other
12267 * character should be an operator, or parenthesis for grouping. Both
12268 * types of operands are handled by calling regclass() to parse them. It
12269 * is called with a parameter to indicate to return the computed inversion
12270 * list. The parsing here is implemented via a stack. Each entry on the
12271 * stack is a single character representing one of the operators, or the
12272 * '('; or else a pointer to an operand inversion list. */
12274 #define IS_OPERAND(a) (! SvIOK(a))
12276 /* The stack starts empty. It is a syntax error if the first thing parsed
12277 * is a binary operator; everything else is pushed on the stack. When an
12278 * operand is parsed, the top of the stack is examined. If it is a binary
12279 * operator, the item before it should be an operand, and both are replaced
12280 * by the result of doing that operation on the new operand and the one on
12281 * the stack. Thus a sequence of binary operands is reduced to a single
12282 * one before the next one is parsed.
12284 * A unary operator may immediately follow a binary in the input, for
12287 * When an operand is parsed and the top of the stack is a unary operator,
12288 * the operation is performed, and then the stack is rechecked to see if
12289 * this new operand is part of a binary operation; if so, it is handled as
12292 * A '(' is simply pushed on the stack; it is valid only if the stack is
12293 * empty, or the top element of the stack is an operator or another '('
12294 * (for which the parenthesized expression will become an operand). By the
12295 * time the corresponding ')' is parsed everything in between should have
12296 * been parsed and evaluated to a single operand (or else is a syntax
12297 * error), and is handled as a regular operand */
12299 sv_2mortal((SV *)(stack = newAV()));
12301 while (RExC_parse < RExC_end) {
12302 I32 top_index = av_tindex(stack);
12304 SV* current = NULL;
12306 /* Skip white space */
12307 RExC_parse = regpatws(pRExC_state, RExC_parse,
12308 TRUE); /* means recognize comments */
12309 if (RExC_parse >= RExC_end) {
12310 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12312 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12319 if (av_tindex(stack) >= 0 /* This makes sure that we can
12320 safely subtract 1 from
12321 RExC_parse in the next clause.
12322 If we have something on the
12323 stack, we have parsed something
12325 && UCHARAT(RExC_parse - 1) == '('
12326 && RExC_parse < RExC_end)
12328 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12329 * This happens when we have some thing like
12331 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12333 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12335 * Here we would be handling the interpolated
12336 * '$thai_or_lao'. We handle this by a recursive call to
12337 * ourselves which returns the inversion list the
12338 * interpolated expression evaluates to. We use the flags
12339 * from the interpolated pattern. */
12340 U32 save_flags = RExC_flags;
12341 const char * const save_parse = ++RExC_parse;
12343 parse_lparen_question_flags(pRExC_state);
12345 if (RExC_parse == save_parse /* Makes sure there was at
12346 least one flag (or this
12347 embedding wasn't compiled)
12349 || RExC_parse >= RExC_end - 4
12350 || UCHARAT(RExC_parse) != ':'
12351 || UCHARAT(++RExC_parse) != '('
12352 || UCHARAT(++RExC_parse) != '?'
12353 || UCHARAT(++RExC_parse) != '[')
12356 /* In combination with the above, this moves the
12357 * pointer to the point just after the first erroneous
12358 * character (or if there are no flags, to where they
12359 * should have been) */
12360 if (RExC_parse >= RExC_end - 4) {
12361 RExC_parse = RExC_end;
12363 else if (RExC_parse != save_parse) {
12364 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12366 vFAIL("Expecting '(?flags:(?[...'");
12369 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
12370 depth+1, oregcomp_parse);
12372 /* Here, 'current' contains the embedded expression's
12373 * inversion list, and RExC_parse points to the trailing
12374 * ']'; the next character should be the ')' which will be
12375 * paired with the '(' that has been put on the stack, so
12376 * the whole embedded expression reduces to '(operand)' */
12379 RExC_flags = save_flags;
12380 goto handle_operand;
12385 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12386 vFAIL("Unexpected character");
12389 /* regclass() can only return RESTART_UTF8 if multi-char
12390 folds are allowed. */
12391 if (!regclass(pRExC_state, flagp,depth+1,
12392 TRUE, /* means parse just the next thing */
12393 FALSE, /* don't allow multi-char folds */
12394 FALSE, /* don't silence non-portable warnings. */
12396 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12398 /* regclass() will return with parsing just the \ sequence,
12399 * leaving the parse pointer at the next thing to parse */
12401 goto handle_operand;
12403 case '[': /* Is a bracketed character class */
12405 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12407 if (! is_posix_class) {
12411 /* regclass() can only return RESTART_UTF8 if multi-char
12412 folds are allowed. */
12413 if(!regclass(pRExC_state, flagp,depth+1,
12414 is_posix_class, /* parse the whole char class
12415 only if not a posix class */
12416 FALSE, /* don't allow multi-char folds */
12417 FALSE, /* don't silence non-portable warnings. */
12419 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12421 /* function call leaves parse pointing to the ']', except if we
12423 if (is_posix_class) {
12427 goto handle_operand;
12436 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12437 || ! IS_OPERAND(*top_ptr))
12440 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12442 av_push(stack, newSVuv(curchar));
12446 av_push(stack, newSVuv(curchar));
12450 if (top_index >= 0) {
12451 top_ptr = av_fetch(stack, top_index, FALSE);
12453 if (IS_OPERAND(*top_ptr)) {
12455 vFAIL("Unexpected '(' with no preceding operator");
12458 av_push(stack, newSVuv(curchar));
12465 || ! (current = av_pop(stack))
12466 || ! IS_OPERAND(current)
12467 || ! (lparen = av_pop(stack))
12468 || IS_OPERAND(lparen)
12469 || SvUV(lparen) != '(')
12471 SvREFCNT_dec(current);
12473 vFAIL("Unexpected ')'");
12476 SvREFCNT_dec_NN(lparen);
12483 /* Here, we have an operand to process, in 'current' */
12485 if (top_index < 0) { /* Just push if stack is empty */
12486 av_push(stack, current);
12489 SV* top = av_pop(stack);
12491 char current_operator;
12493 if (IS_OPERAND(top)) {
12494 SvREFCNT_dec_NN(top);
12495 SvREFCNT_dec_NN(current);
12496 vFAIL("Operand with no preceding operator");
12498 current_operator = (char) SvUV(top);
12499 switch (current_operator) {
12500 case '(': /* Push the '(' back on followed by the new
12502 av_push(stack, top);
12503 av_push(stack, current);
12504 SvREFCNT_inc(top); /* Counters the '_dec' done
12505 just after the 'break', so
12506 it doesn't get wrongly freed
12511 _invlist_invert(current);
12513 /* Unlike binary operators, the top of the stack,
12514 * now that this unary one has been popped off, may
12515 * legally be an operator, and we now have operand
12518 SvREFCNT_dec_NN(top);
12519 goto handle_operand;
12522 prev = av_pop(stack);
12523 _invlist_intersection(prev,
12526 av_push(stack, current);
12531 prev = av_pop(stack);
12532 _invlist_union(prev, current, ¤t);
12533 av_push(stack, current);
12537 prev = av_pop(stack);;
12538 _invlist_subtract(prev, current, ¤t);
12539 av_push(stack, current);
12542 case '^': /* The union minus the intersection */
12548 prev = av_pop(stack);
12549 _invlist_union(prev, current, &u);
12550 _invlist_intersection(prev, current, &i);
12551 /* _invlist_subtract will overwrite current
12552 without freeing what it already contains */
12554 _invlist_subtract(u, i, ¤t);
12555 av_push(stack, current);
12556 SvREFCNT_dec_NN(i);
12557 SvREFCNT_dec_NN(u);
12558 SvREFCNT_dec_NN(element);
12563 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12565 SvREFCNT_dec_NN(top);
12566 SvREFCNT_dec(prev);
12570 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12573 if (av_tindex(stack) < 0 /* Was empty */
12574 || ((final = av_pop(stack)) == NULL)
12575 || ! IS_OPERAND(final)
12576 || av_tindex(stack) >= 0) /* More left on stack */
12578 vFAIL("Incomplete expression within '(?[ ])'");
12581 /* Here, 'final' is the resultant inversion list from evaluating the
12582 * expression. Return it if so requested */
12583 if (return_invlist) {
12584 *return_invlist = final;
12588 /* Otherwise generate a resultant node, based on 'final'. regclass() is
12589 * expecting a string of ranges and individual code points */
12590 invlist_iterinit(final);
12591 result_string = newSVpvs("");
12592 while (invlist_iternext(final, &start, &end)) {
12593 if (start == end) {
12594 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12597 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12602 save_parse = RExC_parse;
12603 RExC_parse = SvPV(result_string, len);
12604 save_end = RExC_end;
12605 RExC_end = RExC_parse + len;
12607 /* We turn off folding around the call, as the class we have constructed
12608 * already has all folding taken into consideration, and we don't want
12609 * regclass() to add to that */
12610 RExC_flags &= ~RXf_PMf_FOLD;
12611 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12613 node = regclass(pRExC_state, flagp,depth+1,
12614 FALSE, /* means parse the whole char class */
12615 FALSE, /* don't allow multi-char folds */
12616 TRUE, /* silence non-portable warnings. The above may very
12617 well have generated non-portable code points, but
12618 they're valid on this machine */
12621 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12624 RExC_flags |= RXf_PMf_FOLD;
12626 RExC_parse = save_parse + 1;
12627 RExC_end = save_end;
12628 SvREFCNT_dec_NN(final);
12629 SvREFCNT_dec_NN(result_string);
12631 nextchar(pRExC_state);
12632 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12637 /* The names of properties whose definitions are not known at compile time are
12638 * stored in this SV, after a constant heading. So if the length has been
12639 * changed since initialization, then there is a run-time definition. */
12640 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12643 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12644 const bool stop_at_1, /* Just parse the next thing, don't
12645 look for a full character class */
12646 bool allow_multi_folds,
12647 const bool silence_non_portable, /* Don't output warnings
12650 SV** ret_invlist) /* Return an inversion list, not a node */
12652 /* parse a bracketed class specification. Most of these will produce an
12653 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12654 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
12655 * under /i with multi-character folds: it will be rewritten following the
12656 * paradigm of this example, where the <multi-fold>s are characters which
12657 * fold to multiple character sequences:
12658 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12659 * gets effectively rewritten as:
12660 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12661 * reg() gets called (recursively) on the rewritten version, and this
12662 * function will return what it constructs. (Actually the <multi-fold>s
12663 * aren't physically removed from the [abcdefghi], it's just that they are
12664 * ignored in the recursion by means of a flag:
12665 * <RExC_in_multi_char_class>.)
12667 * ANYOF nodes contain a bit map for the first 256 characters, with the
12668 * corresponding bit set if that character is in the list. For characters
12669 * above 255, a range list or swash is used. There are extra bits for \w,
12670 * etc. in locale ANYOFs, as what these match is not determinable at
12673 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12674 * to be restarted. This can only happen if ret_invlist is non-NULL.
12678 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12680 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12683 IV namedclass = OOB_NAMEDCLASS;
12684 char *rangebegin = NULL;
12685 bool need_class = 0;
12687 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12688 than just initialized. */
12689 SV* properties = NULL; /* Code points that match \p{} \P{} */
12690 SV* posixes = NULL; /* Code points that match classes like, [:word:],
12691 extended beyond the Latin1 range */
12692 UV element_count = 0; /* Number of distinct elements in the class.
12693 Optimizations may be possible if this is tiny */
12694 AV * multi_char_matches = NULL; /* Code points that fold to more than one
12695 character; used under /i */
12697 char * stop_ptr = RExC_end; /* where to stop parsing */
12698 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12700 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12702 /* Unicode properties are stored in a swash; this holds the current one
12703 * being parsed. If this swash is the only above-latin1 component of the
12704 * character class, an optimization is to pass it directly on to the
12705 * execution engine. Otherwise, it is set to NULL to indicate that there
12706 * are other things in the class that have to be dealt with at execution
12708 SV* swash = NULL; /* Code points that match \p{} \P{} */
12710 /* Set if a component of this character class is user-defined; just passed
12711 * on to the engine */
12712 bool has_user_defined_property = FALSE;
12714 /* inversion list of code points this node matches only when the target
12715 * string is in UTF-8. (Because is under /d) */
12716 SV* depends_list = NULL;
12718 /* inversion list of code points this node matches. For much of the
12719 * function, it includes only those that match regardless of the utf8ness
12720 * of the target string */
12721 SV* cp_list = NULL;
12724 /* In a range, counts how many 0-2 of the ends of it came from literals,
12725 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
12726 UV literal_endpoint = 0;
12728 bool invert = FALSE; /* Is this class to be complemented */
12730 /* Is there any thing like \W or [:^digit:] that matches above the legal
12731 * Unicode range? */
12732 bool runtime_posix_matches_above_Unicode = FALSE;
12734 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12735 case we need to change the emitted regop to an EXACT. */
12736 const char * orig_parse = RExC_parse;
12737 const SSize_t orig_size = RExC_size;
12738 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
12739 GET_RE_DEBUG_FLAGS_DECL;
12741 PERL_ARGS_ASSERT_REGCLASS;
12743 PERL_UNUSED_ARG(depth);
12746 DEBUG_PARSE("clas");
12748 /* Assume we are going to generate an ANYOF node. */
12749 ret = reganode(pRExC_state, ANYOF, 0);
12752 RExC_size += ANYOF_SKIP;
12753 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12756 ANYOF_FLAGS(ret) = 0;
12758 RExC_emit += ANYOF_SKIP;
12760 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12762 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12763 initial_listsv_len = SvCUR(listsv);
12764 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
12768 RExC_parse = regpatws(pRExC_state, RExC_parse,
12769 FALSE /* means don't recognize comments */);
12772 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12775 allow_multi_folds = FALSE;
12778 RExC_parse = regpatws(pRExC_state, RExC_parse,
12779 FALSE /* means don't recognize comments */);
12783 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12784 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12785 const char *s = RExC_parse;
12786 const char c = *s++;
12788 while (isWORDCHAR(*s))
12790 if (*s && c == *s && s[1] == ']') {
12791 SAVEFREESV(RExC_rx_sv);
12793 "POSIX syntax [%c %c] belongs inside character classes",
12795 (void)ReREFCNT_inc(RExC_rx_sv);
12799 /* If the caller wants us to just parse a single element, accomplish this
12800 * by faking the loop ending condition */
12801 if (stop_at_1 && RExC_end > RExC_parse) {
12802 stop_ptr = RExC_parse + 1;
12805 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12806 if (UCHARAT(RExC_parse) == ']')
12807 goto charclassloop;
12811 if (RExC_parse >= stop_ptr) {
12816 RExC_parse = regpatws(pRExC_state, RExC_parse,
12817 FALSE /* means don't recognize comments */);
12820 if (UCHARAT(RExC_parse) == ']') {
12826 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12827 save_value = value;
12828 save_prevvalue = prevvalue;
12831 rangebegin = RExC_parse;
12835 value = utf8n_to_uvchr((U8*)RExC_parse,
12836 RExC_end - RExC_parse,
12837 &numlen, UTF8_ALLOW_DEFAULT);
12838 RExC_parse += numlen;
12841 value = UCHARAT(RExC_parse++);
12844 && RExC_parse < RExC_end
12845 && POSIXCC(UCHARAT(RExC_parse)))
12847 namedclass = regpposixcc(pRExC_state, value, strict);
12849 else if (value == '\\') {
12851 value = utf8n_to_uvchr((U8*)RExC_parse,
12852 RExC_end - RExC_parse,
12853 &numlen, UTF8_ALLOW_DEFAULT);
12854 RExC_parse += numlen;
12857 value = UCHARAT(RExC_parse++);
12859 /* Some compilers cannot handle switching on 64-bit integer
12860 * values, therefore value cannot be an UV. Yes, this will
12861 * be a problem later if we want switch on Unicode.
12862 * A similar issue a little bit later when switching on
12863 * namedclass. --jhi */
12865 /* If the \ is escaping white space when white space is being
12866 * skipped, it means that that white space is wanted literally, and
12867 * is already in 'value'. Otherwise, need to translate the escape
12868 * into what it signifies. */
12869 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12871 case 'w': namedclass = ANYOF_WORDCHAR; break;
12872 case 'W': namedclass = ANYOF_NWORDCHAR; break;
12873 case 's': namedclass = ANYOF_SPACE; break;
12874 case 'S': namedclass = ANYOF_NSPACE; break;
12875 case 'd': namedclass = ANYOF_DIGIT; break;
12876 case 'D': namedclass = ANYOF_NDIGIT; break;
12877 case 'v': namedclass = ANYOF_VERTWS; break;
12878 case 'V': namedclass = ANYOF_NVERTWS; break;
12879 case 'h': namedclass = ANYOF_HORIZWS; break;
12880 case 'H': namedclass = ANYOF_NHORIZWS; break;
12881 case 'N': /* Handle \N{NAME} in class */
12883 /* We only pay attention to the first char of
12884 multichar strings being returned. I kinda wonder
12885 if this makes sense as it does change the behaviour
12886 from earlier versions, OTOH that behaviour was broken
12888 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12889 TRUE, /* => charclass */
12892 if (*flagp & RESTART_UTF8)
12893 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12903 /* We will handle any undefined properties ourselves */
12904 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12906 if (RExC_parse >= RExC_end)
12907 vFAIL2("Empty \\%c{}", (U8)value);
12908 if (*RExC_parse == '{') {
12909 const U8 c = (U8)value;
12910 e = strchr(RExC_parse++, '}');
12912 vFAIL2("Missing right brace on \\%c{}", c);
12913 while (isSPACE(UCHARAT(RExC_parse)))
12915 if (e == RExC_parse)
12916 vFAIL2("Empty \\%c{}", c);
12917 n = e - RExC_parse;
12918 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12930 if (UCHARAT(RExC_parse) == '^') {
12933 /* toggle. (The rhs xor gets the single bit that
12934 * differs between P and p; the other xor inverts just
12936 value ^= 'P' ^ 'p';
12938 while (isSPACE(UCHARAT(RExC_parse))) {
12943 /* Try to get the definition of the property into
12944 * <invlist>. If /i is in effect, the effective property
12945 * will have its name be <__NAME_i>. The design is
12946 * discussed in commit
12947 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12948 formatted = Perl_form(aTHX_
12950 (FOLD) ? "__" : "",
12955 name = savepvn(formatted, strlen(formatted));
12957 /* Look up the property name, and get its swash and
12958 * inversion list, if the property is found */
12960 SvREFCNT_dec_NN(swash);
12962 swash = _core_swash_init("utf8", name, &PL_sv_undef,
12965 NULL, /* No inversion list */
12968 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12970 SvREFCNT_dec_NN(swash);
12974 /* Here didn't find it. It could be a user-defined
12975 * property that will be available at run-time. If we
12976 * accept only compile-time properties, is an error;
12977 * otherwise add it to the list for run-time look up */
12979 RExC_parse = e + 1;
12981 "Property '%"UTF8f"' is unknown",
12982 UTF8fARG(UTF, n, name));
12984 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
12985 (value == 'p' ? '+' : '!'),
12986 UTF8fARG(UTF, n, name));
12987 has_user_defined_property = TRUE;
12989 /* We don't know yet, so have to assume that the
12990 * property could match something in the Latin1 range,
12991 * hence something that isn't utf8. Note that this
12992 * would cause things in <depends_list> to match
12993 * inappropriately, except that any \p{}, including
12994 * this one forces Unicode semantics, which means there
12995 * is <no depends_list> */
12996 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13000 /* Here, did get the swash and its inversion list. If
13001 * the swash is from a user-defined property, then this
13002 * whole character class should be regarded as such */
13003 has_user_defined_property =
13005 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
13007 /* Invert if asking for the complement */
13008 if (value == 'P') {
13009 _invlist_union_complement_2nd(properties,
13013 /* The swash can't be used as-is, because we've
13014 * inverted things; delay removing it to here after
13015 * have copied its invlist above */
13016 SvREFCNT_dec_NN(swash);
13020 _invlist_union(properties, invlist, &properties);
13025 RExC_parse = e + 1;
13026 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13029 /* \p means they want Unicode semantics */
13030 RExC_uni_semantics = 1;
13033 case 'n': value = '\n'; break;
13034 case 'r': value = '\r'; break;
13035 case 't': value = '\t'; break;
13036 case 'f': value = '\f'; break;
13037 case 'b': value = '\b'; break;
13038 case 'e': value = ASCII_TO_NATIVE('\033');break;
13039 case 'a': value = '\a'; break;
13041 RExC_parse--; /* function expects to be pointed at the 'o' */
13043 const char* error_msg;
13044 bool valid = grok_bslash_o(&RExC_parse,
13047 SIZE_ONLY, /* warnings in pass
13050 silence_non_portable,
13056 if (PL_encoding && value < 0x100) {
13057 goto recode_encoding;
13061 RExC_parse--; /* function expects to be pointed at the 'x' */
13063 const char* error_msg;
13064 bool valid = grok_bslash_x(&RExC_parse,
13067 TRUE, /* Output warnings */
13069 silence_non_portable,
13075 if (PL_encoding && value < 0x100)
13076 goto recode_encoding;
13079 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
13081 case '0': case '1': case '2': case '3': case '4':
13082 case '5': case '6': case '7':
13084 /* Take 1-3 octal digits */
13085 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13086 numlen = (strict) ? 4 : 3;
13087 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13088 RExC_parse += numlen;
13091 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13092 vFAIL("Need exactly 3 octal digits");
13094 else if (! SIZE_ONLY /* like \08, \178 */
13096 && RExC_parse < RExC_end
13097 && isDIGIT(*RExC_parse)
13098 && ckWARN(WARN_REGEXP))
13100 SAVEFREESV(RExC_rx_sv);
13101 reg_warn_non_literal_string(
13103 form_short_octal_warning(RExC_parse, numlen));
13104 (void)ReREFCNT_inc(RExC_rx_sv);
13107 if (PL_encoding && value < 0x100)
13108 goto recode_encoding;
13112 if (! RExC_override_recoding) {
13113 SV* enc = PL_encoding;
13114 value = reg_recode((const char)(U8)value, &enc);
13117 vFAIL("Invalid escape in the specified encoding");
13119 else if (SIZE_ONLY) {
13120 ckWARNreg(RExC_parse,
13121 "Invalid escape in the specified encoding");
13127 /* Allow \_ to not give an error */
13128 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13130 vFAIL2("Unrecognized escape \\%c in character class",
13134 SAVEFREESV(RExC_rx_sv);
13135 ckWARN2reg(RExC_parse,
13136 "Unrecognized escape \\%c in character class passed through",
13138 (void)ReREFCNT_inc(RExC_rx_sv);
13142 } /* End of switch on char following backslash */
13143 } /* end of handling backslash escape sequences */
13146 literal_endpoint++;
13149 /* Here, we have the current token in 'value' */
13151 /* What matches in a locale is not known until runtime. This includes
13152 * what the Posix classes (like \w, [:space:]) match. Room must be
13153 * reserved (one time per outer bracketed class) to store such classes,
13154 * either if Perl is compiled so that locale nodes always should have
13155 * this space, or if there is such posix class info to be stored. The
13156 * space will contain a bit for each named class that is to be matched
13157 * against. This isn't needed for \p{} and pseudo-classes, as they are
13158 * not affected by locale, and hence are dealt with separately */
13161 && (ANYOF_LOCALE == ANYOF_POSIXL
13162 || (namedclass > OOB_NAMEDCLASS
13163 && namedclass < ANYOF_POSIXL_MAX)))
13167 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13170 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13172 ANYOF_POSIXL_ZERO(ret);
13173 ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13176 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13179 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13180 * literal, as is the character that began the false range, i.e.
13181 * the 'a' in the examples */
13184 const int w = (RExC_parse >= rangebegin)
13185 ? RExC_parse - rangebegin
13189 "False [] range \"%"UTF8f"\"",
13190 UTF8fARG(UTF, w, rangebegin));
13193 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13194 ckWARN2reg(RExC_parse,
13195 "False [] range \"%"UTF8f"\"",
13196 UTF8fARG(UTF, w, rangebegin));
13197 (void)ReREFCNT_inc(RExC_rx_sv);
13198 cp_list = add_cp_to_invlist(cp_list, '-');
13199 cp_list = add_cp_to_invlist(cp_list, prevvalue);
13203 range = 0; /* this was not a true range */
13204 element_count += 2; /* So counts for three values */
13207 classnum = namedclass_to_classnum(namedclass);
13209 if (LOC && namedclass < ANYOF_POSIXL_MAX
13210 #ifndef HAS_ISASCII
13211 && classnum != _CC_ASCII
13213 #ifndef HAS_ISBLANK
13214 && classnum != _CC_BLANK
13217 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13218 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13222 posixl_matches_all = TRUE;
13225 ANYOF_POSIXL_SET(ret, namedclass);
13227 /* XXX After have made all the posix classes known at compile time
13228 * we can move the LOC handling below to above */
13231 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
13232 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13234 /* Here, should be \h, \H, \v, or \V. Neither /d nor
13235 * /l make a difference in what these match. There
13236 * would be problems if these characters had folds
13237 * other than themselves, as cp_list is subject to
13239 if (classnum != _CC_VERTSPACE) {
13240 assert( namedclass == ANYOF_HORIZWS
13241 || namedclass == ANYOF_NHORIZWS);
13243 /* It turns out that \h is just a synonym for
13245 classnum = _CC_BLANK;
13248 _invlist_union_maybe_complement_2nd(
13250 PL_XPosix_ptrs[classnum],
13251 cBOOL(namedclass % 2), /* Complement if odd
13252 (NHORIZWS, NVERTWS)
13257 else if (classnum == _CC_ASCII) {
13260 ANYOF_POSIXL_SET(ret, namedclass);
13263 #endif /* Not isascii(); just use the hard-coded definition for it */
13264 _invlist_union_maybe_complement_2nd(
13266 PL_Posix_ptrs[_CC_ASCII],
13267 cBOOL(namedclass % 2), /* Complement if odd
13271 else { /* Garden variety class */
13273 /* The ascii range inversion list */
13274 SV* ascii_source = PL_Posix_ptrs[classnum];
13276 /* The full Latin1 range inversion list */
13277 SV* l1_source = PL_L1Posix_ptrs[classnum];
13279 /* This code is structured into two major clauses. The
13280 * first is for classes whose complete definitions may not
13281 * already be known. If not, the Latin1 definition
13282 * (guaranteed to already known) is used plus code is
13283 * generated to load the rest at run-time (only if needed).
13284 * If the complete definition is known, it drops down to
13285 * the second clause, where the complete definition is
13288 if (classnum < _FIRST_NON_SWASH_CC) {
13290 /* Here, the class has a swash, which may or not
13291 * already be loaded */
13293 /* The name of the property to use to match the full
13294 * eXtended Unicode range swash for this character
13296 const char *Xname = swash_property_names[classnum];
13298 /* If returning the inversion list, we can't defer
13299 * getting this until runtime */
13300 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
13301 PL_utf8_swash_ptrs[classnum] =
13302 _core_swash_init("utf8", Xname, &PL_sv_undef,
13305 NULL, /* No inversion list */
13306 NULL /* No flags */
13308 assert(PL_utf8_swash_ptrs[classnum]);
13310 if ( ! PL_utf8_swash_ptrs[classnum]) {
13311 if (namedclass % 2 == 0) { /* A non-complemented
13313 /* If not /a matching, there are code points we
13314 * don't know at compile time. Arrange for the
13315 * unknown matches to be loaded at run-time, if
13317 if (! AT_LEAST_ASCII_RESTRICTED) {
13318 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
13321 if (LOC) { /* Under locale, set run-time
13323 ANYOF_POSIXL_SET(ret, namedclass);
13326 /* Add the current class's code points to
13327 * the running total */
13328 _invlist_union(posixes,
13329 (AT_LEAST_ASCII_RESTRICTED)
13335 else { /* A complemented class */
13336 if (AT_LEAST_ASCII_RESTRICTED) {
13337 /* Under /a should match everything above
13338 * ASCII, plus the complement of the set's
13340 _invlist_union_complement_2nd(posixes,
13345 /* Arrange for the unknown matches to be
13346 * loaded at run-time, if needed */
13347 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
13349 runtime_posix_matches_above_Unicode = TRUE;
13351 ANYOF_POSIXL_SET(ret, namedclass);
13355 /* We want to match everything in
13356 * Latin1, except those things that
13357 * l1_source matches */
13358 SV* scratch_list = NULL;
13359 _invlist_subtract(PL_Latin1, l1_source,
13362 /* Add the list from this class to the
13365 posixes = scratch_list;
13368 _invlist_union(posixes,
13371 SvREFCNT_dec_NN(scratch_list);
13373 if (DEPENDS_SEMANTICS) {
13375 |= ANYOF_NON_UTF8_LATIN1_ALL;
13380 goto namedclass_done;
13383 /* Here, there is a swash loaded for the class. If no
13384 * inversion list for it yet, get it */
13385 if (! PL_XPosix_ptrs[classnum]) {
13386 PL_XPosix_ptrs[classnum]
13387 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
13391 /* Here there is an inversion list already loaded for the
13394 if (namedclass % 2 == 0) { /* A non-complemented class,
13395 like ANYOF_PUNCT */
13397 /* For non-locale, just add it to any existing list
13399 _invlist_union(posixes,
13400 (AT_LEAST_ASCII_RESTRICTED)
13402 : PL_XPosix_ptrs[classnum],
13405 else { /* Locale */
13406 SV* scratch_list = NULL;
13408 /* For above Latin1 code points, we use the full
13410 _invlist_intersection(PL_AboveLatin1,
13411 PL_XPosix_ptrs[classnum],
13413 /* And set the output to it, adding instead if
13414 * there already is an output. Checking if
13415 * 'posixes' is NULL first saves an extra clone.
13416 * Its reference count will be decremented at the
13417 * next union, etc, or if this is the only
13418 * instance, at the end of the routine */
13420 posixes = scratch_list;
13423 _invlist_union(posixes, scratch_list, &posixes);
13424 SvREFCNT_dec_NN(scratch_list);
13427 #ifndef HAS_ISBLANK
13428 if (namedclass != ANYOF_BLANK) {
13430 /* Set this class in the node for runtime
13432 ANYOF_POSIXL_SET(ret, namedclass);
13433 #ifndef HAS_ISBLANK
13436 /* No isblank(), use the hard-coded ASCII-range
13437 * blanks, adding them to the running total. */
13439 _invlist_union(posixes, ascii_source, &posixes);
13444 else { /* A complemented class, like ANYOF_NPUNCT */
13446 _invlist_union_complement_2nd(
13448 (AT_LEAST_ASCII_RESTRICTED)
13450 : PL_XPosix_ptrs[classnum],
13452 /* Under /d, everything in the upper half of the
13453 * Latin1 range matches this complement */
13454 if (DEPENDS_SEMANTICS) {
13455 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13458 else { /* Locale */
13459 SV* scratch_list = NULL;
13460 _invlist_subtract(PL_AboveLatin1,
13461 PL_XPosix_ptrs[classnum],
13464 posixes = scratch_list;
13467 _invlist_union(posixes, scratch_list, &posixes);
13468 SvREFCNT_dec_NN(scratch_list);
13470 #ifndef HAS_ISBLANK
13471 if (namedclass != ANYOF_NBLANK) {
13473 ANYOF_POSIXL_SET(ret, namedclass);
13474 #ifndef HAS_ISBLANK
13477 /* Get the list of all code points in Latin1
13478 * that are not ASCII blanks, and add them to
13479 * the running total */
13480 _invlist_subtract(PL_Latin1, ascii_source,
13482 _invlist_union(posixes, scratch_list, &posixes);
13483 SvREFCNT_dec_NN(scratch_list);
13490 continue; /* Go get next character */
13492 } /* end of namedclass \blah */
13494 /* Here, we have a single value. If 'range' is set, it is the ending
13495 * of a range--check its validity. Later, we will handle each
13496 * individual code point in the range. If 'range' isn't set, this
13497 * could be the beginning of a range, so check for that by looking
13498 * ahead to see if the next real character to be processed is the range
13499 * indicator--the minus sign */
13502 RExC_parse = regpatws(pRExC_state, RExC_parse,
13503 FALSE /* means don't recognize comments */);
13507 if (prevvalue > value) /* b-a */ {
13508 const int w = RExC_parse - rangebegin;
13510 "Invalid [] range \"%"UTF8f"\"",
13511 UTF8fARG(UTF, w, rangebegin));
13512 range = 0; /* not a valid range */
13516 prevvalue = value; /* save the beginning of the potential range */
13517 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13518 && *RExC_parse == '-')
13520 char* next_char_ptr = RExC_parse + 1;
13521 if (skip_white) { /* Get the next real char after the '-' */
13522 next_char_ptr = regpatws(pRExC_state,
13524 FALSE); /* means don't recognize
13528 /* If the '-' is at the end of the class (just before the ']',
13529 * it is a literal minus; otherwise it is a range */
13530 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13531 RExC_parse = next_char_ptr;
13533 /* a bad range like \w-, [:word:]- ? */
13534 if (namedclass > OOB_NAMEDCLASS) {
13535 if (strict || ckWARN(WARN_REGEXP)) {
13537 RExC_parse >= rangebegin ?
13538 RExC_parse - rangebegin : 0;
13540 vFAIL4("False [] range \"%*.*s\"",
13545 "False [] range \"%*.*s\"",
13550 cp_list = add_cp_to_invlist(cp_list, '-');
13554 range = 1; /* yeah, it's a range! */
13555 continue; /* but do it the next time */
13560 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13563 /* non-Latin1 code point implies unicode semantics. Must be set in
13564 * pass1 so is there for the whole of pass 2 */
13566 RExC_uni_semantics = 1;
13569 /* Ready to process either the single value, or the completed range.
13570 * For single-valued non-inverted ranges, we consider the possibility
13571 * of multi-char folds. (We made a conscious decision to not do this
13572 * for the other cases because it can often lead to non-intuitive
13573 * results. For example, you have the peculiar case that:
13574 * "s s" =~ /^[^\xDF]+$/i => Y
13575 * "ss" =~ /^[^\xDF]+$/i => N
13577 * See [perl #89750] */
13578 if (FOLD && allow_multi_folds && value == prevvalue) {
13579 if (value == LATIN_SMALL_LETTER_SHARP_S
13580 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13583 /* Here <value> is indeed a multi-char fold. Get what it is */
13585 U8 foldbuf[UTF8_MAXBYTES_CASE];
13588 UV folded = _to_uni_fold_flags(
13593 | ((LOC) ? FOLD_FLAGS_LOCALE
13594 : (ASCII_FOLD_RESTRICTED)
13595 ? FOLD_FLAGS_NOMIX_ASCII
13599 /* Here, <folded> should be the first character of the
13600 * multi-char fold of <value>, with <foldbuf> containing the
13601 * whole thing. But, if this fold is not allowed (because of
13602 * the flags), <fold> will be the same as <value>, and should
13603 * be processed like any other character, so skip the special
13605 if (folded != value) {
13607 /* Skip if we are recursed, currently parsing the class
13608 * again. Otherwise add this character to the list of
13609 * multi-char folds. */
13610 if (! RExC_in_multi_char_class) {
13611 AV** this_array_ptr;
13613 STRLEN cp_count = utf8_length(foldbuf,
13614 foldbuf + foldlen);
13615 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13617 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13620 if (! multi_char_matches) {
13621 multi_char_matches = newAV();
13624 /* <multi_char_matches> is actually an array of arrays.
13625 * There will be one or two top-level elements: [2],
13626 * and/or [3]. The [2] element is an array, each
13627 * element thereof is a character which folds to TWO
13628 * characters; [3] is for folds to THREE characters.
13629 * (Unicode guarantees a maximum of 3 characters in any
13630 * fold.) When we rewrite the character class below,
13631 * we will do so such that the longest folds are
13632 * written first, so that it prefers the longest
13633 * matching strings first. This is done even if it
13634 * turns out that any quantifier is non-greedy, out of
13635 * programmer laziness. Tom Christiansen has agreed
13636 * that this is ok. This makes the test for the
13637 * ligature 'ffi' come before the test for 'ff' */
13638 if (av_exists(multi_char_matches, cp_count)) {
13639 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13641 this_array = *this_array_ptr;
13644 this_array = newAV();
13645 av_store(multi_char_matches, cp_count,
13648 av_push(this_array, multi_fold);
13651 /* This element should not be processed further in this
13654 value = save_value;
13655 prevvalue = save_prevvalue;
13661 /* Deal with this element of the class */
13664 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13666 SV* this_range = _new_invlist(1);
13667 _append_range_to_invlist(this_range, prevvalue, value);
13669 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13670 * If this range was specified using something like 'i-j', we want
13671 * to include only the 'i' and the 'j', and not anything in
13672 * between, so exclude non-ASCII, non-alphabetics from it.
13673 * However, if the range was specified with something like
13674 * [\x89-\x91] or [\x89-j], all code points within it should be
13675 * included. literal_endpoint==2 means both ends of the range used
13676 * a literal character, not \x{foo} */
13677 if (literal_endpoint == 2
13678 && ((prevvalue >= 'a' && value <= 'z')
13679 || (prevvalue >= 'A' && value <= 'Z')))
13681 _invlist_intersection(this_range, PL_ASCII,
13683 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13686 _invlist_union(cp_list, this_range, &cp_list);
13687 literal_endpoint = 0;
13691 range = 0; /* this range (if it was one) is done now */
13692 } /* End of loop through all the text within the brackets */
13694 /* If anything in the class expands to more than one character, we have to
13695 * deal with them by building up a substitute parse string, and recursively
13696 * calling reg() on it, instead of proceeding */
13697 if (multi_char_matches) {
13698 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13701 char *save_end = RExC_end;
13702 char *save_parse = RExC_parse;
13703 bool first_time = TRUE; /* First multi-char occurrence doesn't get
13708 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13709 because too confusing */
13711 sv_catpv(substitute_parse, "(?:");
13715 /* Look at the longest folds first */
13716 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13718 if (av_exists(multi_char_matches, cp_count)) {
13719 AV** this_array_ptr;
13722 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13724 while ((this_sequence = av_pop(*this_array_ptr)) !=
13727 if (! first_time) {
13728 sv_catpv(substitute_parse, "|");
13730 first_time = FALSE;
13732 sv_catpv(substitute_parse, SvPVX(this_sequence));
13737 /* If the character class contains anything else besides these
13738 * multi-character folds, have to include it in recursive parsing */
13739 if (element_count) {
13740 sv_catpv(substitute_parse, "|[");
13741 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13742 sv_catpv(substitute_parse, "]");
13745 sv_catpv(substitute_parse, ")");
13748 /* This is a way to get the parse to skip forward a whole named
13749 * sequence instead of matching the 2nd character when it fails the
13751 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13755 RExC_parse = SvPV(substitute_parse, len);
13756 RExC_end = RExC_parse + len;
13757 RExC_in_multi_char_class = 1;
13758 RExC_emit = (regnode *)orig_emit;
13760 ret = reg(pRExC_state, 1, ®_flags, depth+1);
13762 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13764 RExC_parse = save_parse;
13765 RExC_end = save_end;
13766 RExC_in_multi_char_class = 0;
13767 SvREFCNT_dec_NN(multi_char_matches);
13771 /* If the character class contains only a single element, it may be
13772 * optimizable into another node type which is smaller and runs faster.
13773 * Check if this is the case for this class */
13774 if ((element_count == 1 && ! ret_invlist)
13775 || UNLIKELY(posixl_matches_all))
13780 if (UNLIKELY(posixl_matches_all)) {
13783 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
13784 \w or [:digit:] or \p{foo}
13787 /* All named classes are mapped into POSIXish nodes, with its FLAG
13788 * argument giving which class it is */
13789 switch ((I32)namedclass) {
13790 case ANYOF_UNIPROP:
13793 /* These don't depend on the charset modifiers. They always
13794 * match under /u rules */
13795 case ANYOF_NHORIZWS:
13796 case ANYOF_HORIZWS:
13797 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13800 case ANYOF_NVERTWS:
13805 /* The actual POSIXish node for all the rest depends on the
13806 * charset modifier. The ones in the first set depend only on
13807 * ASCII or, if available on this platform, locale */
13811 op = (LOC) ? POSIXL : POSIXA;
13822 /* under /a could be alpha */
13824 if (ASCII_RESTRICTED) {
13825 namedclass = ANYOF_ALPHA + (namedclass % 2);
13833 /* The rest have more possibilities depending on the charset.
13834 * We take advantage of the enum ordering of the charset
13835 * modifiers to get the exact node type, */
13837 op = POSIXD + get_regex_charset(RExC_flags);
13838 if (op > POSIXA) { /* /aa is same as /a */
13841 #ifndef HAS_ISBLANK
13843 && (namedclass == ANYOF_BLANK
13844 || namedclass == ANYOF_NBLANK))
13851 /* The odd numbered ones are the complements of the
13852 * next-lower even number one */
13853 if (namedclass % 2 == 1) {
13857 arg = namedclass_to_classnum(namedclass);
13861 else if (value == prevvalue) {
13863 /* Here, the class consists of just a single code point */
13866 if (! LOC && value == '\n') {
13867 op = REG_ANY; /* Optimize [^\n] */
13868 *flagp |= HASWIDTH|SIMPLE;
13872 else if (value < 256 || UTF) {
13874 /* Optimize a single value into an EXACTish node, but not if it
13875 * would require converting the pattern to UTF-8. */
13876 op = compute_EXACTish(pRExC_state);
13878 } /* Otherwise is a range */
13879 else if (! LOC) { /* locale could vary these */
13880 if (prevvalue == '0') {
13881 if (value == '9') {
13888 /* Here, we have changed <op> away from its initial value iff we found
13889 * an optimization */
13892 /* Throw away this ANYOF regnode, and emit the calculated one,
13893 * which should correspond to the beginning, not current, state of
13895 const char * cur_parse = RExC_parse;
13896 RExC_parse = (char *)orig_parse;
13900 /* To get locale nodes to not use the full ANYOF size would
13901 * require moving the code above that writes the portions
13902 * of it that aren't in other nodes to after this point.
13903 * e.g. ANYOF_POSIXL_SET */
13904 RExC_size = orig_size;
13908 RExC_emit = (regnode *)orig_emit;
13909 if (PL_regkind[op] == POSIXD) {
13911 op += NPOSIXD - POSIXD;
13916 ret = reg_node(pRExC_state, op);
13918 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13922 *flagp |= HASWIDTH|SIMPLE;
13924 else if (PL_regkind[op] == EXACT) {
13925 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13928 RExC_parse = (char *) cur_parse;
13930 SvREFCNT_dec(posixes);
13931 SvREFCNT_dec(cp_list);
13938 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13940 /* If folding, we calculate all characters that could fold to or from the
13941 * ones already on the list */
13942 if (FOLD && cp_list) {
13943 UV start, end; /* End points of code point ranges */
13945 SV* fold_intersection = NULL;
13947 /* If the highest code point is within Latin1, we can use the
13948 * compiled-in Alphas list, and not have to go out to disk. This
13949 * yields two false positives, the masculine and feminine ordinal
13950 * indicators, which are weeded out below using the
13951 * IS_IN_SOME_FOLD_L1() macro */
13952 if (invlist_highest(cp_list) < 256) {
13953 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13954 &fold_intersection);
13958 /* Here, there are non-Latin1 code points, so we will have to go
13959 * fetch the list of all the characters that participate in folds
13961 if (! PL_utf8_foldable) {
13962 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13963 &PL_sv_undef, 1, 0);
13964 PL_utf8_foldable = _get_swash_invlist(swash);
13965 SvREFCNT_dec_NN(swash);
13968 /* This is a hash that for a particular fold gives all characters
13969 * that are involved in it */
13970 if (! PL_utf8_foldclosures) {
13972 /* If we were unable to find any folds, then we likely won't be
13973 * able to find the closures. So just create an empty list.
13974 * Folding will effectively be restricted to the non-Unicode
13975 * rules hard-coded into Perl. (This case happens legitimately
13976 * during compilation of Perl itself before the Unicode tables
13977 * are generated) */
13978 if (_invlist_len(PL_utf8_foldable) == 0) {
13979 PL_utf8_foldclosures = newHV();
13982 /* If the folds haven't been read in, call a fold function
13984 if (! PL_utf8_tofold) {
13985 U8 dummy[UTF8_MAXBYTES_CASE+1];
13987 /* This string is just a short named one above \xff */
13988 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13989 assert(PL_utf8_tofold); /* Verify that worked */
13991 PL_utf8_foldclosures =
13992 _swash_inversion_hash(PL_utf8_tofold);
13996 /* Only the characters in this class that participate in folds need
13997 * be checked. Get the intersection of this class and all the
13998 * possible characters that are foldable. This can quickly narrow
13999 * down a large class */
14000 _invlist_intersection(PL_utf8_foldable, cp_list,
14001 &fold_intersection);
14004 /* Now look at the foldable characters in this class individually */
14005 invlist_iterinit(fold_intersection);
14006 while (invlist_iternext(fold_intersection, &start, &end)) {
14009 /* Locale folding for Latin1 characters is deferred until runtime */
14010 if (LOC && start < 256) {
14014 /* Look at every character in the range */
14015 for (j = start; j <= end; j++) {
14017 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14023 /* We have the latin1 folding rules hard-coded here so that
14024 * an innocent-looking character class, like /[ks]/i won't
14025 * have to go out to disk to find the possible matches.
14026 * XXX It would be better to generate these via regen, in
14027 * case a new version of the Unicode standard adds new
14028 * mappings, though that is not really likely, and may be
14029 * caught by the default: case of the switch below. */
14031 if (IS_IN_SOME_FOLD_L1(j)) {
14033 /* ASCII is always matched; non-ASCII is matched only
14034 * under Unicode rules */
14035 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
14037 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
14041 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
14045 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14046 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14048 /* Certain Latin1 characters have matches outside
14049 * Latin1. To get here, <j> is one of those
14050 * characters. None of these matches is valid for
14051 * ASCII characters under /aa, which is why the 'if'
14052 * just above excludes those. These matches only
14053 * happen when the target string is utf8. The code
14054 * below adds the single fold closures for <j> to the
14055 * inversion list. */
14060 add_cp_to_invlist(cp_list, KELVIN_SIGN);
14064 cp_list = add_cp_to_invlist(cp_list,
14065 LATIN_SMALL_LETTER_LONG_S);
14068 cp_list = add_cp_to_invlist(cp_list,
14069 GREEK_CAPITAL_LETTER_MU);
14070 cp_list = add_cp_to_invlist(cp_list,
14071 GREEK_SMALL_LETTER_MU);
14073 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14074 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14076 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
14078 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14079 cp_list = add_cp_to_invlist(cp_list,
14080 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14082 case LATIN_SMALL_LETTER_SHARP_S:
14083 cp_list = add_cp_to_invlist(cp_list,
14084 LATIN_CAPITAL_LETTER_SHARP_S);
14086 case 'F': case 'f':
14087 case 'I': case 'i':
14088 case 'L': case 'l':
14089 case 'T': case 't':
14090 case 'A': case 'a':
14091 case 'H': case 'h':
14092 case 'J': case 'j':
14093 case 'N': case 'n':
14094 case 'W': case 'w':
14095 case 'Y': case 'y':
14096 /* These all are targets of multi-character
14097 * folds from code points that require UTF8 to
14098 * express, so they can't match unless the
14099 * target string is in UTF-8, so no action here
14100 * is necessary, as regexec.c properly handles
14101 * the general case for UTF-8 matching and
14102 * multi-char folds */
14105 /* Use deprecated warning to increase the
14106 * chances of this being output */
14107 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14114 /* Here is an above Latin1 character. We don't have the rules
14115 * hard-coded for it. First, get its fold. This is the simple
14116 * fold, as the multi-character folds have been handled earlier
14117 * and separated out */
14118 _to_uni_fold_flags(j, foldbuf, &foldlen,
14120 ? FOLD_FLAGS_LOCALE
14121 : (ASCII_FOLD_RESTRICTED)
14122 ? FOLD_FLAGS_NOMIX_ASCII
14125 /* Single character fold of above Latin1. Add everything in
14126 * its fold closure to the list that this node should match.
14127 * The fold closures data structure is a hash with the keys
14128 * being the UTF-8 of every character that is folded to, like
14129 * 'k', and the values each an array of all code points that
14130 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14131 * Multi-character folds are not included */
14132 if ((listp = hv_fetch(PL_utf8_foldclosures,
14133 (char *) foldbuf, foldlen, FALSE)))
14135 AV* list = (AV*) *listp;
14137 for (k = 0; k <= av_len(list); k++) {
14138 SV** c_p = av_fetch(list, k, FALSE);
14141 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14145 /* /aa doesn't allow folds between ASCII and non-; /l
14146 * doesn't allow them between above and below 256 */
14147 if ((ASCII_FOLD_RESTRICTED
14148 && (isASCII(c) != isASCII(j)))
14149 || (LOC && c < 256)) {
14153 /* Folds involving non-ascii Latin1 characters
14154 * under /d are added to a separate list */
14155 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14157 cp_list = add_cp_to_invlist(cp_list, c);
14160 depends_list = add_cp_to_invlist(depends_list, c);
14166 SvREFCNT_dec_NN(fold_intersection);
14169 /* And combine the result (if any) with any inversion list from posix
14170 * classes. The lists are kept separate up to now because we don't want to
14171 * fold the classes (folding of those is automatically handled by the swash
14172 * fetching code) */
14174 if (! DEPENDS_SEMANTICS) {
14176 _invlist_union(cp_list, posixes, &cp_list);
14177 SvREFCNT_dec_NN(posixes);
14184 /* Under /d, we put into a separate list the Latin1 things that
14185 * match only when the target string is utf8 */
14186 SV* nonascii_but_latin1_properties = NULL;
14187 _invlist_intersection(posixes, PL_UpperLatin1,
14188 &nonascii_but_latin1_properties);
14189 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14192 _invlist_union(cp_list, posixes, &cp_list);
14193 SvREFCNT_dec_NN(posixes);
14199 if (depends_list) {
14200 _invlist_union(depends_list, nonascii_but_latin1_properties,
14202 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14205 depends_list = nonascii_but_latin1_properties;
14210 /* And combine the result (if any) with any inversion list from properties.
14211 * The lists are kept separate up to now so that we can distinguish the two
14212 * in regards to matching above-Unicode. A run-time warning is generated
14213 * if a Unicode property is matched against a non-Unicode code point. But,
14214 * we allow user-defined properties to match anything, without any warning,
14215 * and we also suppress the warning if there is a portion of the character
14216 * class that isn't a Unicode property, and which matches above Unicode, \W
14217 * or [\x{110000}] for example.
14218 * (Note that in this case, unlike the Posix one above, there is no
14219 * <depends_list>, because having a Unicode property forces Unicode
14222 bool warn_super = ! has_user_defined_property;
14225 /* If it matters to the final outcome, see if a non-property
14226 * component of the class matches above Unicode. If so, the
14227 * warning gets suppressed. This is true even if just a single
14228 * such code point is specified, as though not strictly correct if
14229 * another such code point is matched against, the fact that they
14230 * are using above-Unicode code points indicates they should know
14231 * the issues involved */
14233 bool non_prop_matches_above_Unicode =
14234 runtime_posix_matches_above_Unicode
14235 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
14237 non_prop_matches_above_Unicode =
14238 ! non_prop_matches_above_Unicode;
14240 warn_super = ! non_prop_matches_above_Unicode;
14243 _invlist_union(properties, cp_list, &cp_list);
14244 SvREFCNT_dec_NN(properties);
14247 cp_list = properties;
14251 OP(ret) = ANYOF_WARN_SUPER;
14255 /* Here, we have calculated what code points should be in the character
14258 * Now we can see about various optimizations. Fold calculation (which we
14259 * did above) needs to take place before inversion. Otherwise /[^k]/i
14260 * would invert to include K, which under /i would match k, which it
14261 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14262 * folded until runtime */
14264 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14265 * at compile time. Besides not inverting folded locale now, we can't
14266 * invert if there are things such as \w, which aren't known until runtime
14269 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_POSIXL)))
14271 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14273 _invlist_invert(cp_list);
14275 /* Any swash can't be used as-is, because we've inverted things */
14277 SvREFCNT_dec_NN(swash);
14281 /* Clear the invert flag since have just done it here */
14286 *ret_invlist = cp_list;
14287 SvREFCNT_dec(swash);
14289 /* Discard the generated node */
14291 RExC_size = orig_size;
14294 RExC_emit = orig_emit;
14299 /* If we didn't do folding, it's because some information isn't available
14300 * until runtime; set the run-time fold flag for these. (We don't have to
14301 * worry about properties folding, as that is taken care of by the swash
14305 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14308 /* Some character classes are equivalent to other nodes. Such nodes take
14309 * up less room and generally fewer operations to execute than ANYOF nodes.
14310 * Above, we checked for and optimized into some such equivalents for
14311 * certain common classes that are easy to test. Getting to this point in
14312 * the code means that the class didn't get optimized there. Since this
14313 * code is only executed in Pass 2, it is too late to save space--it has
14314 * been allocated in Pass 1, and currently isn't given back. But turning
14315 * things into an EXACTish node can allow the optimizer to join it to any
14316 * adjacent such nodes. And if the class is equivalent to things like /./,
14317 * expensive run-time swashes can be avoided. Now that we have more
14318 * complete information, we can find things necessarily missed by the
14319 * earlier code. I (khw) am not sure how much to look for here. It would
14320 * be easy, but perhaps too slow, to check any candidates against all the
14321 * node types they could possibly match using _invlistEQ(). */
14326 && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
14327 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14330 U8 op = END; /* The optimzation node-type */
14331 const char * cur_parse= RExC_parse;
14333 invlist_iterinit(cp_list);
14334 if (! invlist_iternext(cp_list, &start, &end)) {
14336 /* Here, the list is empty. This happens, for example, when a
14337 * Unicode property is the only thing in the character class, and
14338 * it doesn't match anything. (perluniprops.pod notes such
14341 *flagp |= HASWIDTH|SIMPLE;
14343 else if (start == end) { /* The range is a single code point */
14344 if (! invlist_iternext(cp_list, &start, &end)
14346 /* Don't do this optimization if it would require changing
14347 * the pattern to UTF-8 */
14348 && (start < 256 || UTF))
14350 /* Here, the list contains a single code point. Can optimize
14351 * into an EXACT node */
14360 /* A locale node under folding with one code point can be
14361 * an EXACTFL, as its fold won't be calculated until
14367 /* Here, we are generally folding, but there is only one
14368 * code point to match. If we have to, we use an EXACT
14369 * node, but it would be better for joining with adjacent
14370 * nodes in the optimization pass if we used the same
14371 * EXACTFish node that any such are likely to be. We can
14372 * do this iff the code point doesn't participate in any
14373 * folds. For example, an EXACTF of a colon is the same as
14374 * an EXACT one, since nothing folds to or from a colon. */
14376 if (IS_IN_SOME_FOLD_L1(value)) {
14381 if (! PL_utf8_foldable) {
14382 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14383 &PL_sv_undef, 1, 0);
14384 PL_utf8_foldable = _get_swash_invlist(swash);
14385 SvREFCNT_dec_NN(swash);
14387 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14392 /* If we haven't found the node type, above, it means we
14393 * can use the prevailing one */
14395 op = compute_EXACTish(pRExC_state);
14400 else if (start == 0) {
14401 if (end == UV_MAX) {
14403 *flagp |= HASWIDTH|SIMPLE;
14406 else if (end == '\n' - 1
14407 && invlist_iternext(cp_list, &start, &end)
14408 && start == '\n' + 1 && end == UV_MAX)
14411 *flagp |= HASWIDTH|SIMPLE;
14415 invlist_iterfinish(cp_list);
14418 RExC_parse = (char *)orig_parse;
14419 RExC_emit = (regnode *)orig_emit;
14421 ret = reg_node(pRExC_state, op);
14423 RExC_parse = (char *)cur_parse;
14425 if (PL_regkind[op] == EXACT) {
14426 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14429 SvREFCNT_dec_NN(cp_list);
14434 /* Here, <cp_list> contains all the code points we can determine at
14435 * compile time that match under all conditions. Go through it, and
14436 * for things that belong in the bitmap, put them there, and delete from
14437 * <cp_list>. While we are at it, see if everything above 255 is in the
14438 * list, and if so, set a flag to speed up execution */
14440 populate_ANYOF_from_invlist(ret, &cp_list);
14443 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14446 /* Here, the bitmap has been populated with all the Latin1 code points that
14447 * always match. Can now add to the overall list those that match only
14448 * when the target string is UTF-8 (<depends_list>). */
14449 if (depends_list) {
14451 _invlist_union(cp_list, depends_list, &cp_list);
14452 SvREFCNT_dec_NN(depends_list);
14455 cp_list = depends_list;
14459 /* If there is a swash and more than one element, we can't use the swash in
14460 * the optimization below. */
14461 if (swash && element_count > 1) {
14462 SvREFCNT_dec_NN(swash);
14466 set_ANYOF_arg(pRExC_state, ret, cp_list,
14467 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14469 swash, has_user_defined_property);
14471 *flagp |= HASWIDTH|SIMPLE;
14475 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14478 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14479 regnode* const node,
14481 SV* const runtime_defns,
14483 const bool has_user_defined_property)
14485 /* Sets the arg field of an ANYOF-type node 'node', using information about
14486 * the node passed-in. If there is nothing outside the node's bitmap, the
14487 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
14488 * the count returned by add_data(), having allocated and stored an array,
14489 * av, that that count references, as follows:
14490 * av[0] stores the character class description in its textual form.
14491 * This is used later (regexec.c:Perl_regclass_swash()) to
14492 * initialize the appropriate swash, and is also useful for dumping
14493 * the regnode. This is set to &PL_sv_undef if the textual
14494 * description is not needed at run-time (as happens if the other
14495 * elements completely define the class)
14496 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14497 * computed from av[0]. But if no further computation need be done,
14498 * the swash is stored here now (and av[0] is &PL_sv_undef).
14499 * av[2] stores the cp_list inversion list for use in addition or instead
14500 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14501 * (Otherwise everything needed is already in av[0] and av[1])
14502 * av[3] is set if any component of the class is from a user-defined
14503 * property; used only if av[2] exists */
14507 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14509 if (! cp_list && ! runtime_defns) {
14510 ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14513 AV * const av = newAV();
14516 av_store(av, 0, (runtime_defns)
14517 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14519 av_store(av, 1, swash);
14520 SvREFCNT_dec_NN(cp_list);
14523 av_store(av, 1, &PL_sv_undef);
14525 av_store(av, 2, cp_list);
14526 av_store(av, 3, newSVuv(has_user_defined_property));
14530 rv = newRV_noinc(MUTABLE_SV(av));
14531 n = add_data(pRExC_state, STR_WITH_LEN("s"));
14532 RExC_rxi->data->data[n] = (void*)rv;
14538 /* reg_skipcomment()
14540 Absorbs an /x style # comments from the input stream.
14541 Returns true if there is more text remaining in the stream.
14542 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14543 terminates the pattern without including a newline.
14545 Note its the callers responsibility to ensure that we are
14546 actually in /x mode
14551 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14555 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14557 while (RExC_parse < RExC_end)
14558 if (*RExC_parse++ == '\n') {
14563 /* we ran off the end of the pattern without ending
14564 the comment, so we have to add an \n when wrapping */
14565 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14573 Advances the parse position, and optionally absorbs
14574 "whitespace" from the inputstream.
14576 Without /x "whitespace" means (?#...) style comments only,
14577 with /x this means (?#...) and # comments and whitespace proper.
14579 Returns the RExC_parse point from BEFORE the scan occurs.
14581 This is the /x friendly way of saying RExC_parse++.
14585 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14587 char* const retval = RExC_parse++;
14589 PERL_ARGS_ASSERT_NEXTCHAR;
14592 if (RExC_end - RExC_parse >= 3
14593 && *RExC_parse == '('
14594 && RExC_parse[1] == '?'
14595 && RExC_parse[2] == '#')
14597 while (*RExC_parse != ')') {
14598 if (RExC_parse == RExC_end)
14599 FAIL("Sequence (?#... not terminated");
14605 if (RExC_flags & RXf_PMf_EXTENDED) {
14606 if (isSPACE(*RExC_parse)) {
14610 else if (*RExC_parse == '#') {
14611 if ( reg_skipcomment( pRExC_state ) )
14620 - reg_node - emit a node
14622 STATIC regnode * /* Location. */
14623 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14627 regnode * const ret = RExC_emit;
14628 GET_RE_DEBUG_FLAGS_DECL;
14630 PERL_ARGS_ASSERT_REG_NODE;
14633 SIZE_ALIGN(RExC_size);
14637 if (RExC_emit >= RExC_emit_bound)
14638 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14639 op, RExC_emit, RExC_emit_bound);
14641 NODE_ALIGN_FILL(ret);
14643 FILL_ADVANCE_NODE(ptr, op);
14644 #ifdef RE_TRACK_PATTERN_OFFSETS
14645 if (RExC_offsets) { /* MJD */
14646 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14647 "reg_node", __LINE__,
14649 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14650 ? "Overwriting end of array!\n" : "OK",
14651 (UV)(RExC_emit - RExC_emit_start),
14652 (UV)(RExC_parse - RExC_start),
14653 (UV)RExC_offsets[0]));
14654 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14662 - reganode - emit a node with an argument
14664 STATIC regnode * /* Location. */
14665 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14669 regnode * const ret = RExC_emit;
14670 GET_RE_DEBUG_FLAGS_DECL;
14672 PERL_ARGS_ASSERT_REGANODE;
14675 SIZE_ALIGN(RExC_size);
14680 assert(2==regarglen[op]+1);
14682 Anything larger than this has to allocate the extra amount.
14683 If we changed this to be:
14685 RExC_size += (1 + regarglen[op]);
14687 then it wouldn't matter. Its not clear what side effect
14688 might come from that so its not done so far.
14693 if (RExC_emit >= RExC_emit_bound)
14694 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14695 op, RExC_emit, RExC_emit_bound);
14697 NODE_ALIGN_FILL(ret);
14699 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14700 #ifdef RE_TRACK_PATTERN_OFFSETS
14701 if (RExC_offsets) { /* MJD */
14702 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14706 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14707 "Overwriting end of array!\n" : "OK",
14708 (UV)(RExC_emit - RExC_emit_start),
14709 (UV)(RExC_parse - RExC_start),
14710 (UV)RExC_offsets[0]));
14711 Set_Cur_Node_Offset;
14719 - reguni - emit (if appropriate) a Unicode character
14722 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14726 PERL_ARGS_ASSERT_REGUNI;
14728 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14732 - reginsert - insert an operator in front of already-emitted operand
14734 * Means relocating the operand.
14737 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14743 const int offset = regarglen[(U8)op];
14744 const int size = NODE_STEP_REGNODE + offset;
14745 GET_RE_DEBUG_FLAGS_DECL;
14747 PERL_ARGS_ASSERT_REGINSERT;
14748 PERL_UNUSED_ARG(depth);
14749 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14750 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14759 if (RExC_open_parens) {
14761 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14762 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14763 if ( RExC_open_parens[paren] >= opnd ) {
14764 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14765 RExC_open_parens[paren] += size;
14767 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14769 if ( RExC_close_parens[paren] >= opnd ) {
14770 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14771 RExC_close_parens[paren] += size;
14773 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14778 while (src > opnd) {
14779 StructCopy(--src, --dst, regnode);
14780 #ifdef RE_TRACK_PATTERN_OFFSETS
14781 if (RExC_offsets) { /* MJD 20010112 */
14782 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14786 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14787 ? "Overwriting end of array!\n" : "OK",
14788 (UV)(src - RExC_emit_start),
14789 (UV)(dst - RExC_emit_start),
14790 (UV)RExC_offsets[0]));
14791 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14792 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14798 place = opnd; /* Op node, where operand used to be. */
14799 #ifdef RE_TRACK_PATTERN_OFFSETS
14800 if (RExC_offsets) { /* MJD */
14801 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14805 (UV)(place - RExC_emit_start) > RExC_offsets[0]
14806 ? "Overwriting end of array!\n" : "OK",
14807 (UV)(place - RExC_emit_start),
14808 (UV)(RExC_parse - RExC_start),
14809 (UV)RExC_offsets[0]));
14810 Set_Node_Offset(place, RExC_parse);
14811 Set_Node_Length(place, 1);
14814 src = NEXTOPER(place);
14815 FILL_ADVANCE_NODE(place, op);
14816 Zero(src, offset, regnode);
14820 - regtail - set the next-pointer at the end of a node chain of p to val.
14821 - SEE ALSO: regtail_study
14823 /* TODO: All three parms should be const */
14825 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14829 GET_RE_DEBUG_FLAGS_DECL;
14831 PERL_ARGS_ASSERT_REGTAIL;
14833 PERL_UNUSED_ARG(depth);
14839 /* Find last node. */
14842 regnode * const temp = regnext(scan);
14844 SV * const mysv=sv_newmortal();
14845 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14846 regprop(RExC_rx, mysv, scan);
14847 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14848 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14849 (temp == NULL ? "->" : ""),
14850 (temp == NULL ? PL_reg_name[OP(val)] : "")
14858 if (reg_off_by_arg[OP(scan)]) {
14859 ARG_SET(scan, val - scan);
14862 NEXT_OFF(scan) = val - scan;
14868 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14869 - Look for optimizable sequences at the same time.
14870 - currently only looks for EXACT chains.
14872 This is experimental code. The idea is to use this routine to perform
14873 in place optimizations on branches and groups as they are constructed,
14874 with the long term intention of removing optimization from study_chunk so
14875 that it is purely analytical.
14877 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14878 to control which is which.
14881 /* TODO: All four parms should be const */
14884 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14889 #ifdef EXPERIMENTAL_INPLACESCAN
14892 GET_RE_DEBUG_FLAGS_DECL;
14894 PERL_ARGS_ASSERT_REGTAIL_STUDY;
14900 /* Find last node. */
14904 regnode * const temp = regnext(scan);
14905 #ifdef EXPERIMENTAL_INPLACESCAN
14906 if (PL_regkind[OP(scan)] == EXACT) {
14907 bool has_exactf_sharp_s; /* Unexamined in this routine */
14908 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14913 switch (OP(scan)) {
14916 case EXACTFA_NO_TRIE:
14921 if( exact == PSEUDO )
14923 else if ( exact != OP(scan) )
14932 SV * const mysv=sv_newmortal();
14933 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14934 regprop(RExC_rx, mysv, scan);
14935 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14936 SvPV_nolen_const(mysv),
14937 REG_NODE_NUM(scan),
14938 PL_reg_name[exact]);
14945 SV * const mysv_val=sv_newmortal();
14946 DEBUG_PARSE_MSG("");
14947 regprop(RExC_rx, mysv_val, val);
14948 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14949 SvPV_nolen_const(mysv_val),
14950 (IV)REG_NODE_NUM(val),
14954 if (reg_off_by_arg[OP(scan)]) {
14955 ARG_SET(scan, val - scan);
14958 NEXT_OFF(scan) = val - scan;
14966 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14971 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
14976 for (bit=0; bit<32; bit++) {
14977 if (flags & (1<<bit)) {
14978 if (!set++ && lead)
14979 PerlIO_printf(Perl_debug_log, "%s",lead);
14980 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
14985 PerlIO_printf(Perl_debug_log, "\n");
14987 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14992 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14998 for (bit=0; bit<32; bit++) {
14999 if (flags & (1<<bit)) {
15000 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15003 if (!set++ && lead)
15004 PerlIO_printf(Perl_debug_log, "%s",lead);
15005 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15008 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15009 if (!set++ && lead) {
15010 PerlIO_printf(Perl_debug_log, "%s",lead);
15013 case REGEX_UNICODE_CHARSET:
15014 PerlIO_printf(Perl_debug_log, "UNICODE");
15016 case REGEX_LOCALE_CHARSET:
15017 PerlIO_printf(Perl_debug_log, "LOCALE");
15019 case REGEX_ASCII_RESTRICTED_CHARSET:
15020 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15022 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15023 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15026 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15032 PerlIO_printf(Perl_debug_log, "\n");
15034 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15040 Perl_regdump(pTHX_ const regexp *r)
15044 SV * const sv = sv_newmortal();
15045 SV *dsv= sv_newmortal();
15046 RXi_GET_DECL(r,ri);
15047 GET_RE_DEBUG_FLAGS_DECL;
15049 PERL_ARGS_ASSERT_REGDUMP;
15051 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15053 /* Header fields of interest. */
15054 if (r->anchored_substr) {
15055 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15056 RE_SV_DUMPLEN(r->anchored_substr), 30);
15057 PerlIO_printf(Perl_debug_log,
15058 "anchored %s%s at %"IVdf" ",
15059 s, RE_SV_TAIL(r->anchored_substr),
15060 (IV)r->anchored_offset);
15061 } else if (r->anchored_utf8) {
15062 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15063 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15064 PerlIO_printf(Perl_debug_log,
15065 "anchored utf8 %s%s at %"IVdf" ",
15066 s, RE_SV_TAIL(r->anchored_utf8),
15067 (IV)r->anchored_offset);
15069 if (r->float_substr) {
15070 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15071 RE_SV_DUMPLEN(r->float_substr), 30);
15072 PerlIO_printf(Perl_debug_log,
15073 "floating %s%s at %"IVdf"..%"UVuf" ",
15074 s, RE_SV_TAIL(r->float_substr),
15075 (IV)r->float_min_offset, (UV)r->float_max_offset);
15076 } else if (r->float_utf8) {
15077 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15078 RE_SV_DUMPLEN(r->float_utf8), 30);
15079 PerlIO_printf(Perl_debug_log,
15080 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15081 s, RE_SV_TAIL(r->float_utf8),
15082 (IV)r->float_min_offset, (UV)r->float_max_offset);
15084 if (r->check_substr || r->check_utf8)
15085 PerlIO_printf(Perl_debug_log,
15087 (r->check_substr == r->float_substr
15088 && r->check_utf8 == r->float_utf8
15089 ? "(checking floating" : "(checking anchored"));
15090 if (r->extflags & RXf_NOSCAN)
15091 PerlIO_printf(Perl_debug_log, " noscan");
15092 if (r->extflags & RXf_CHECK_ALL)
15093 PerlIO_printf(Perl_debug_log, " isall");
15094 if (r->check_substr || r->check_utf8)
15095 PerlIO_printf(Perl_debug_log, ") ");
15097 if (ri->regstclass) {
15098 regprop(r, sv, ri->regstclass);
15099 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15101 if (r->extflags & RXf_ANCH) {
15102 PerlIO_printf(Perl_debug_log, "anchored");
15103 if (r->extflags & RXf_ANCH_BOL)
15104 PerlIO_printf(Perl_debug_log, "(BOL)");
15105 if (r->extflags & RXf_ANCH_MBOL)
15106 PerlIO_printf(Perl_debug_log, "(MBOL)");
15107 if (r->extflags & RXf_ANCH_SBOL)
15108 PerlIO_printf(Perl_debug_log, "(SBOL)");
15109 if (r->extflags & RXf_ANCH_GPOS)
15110 PerlIO_printf(Perl_debug_log, "(GPOS)");
15111 PerlIO_putc(Perl_debug_log, ' ');
15113 if (r->extflags & RXf_GPOS_SEEN)
15114 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15115 if (r->intflags & PREGf_SKIP)
15116 PerlIO_printf(Perl_debug_log, "plus ");
15117 if (r->intflags & PREGf_IMPLICIT)
15118 PerlIO_printf(Perl_debug_log, "implicit ");
15119 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15120 if (r->extflags & RXf_EVAL_SEEN)
15121 PerlIO_printf(Perl_debug_log, "with eval ");
15122 PerlIO_printf(Perl_debug_log, "\n");
15124 regdump_extflags("r->extflags: ",r->extflags);
15125 regdump_intflags("r->intflags: ",r->intflags);
15128 PERL_ARGS_ASSERT_REGDUMP;
15129 PERL_UNUSED_CONTEXT;
15130 PERL_UNUSED_ARG(r);
15131 #endif /* DEBUGGING */
15135 - regprop - printable representation of opcode
15139 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
15145 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15146 static const char * const anyofs[] = {
15147 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15148 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15149 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15150 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15151 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15152 || _CC_VERTSPACE != 16
15153 #error Need to adjust order of anyofs[]
15190 RXi_GET_DECL(prog,progi);
15191 GET_RE_DEBUG_FLAGS_DECL;
15193 PERL_ARGS_ASSERT_REGPROP;
15197 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15198 /* It would be nice to FAIL() here, but this may be called from
15199 regexec.c, and it would be hard to supply pRExC_state. */
15200 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
15201 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15203 k = PL_regkind[OP(o)];
15206 sv_catpvs(sv, " ");
15207 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15208 * is a crude hack but it may be the best for now since
15209 * we have no flag "this EXACTish node was UTF-8"
15211 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15212 PERL_PV_ESCAPE_UNI_DETECT |
15213 PERL_PV_ESCAPE_NONASCII |
15214 PERL_PV_PRETTY_ELLIPSES |
15215 PERL_PV_PRETTY_LTGT |
15216 PERL_PV_PRETTY_NOCLEAR
15218 } else if (k == TRIE) {
15219 /* print the details of the trie in dumpuntil instead, as
15220 * progi->data isn't available here */
15221 const char op = OP(o);
15222 const U32 n = ARG(o);
15223 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15224 (reg_ac_data *)progi->data->data[n] :
15226 const reg_trie_data * const trie
15227 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15229 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15230 DEBUG_TRIE_COMPILE_r(
15231 Perl_sv_catpvf(aTHX_ sv,
15232 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15233 (UV)trie->startstate,
15234 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15235 (UV)trie->wordcount,
15238 (UV)TRIE_CHARCOUNT(trie),
15239 (UV)trie->uniquecharcount
15242 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15243 sv_catpvs(sv, "[");
15244 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15246 : TRIE_BITMAP(trie));
15247 sv_catpvs(sv, "]");
15250 } else if (k == CURLY) {
15251 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15252 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15253 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15255 else if (k == WHILEM && o->flags) /* Ordinal/of */
15256 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15257 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
15258 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15259 if ( RXp_PAREN_NAMES(prog) ) {
15260 if ( k != REF || (OP(o) < NREF)) {
15261 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15262 SV **name= av_fetch(list, ARG(o), 0 );
15264 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15267 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15268 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15269 I32 *nums=(I32*)SvPVX(sv_dat);
15270 SV **name= av_fetch(list, nums[0], 0 );
15273 for ( n=0; n<SvIVX(sv_dat); n++ ) {
15274 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15275 (n ? "," : ""), (IV)nums[n]);
15277 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15281 } else if (k == GOSUB)
15282 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
15283 else if (k == VERB) {
15285 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15286 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15287 } else if (k == LOGICAL)
15288 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
15289 else if (k == ANYOF) {
15290 const U8 flags = ANYOF_FLAGS(o);
15294 if (flags & ANYOF_LOCALE)
15295 sv_catpvs(sv, "{loc}");
15296 if (flags & ANYOF_LOC_FOLD)
15297 sv_catpvs(sv, "{i}");
15298 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15299 if (flags & ANYOF_INVERT)
15300 sv_catpvs(sv, "^");
15302 /* output what the standard cp 0-255 bitmap matches */
15303 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15305 /* output any special charclass tests (used entirely under use
15307 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15309 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15310 if (ANYOF_POSIXL_TEST(o,i)) {
15311 sv_catpv(sv, anyofs[i]);
15317 if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
15318 || ANYOF_NONBITMAP(o))
15321 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15322 if (flags & ANYOF_INVERT)
15323 /*make sure the invert info is in each */
15324 sv_catpvs(sv, "^");
15327 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
15328 sv_catpvs(sv, "{non-utf8-latin1-all}");
15331 /* output information about the unicode matching */
15332 if (flags & ANYOF_ABOVE_LATIN1_ALL)
15333 sv_catpvs(sv, "{unicode_all}");
15334 else if (ANYOF_NONBITMAP(o)) {
15335 SV *lv; /* Set if there is something outside the bit map. */
15336 bool byte_output = FALSE; /* If something in the bitmap has been
15339 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15340 sv_catpvs(sv, "{outside bitmap}");
15343 sv_catpvs(sv, "{utf8}");
15346 /* Get the stuff that wasn't in the bitmap */
15347 (void) regclass_swash(prog, o, FALSE, &lv, NULL);
15348 if (lv && lv != &PL_sv_undef) {
15349 char *s = savesvpv(lv);
15350 char * const origs = s;
15352 while (*s && *s != '\n')
15356 const char * const t = ++s;
15359 sv_catpvs(sv, " ");
15365 /* Truncate very long output */
15366 if (s - origs > 256) {
15367 Perl_sv_catpvf(aTHX_ sv,
15369 (int) (s - origs - 1),
15375 else if (*s == '\t') {
15389 SvREFCNT_dec_NN(lv);
15394 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15396 else if (k == POSIXD || k == NPOSIXD) {
15397 U8 index = FLAGS(o) * 2;
15398 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
15399 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15402 if (*anyofs[index] != '[') {
15405 sv_catpv(sv, anyofs[index]);
15406 if (*anyofs[index] != '[') {
15411 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15412 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15414 PERL_UNUSED_CONTEXT;
15415 PERL_UNUSED_ARG(sv);
15416 PERL_UNUSED_ARG(o);
15417 PERL_UNUSED_ARG(prog);
15418 #endif /* DEBUGGING */
15422 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15423 { /* Assume that RE_INTUIT is set */
15425 struct regexp *const prog = ReANY(r);
15426 GET_RE_DEBUG_FLAGS_DECL;
15428 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15429 PERL_UNUSED_CONTEXT;
15433 const char * const s = SvPV_nolen_const(prog->check_substr
15434 ? prog->check_substr : prog->check_utf8);
15436 if (!PL_colorset) reginitcolors();
15437 PerlIO_printf(Perl_debug_log,
15438 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15440 prog->check_substr ? "" : "utf8 ",
15441 PL_colors[5],PL_colors[0],
15444 (strlen(s) > 60 ? "..." : ""));
15447 return prog->check_substr ? prog->check_substr : prog->check_utf8;
15453 handles refcounting and freeing the perl core regexp structure. When
15454 it is necessary to actually free the structure the first thing it
15455 does is call the 'free' method of the regexp_engine associated to
15456 the regexp, allowing the handling of the void *pprivate; member
15457 first. (This routine is not overridable by extensions, which is why
15458 the extensions free is called first.)
15460 See regdupe and regdupe_internal if you change anything here.
15462 #ifndef PERL_IN_XSUB_RE
15464 Perl_pregfree(pTHX_ REGEXP *r)
15470 Perl_pregfree2(pTHX_ REGEXP *rx)
15473 struct regexp *const r = ReANY(rx);
15474 GET_RE_DEBUG_FLAGS_DECL;
15476 PERL_ARGS_ASSERT_PREGFREE2;
15478 if (r->mother_re) {
15479 ReREFCNT_dec(r->mother_re);
15481 CALLREGFREE_PVT(rx); /* free the private data */
15482 SvREFCNT_dec(RXp_PAREN_NAMES(r));
15483 Safefree(r->xpv_len_u.xpvlenu_pv);
15486 SvREFCNT_dec(r->anchored_substr);
15487 SvREFCNT_dec(r->anchored_utf8);
15488 SvREFCNT_dec(r->float_substr);
15489 SvREFCNT_dec(r->float_utf8);
15490 Safefree(r->substrs);
15492 RX_MATCH_COPY_FREE(rx);
15493 #ifdef PERL_ANY_COW
15494 SvREFCNT_dec(r->saved_copy);
15497 SvREFCNT_dec(r->qr_anoncv);
15498 rx->sv_u.svu_rx = 0;
15503 This is a hacky workaround to the structural issue of match results
15504 being stored in the regexp structure which is in turn stored in
15505 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15506 could be PL_curpm in multiple contexts, and could require multiple
15507 result sets being associated with the pattern simultaneously, such
15508 as when doing a recursive match with (??{$qr})
15510 The solution is to make a lightweight copy of the regexp structure
15511 when a qr// is returned from the code executed by (??{$qr}) this
15512 lightweight copy doesn't actually own any of its data except for
15513 the starp/end and the actual regexp structure itself.
15519 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15521 struct regexp *ret;
15522 struct regexp *const r = ReANY(rx);
15523 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15525 PERL_ARGS_ASSERT_REG_TEMP_COPY;
15528 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15530 SvOK_off((SV *)ret_x);
15532 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15533 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
15534 made both spots point to the same regexp body.) */
15535 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15536 assert(!SvPVX(ret_x));
15537 ret_x->sv_u.svu_rx = temp->sv_any;
15538 temp->sv_any = NULL;
15539 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15540 SvREFCNT_dec_NN(temp);
15541 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15542 ing below will not set it. */
15543 SvCUR_set(ret_x, SvCUR(rx));
15546 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15547 sv_force_normal(sv) is called. */
15549 ret = ReANY(ret_x);
15551 SvFLAGS(ret_x) |= SvUTF8(rx);
15552 /* We share the same string buffer as the original regexp, on which we
15553 hold a reference count, incremented when mother_re is set below.
15554 The string pointer is copied here, being part of the regexp struct.
15556 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15557 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15559 const I32 npar = r->nparens+1;
15560 Newx(ret->offs, npar, regexp_paren_pair);
15561 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15564 Newx(ret->substrs, 1, struct reg_substr_data);
15565 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15567 SvREFCNT_inc_void(ret->anchored_substr);
15568 SvREFCNT_inc_void(ret->anchored_utf8);
15569 SvREFCNT_inc_void(ret->float_substr);
15570 SvREFCNT_inc_void(ret->float_utf8);
15572 /* check_substr and check_utf8, if non-NULL, point to either their
15573 anchored or float namesakes, and don't hold a second reference. */
15575 RX_MATCH_COPIED_off(ret_x);
15576 #ifdef PERL_ANY_COW
15577 ret->saved_copy = NULL;
15579 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15580 SvREFCNT_inc_void(ret->qr_anoncv);
15586 /* regfree_internal()
15588 Free the private data in a regexp. This is overloadable by
15589 extensions. Perl takes care of the regexp structure in pregfree(),
15590 this covers the *pprivate pointer which technically perl doesn't
15591 know about, however of course we have to handle the
15592 regexp_internal structure when no extension is in use.
15594 Note this is called before freeing anything in the regexp
15599 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15602 struct regexp *const r = ReANY(rx);
15603 RXi_GET_DECL(r,ri);
15604 GET_RE_DEBUG_FLAGS_DECL;
15606 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15612 SV *dsv= sv_newmortal();
15613 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15614 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15615 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15616 PL_colors[4],PL_colors[5],s);
15619 #ifdef RE_TRACK_PATTERN_OFFSETS
15621 Safefree(ri->u.offsets); /* 20010421 MJD */
15623 if (ri->code_blocks) {
15625 for (n = 0; n < ri->num_code_blocks; n++)
15626 SvREFCNT_dec(ri->code_blocks[n].src_regex);
15627 Safefree(ri->code_blocks);
15631 int n = ri->data->count;
15634 /* If you add a ->what type here, update the comment in regcomp.h */
15635 switch (ri->data->what[n]) {
15641 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15644 Safefree(ri->data->data[n]);
15650 { /* Aho Corasick add-on structure for a trie node.
15651 Used in stclass optimization only */
15653 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15655 refcount = --aho->refcount;
15658 PerlMemShared_free(aho->states);
15659 PerlMemShared_free(aho->fail);
15660 /* do this last!!!! */
15661 PerlMemShared_free(ri->data->data[n]);
15662 PerlMemShared_free(ri->regstclass);
15668 /* trie structure. */
15670 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15672 refcount = --trie->refcount;
15675 PerlMemShared_free(trie->charmap);
15676 PerlMemShared_free(trie->states);
15677 PerlMemShared_free(trie->trans);
15679 PerlMemShared_free(trie->bitmap);
15681 PerlMemShared_free(trie->jump);
15682 PerlMemShared_free(trie->wordinfo);
15683 /* do this last!!!! */
15684 PerlMemShared_free(ri->data->data[n]);
15689 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15692 Safefree(ri->data->what);
15693 Safefree(ri->data);
15699 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15700 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15701 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15704 re_dup - duplicate a regexp.
15706 This routine is expected to clone a given regexp structure. It is only
15707 compiled under USE_ITHREADS.
15709 After all of the core data stored in struct regexp is duplicated
15710 the regexp_engine.dupe method is used to copy any private data
15711 stored in the *pprivate pointer. This allows extensions to handle
15712 any duplication it needs to do.
15714 See pregfree() and regfree_internal() if you change anything here.
15716 #if defined(USE_ITHREADS)
15717 #ifndef PERL_IN_XSUB_RE
15719 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15723 const struct regexp *r = ReANY(sstr);
15724 struct regexp *ret = ReANY(dstr);
15726 PERL_ARGS_ASSERT_RE_DUP_GUTS;
15728 npar = r->nparens+1;
15729 Newx(ret->offs, npar, regexp_paren_pair);
15730 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15732 if (ret->substrs) {
15733 /* Do it this way to avoid reading from *r after the StructCopy().
15734 That way, if any of the sv_dup_inc()s dislodge *r from the L1
15735 cache, it doesn't matter. */
15736 const bool anchored = r->check_substr
15737 ? r->check_substr == r->anchored_substr
15738 : r->check_utf8 == r->anchored_utf8;
15739 Newx(ret->substrs, 1, struct reg_substr_data);
15740 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15742 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15743 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15744 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15745 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15747 /* check_substr and check_utf8, if non-NULL, point to either their
15748 anchored or float namesakes, and don't hold a second reference. */
15750 if (ret->check_substr) {
15752 assert(r->check_utf8 == r->anchored_utf8);
15753 ret->check_substr = ret->anchored_substr;
15754 ret->check_utf8 = ret->anchored_utf8;
15756 assert(r->check_substr == r->float_substr);
15757 assert(r->check_utf8 == r->float_utf8);
15758 ret->check_substr = ret->float_substr;
15759 ret->check_utf8 = ret->float_utf8;
15761 } else if (ret->check_utf8) {
15763 ret->check_utf8 = ret->anchored_utf8;
15765 ret->check_utf8 = ret->float_utf8;
15770 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15771 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15774 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15776 if (RX_MATCH_COPIED(dstr))
15777 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15779 ret->subbeg = NULL;
15780 #ifdef PERL_ANY_COW
15781 ret->saved_copy = NULL;
15784 /* Whether mother_re be set or no, we need to copy the string. We
15785 cannot refrain from copying it when the storage points directly to
15786 our mother regexp, because that's
15787 1: a buffer in a different thread
15788 2: something we no longer hold a reference on
15789 so we need to copy it locally. */
15790 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15791 ret->mother_re = NULL;
15793 #endif /* PERL_IN_XSUB_RE */
15798 This is the internal complement to regdupe() which is used to copy
15799 the structure pointed to by the *pprivate pointer in the regexp.
15800 This is the core version of the extension overridable cloning hook.
15801 The regexp structure being duplicated will be copied by perl prior
15802 to this and will be provided as the regexp *r argument, however
15803 with the /old/ structures pprivate pointer value. Thus this routine
15804 may override any copying normally done by perl.
15806 It returns a pointer to the new regexp_internal structure.
15810 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15813 struct regexp *const r = ReANY(rx);
15814 regexp_internal *reti;
15816 RXi_GET_DECL(r,ri);
15818 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15822 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15823 Copy(ri->program, reti->program, len+1, regnode);
15825 reti->num_code_blocks = ri->num_code_blocks;
15826 if (ri->code_blocks) {
15828 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15829 struct reg_code_block);
15830 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15831 struct reg_code_block);
15832 for (n = 0; n < ri->num_code_blocks; n++)
15833 reti->code_blocks[n].src_regex = (REGEXP*)
15834 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15837 reti->code_blocks = NULL;
15839 reti->regstclass = NULL;
15842 struct reg_data *d;
15843 const int count = ri->data->count;
15846 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15847 char, struct reg_data);
15848 Newx(d->what, count, U8);
15851 for (i = 0; i < count; i++) {
15852 d->what[i] = ri->data->what[i];
15853 switch (d->what[i]) {
15854 /* see also regcomp.h and regfree_internal() */
15855 case 'a': /* actually an AV, but the dup function is identical. */
15859 case 'u': /* actually an HV, but the dup function is identical. */
15860 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15863 /* This is cheating. */
15864 Newx(d->data[i], 1, regnode_ssc);
15865 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
15866 reti->regstclass = (regnode*)d->data[i];
15869 /* Trie stclasses are readonly and can thus be shared
15870 * without duplication. We free the stclass in pregfree
15871 * when the corresponding reg_ac_data struct is freed.
15873 reti->regstclass= ri->regstclass;
15877 ((reg_trie_data*)ri->data->data[i])->refcount++;
15882 d->data[i] = ri->data->data[i];
15885 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15894 reti->name_list_idx = ri->name_list_idx;
15896 #ifdef RE_TRACK_PATTERN_OFFSETS
15897 if (ri->u.offsets) {
15898 Newx(reti->u.offsets, 2*len+1, U32);
15899 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15902 SetProgLen(reti,len);
15905 return (void*)reti;
15908 #endif /* USE_ITHREADS */
15910 #ifndef PERL_IN_XSUB_RE
15913 - regnext - dig the "next" pointer out of a node
15916 Perl_regnext(pTHX_ regnode *p)
15924 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
15925 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15928 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15937 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
15940 STRLEN l1 = strlen(pat1);
15941 STRLEN l2 = strlen(pat2);
15944 const char *message;
15946 PERL_ARGS_ASSERT_RE_CROAK2;
15952 Copy(pat1, buf, l1 , char);
15953 Copy(pat2, buf + l1, l2 , char);
15954 buf[l1 + l2] = '\n';
15955 buf[l1 + l2 + 1] = '\0';
15956 va_start(args, pat2);
15957 msv = vmess(buf, &args);
15959 message = SvPV_const(msv,l1);
15962 Copy(message, buf, l1 , char);
15963 /* l1-1 to avoid \n */
15964 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
15967 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
15969 #ifndef PERL_IN_XSUB_RE
15971 Perl_save_re_context(pTHX)
15975 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15977 const REGEXP * const rx = PM_GETRE(PL_curpm);
15980 for (i = 1; i <= RX_NPARENS(rx); i++) {
15981 char digits[TYPE_CHARS(long)];
15982 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15983 GV *const *const gvp
15984 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15987 GV * const gv = *gvp;
15988 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16000 S_put_byte(pTHX_ SV *sv, int c)
16002 PERL_ARGS_ASSERT_PUT_BYTE;
16004 /* Our definition of isPRINT() ignores locales, so only bytes that are
16005 not part of UTF-8 are considered printable. I assume that the same
16006 holds for UTF-EBCDIC.
16007 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
16008 which Wikipedia says:
16010 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
16011 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
16012 identical, to the ASCII delete (DEL) or rubout control character. ...
16013 it is typically mapped to hexadecimal code 9F, in order to provide a
16014 unique character mapping in both directions)
16016 So the old condition can be simplified to !isPRINT(c) */
16019 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16020 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16021 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16022 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16023 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16026 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16031 const char string = c;
16032 if (c == '-' || c == ']' || c == '\\' || c == '^')
16033 sv_catpvs(sv, "\\");
16034 sv_catpvn(sv, &string, 1);
16039 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16041 /* Appends to 'sv' a displayable version of the innards of the bracketed
16042 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16043 * output anything */
16046 int rangestart = -1;
16047 bool has_output_anything = FALSE;
16049 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16051 for (i = 0; i <= 256; i++) {
16052 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16053 if (rangestart == -1)
16055 } else if (rangestart != -1) {
16057 if (i <= rangestart + 3) { /* Individual chars in short ranges */
16058 for (; rangestart < i; rangestart++)
16059 put_byte(sv, rangestart);
16062 || ! isALPHANUMERIC(rangestart)
16063 || ! isALPHANUMERIC(j)
16064 || isDIGIT(rangestart) != isDIGIT(j)
16065 || isUPPER(rangestart) != isUPPER(j)
16066 || isLOWER(rangestart) != isLOWER(j)
16068 /* This final test should get optimized out except
16069 * on EBCDIC platforms, where it causes ranges that
16070 * cross discontinuities like i/j to be shown as hex
16071 * instead of the misleading, e.g. H-K (since that
16072 * range includes more than H, I, J, K). */
16073 || (j - rangestart)
16074 != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
16076 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
16078 (j < 256) ? j : 255);
16080 else { /* Here, the ends of the range are both digits, or both
16081 uppercase, or both lowercase; and there's no
16082 discontinuity in the range (which could happen on EBCDIC
16084 put_byte(sv, rangestart);
16085 sv_catpvs(sv, "-");
16089 has_output_anything = TRUE;
16093 return has_output_anything;
16096 #define CLEAR_OPTSTART \
16097 if (optstart) STMT_START { \
16098 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16102 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16104 STATIC const regnode *
16105 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16106 const regnode *last, const regnode *plast,
16107 SV* sv, I32 indent, U32 depth)
16110 U8 op = PSEUDO; /* Arbitrary non-END op. */
16111 const regnode *next;
16112 const regnode *optstart= NULL;
16114 RXi_GET_DECL(r,ri);
16115 GET_RE_DEBUG_FLAGS_DECL;
16117 PERL_ARGS_ASSERT_DUMPUNTIL;
16119 #ifdef DEBUG_DUMPUNTIL
16120 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16121 last ? last-start : 0,plast ? plast-start : 0);
16124 if (plast && plast < last)
16127 while (PL_regkind[op] != END && (!last || node < last)) {
16128 /* While that wasn't END last time... */
16131 if (op == CLOSE || op == WHILEM)
16133 next = regnext((regnode *)node);
16136 if (OP(node) == OPTIMIZED) {
16137 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16144 regprop(r, sv, node);
16145 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16146 (int)(2*indent + 1), "", SvPVX_const(sv));
16148 if (OP(node) != OPTIMIZED) {
16149 if (next == NULL) /* Next ptr. */
16150 PerlIO_printf(Perl_debug_log, " (0)");
16151 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
16152 PerlIO_printf(Perl_debug_log, " (FAIL)");
16154 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16155 (void)PerlIO_putc(Perl_debug_log, '\n');
16159 if (PL_regkind[(U8)op] == BRANCHJ) {
16162 const regnode *nnode = (OP(next) == LONGJMP
16163 ? regnext((regnode *)next)
16165 if (last && nnode > last)
16167 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16170 else if (PL_regkind[(U8)op] == BRANCH) {
16172 DUMPUNTIL(NEXTOPER(node), next);
16174 else if ( PL_regkind[(U8)op] == TRIE ) {
16175 const regnode *this_trie = node;
16176 const char op = OP(node);
16177 const U32 n = ARG(node);
16178 const reg_ac_data * const ac = op>=AHOCORASICK ?
16179 (reg_ac_data *)ri->data->data[n] :
16181 const reg_trie_data * const trie =
16182 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16184 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16186 const regnode *nextbranch= NULL;
16189 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16190 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16192 PerlIO_printf(Perl_debug_log, "%*s%s ",
16193 (int)(2*(indent+3)), "",
16194 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
16195 PL_colors[0], PL_colors[1],
16196 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
16197 PERL_PV_PRETTY_ELLIPSES |
16198 PERL_PV_PRETTY_LTGT
16203 U16 dist= trie->jump[word_idx+1];
16204 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16205 (UV)((dist ? this_trie + dist : next) - start));
16208 nextbranch= this_trie + trie->jump[0];
16209 DUMPUNTIL(this_trie + dist, nextbranch);
16211 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16212 nextbranch= regnext((regnode *)nextbranch);
16214 PerlIO_printf(Perl_debug_log, "\n");
16217 if (last && next > last)
16222 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
16223 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16224 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16226 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16228 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16230 else if ( op == PLUS || op == STAR) {
16231 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16233 else if (PL_regkind[(U8)op] == ANYOF) {
16234 /* arglen 1 + class block */
16235 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16236 ? ANYOF_POSIXL_SKIP : ANYOF_SKIP);
16237 node = NEXTOPER(node);
16239 else if (PL_regkind[(U8)op] == EXACT) {
16240 /* Literal string, where present. */
16241 node += NODE_SZ_STR(node) - 1;
16242 node = NEXTOPER(node);
16245 node = NEXTOPER(node);
16246 node += regarglen[(U8)op];
16248 if (op == CURLYX || op == OPEN)
16252 #ifdef DEBUG_DUMPUNTIL
16253 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16258 #endif /* DEBUGGING */
16262 * c-indentation-style: bsd
16263 * c-basic-offset: 4
16264 * indent-tabs-mode: nil
16267 * ex: set ts=8 sts=4 sw=4 et: