5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 struct RExC_state_t {
104 U32 flags; /* RXf_* are we folding, multilining? */
105 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
106 char *precomp; /* uncompiled string. */
107 REGEXP *rx_sv; /* The SV that is the regexp. */
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
110 char *start; /* Start of input for compile */
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
113 SSize_t whilem_seen; /* number of WHILEM in this expr */
114 regnode *emit_start; /* Start of emitted-code area */
115 regnode *emit_bound; /* First regnode outside of the allocated space */
116 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
117 implies compiling, so don't emit */
118 regnode_ssc emit_dummy; /* placeholder for emit to point to;
119 large enough for the largest
120 non-EXACTish node, so can use it as
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
125 SSize_t size; /* Code size. */
126 I32 npar; /* Capture buffer count, (OPEN) plus one. ("par" 0 is the whole pattern)*/
127 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
140 HV *paren_names; /* Paren names */
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
144 U8 *study_chunk_recursed; /* bitmap of which parens we have moved through */
145 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
149 I32 override_recoding;
150 I32 in_multi_char_class;
151 struct reg_code_block *code_blocks; /* positions of literal (?{})
153 int num_code_blocks; /* size of code_blocks[] */
154 int code_index; /* next code_blocks[] slot */
156 char *starttry; /* -Dr: where regtry was called. */
157 #define RExC_starttry (pRExC_state->starttry)
159 SV *runtime_code_qr; /* qr with the runtime code blocks */
161 const char *lastparse;
163 AV *paren_name_list; /* idx -> name */
164 #define RExC_lastparse (pRExC_state->lastparse)
165 #define RExC_lastnum (pRExC_state->lastnum)
166 #define RExC_paren_name_list (pRExC_state->paren_name_list)
170 #define RExC_flags (pRExC_state->flags)
171 #define RExC_pm_flags (pRExC_state->pm_flags)
172 #define RExC_precomp (pRExC_state->precomp)
173 #define RExC_rx_sv (pRExC_state->rx_sv)
174 #define RExC_rx (pRExC_state->rx)
175 #define RExC_rxi (pRExC_state->rxi)
176 #define RExC_start (pRExC_state->start)
177 #define RExC_end (pRExC_state->end)
178 #define RExC_parse (pRExC_state->parse)
179 #define RExC_whilem_seen (pRExC_state->whilem_seen)
180 #ifdef RE_TRACK_PATTERN_OFFSETS
181 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
183 #define RExC_emit (pRExC_state->emit)
184 #define RExC_emit_dummy (pRExC_state->emit_dummy)
185 #define RExC_emit_start (pRExC_state->emit_start)
186 #define RExC_emit_bound (pRExC_state->emit_bound)
187 #define RExC_naughty (pRExC_state->naughty)
188 #define RExC_sawback (pRExC_state->sawback)
189 #define RExC_seen (pRExC_state->seen)
190 #define RExC_size (pRExC_state->size)
191 #define RExC_npar (pRExC_state->npar)
192 #define RExC_nestroot (pRExC_state->nestroot)
193 #define RExC_extralen (pRExC_state->extralen)
194 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
195 #define RExC_utf8 (pRExC_state->utf8)
196 #define RExC_uni_semantics (pRExC_state->uni_semantics)
197 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
198 #define RExC_open_parens (pRExC_state->open_parens)
199 #define RExC_close_parens (pRExC_state->close_parens)
200 #define RExC_opend (pRExC_state->opend)
201 #define RExC_paren_names (pRExC_state->paren_names)
202 #define RExC_recurse (pRExC_state->recurse)
203 #define RExC_recurse_count (pRExC_state->recurse_count)
204 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
205 #define RExC_study_chunk_recursed_bytes (pRExC_state->study_chunk_recursed_bytes)
206 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
207 #define RExC_contains_locale (pRExC_state->contains_locale)
208 #define RExC_contains_i (pRExC_state->contains_i)
209 #define RExC_override_recoding (pRExC_state->override_recoding)
210 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
213 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
214 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
215 ((*s) == '{' && regcurly(s, FALSE)))
218 * Flags to be passed up and down.
220 #define WORST 0 /* Worst case. */
221 #define HASWIDTH 0x01 /* Known to match non-null strings. */
223 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
224 * character. (There needs to be a case: in the switch statement in regexec.c
225 * for any node marked SIMPLE.) Note that this is not the same thing as
228 #define SPSTART 0x04 /* Starts with * or + */
229 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
230 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
231 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
233 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
235 /* whether trie related optimizations are enabled */
236 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
237 #define TRIE_STUDY_OPT
238 #define FULL_TRIE_STUDY
244 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
245 #define PBITVAL(paren) (1 << ((paren) & 7))
246 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
247 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
248 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
250 #define REQUIRE_UTF8 STMT_START { \
252 *flagp = RESTART_UTF8; \
257 /* This converts the named class defined in regcomp.h to its equivalent class
258 * number defined in handy.h. */
259 #define namedclass_to_classnum(class) ((int) ((class) / 2))
260 #define classnum_to_namedclass(classnum) ((classnum) * 2)
262 #define _invlist_union_complement_2nd(a, b, output) \
263 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
264 #define _invlist_intersection_complement_2nd(a, b, output) \
265 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
267 /* About scan_data_t.
269 During optimisation we recurse through the regexp program performing
270 various inplace (keyhole style) optimisations. In addition study_chunk
271 and scan_commit populate this data structure with information about
272 what strings MUST appear in the pattern. We look for the longest
273 string that must appear at a fixed location, and we look for the
274 longest string that may appear at a floating location. So for instance
279 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
280 strings (because they follow a .* construct). study_chunk will identify
281 both FOO and BAR as being the longest fixed and floating strings respectively.
283 The strings can be composites, for instance
287 will result in a composite fixed substring 'foo'.
289 For each string some basic information is maintained:
291 - offset or min_offset
292 This is the position the string must appear at, or not before.
293 It also implicitly (when combined with minlenp) tells us how many
294 characters must match before the string we are searching for.
295 Likewise when combined with minlenp and the length of the string it
296 tells us how many characters must appear after the string we have
300 Only used for floating strings. This is the rightmost point that
301 the string can appear at. If set to SSize_t_MAX it indicates that the
302 string can occur infinitely far to the right.
305 A pointer to the minimum number of characters of the pattern that the
306 string was found inside. This is important as in the case of positive
307 lookahead or positive lookbehind we can have multiple patterns
312 The minimum length of the pattern overall is 3, the minimum length
313 of the lookahead part is 3, but the minimum length of the part that
314 will actually match is 1. So 'FOO's minimum length is 3, but the
315 minimum length for the F is 1. This is important as the minimum length
316 is used to determine offsets in front of and behind the string being
317 looked for. Since strings can be composites this is the length of the
318 pattern at the time it was committed with a scan_commit. Note that
319 the length is calculated by study_chunk, so that the minimum lengths
320 are not known until the full pattern has been compiled, thus the
321 pointer to the value.
325 In the case of lookbehind the string being searched for can be
326 offset past the start point of the final matching string.
327 If this value was just blithely removed from the min_offset it would
328 invalidate some of the calculations for how many chars must match
329 before or after (as they are derived from min_offset and minlen and
330 the length of the string being searched for).
331 When the final pattern is compiled and the data is moved from the
332 scan_data_t structure into the regexp structure the information
333 about lookbehind is factored in, with the information that would
334 have been lost precalculated in the end_shift field for the
337 The fields pos_min and pos_delta are used to store the minimum offset
338 and the delta to the maximum offset at the current point in the pattern.
342 typedef struct scan_data_t {
343 /*I32 len_min; unused */
344 /*I32 len_delta; unused */
348 SSize_t last_end; /* min value, <0 unless valid. */
349 SSize_t last_start_min;
350 SSize_t last_start_max;
351 SV **longest; /* Either &l_fixed, or &l_float. */
352 SV *longest_fixed; /* longest fixed string found in pattern */
353 SSize_t offset_fixed; /* offset where it starts */
354 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
355 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
356 SV *longest_float; /* longest floating string found in pattern */
357 SSize_t offset_float_min; /* earliest point in string it can appear */
358 SSize_t offset_float_max; /* latest point in string it can appear */
359 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
360 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
363 SSize_t *last_closep;
364 regnode_ssc *start_class;
367 /* The below is perhaps overboard, but this allows us to save a test at the
368 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
369 * and 'a' differ by a single bit; the same with the upper and lower case of
370 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
371 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
372 * then inverts it to form a mask, with just a single 0, in the bit position
373 * where the upper- and lowercase differ. XXX There are about 40 other
374 * instances in the Perl core where this micro-optimization could be used.
375 * Should decide if maintenance cost is worse, before changing those
377 * Returns a boolean as to whether or not 'v' is either a lowercase or
378 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
379 * compile-time constant, the generated code is better than some optimizing
380 * compilers figure out, amounting to a mask and test. The results are
381 * meaningless if 'c' is not one of [A-Za-z] */
382 #define isARG2_lower_or_UPPER_ARG1(c, v) \
383 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
386 * Forward declarations for pregcomp()'s friends.
389 static const scan_data_t zero_scan_data =
390 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
392 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
393 #define SF_BEFORE_SEOL 0x0001
394 #define SF_BEFORE_MEOL 0x0002
395 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
396 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
398 #define SF_FIX_SHIFT_EOL (+2)
399 #define SF_FL_SHIFT_EOL (+4)
401 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
402 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
404 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
405 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
406 #define SF_IS_INF 0x0040
407 #define SF_HAS_PAR 0x0080
408 #define SF_IN_PAR 0x0100
409 #define SF_HAS_EVAL 0x0200
410 #define SCF_DO_SUBSTR 0x0400
411 #define SCF_DO_STCLASS_AND 0x0800
412 #define SCF_DO_STCLASS_OR 0x1000
413 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
414 #define SCF_WHILEM_VISITED_POS 0x2000
416 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
417 #define SCF_SEEN_ACCEPT 0x8000
418 #define SCF_TRIE_DOING_RESTUDY 0x10000
420 #define UTF cBOOL(RExC_utf8)
422 /* The enums for all these are ordered so things work out correctly */
423 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
424 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
425 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
426 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
427 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
428 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
429 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
431 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
433 /* For programs that want to be strictly Unicode compatible by dying if any
434 * attempt is made to match a non-Unicode code point against a Unicode
436 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
438 #define OOB_NAMEDCLASS -1
440 /* There is no code point that is out-of-bounds, so this is problematic. But
441 * its only current use is to initialize a variable that is always set before
443 #define OOB_UNICODE 0xDEADBEEF
445 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
446 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
449 /* length of regex to show in messages that don't mark a position within */
450 #define RegexLengthToShowInErrorMessages 127
453 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
454 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
455 * op/pragma/warn/regcomp.
457 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
458 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
460 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
462 #define REPORT_LOCATION_ARGS(offset) \
463 UTF8fARG(UTF, offset, RExC_precomp), \
464 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
467 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
468 * arg. Show regex, up to a maximum length. If it's too long, chop and add
471 #define _FAIL(code) STMT_START { \
472 const char *ellipses = ""; \
473 IV len = RExC_end - RExC_precomp; \
476 SAVEFREESV(RExC_rx_sv); \
477 if (len > RegexLengthToShowInErrorMessages) { \
478 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
479 len = RegexLengthToShowInErrorMessages - 10; \
485 #define FAIL(msg) _FAIL( \
486 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
487 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
489 #define FAIL2(msg,arg) _FAIL( \
490 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
491 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
494 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
496 #define Simple_vFAIL(m) STMT_START { \
497 const IV offset = RExC_parse - RExC_precomp; \
498 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
499 m, REPORT_LOCATION_ARGS(offset)); \
503 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
505 #define vFAIL(m) STMT_START { \
507 SAVEFREESV(RExC_rx_sv); \
512 * Like Simple_vFAIL(), but accepts two arguments.
514 #define Simple_vFAIL2(m,a1) STMT_START { \
515 const IV offset = RExC_parse - RExC_precomp; \
516 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
517 REPORT_LOCATION_ARGS(offset)); \
521 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
523 #define vFAIL2(m,a1) STMT_START { \
525 SAVEFREESV(RExC_rx_sv); \
526 Simple_vFAIL2(m, a1); \
531 * Like Simple_vFAIL(), but accepts three arguments.
533 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
534 const IV offset = RExC_parse - RExC_precomp; \
535 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
536 REPORT_LOCATION_ARGS(offset)); \
540 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
542 #define vFAIL3(m,a1,a2) STMT_START { \
544 SAVEFREESV(RExC_rx_sv); \
545 Simple_vFAIL3(m, a1, a2); \
549 * Like Simple_vFAIL(), but accepts four arguments.
551 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
552 const IV offset = RExC_parse - RExC_precomp; \
553 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
554 REPORT_LOCATION_ARGS(offset)); \
557 #define vFAIL4(m,a1,a2,a3) STMT_START { \
559 SAVEFREESV(RExC_rx_sv); \
560 Simple_vFAIL4(m, a1, a2, a3); \
563 /* A specialized version of vFAIL2 that works with UTF8f */
564 #define vFAIL2utf8f(m, a1) STMT_START { \
565 const IV offset = RExC_parse - RExC_precomp; \
567 SAVEFREESV(RExC_rx_sv); \
568 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
569 REPORT_LOCATION_ARGS(offset)); \
573 /* m is not necessarily a "literal string", in this macro */
574 #define reg_warn_non_literal_string(loc, m) STMT_START { \
575 const IV offset = loc - RExC_precomp; \
576 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
577 m, REPORT_LOCATION_ARGS(offset)); \
580 #define ckWARNreg(loc,m) STMT_START { \
581 const IV offset = loc - RExC_precomp; \
582 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
583 REPORT_LOCATION_ARGS(offset)); \
586 #define vWARN_dep(loc, m) STMT_START { \
587 const IV offset = loc - RExC_precomp; \
588 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
589 REPORT_LOCATION_ARGS(offset)); \
592 #define ckWARNdep(loc,m) STMT_START { \
593 const IV offset = loc - RExC_precomp; \
594 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
596 REPORT_LOCATION_ARGS(offset)); \
599 #define ckWARNregdep(loc,m) STMT_START { \
600 const IV offset = loc - RExC_precomp; \
601 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
603 REPORT_LOCATION_ARGS(offset)); \
606 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
607 const IV offset = loc - RExC_precomp; \
608 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
610 a1, REPORT_LOCATION_ARGS(offset)); \
613 #define ckWARN2reg(loc, m, a1) STMT_START { \
614 const IV offset = loc - RExC_precomp; \
615 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
616 a1, REPORT_LOCATION_ARGS(offset)); \
619 #define vWARN3(loc, m, a1, a2) STMT_START { \
620 const IV offset = loc - RExC_precomp; \
621 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
622 a1, a2, REPORT_LOCATION_ARGS(offset)); \
625 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
626 const IV offset = loc - RExC_precomp; \
627 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
628 a1, a2, REPORT_LOCATION_ARGS(offset)); \
631 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
632 const IV offset = loc - RExC_precomp; \
633 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
634 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
637 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
638 const IV offset = loc - RExC_precomp; \
639 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
640 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
643 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
644 const IV offset = loc - RExC_precomp; \
645 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
646 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
650 /* Allow for side effects in s */
651 #define REGC(c,s) STMT_START { \
652 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
655 /* Macros for recording node offsets. 20001227 mjd@plover.com
656 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
657 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
658 * Element 0 holds the number n.
659 * Position is 1 indexed.
661 #ifndef RE_TRACK_PATTERN_OFFSETS
662 #define Set_Node_Offset_To_R(node,byte)
663 #define Set_Node_Offset(node,byte)
664 #define Set_Cur_Node_Offset
665 #define Set_Node_Length_To_R(node,len)
666 #define Set_Node_Length(node,len)
667 #define Set_Node_Cur_Length(node,start)
668 #define Node_Offset(n)
669 #define Node_Length(n)
670 #define Set_Node_Offset_Length(node,offset,len)
671 #define ProgLen(ri) ri->u.proglen
672 #define SetProgLen(ri,x) ri->u.proglen = x
674 #define ProgLen(ri) ri->u.offsets[0]
675 #define SetProgLen(ri,x) ri->u.offsets[0] = x
676 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
678 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
679 __LINE__, (int)(node), (int)(byte))); \
681 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
683 RExC_offsets[2*(node)-1] = (byte); \
688 #define Set_Node_Offset(node,byte) \
689 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
690 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
692 #define Set_Node_Length_To_R(node,len) STMT_START { \
694 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
695 __LINE__, (int)(node), (int)(len))); \
697 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
699 RExC_offsets[2*(node)] = (len); \
704 #define Set_Node_Length(node,len) \
705 Set_Node_Length_To_R((node)-RExC_emit_start, len)
706 #define Set_Node_Cur_Length(node, start) \
707 Set_Node_Length(node, RExC_parse - start)
709 /* Get offsets and lengths */
710 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
711 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
713 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
714 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
715 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
719 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
720 #define EXPERIMENTAL_INPLACESCAN
721 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
723 #define DEBUG_RExC_seen() \
724 DEBUG_OPTIMISE_MORE_r({ \
725 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
727 if (RExC_seen & REG_SEEN_ZERO_LEN) \
728 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN "); \
730 if (RExC_seen & REG_SEEN_LOOKBEHIND) \
731 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND "); \
733 if (RExC_seen & REG_SEEN_GPOS) \
734 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS "); \
736 if (RExC_seen & REG_SEEN_CANY) \
737 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY "); \
739 if (RExC_seen & REG_SEEN_RECURSE) \
740 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE "); \
742 if (RExC_seen & REG_TOP_LEVEL_BRANCHES) \
743 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES "); \
745 if (RExC_seen & REG_SEEN_VERBARG) \
746 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG "); \
748 if (RExC_seen & REG_SEEN_CUTGROUP) \
749 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP "); \
751 if (RExC_seen & REG_SEEN_RUN_ON_COMMENT) \
752 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT "); \
754 if (RExC_seen & REG_SEEN_EXACTF_SHARP_S) \
755 PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S "); \
757 if (RExC_seen & REG_SEEN_GOSTART) \
758 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART "); \
760 PerlIO_printf(Perl_debug_log,"\n"); \
763 #define DEBUG_STUDYDATA(str,data,depth) \
764 DEBUG_OPTIMISE_MORE_r(if(data){ \
765 PerlIO_printf(Perl_debug_log, \
766 "%*s" str "Pos:%"IVdf"/%"IVdf \
767 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
768 (int)(depth)*2, "", \
769 (IV)((data)->pos_min), \
770 (IV)((data)->pos_delta), \
771 (UV)((data)->flags), \
772 (IV)((data)->whilem_c), \
773 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
774 is_inf ? "INF " : "" \
776 if ((data)->last_found) \
777 PerlIO_printf(Perl_debug_log, \
778 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
779 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
780 SvPVX_const((data)->last_found), \
781 (IV)((data)->last_end), \
782 (IV)((data)->last_start_min), \
783 (IV)((data)->last_start_max), \
784 ((data)->longest && \
785 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
786 SvPVX_const((data)->longest_fixed), \
787 (IV)((data)->offset_fixed), \
788 ((data)->longest && \
789 (data)->longest==&((data)->longest_float)) ? "*" : "", \
790 SvPVX_const((data)->longest_float), \
791 (IV)((data)->offset_float_min), \
792 (IV)((data)->offset_float_max) \
794 PerlIO_printf(Perl_debug_log,"\n"); \
797 /* Mark that we cannot extend a found fixed substring at this point.
798 Update the longest found anchored substring and the longest found
799 floating substrings if needed. */
802 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
803 SSize_t *minlenp, int is_inf)
805 const STRLEN l = CHR_SVLEN(data->last_found);
806 const STRLEN old_l = CHR_SVLEN(*data->longest);
807 GET_RE_DEBUG_FLAGS_DECL;
809 PERL_ARGS_ASSERT_SCAN_COMMIT;
811 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
812 SvSetMagicSV(*data->longest, data->last_found);
813 if (*data->longest == data->longest_fixed) {
814 data->offset_fixed = l ? data->last_start_min : data->pos_min;
815 if (data->flags & SF_BEFORE_EOL)
817 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
819 data->flags &= ~SF_FIX_BEFORE_EOL;
820 data->minlen_fixed=minlenp;
821 data->lookbehind_fixed=0;
823 else { /* *data->longest == data->longest_float */
824 data->offset_float_min = l ? data->last_start_min : data->pos_min;
825 data->offset_float_max = (l
826 ? data->last_start_max
827 : (data->pos_delta == SSize_t_MAX
829 : data->pos_min + data->pos_delta));
831 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
832 data->offset_float_max = SSize_t_MAX;
833 if (data->flags & SF_BEFORE_EOL)
835 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
837 data->flags &= ~SF_FL_BEFORE_EOL;
838 data->minlen_float=minlenp;
839 data->lookbehind_float=0;
842 SvCUR_set(data->last_found, 0);
844 SV * const sv = data->last_found;
845 if (SvUTF8(sv) && SvMAGICAL(sv)) {
846 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
852 data->flags &= ~SF_BEFORE_EOL;
853 DEBUG_STUDYDATA("commit: ",data,0);
856 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
857 * list that describes which code points it matches */
860 S_ssc_anything(pTHX_ regnode_ssc *ssc)
862 /* Set the SSC 'ssc' to match an empty string or any code point */
864 PERL_ARGS_ASSERT_SSC_ANYTHING;
866 assert(OP(ssc) == ANYOF_SYNTHETIC);
868 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
869 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
870 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
874 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
876 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
877 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
878 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
879 * in any way, so there's no point in using it */
884 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
886 assert(OP(ssc) == ANYOF_SYNTHETIC);
888 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
892 /* See if the list consists solely of the range 0 - Infinity */
893 invlist_iterinit(ssc->invlist);
894 ret = invlist_iternext(ssc->invlist, &start, &end)
898 invlist_iterfinish(ssc->invlist);
904 /* If e.g., both \w and \W are set, matches everything */
905 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
907 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
908 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
918 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
920 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
921 * string, any code point, or any posix class under locale */
923 PERL_ARGS_ASSERT_SSC_INIT;
925 Zero(ssc, 1, regnode_ssc);
926 OP(ssc) = ANYOF_SYNTHETIC;
927 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
930 /* If any portion of the regex is to operate under locale rules,
931 * initialization includes it. The reason this isn't done for all regexes
932 * is that the optimizer was written under the assumption that locale was
933 * all-or-nothing. Given the complexity and lack of documentation in the
934 * optimizer, and that there are inadequate test cases for locale, many
935 * parts of it may not work properly, it is safest to avoid locale unless
937 if (RExC_contains_locale) {
938 ANYOF_POSIXL_SETALL(ssc);
939 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
940 if (RExC_contains_i) {
941 ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
945 ANYOF_POSIXL_ZERO(ssc);
950 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
951 const regnode_ssc *ssc)
953 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
954 * to the list of code points matched, and locale posix classes; hence does
955 * not check its flags) */
960 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
962 assert(OP(ssc) == ANYOF_SYNTHETIC);
964 invlist_iterinit(ssc->invlist);
965 ret = invlist_iternext(ssc->invlist, &start, &end)
969 invlist_iterfinish(ssc->invlist);
975 if (RExC_contains_locale) {
976 if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
977 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
978 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
982 if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
991 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
992 const regnode_charclass_posixl* const node)
994 /* Returns a mortal inversion list defining which code points are matched
995 * by 'node', which is of type ANYOF. Handles complementing the result if
996 * appropriate. If some code points aren't knowable at this time, the
997 * returned list must, and will, contain every possible code point. */
999 SV* invlist = sv_2mortal(_new_invlist(0));
1001 const U32 n = ARG(node);
1003 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1005 /* Look at the data structure created by S_set_ANYOF_arg() */
1006 if (n != ANYOF_NONBITMAP_EMPTY) {
1007 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1008 AV * const av = MUTABLE_AV(SvRV(rv));
1009 SV **const ary = AvARRAY(av);
1010 assert(RExC_rxi->data->what[n] == 's');
1012 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1013 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1015 else if (ary[0] && ary[0] != &PL_sv_undef) {
1017 /* Here, no compile-time swash, and there are things that won't be
1018 * known until runtime -- we have to assume it could be anything */
1019 return _add_range_to_invlist(invlist, 0, UV_MAX);
1023 /* Here no compile-time swash, and no run-time only data. Use the
1024 * node's inversion list */
1025 invlist = sv_2mortal(invlist_clone(ary[2]));
1029 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1030 * inversion list for the others, but if there are code points that should
1031 * match only conditionally on the target string being UTF-8, those are
1032 * placed in the inversion list, and not the bitmap. Since there are
1033 * circumstances under which they could match, they are included in the
1034 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1035 * here, so that when we invert below, the end result actually does include
1036 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1037 * before we add the unconditionally matched code points */
1038 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1039 _invlist_intersection_complement_2nd(invlist,
1044 /* Add in the points from the bit map */
1045 for (i = 0; i < 256; i++) {
1046 if (ANYOF_BITMAP_TEST(node, i)) {
1047 invlist = add_cp_to_invlist(invlist, i);
1051 /* If this can match all upper Latin1 code points, have to add them
1053 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
1054 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1057 /* Similarly for these */
1058 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1059 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1062 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1063 _invlist_invert(invlist);
1069 /* These two functions currently do the exact same thing */
1070 #define ssc_init_zero ssc_init
1072 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1073 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1076 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1078 /* Take the flags 'and_with' and accumulate them anded into the flags for
1079 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
1080 * The flags 'and_with' should not come from another SSC (otherwise the
1081 * EMPTY_STRING flag won't work) */
1083 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS;
1085 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1087 /* Use just the SSC-related flags from 'and_with' */
1088 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_COMMON_FLAGS);
1089 ANYOF_FLAGS(ssc) |= ssc_only_flags;
1092 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1093 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1094 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1097 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1098 const regnode_ssc *and_with)
1100 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1101 * another SSC or a regular ANYOF class. Can create false positives. */
1106 PERL_ARGS_ASSERT_SSC_AND;
1108 assert(OP(ssc) == ANYOF_SYNTHETIC);
1110 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1111 * the code point inversion list and just the relevant flags */
1112 if (OP(and_with) == ANYOF_SYNTHETIC) {
1113 anded_cp_list = and_with->invlist;
1114 anded_flags = ANYOF_FLAGS(and_with);
1116 /* XXX This is a kludge around what appears to be deficiencies in the
1117 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1118 * there are paths through the optimizer where it doesn't get weeded
1119 * out when it should. And if we don't make some extra provision for
1120 * it like the code just below, it doesn't get added when it should.
1121 * This solution is to add it only when AND'ing, which is here, and
1122 * only when what is being AND'ed is the pristine, original node
1123 * matching anything. Thus it is like adding it to ssc_anything() but
1124 * only when the result is to be AND'ed. Probably the same solution
1125 * could be adopted for the same problem we have with /l matching,
1126 * which is solved differently in S_ssc_init(), and that would lead to
1127 * fewer false positives than that solution has. But if this solution
1128 * creates bugs, the consequences are only that a warning isn't raised
1129 * that should be; while the consequences for having /l bugs is
1130 * incorrect matches */
1131 if (ssc_is_anything(and_with)) {
1132 anded_flags |= ANYOF_WARN_SUPER;
1136 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1137 (regnode_charclass_posixl*) and_with);
1138 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1141 ANYOF_FLAGS(ssc) &= anded_flags;
1143 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1144 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1145 * 'and_with' may be inverted. When not inverted, we have the situation of
1147 * (C1 | P1) & (C2 | P2)
1148 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1149 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1150 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1151 * <= ((C1 & C2) | P1 | P2)
1152 * Alternatively, the last few steps could be:
1153 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1154 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1155 * <= (C1 | C2 | (P1 & P2))
1156 * We favor the second approach if either P1 or P2 is non-empty. This is
1157 * because these components are a barrier to doing optimizations, as what
1158 * they match cannot be known until the moment of matching as they are
1159 * dependent on the current locale, 'AND"ing them likely will reduce or
1161 * But we can do better if we know that C1,P1 are in their initial state (a
1162 * frequent occurrence), each matching everything:
1163 * (<everything>) & (C2 | P2) = C2 | P2
1164 * Similarly, if C2,P2 are in their initial state (again a frequent
1165 * occurrence), the result is a no-op
1166 * (C1 | P1) & (<everything>) = C1 | P1
1169 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1170 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1171 * <= (C1 & ~C2) | (P1 & ~P2)
1174 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1175 && OP(and_with) != ANYOF_SYNTHETIC)
1179 ssc_intersection(ssc,
1181 FALSE /* Has already been inverted */
1184 /* If either P1 or P2 is empty, the intersection will be also; can skip
1186 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1187 ANYOF_POSIXL_ZERO(ssc);
1189 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1191 /* Note that the Posix class component P from 'and_with' actually
1193 * P = Pa | Pb | ... | Pn
1194 * where each component is one posix class, such as in [\w\s].
1196 * ~P = ~(Pa | Pb | ... | Pn)
1197 * = ~Pa & ~Pb & ... & ~Pn
1198 * <= ~Pa | ~Pb | ... | ~Pn
1199 * The last is something we can easily calculate, but unfortunately
1200 * is likely to have many false positives. We could do better
1201 * in some (but certainly not all) instances if two classes in
1202 * P have known relationships. For example
1203 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1205 * :lower: & :print: = :lower:
1206 * And similarly for classes that must be disjoint. For example,
1207 * since \s and \w can have no elements in common based on rules in
1208 * the POSIX standard,
1209 * \w & ^\S = nothing
1210 * Unfortunately, some vendor locales do not meet the Posix
1211 * standard, in particular almost everything by Microsoft.
1212 * The loop below just changes e.g., \w into \W and vice versa */
1214 regnode_charclass_posixl temp;
1215 int add = 1; /* To calculate the index of the complement */
1217 ANYOF_POSIXL_ZERO(&temp);
1218 for (i = 0; i < ANYOF_MAX; i++) {
1220 || ! ANYOF_POSIXL_TEST(and_with, i)
1221 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1223 if (ANYOF_POSIXL_TEST(and_with, i)) {
1224 ANYOF_POSIXL_SET(&temp, i + add);
1226 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1228 ANYOF_POSIXL_AND(&temp, ssc);
1230 } /* else ssc already has no posixes */
1231 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1232 in its initial state */
1233 else if (OP(and_with) != ANYOF_SYNTHETIC
1234 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1236 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1237 * copy it over 'ssc' */
1238 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1239 if (OP(and_with) == ANYOF_SYNTHETIC) {
1240 StructCopy(and_with, ssc, regnode_ssc);
1243 ssc->invlist = anded_cp_list;
1244 ANYOF_POSIXL_ZERO(ssc);
1245 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1246 ANYOF_POSIXL_OR(and_with, ssc);
1250 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1251 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1253 /* One or the other of P1, P2 is non-empty. */
1254 ANYOF_POSIXL_AND(and_with, ssc);
1255 ssc_union(ssc, anded_cp_list, FALSE);
1257 else { /* P1 = P2 = empty */
1258 ssc_intersection(ssc, anded_cp_list, FALSE);
1264 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1265 const regnode_ssc *or_with)
1267 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1268 * another SSC or a regular ANYOF class. Can create false positives if
1269 * 'or_with' is to be inverted. */
1274 PERL_ARGS_ASSERT_SSC_OR;
1276 assert(OP(ssc) == ANYOF_SYNTHETIC);
1278 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1279 * the code point inversion list and just the relevant flags */
1280 if (OP(or_with) == ANYOF_SYNTHETIC) {
1281 ored_cp_list = or_with->invlist;
1282 ored_flags = ANYOF_FLAGS(or_with);
1285 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1286 (regnode_charclass_posixl*) or_with);
1287 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1290 ANYOF_FLAGS(ssc) |= ored_flags;
1292 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1293 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1294 * 'or_with' may be inverted. When not inverted, we have the simple
1295 * situation of computing:
1296 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1297 * If P1|P2 yields a situation with both a class and its complement are
1298 * set, like having both \w and \W, this matches all code points, and we
1299 * can delete these from the P component of the ssc going forward. XXX We
1300 * might be able to delete all the P components, but I (khw) am not certain
1301 * about this, and it is better to be safe.
1304 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1305 * <= (C1 | P1) | ~C2
1306 * <= (C1 | ~C2) | P1
1307 * (which results in actually simpler code than the non-inverted case)
1310 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1311 && OP(or_with) != ANYOF_SYNTHETIC)
1313 /* We ignore P2, leaving P1 going forward */
1315 else { /* Not inverted */
1316 ANYOF_POSIXL_OR(or_with, ssc);
1317 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1319 for (i = 0; i < ANYOF_MAX; i += 2) {
1320 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1322 ssc_match_all_cp(ssc);
1323 ANYOF_POSIXL_CLEAR(ssc, i);
1324 ANYOF_POSIXL_CLEAR(ssc, i+1);
1325 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1326 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1335 FALSE /* Already has been inverted */
1339 PERL_STATIC_INLINE void
1340 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1342 PERL_ARGS_ASSERT_SSC_UNION;
1344 assert(OP(ssc) == ANYOF_SYNTHETIC);
1346 _invlist_union_maybe_complement_2nd(ssc->invlist,
1352 PERL_STATIC_INLINE void
1353 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1355 const bool invert2nd)
1357 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1359 assert(OP(ssc) == ANYOF_SYNTHETIC);
1361 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1367 PERL_STATIC_INLINE void
1368 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1370 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1372 assert(OP(ssc) == ANYOF_SYNTHETIC);
1374 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1377 PERL_STATIC_INLINE void
1378 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1380 /* AND just the single code point 'cp' into the SSC 'ssc' */
1382 SV* cp_list = _new_invlist(2);
1384 PERL_ARGS_ASSERT_SSC_CP_AND;
1386 assert(OP(ssc) == ANYOF_SYNTHETIC);
1388 cp_list = add_cp_to_invlist(cp_list, cp);
1389 ssc_intersection(ssc, cp_list,
1390 FALSE /* Not inverted */
1392 SvREFCNT_dec_NN(cp_list);
1395 PERL_STATIC_INLINE void
1396 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1398 /* Set the SSC 'ssc' to not match any locale things */
1400 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1402 assert(OP(ssc) == ANYOF_SYNTHETIC);
1404 ANYOF_POSIXL_ZERO(ssc);
1405 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1409 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1411 /* The inversion list in the SSC is marked mortal; now we need a more
1412 * permanent copy, which is stored the same way that is done in a regular
1413 * ANYOF node, with the first 256 code points in a bit map */
1415 SV* invlist = invlist_clone(ssc->invlist);
1417 PERL_ARGS_ASSERT_SSC_FINALIZE;
1419 assert(OP(ssc) == ANYOF_SYNTHETIC);
1421 /* The code in this file assumes that all but these flags aren't relevant
1422 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1423 * time we reach here */
1424 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1426 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1428 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1430 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1433 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1434 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1435 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1436 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1441 dump_trie(trie,widecharmap,revcharmap)
1442 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1443 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1445 These routines dump out a trie in a somewhat readable format.
1446 The _interim_ variants are used for debugging the interim
1447 tables that are used to generate the final compressed
1448 representation which is what dump_trie expects.
1450 Part of the reason for their existence is to provide a form
1451 of documentation as to how the different representations function.
1456 Dumps the final compressed table form of the trie to Perl_debug_log.
1457 Used for debugging make_trie().
1461 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1462 AV *revcharmap, U32 depth)
1465 SV *sv=sv_newmortal();
1466 int colwidth= widecharmap ? 6 : 4;
1468 GET_RE_DEBUG_FLAGS_DECL;
1470 PERL_ARGS_ASSERT_DUMP_TRIE;
1472 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1473 (int)depth * 2 + 2,"",
1474 "Match","Base","Ofs" );
1476 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1477 SV ** const tmp = av_fetch( revcharmap, state, 0);
1479 PerlIO_printf( Perl_debug_log, "%*s",
1481 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1482 PL_colors[0], PL_colors[1],
1483 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1484 PERL_PV_ESCAPE_FIRSTCHAR
1489 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1490 (int)depth * 2 + 2,"");
1492 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1493 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1494 PerlIO_printf( Perl_debug_log, "\n");
1496 for( state = 1 ; state < trie->statecount ; state++ ) {
1497 const U32 base = trie->states[ state ].trans.base;
1499 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1501 if ( trie->states[ state ].wordnum ) {
1502 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1504 PerlIO_printf( Perl_debug_log, "%6s", "" );
1507 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1512 while( ( base + ofs < trie->uniquecharcount ) ||
1513 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1514 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1517 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1519 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1520 if ( ( base + ofs >= trie->uniquecharcount ) &&
1521 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1522 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1524 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1526 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1528 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1532 PerlIO_printf( Perl_debug_log, "]");
1535 PerlIO_printf( Perl_debug_log, "\n" );
1537 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1538 for (word=1; word <= trie->wordcount; word++) {
1539 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1540 (int)word, (int)(trie->wordinfo[word].prev),
1541 (int)(trie->wordinfo[word].len));
1543 PerlIO_printf(Perl_debug_log, "\n" );
1546 Dumps a fully constructed but uncompressed trie in list form.
1547 List tries normally only are used for construction when the number of
1548 possible chars (trie->uniquecharcount) is very high.
1549 Used for debugging make_trie().
1552 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1553 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1557 SV *sv=sv_newmortal();
1558 int colwidth= widecharmap ? 6 : 4;
1559 GET_RE_DEBUG_FLAGS_DECL;
1561 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1563 /* print out the table precompression. */
1564 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1565 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1566 "------:-----+-----------------\n" );
1568 for( state=1 ; state < next_alloc ; state ++ ) {
1571 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1572 (int)depth * 2 + 2,"", (UV)state );
1573 if ( ! trie->states[ state ].wordnum ) {
1574 PerlIO_printf( Perl_debug_log, "%5s| ","");
1576 PerlIO_printf( Perl_debug_log, "W%4x| ",
1577 trie->states[ state ].wordnum
1580 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1581 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1583 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1585 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1586 PL_colors[0], PL_colors[1],
1587 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1588 PERL_PV_ESCAPE_FIRSTCHAR
1590 TRIE_LIST_ITEM(state,charid).forid,
1591 (UV)TRIE_LIST_ITEM(state,charid).newstate
1594 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1595 (int)((depth * 2) + 14), "");
1598 PerlIO_printf( Perl_debug_log, "\n");
1603 Dumps a fully constructed but uncompressed trie in table form.
1604 This is the normal DFA style state transition table, with a few
1605 twists to facilitate compression later.
1606 Used for debugging make_trie().
1609 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1610 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1615 SV *sv=sv_newmortal();
1616 int colwidth= widecharmap ? 6 : 4;
1617 GET_RE_DEBUG_FLAGS_DECL;
1619 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1622 print out the table precompression so that we can do a visual check
1623 that they are identical.
1626 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1628 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1629 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1631 PerlIO_printf( Perl_debug_log, "%*s",
1633 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1634 PL_colors[0], PL_colors[1],
1635 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1636 PERL_PV_ESCAPE_FIRSTCHAR
1642 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1644 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1645 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1648 PerlIO_printf( Perl_debug_log, "\n" );
1650 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1652 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1653 (int)depth * 2 + 2,"",
1654 (UV)TRIE_NODENUM( state ) );
1656 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1657 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1659 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1661 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1663 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1664 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1666 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1667 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1675 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1676 startbranch: the first branch in the whole branch sequence
1677 first : start branch of sequence of branch-exact nodes.
1678 May be the same as startbranch
1679 last : Thing following the last branch.
1680 May be the same as tail.
1681 tail : item following the branch sequence
1682 count : words in the sequence
1683 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1684 depth : indent depth
1686 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1688 A trie is an N'ary tree where the branches are determined by digital
1689 decomposition of the key. IE, at the root node you look up the 1st character and
1690 follow that branch repeat until you find the end of the branches. Nodes can be
1691 marked as "accepting" meaning they represent a complete word. Eg:
1695 would convert into the following structure. Numbers represent states, letters
1696 following numbers represent valid transitions on the letter from that state, if
1697 the number is in square brackets it represents an accepting state, otherwise it
1698 will be in parenthesis.
1700 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1704 (1) +-i->(6)-+-s->[7]
1706 +-s->(3)-+-h->(4)-+-e->[5]
1708 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1710 This shows that when matching against the string 'hers' we will begin at state 1
1711 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1712 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1713 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1714 single traverse. We store a mapping from accepting to state to which word was
1715 matched, and then when we have multiple possibilities we try to complete the
1716 rest of the regex in the order in which they occured in the alternation.
1718 The only prior NFA like behaviour that would be changed by the TRIE support is
1719 the silent ignoring of duplicate alternations which are of the form:
1721 / (DUPE|DUPE) X? (?{ ... }) Y /x
1723 Thus EVAL blocks following a trie may be called a different number of times with
1724 and without the optimisation. With the optimisations dupes will be silently
1725 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1726 the following demonstrates:
1728 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1730 which prints out 'word' three times, but
1732 'words'=~/(word|word|word)(?{ print $1 })S/
1734 which doesnt print it out at all. This is due to other optimisations kicking in.
1736 Example of what happens on a structural level:
1738 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1740 1: CURLYM[1] {1,32767}(18)
1751 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1752 and should turn into:
1754 1: CURLYM[1] {1,32767}(18)
1756 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1764 Cases where tail != last would be like /(?foo|bar)baz/:
1774 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1775 and would end up looking like:
1778 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1785 d = uvchr_to_utf8_flags(d, uv, 0);
1787 is the recommended Unicode-aware way of saying
1792 #define TRIE_STORE_REVCHAR(val) \
1795 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1796 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1797 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1798 SvCUR_set(zlopp, kapow - flrbbbbb); \
1801 av_push(revcharmap, zlopp); \
1803 char ooooff = (char)val; \
1804 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1808 /* This gets the next character from the input, folding it if not already
1810 #define TRIE_READ_CHAR STMT_START { \
1813 /* if it is UTF then it is either already folded, or does not need \
1815 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1817 else if (folder == PL_fold_latin1) { \
1818 /* This folder implies Unicode rules, which in the range expressible \
1819 * by not UTF is the lower case, with the two exceptions, one of \
1820 * which should have been taken care of before calling this */ \
1821 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1822 uvc = toLOWER_L1(*uc); \
1823 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1826 /* raw data, will be folded later if needed */ \
1834 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1835 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1836 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1837 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1839 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1840 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1841 TRIE_LIST_CUR( state )++; \
1844 #define TRIE_LIST_NEW(state) STMT_START { \
1845 Newxz( trie->states[ state ].trans.list, \
1846 4, reg_trie_trans_le ); \
1847 TRIE_LIST_CUR( state ) = 1; \
1848 TRIE_LIST_LEN( state ) = 4; \
1851 #define TRIE_HANDLE_WORD(state) STMT_START { \
1852 U16 dupe= trie->states[ state ].wordnum; \
1853 regnode * const noper_next = regnext( noper ); \
1856 /* store the word for dumping */ \
1858 if (OP(noper) != NOTHING) \
1859 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1861 tmp = newSVpvn_utf8( "", 0, UTF ); \
1862 av_push( trie_words, tmp ); \
1866 trie->wordinfo[curword].prev = 0; \
1867 trie->wordinfo[curword].len = wordlen; \
1868 trie->wordinfo[curword].accept = state; \
1870 if ( noper_next < tail ) { \
1872 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1873 trie->jump[curword] = (U16)(noper_next - convert); \
1875 jumper = noper_next; \
1877 nextbranch= regnext(cur); \
1881 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1882 /* chain, so that when the bits of chain are later */\
1883 /* linked together, the dups appear in the chain */\
1884 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1885 trie->wordinfo[dupe].prev = curword; \
1887 /* we haven't inserted this word yet. */ \
1888 trie->states[ state ].wordnum = curword; \
1893 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1894 ( ( base + charid >= ucharcount \
1895 && base + charid < ubound \
1896 && state == trie->trans[ base - ucharcount + charid ].check \
1897 && trie->trans[ base - ucharcount + charid ].next ) \
1898 ? trie->trans[ base - ucharcount + charid ].next \
1899 : ( state==1 ? special : 0 ) \
1903 #define MADE_JUMP_TRIE 2
1904 #define MADE_EXACT_TRIE 4
1907 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1910 /* first pass, loop through and scan words */
1911 reg_trie_data *trie;
1912 HV *widecharmap = NULL;
1913 AV *revcharmap = newAV();
1919 regnode *jumper = NULL;
1920 regnode *nextbranch = NULL;
1921 regnode *convert = NULL;
1922 U32 *prev_states; /* temp array mapping each state to previous one */
1923 /* we just use folder as a flag in utf8 */
1924 const U8 * folder = NULL;
1927 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1928 AV *trie_words = NULL;
1929 /* along with revcharmap, this only used during construction but both are
1930 * useful during debugging so we store them in the struct when debugging.
1933 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1934 STRLEN trie_charcount=0;
1936 SV *re_trie_maxbuff;
1937 GET_RE_DEBUG_FLAGS_DECL;
1939 PERL_ARGS_ASSERT_MAKE_TRIE;
1941 PERL_UNUSED_ARG(depth);
1948 case EXACTFU: folder = PL_fold_latin1; break;
1949 case EXACTF: folder = PL_fold; break;
1950 case EXACTFL: folder = PL_fold_locale; break;
1951 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1954 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1956 trie->startstate = 1;
1957 trie->wordcount = word_count;
1958 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1959 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1961 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1962 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1963 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1966 trie_words = newAV();
1969 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1970 if (!SvIOK(re_trie_maxbuff)) {
1971 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1973 DEBUG_TRIE_COMPILE_r({
1974 PerlIO_printf( Perl_debug_log,
1975 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1976 (int)depth * 2 + 2, "",
1977 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1978 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1982 /* Find the node we are going to overwrite */
1983 if ( first == startbranch && OP( last ) != BRANCH ) {
1984 /* whole branch chain */
1987 /* branch sub-chain */
1988 convert = NEXTOPER( first );
1991 /* -- First loop and Setup --
1993 We first traverse the branches and scan each word to determine if it
1994 contains widechars, and how many unique chars there are, this is
1995 important as we have to build a table with at least as many columns as we
1998 We use an array of integers to represent the character codes 0..255
1999 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
2000 native representation of the character value as the key and IV's for the
2003 *TODO* If we keep track of how many times each character is used we can
2004 remap the columns so that the table compression later on is more
2005 efficient in terms of memory by ensuring the most common value is in the
2006 middle and the least common are on the outside. IMO this would be better
2007 than a most to least common mapping as theres a decent chance the most
2008 common letter will share a node with the least common, meaning the node
2009 will not be compressible. With a middle is most common approach the worst
2010 case is when we have the least common nodes twice.
2014 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2015 regnode *noper = NEXTOPER( cur );
2016 const U8 *uc = (U8*)STRING( noper );
2017 const U8 *e = uc + STR_LEN( noper );
2019 U32 wordlen = 0; /* required init */
2020 STRLEN minbytes = 0;
2021 STRLEN maxbytes = 0;
2022 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
2024 if (OP(noper) == NOTHING) {
2025 regnode *noper_next= regnext(noper);
2026 if (noper_next != tail && OP(noper_next) == flags) {
2028 uc= (U8*)STRING(noper);
2029 e= uc + STR_LEN(noper);
2030 trie->minlen= STR_LEN(noper);
2037 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2038 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2039 regardless of encoding */
2040 if (OP( noper ) == EXACTFU_SS) {
2041 /* false positives are ok, so just set this */
2042 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2045 for ( ; uc < e ; uc += len ) {
2046 TRIE_CHARCOUNT(trie)++;
2049 /* Acummulate to the current values, the range in the number of
2050 * bytes that this character could match. The max is presumed to
2051 * be the same as the folded input (which TRIE_READ_CHAR returns),
2052 * except that when this is not in UTF-8, it could be matched
2053 * against a string which is UTF-8, and the variant characters
2054 * could be 2 bytes instead of the 1 here. Likewise, for the
2055 * minimum number of bytes when not folded. When folding, the min
2056 * is assumed to be 1 byte could fold to match the single character
2057 * here, or in the case of a multi-char fold, 1 byte can fold to
2058 * the whole sequence. 'foldlen' is used to denote whether we are
2059 * in such a sequence, skipping the min setting if so. XXX TODO
2060 * Use the exact list of what folds to each character, from
2061 * PL_utf8_foldclosures */
2063 maxbytes += UTF8SKIP(uc);
2065 /* A non-UTF-8 string could be 1 byte to match our 2 */
2066 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2072 foldlen -= UTF8SKIP(uc);
2075 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2081 maxbytes += (UNI_IS_INVARIANT(*uc))
2092 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2099 U8 folded= folder[ (U8) uvc ];
2100 if ( !trie->charmap[ folded ] ) {
2101 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2102 TRIE_STORE_REVCHAR( folded );
2105 if ( !trie->charmap[ uvc ] ) {
2106 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2107 TRIE_STORE_REVCHAR( uvc );
2110 /* store the codepoint in the bitmap, and its folded
2112 TRIE_BITMAP_SET(trie, uvc);
2114 /* store the folded codepoint */
2115 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2118 /* store first byte of utf8 representation of
2119 variant codepoints */
2120 if (! UVCHR_IS_INVARIANT(uvc)) {
2121 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2124 set_bit = 0; /* We've done our bit :-) */
2129 widecharmap = newHV();
2131 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2134 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2136 if ( !SvTRUE( *svpp ) ) {
2137 sv_setiv( *svpp, ++trie->uniquecharcount );
2138 TRIE_STORE_REVCHAR(uvc);
2142 if( cur == first ) {
2143 trie->minlen = minbytes;
2144 trie->maxlen = maxbytes;
2145 } else if (minbytes < trie->minlen) {
2146 trie->minlen = minbytes;
2147 } else if (maxbytes > trie->maxlen) {
2148 trie->maxlen = maxbytes;
2150 } /* end first pass */
2151 DEBUG_TRIE_COMPILE_r(
2152 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2153 (int)depth * 2 + 2,"",
2154 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2155 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2156 (int)trie->minlen, (int)trie->maxlen )
2160 We now know what we are dealing with in terms of unique chars and
2161 string sizes so we can calculate how much memory a naive
2162 representation using a flat table will take. If it's over a reasonable
2163 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2164 conservative but potentially much slower representation using an array
2167 At the end we convert both representations into the same compressed
2168 form that will be used in regexec.c for matching with. The latter
2169 is a form that cannot be used to construct with but has memory
2170 properties similar to the list form and access properties similar
2171 to the table form making it both suitable for fast searches and
2172 small enough that its feasable to store for the duration of a program.
2174 See the comment in the code where the compressed table is produced
2175 inplace from the flat tabe representation for an explanation of how
2176 the compression works.
2181 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2184 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
2186 Second Pass -- Array Of Lists Representation
2188 Each state will be represented by a list of charid:state records
2189 (reg_trie_trans_le) the first such element holds the CUR and LEN
2190 points of the allocated array. (See defines above).
2192 We build the initial structure using the lists, and then convert
2193 it into the compressed table form which allows faster lookups
2194 (but cant be modified once converted).
2197 STRLEN transcount = 1;
2199 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2200 "%*sCompiling trie using list compiler\n",
2201 (int)depth * 2 + 2, ""));
2203 trie->states = (reg_trie_state *)
2204 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2205 sizeof(reg_trie_state) );
2209 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2211 regnode *noper = NEXTOPER( cur );
2212 U8 *uc = (U8*)STRING( noper );
2213 const U8 *e = uc + STR_LEN( noper );
2214 U32 state = 1; /* required init */
2215 U16 charid = 0; /* sanity init */
2216 U32 wordlen = 0; /* required init */
2218 if (OP(noper) == NOTHING) {
2219 regnode *noper_next= regnext(noper);
2220 if (noper_next != tail && OP(noper_next) == flags) {
2222 uc= (U8*)STRING(noper);
2223 e= uc + STR_LEN(noper);
2227 if (OP(noper) != NOTHING) {
2228 for ( ; uc < e ; uc += len ) {
2233 charid = trie->charmap[ uvc ];
2235 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2239 charid=(U16)SvIV( *svpp );
2242 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2249 if ( !trie->states[ state ].trans.list ) {
2250 TRIE_LIST_NEW( state );
2252 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2253 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2254 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2259 newstate = next_alloc++;
2260 prev_states[newstate] = state;
2261 TRIE_LIST_PUSH( state, charid, newstate );
2266 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2270 TRIE_HANDLE_WORD(state);
2272 } /* end second pass */
2274 /* next alloc is the NEXT state to be allocated */
2275 trie->statecount = next_alloc;
2276 trie->states = (reg_trie_state *)
2277 PerlMemShared_realloc( trie->states,
2279 * sizeof(reg_trie_state) );
2281 /* and now dump it out before we compress it */
2282 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2283 revcharmap, next_alloc,
2287 trie->trans = (reg_trie_trans *)
2288 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2295 for( state=1 ; state < next_alloc ; state ++ ) {
2299 DEBUG_TRIE_COMPILE_MORE_r(
2300 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2304 if (trie->states[state].trans.list) {
2305 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2309 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2310 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2311 if ( forid < minid ) {
2313 } else if ( forid > maxid ) {
2317 if ( transcount < tp + maxid - minid + 1) {
2319 trie->trans = (reg_trie_trans *)
2320 PerlMemShared_realloc( trie->trans,
2322 * sizeof(reg_trie_trans) );
2323 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2325 base = trie->uniquecharcount + tp - minid;
2326 if ( maxid == minid ) {
2328 for ( ; zp < tp ; zp++ ) {
2329 if ( ! trie->trans[ zp ].next ) {
2330 base = trie->uniquecharcount + zp - minid;
2331 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2332 trie->trans[ zp ].check = state;
2338 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2339 trie->trans[ tp ].check = state;
2344 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2345 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2346 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2347 trie->trans[ tid ].check = state;
2349 tp += ( maxid - minid + 1 );
2351 Safefree(trie->states[ state ].trans.list);
2354 DEBUG_TRIE_COMPILE_MORE_r(
2355 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2358 trie->states[ state ].trans.base=base;
2360 trie->lasttrans = tp + 1;
2364 Second Pass -- Flat Table Representation.
2366 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2367 each. We know that we will need Charcount+1 trans at most to store
2368 the data (one row per char at worst case) So we preallocate both
2369 structures assuming worst case.
2371 We then construct the trie using only the .next slots of the entry
2374 We use the .check field of the first entry of the node temporarily
2375 to make compression both faster and easier by keeping track of how
2376 many non zero fields are in the node.
2378 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2381 There are two terms at use here: state as a TRIE_NODEIDX() which is
2382 a number representing the first entry of the node, and state as a
2383 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2384 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2385 if there are 2 entrys per node. eg:
2393 The table is internally in the right hand, idx form. However as we
2394 also have to deal with the states array which is indexed by nodenum
2395 we have to use TRIE_NODENUM() to convert.
2398 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2399 "%*sCompiling trie using table compiler\n",
2400 (int)depth * 2 + 2, ""));
2402 trie->trans = (reg_trie_trans *)
2403 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2404 * trie->uniquecharcount + 1,
2405 sizeof(reg_trie_trans) );
2406 trie->states = (reg_trie_state *)
2407 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2408 sizeof(reg_trie_state) );
2409 next_alloc = trie->uniquecharcount + 1;
2412 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2414 regnode *noper = NEXTOPER( cur );
2415 const U8 *uc = (U8*)STRING( noper );
2416 const U8 *e = uc + STR_LEN( noper );
2418 U32 state = 1; /* required init */
2420 U16 charid = 0; /* sanity init */
2421 U32 accept_state = 0; /* sanity init */
2423 U32 wordlen = 0; /* required init */
2425 if (OP(noper) == NOTHING) {
2426 regnode *noper_next= regnext(noper);
2427 if (noper_next != tail && OP(noper_next) == flags) {
2429 uc= (U8*)STRING(noper);
2430 e= uc + STR_LEN(noper);
2434 if ( OP(noper) != NOTHING ) {
2435 for ( ; uc < e ; uc += len ) {
2440 charid = trie->charmap[ uvc ];
2442 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2443 charid = svpp ? (U16)SvIV(*svpp) : 0;
2447 if ( !trie->trans[ state + charid ].next ) {
2448 trie->trans[ state + charid ].next = next_alloc;
2449 trie->trans[ state ].check++;
2450 prev_states[TRIE_NODENUM(next_alloc)]
2451 = TRIE_NODENUM(state);
2452 next_alloc += trie->uniquecharcount;
2454 state = trie->trans[ state + charid ].next;
2456 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2458 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2461 accept_state = TRIE_NODENUM( state );
2462 TRIE_HANDLE_WORD(accept_state);
2464 } /* end second pass */
2466 /* and now dump it out before we compress it */
2467 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2469 next_alloc, depth+1));
2473 * Inplace compress the table.*
2475 For sparse data sets the table constructed by the trie algorithm will
2476 be mostly 0/FAIL transitions or to put it another way mostly empty.
2477 (Note that leaf nodes will not contain any transitions.)
2479 This algorithm compresses the tables by eliminating most such
2480 transitions, at the cost of a modest bit of extra work during lookup:
2482 - Each states[] entry contains a .base field which indicates the
2483 index in the state[] array wheres its transition data is stored.
2485 - If .base is 0 there are no valid transitions from that node.
2487 - If .base is nonzero then charid is added to it to find an entry in
2490 -If trans[states[state].base+charid].check!=state then the
2491 transition is taken to be a 0/Fail transition. Thus if there are fail
2492 transitions at the front of the node then the .base offset will point
2493 somewhere inside the previous nodes data (or maybe even into a node
2494 even earlier), but the .check field determines if the transition is
2498 The following process inplace converts the table to the compressed
2499 table: We first do not compress the root node 1,and mark all its
2500 .check pointers as 1 and set its .base pointer as 1 as well. This
2501 allows us to do a DFA construction from the compressed table later,
2502 and ensures that any .base pointers we calculate later are greater
2505 - We set 'pos' to indicate the first entry of the second node.
2507 - We then iterate over the columns of the node, finding the first and
2508 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2509 and set the .check pointers accordingly, and advance pos
2510 appropriately and repreat for the next node. Note that when we copy
2511 the next pointers we have to convert them from the original
2512 NODEIDX form to NODENUM form as the former is not valid post
2515 - If a node has no transitions used we mark its base as 0 and do not
2516 advance the pos pointer.
2518 - If a node only has one transition we use a second pointer into the
2519 structure to fill in allocated fail transitions from other states.
2520 This pointer is independent of the main pointer and scans forward
2521 looking for null transitions that are allocated to a state. When it
2522 finds one it writes the single transition into the "hole". If the
2523 pointer doesnt find one the single transition is appended as normal.
2525 - Once compressed we can Renew/realloc the structures to release the
2528 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2529 specifically Fig 3.47 and the associated pseudocode.
2533 const U32 laststate = TRIE_NODENUM( next_alloc );
2536 trie->statecount = laststate;
2538 for ( state = 1 ; state < laststate ; state++ ) {
2540 const U32 stateidx = TRIE_NODEIDX( state );
2541 const U32 o_used = trie->trans[ stateidx ].check;
2542 U32 used = trie->trans[ stateidx ].check;
2543 trie->trans[ stateidx ].check = 0;
2545 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2546 if ( flag || trie->trans[ stateidx + charid ].next ) {
2547 if ( trie->trans[ stateidx + charid ].next ) {
2549 for ( ; zp < pos ; zp++ ) {
2550 if ( ! trie->trans[ zp ].next ) {
2554 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2555 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2556 trie->trans[ zp ].check = state;
2557 if ( ++zp > pos ) pos = zp;
2564 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2566 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2567 trie->trans[ pos ].check = state;
2572 trie->lasttrans = pos + 1;
2573 trie->states = (reg_trie_state *)
2574 PerlMemShared_realloc( trie->states, laststate
2575 * sizeof(reg_trie_state) );
2576 DEBUG_TRIE_COMPILE_MORE_r(
2577 PerlIO_printf( Perl_debug_log,
2578 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2579 (int)depth * 2 + 2,"",
2580 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2583 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2586 } /* end table compress */
2588 DEBUG_TRIE_COMPILE_MORE_r(
2589 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2590 (int)depth * 2 + 2, "",
2591 (UV)trie->statecount,
2592 (UV)trie->lasttrans)
2594 /* resize the trans array to remove unused space */
2595 trie->trans = (reg_trie_trans *)
2596 PerlMemShared_realloc( trie->trans, trie->lasttrans
2597 * sizeof(reg_trie_trans) );
2599 { /* Modify the program and insert the new TRIE node */
2600 U8 nodetype =(U8)(flags & 0xFF);
2604 regnode *optimize = NULL;
2605 #ifdef RE_TRACK_PATTERN_OFFSETS
2608 U32 mjd_nodelen = 0;
2609 #endif /* RE_TRACK_PATTERN_OFFSETS */
2610 #endif /* DEBUGGING */
2612 This means we convert either the first branch or the first Exact,
2613 depending on whether the thing following (in 'last') is a branch
2614 or not and whther first is the startbranch (ie is it a sub part of
2615 the alternation or is it the whole thing.)
2616 Assuming its a sub part we convert the EXACT otherwise we convert
2617 the whole branch sequence, including the first.
2619 /* Find the node we are going to overwrite */
2620 if ( first != startbranch || OP( last ) == BRANCH ) {
2621 /* branch sub-chain */
2622 NEXT_OFF( first ) = (U16)(last - first);
2623 #ifdef RE_TRACK_PATTERN_OFFSETS
2625 mjd_offset= Node_Offset((convert));
2626 mjd_nodelen= Node_Length((convert));
2629 /* whole branch chain */
2631 #ifdef RE_TRACK_PATTERN_OFFSETS
2634 const regnode *nop = NEXTOPER( convert );
2635 mjd_offset= Node_Offset((nop));
2636 mjd_nodelen= Node_Length((nop));
2640 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2641 (int)depth * 2 + 2, "",
2642 (UV)mjd_offset, (UV)mjd_nodelen)
2645 /* But first we check to see if there is a common prefix we can
2646 split out as an EXACT and put in front of the TRIE node. */
2647 trie->startstate= 1;
2648 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2650 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2654 const U32 base = trie->states[ state ].trans.base;
2656 if ( trie->states[state].wordnum )
2659 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2660 if ( ( base + ofs >= trie->uniquecharcount ) &&
2661 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2662 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2664 if ( ++count > 1 ) {
2665 SV **tmp = av_fetch( revcharmap, ofs, 0);
2666 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2667 if ( state == 1 ) break;
2669 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2671 PerlIO_printf(Perl_debug_log,
2672 "%*sNew Start State=%"UVuf" Class: [",
2673 (int)depth * 2 + 2, "",
2676 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2677 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2679 TRIE_BITMAP_SET(trie,*ch);
2681 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2683 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2687 TRIE_BITMAP_SET(trie,*ch);
2689 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2690 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2696 SV **tmp = av_fetch( revcharmap, idx, 0);
2698 char *ch = SvPV( *tmp, len );
2700 SV *sv=sv_newmortal();
2701 PerlIO_printf( Perl_debug_log,
2702 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2703 (int)depth * 2 + 2, "",
2705 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2706 PL_colors[0], PL_colors[1],
2707 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2708 PERL_PV_ESCAPE_FIRSTCHAR
2713 OP( convert ) = nodetype;
2714 str=STRING(convert);
2717 STR_LEN(convert) += len;
2723 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2728 trie->prefixlen = (state-1);
2730 regnode *n = convert+NODE_SZ_STR(convert);
2731 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2732 trie->startstate = state;
2733 trie->minlen -= (state - 1);
2734 trie->maxlen -= (state - 1);
2736 /* At least the UNICOS C compiler choked on this
2737 * being argument to DEBUG_r(), so let's just have
2740 #ifdef PERL_EXT_RE_BUILD
2746 regnode *fix = convert;
2747 U32 word = trie->wordcount;
2749 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2750 while( ++fix < n ) {
2751 Set_Node_Offset_Length(fix, 0, 0);
2754 SV ** const tmp = av_fetch( trie_words, word, 0 );
2756 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2757 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2759 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2767 NEXT_OFF(convert) = (U16)(tail - convert);
2768 DEBUG_r(optimize= n);
2774 if ( trie->maxlen ) {
2775 NEXT_OFF( convert ) = (U16)(tail - convert);
2776 ARG_SET( convert, data_slot );
2777 /* Store the offset to the first unabsorbed branch in
2778 jump[0], which is otherwise unused by the jump logic.
2779 We use this when dumping a trie and during optimisation. */
2781 trie->jump[0] = (U16)(nextbranch - convert);
2783 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2784 * and there is a bitmap
2785 * and the first "jump target" node we found leaves enough room
2786 * then convert the TRIE node into a TRIEC node, with the bitmap
2787 * embedded inline in the opcode - this is hypothetically faster.
2789 if ( !trie->states[trie->startstate].wordnum
2791 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2793 OP( convert ) = TRIEC;
2794 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2795 PerlMemShared_free(trie->bitmap);
2798 OP( convert ) = TRIE;
2800 /* store the type in the flags */
2801 convert->flags = nodetype;
2805 + regarglen[ OP( convert ) ];
2807 /* XXX We really should free up the resource in trie now,
2808 as we won't use them - (which resources?) dmq */
2810 /* needed for dumping*/
2811 DEBUG_r(if (optimize) {
2812 regnode *opt = convert;
2814 while ( ++opt < optimize) {
2815 Set_Node_Offset_Length(opt,0,0);
2818 Try to clean up some of the debris left after the
2821 while( optimize < jumper ) {
2822 mjd_nodelen += Node_Length((optimize));
2823 OP( optimize ) = OPTIMIZED;
2824 Set_Node_Offset_Length(optimize,0,0);
2827 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2829 } /* end node insert */
2831 /* Finish populating the prev field of the wordinfo array. Walk back
2832 * from each accept state until we find another accept state, and if
2833 * so, point the first word's .prev field at the second word. If the
2834 * second already has a .prev field set, stop now. This will be the
2835 * case either if we've already processed that word's accept state,
2836 * or that state had multiple words, and the overspill words were
2837 * already linked up earlier.
2844 for (word=1; word <= trie->wordcount; word++) {
2846 if (trie->wordinfo[word].prev)
2848 state = trie->wordinfo[word].accept;
2850 state = prev_states[state];
2853 prev = trie->states[state].wordnum;
2857 trie->wordinfo[word].prev = prev;
2859 Safefree(prev_states);
2863 /* and now dump out the compressed format */
2864 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2866 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2868 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2869 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2871 SvREFCNT_dec_NN(revcharmap);
2875 : trie->startstate>1
2881 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2883 /* The Trie is constructed and compressed now so we can build a fail array if
2886 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2888 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2892 We find the fail state for each state in the trie, this state is the longest
2893 proper suffix of the current state's 'word' that is also a proper prefix of
2894 another word in our trie. State 1 represents the word '' and is thus the
2895 default fail state. This allows the DFA not to have to restart after its
2896 tried and failed a word at a given point, it simply continues as though it
2897 had been matching the other word in the first place.
2899 'abcdgu'=~/abcdefg|cdgu/
2900 When we get to 'd' we are still matching the first word, we would encounter
2901 'g' which would fail, which would bring us to the state representing 'd' in
2902 the second word where we would try 'g' and succeed, proceeding to match
2905 /* add a fail transition */
2906 const U32 trie_offset = ARG(source);
2907 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2909 const U32 ucharcount = trie->uniquecharcount;
2910 const U32 numstates = trie->statecount;
2911 const U32 ubound = trie->lasttrans + ucharcount;
2915 U32 base = trie->states[ 1 ].trans.base;
2918 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
2919 GET_RE_DEBUG_FLAGS_DECL;
2921 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2923 PERL_UNUSED_ARG(depth);
2927 ARG_SET( stclass, data_slot );
2928 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2929 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2930 aho->trie=trie_offset;
2931 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2932 Copy( trie->states, aho->states, numstates, reg_trie_state );
2933 Newxz( q, numstates, U32);
2934 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2937 /* initialize fail[0..1] to be 1 so that we always have
2938 a valid final fail state */
2939 fail[ 0 ] = fail[ 1 ] = 1;
2941 for ( charid = 0; charid < ucharcount ; charid++ ) {
2942 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2944 q[ q_write ] = newstate;
2945 /* set to point at the root */
2946 fail[ q[ q_write++ ] ]=1;
2949 while ( q_read < q_write) {
2950 const U32 cur = q[ q_read++ % numstates ];
2951 base = trie->states[ cur ].trans.base;
2953 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2954 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2956 U32 fail_state = cur;
2959 fail_state = fail[ fail_state ];
2960 fail_base = aho->states[ fail_state ].trans.base;
2961 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2963 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2964 fail[ ch_state ] = fail_state;
2965 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2967 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2969 q[ q_write++ % numstates] = ch_state;
2973 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2974 when we fail in state 1, this allows us to use the
2975 charclass scan to find a valid start char. This is based on the principle
2976 that theres a good chance the string being searched contains lots of stuff
2977 that cant be a start char.
2979 fail[ 0 ] = fail[ 1 ] = 0;
2980 DEBUG_TRIE_COMPILE_r({
2981 PerlIO_printf(Perl_debug_log,
2982 "%*sStclass Failtable (%"UVuf" states): 0",
2983 (int)(depth * 2), "", (UV)numstates
2985 for( q_read=1; q_read<numstates; q_read++ ) {
2986 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2988 PerlIO_printf(Perl_debug_log, "\n");
2991 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2995 #define DEBUG_PEEP(str,scan,depth) \
2996 DEBUG_OPTIMISE_r({if (scan){ \
2997 SV * const mysv=sv_newmortal(); \
2998 regnode *Next = regnext(scan); \
2999 regprop(RExC_rx, mysv, scan); \
3000 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3001 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3002 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3006 /* The below joins as many adjacent EXACTish nodes as possible into a single
3007 * one. The regop may be changed if the node(s) contain certain sequences that
3008 * require special handling. The joining is only done if:
3009 * 1) there is room in the current conglomerated node to entirely contain the
3011 * 2) they are the exact same node type
3013 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3014 * these get optimized out
3016 * If a node is to match under /i (folded), the number of characters it matches
3017 * can be different than its character length if it contains a multi-character
3018 * fold. *min_subtract is set to the total delta of the input nodes.
3020 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
3021 * and contains LATIN SMALL LETTER SHARP S
3023 * This is as good a place as any to discuss the design of handling these
3024 * multi-character fold sequences. It's been wrong in Perl for a very long
3025 * time. There are three code points in Unicode whose multi-character folds
3026 * were long ago discovered to mess things up. The previous designs for
3027 * dealing with these involved assigning a special node for them. This
3028 * approach doesn't work, as evidenced by this example:
3029 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3030 * Both these fold to "sss", but if the pattern is parsed to create a node that
3031 * would match just the \xDF, it won't be able to handle the case where a
3032 * successful match would have to cross the node's boundary. The new approach
3033 * that hopefully generally solves the problem generates an EXACTFU_SS node
3036 * It turns out that there are problems with all multi-character folds, and not
3037 * just these three. Now the code is general, for all such cases. The
3038 * approach taken is:
3039 * 1) This routine examines each EXACTFish node that could contain multi-
3040 * character fold sequences. It returns in *min_subtract how much to
3041 * subtract from the the actual length of the string to get a real minimum
3042 * match length; it is 0 if there are no multi-char folds. This delta is
3043 * used by the caller to adjust the min length of the match, and the delta
3044 * between min and max, so that the optimizer doesn't reject these
3045 * possibilities based on size constraints.
3046 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3047 * is used for an EXACTFU node that contains at least one "ss" sequence in
3048 * it. For non-UTF-8 patterns and strings, this is the only case where
3049 * there is a possible fold length change. That means that a regular
3050 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3051 * with length changes, and so can be processed faster. regexec.c takes
3052 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3053 * pre-folded by regcomp.c. This saves effort in regex matching.
3054 * However, the pre-folding isn't done for non-UTF8 patterns because the
3055 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
3056 * down by forcing the pattern into UTF8 unless necessary. Also what
3057 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
3058 * possibilities for the non-UTF8 patterns are quite simple, except for
3059 * the sharp s. All the ones that don't involve a UTF-8 target string are
3060 * members of a fold-pair, and arrays are set up for all of them so that
3061 * the other member of the pair can be found quickly. Code elsewhere in
3062 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3063 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3064 * described in the next item.
3065 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
3066 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3067 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
3068 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
3069 * character in the pattern corresponds to at most a single character in
3070 * the target string. (And I do mean character, and not byte here, unlike
3071 * other parts of the documentation that have never been updated to
3072 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
3073 * two character string 'ss'; in EXACTFA nodes it can match
3074 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
3075 * instances where it is violated. I'm reluctant to try to change the
3076 * assumption, as the code involved is impenetrable to me (khw), so
3077 * instead the code here punts. This routine examines (when the pattern
3078 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3079 * boolean indicating whether or not the node contains a sharp s. When it
3080 * is true, the caller sets a flag that later causes the optimizer in this
3081 * file to not set values for the floating and fixed string lengths, and
3082 * thus avoids the optimizer code in regexec.c that makes the invalid
3083 * assumption. Thus, there is no optimization based on string lengths for
3084 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3085 * (The reason the assumption is wrong only in these two cases is that all
3086 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3087 * other folds to their expanded versions. We can't prefold sharp s to
3088 * 'ss' in EXACTF nodes because we don't know at compile time if it
3089 * actually matches 'ss' or not. It will match iff the target string is
3090 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3091 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
3092 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3093 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3094 * require the pattern to be forced into UTF-8, the overhead of which we
3097 * Similarly, the code that generates tries doesn't currently handle
3098 * not-already-folded multi-char folds, and it looks like a pain to change
3099 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3100 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3101 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3102 * using /iaa matching will be doing so almost entirely with ASCII
3103 * strings, so this should rarely be encountered in practice */
3105 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
3106 if (PL_regkind[OP(scan)] == EXACT) \
3107 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
3110 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
3111 /* Merge several consecutive EXACTish nodes into one. */
3112 regnode *n = regnext(scan);
3114 regnode *next = scan + NODE_SZ_STR(scan);
3118 regnode *stop = scan;
3119 GET_RE_DEBUG_FLAGS_DECL;
3121 PERL_UNUSED_ARG(depth);
3124 PERL_ARGS_ASSERT_JOIN_EXACT;
3125 #ifndef EXPERIMENTAL_INPLACESCAN
3126 PERL_UNUSED_ARG(flags);
3127 PERL_UNUSED_ARG(val);
3129 DEBUG_PEEP("join",scan,depth);
3131 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3132 * EXACT ones that are mergeable to the current one. */
3134 && (PL_regkind[OP(n)] == NOTHING
3135 || (stringok && OP(n) == OP(scan)))
3137 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3140 if (OP(n) == TAIL || n > next)
3142 if (PL_regkind[OP(n)] == NOTHING) {
3143 DEBUG_PEEP("skip:",n,depth);
3144 NEXT_OFF(scan) += NEXT_OFF(n);
3145 next = n + NODE_STEP_REGNODE;
3152 else if (stringok) {
3153 const unsigned int oldl = STR_LEN(scan);
3154 regnode * const nnext = regnext(n);
3156 /* XXX I (khw) kind of doubt that this works on platforms where
3157 * U8_MAX is above 255 because of lots of other assumptions */
3158 /* Don't join if the sum can't fit into a single node */
3159 if (oldl + STR_LEN(n) > U8_MAX)
3162 DEBUG_PEEP("merg",n,depth);
3165 NEXT_OFF(scan) += NEXT_OFF(n);
3166 STR_LEN(scan) += STR_LEN(n);
3167 next = n + NODE_SZ_STR(n);
3168 /* Now we can overwrite *n : */
3169 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3177 #ifdef EXPERIMENTAL_INPLACESCAN
3178 if (flags && !NEXT_OFF(n)) {
3179 DEBUG_PEEP("atch", val, depth);
3180 if (reg_off_by_arg[OP(n)]) {
3181 ARG_SET(n, val - n);
3184 NEXT_OFF(n) = val - n;
3192 *has_exactf_sharp_s = FALSE;
3194 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3195 * can now analyze for sequences of problematic code points. (Prior to
3196 * this final joining, sequences could have been split over boundaries, and
3197 * hence missed). The sequences only happen in folding, hence for any
3198 * non-EXACT EXACTish node */
3199 if (OP(scan) != EXACT) {
3200 const U8 * const s0 = (U8*) STRING(scan);
3202 const U8 * const s_end = s0 + STR_LEN(scan);
3204 /* One pass is made over the node's string looking for all the
3205 * possibilities. to avoid some tests in the loop, there are two main
3206 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3210 /* Examine the string for a multi-character fold sequence. UTF-8
3211 * patterns have all characters pre-folded by the time this code is
3213 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3214 length sequence we are looking for is 2 */
3217 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3218 if (! len) { /* Not a multi-char fold: get next char */
3223 /* Nodes with 'ss' require special handling, except for EXACTFL
3224 * and EXACTFA-ish for which there is no multi-char fold to
3226 if (len == 2 && *s == 's' && *(s+1) == 's'
3227 && OP(scan) != EXACTFL
3228 && OP(scan) != EXACTFA
3229 && OP(scan) != EXACTFA_NO_TRIE)
3232 OP(scan) = EXACTFU_SS;
3235 else { /* Here is a generic multi-char fold. */
3236 const U8* multi_end = s + len;
3238 /* Count how many characters in it. In the case of /l and
3239 * /aa, no folds which contain ASCII code points are
3240 * allowed, so check for those, and skip if found. (In
3241 * EXACTFL, no folds are allowed to any Latin1 code point,
3242 * not just ASCII. But there aren't any of these
3243 * currently, nor ever likely, so don't take the time to
3244 * test for them. The code that generates the
3245 * is_MULTI_foo() macros croaks should one actually get put
3246 * into Unicode .) */
3247 if (OP(scan) != EXACTFL
3248 && OP(scan) != EXACTFA
3249 && OP(scan) != EXACTFA_NO_TRIE)
3251 count = utf8_length(s, multi_end);
3255 while (s < multi_end) {
3258 goto next_iteration;
3268 /* The delta is how long the sequence is minus 1 (1 is how long
3269 * the character that folds to the sequence is) */
3270 *min_subtract += count - 1;
3274 else if (OP(scan) == EXACTFA) {
3276 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3277 * fold to the ASCII range (and there are no existing ones in the
3278 * upper latin1 range). But, as outlined in the comments preceding
3279 * this function, we need to flag any occurrences of the sharp s.
3280 * This character forbids trie formation (because of added
3283 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3284 OP(scan) = EXACTFA_NO_TRIE;
3285 *has_exactf_sharp_s = TRUE;
3292 else if (OP(scan) != EXACTFL) {
3294 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
3295 * multi-char folds that are all Latin1. (This code knows that
3296 * there are no current multi-char folds possible with EXACTFL,
3297 * relying on fold_grind.t to catch any errors if the very unlikely
3298 * event happens that some get added in future Unicode versions.)
3299 * As explained in the comments preceding this function, we look
3300 * also for the sharp s in EXACTF nodes; it can be in the final
3301 * position. Otherwise we can stop looking 1 byte earlier because
3302 * have to find at least two characters for a multi-fold */
3303 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3306 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3307 if (! len) { /* Not a multi-char fold. */
3308 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3310 *has_exactf_sharp_s = TRUE;
3317 && isARG2_lower_or_UPPER_ARG1('s', *s)
3318 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3321 /* EXACTF nodes need to know that the minimum length
3322 * changed so that a sharp s in the string can match this
3323 * ss in the pattern, but they remain EXACTF nodes, as they
3324 * won't match this unless the target string is is UTF-8,
3325 * which we don't know until runtime */
3326 if (OP(scan) != EXACTF) {
3327 OP(scan) = EXACTFU_SS;
3331 *min_subtract += len - 1;
3338 /* Allow dumping but overwriting the collection of skipped
3339 * ops and/or strings with fake optimized ops */
3340 n = scan + NODE_SZ_STR(scan);
3348 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3352 /* REx optimizer. Converts nodes into quicker variants "in place".
3353 Finds fixed substrings. */
3355 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3356 to the position after last scanned or to NULL. */
3358 #define INIT_AND_WITHP \
3359 assert(!and_withp); \
3360 Newx(and_withp,1, regnode_ssc); \
3361 SAVEFREEPV(and_withp)
3363 /* this is a chain of data about sub patterns we are processing that
3364 need to be handled separately/specially in study_chunk. Its so
3365 we can simulate recursion without losing state. */
3367 typedef struct scan_frame {
3368 regnode *last; /* last node to process in this frame */
3369 regnode *next; /* next node to process when last is reached */
3370 struct scan_frame *prev; /*previous frame*/
3371 U32 prev_recursed_depth;
3372 I32 stop; /* what stopparen do we use */
3376 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3379 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3380 SSize_t *minlenp, SSize_t *deltap,
3385 regnode_ssc *and_withp,
3386 U32 flags, U32 depth)
3387 /* scanp: Start here (read-write). */
3388 /* deltap: Write maxlen-minlen here. */
3389 /* last: Stop before this one. */
3390 /* data: string data about the pattern */
3391 /* stopparen: treat close N as END */
3392 /* recursed: which subroutines have we recursed into */
3393 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3396 /* There must be at least this number of characters to match */
3399 regnode *scan = *scanp, *next;
3401 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3402 int is_inf_internal = 0; /* The studied chunk is infinite */
3403 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3404 scan_data_t data_fake;
3405 SV *re_trie_maxbuff = NULL;
3406 regnode *first_non_open = scan;
3407 SSize_t stopmin = SSize_t_MAX;
3408 scan_frame *frame = NULL;
3409 GET_RE_DEBUG_FLAGS_DECL;
3411 PERL_ARGS_ASSERT_STUDY_CHUNK;
3414 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3417 while (first_non_open && OP(first_non_open) == OPEN)
3418 first_non_open=regnext(first_non_open);
3423 while ( scan && OP(scan) != END && scan < last ){
3424 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3425 node length to get a real minimum (because
3426 the folded version may be shorter) */
3427 bool has_exactf_sharp_s = FALSE;
3428 /* Peephole optimizer: */
3429 DEBUG_OPTIMISE_MORE_r(
3431 PerlIO_printf(Perl_debug_log,"%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3432 ((int) depth*2), "", (long)stopparen,
3433 (unsigned long)depth, (unsigned long)recursed_depth);
3434 if (recursed_depth) {
3437 for ( j = 0 ; j < recursed_depth ; j++ ) {
3438 PerlIO_printf(Perl_debug_log,"[");
3439 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3440 PerlIO_printf(Perl_debug_log,"%d",
3441 PAREN_TEST(RExC_study_chunk_recursed +
3442 (j * RExC_study_chunk_recursed_bytes), i)
3445 PerlIO_printf(Perl_debug_log,"]");
3448 PerlIO_printf(Perl_debug_log,"\n");
3451 DEBUG_STUDYDATA("Peep:", data, depth);
3452 DEBUG_PEEP("Peep", scan, depth);
3455 /* Its not clear to khw or hv why this is done here, and not in the
3456 * clauses that deal with EXACT nodes. khw's guess is that it's
3457 * because of a previous design */
3458 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3460 /* Follow the next-chain of the current node and optimize
3461 away all the NOTHINGs from it. */
3462 if (OP(scan) != CURLYX) {
3463 const int max = (reg_off_by_arg[OP(scan)]
3465 /* I32 may be smaller than U16 on CRAYs! */
3466 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3467 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3471 /* Skip NOTHING and LONGJMP. */
3472 while ((n = regnext(n))
3473 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3474 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3475 && off + noff < max)
3477 if (reg_off_by_arg[OP(scan)])
3480 NEXT_OFF(scan) = off;
3485 /* The principal pseudo-switch. Cannot be a switch, since we
3486 look into several different things. */
3487 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3488 || OP(scan) == IFTHEN) {
3489 next = regnext(scan);
3491 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3493 if (OP(next) == code || code == IFTHEN) {
3494 /* NOTE - There is similar code to this block below for
3495 * handling TRIE nodes on a re-study. If you change stuff here
3496 * check there too. */
3497 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3499 regnode * const startbranch=scan;
3501 if (flags & SCF_DO_SUBSTR)
3502 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3503 if (flags & SCF_DO_STCLASS)
3504 ssc_init_zero(pRExC_state, &accum);
3506 while (OP(scan) == code) {
3507 SSize_t deltanext, minnext, fake;
3509 regnode_ssc this_class;
3512 data_fake.flags = 0;
3514 data_fake.whilem_c = data->whilem_c;
3515 data_fake.last_closep = data->last_closep;
3518 data_fake.last_closep = &fake;
3520 data_fake.pos_delta = delta;
3521 next = regnext(scan);
3522 scan = NEXTOPER(scan);
3524 scan = NEXTOPER(scan);
3525 if (flags & SCF_DO_STCLASS) {
3526 ssc_init(pRExC_state, &this_class);
3527 data_fake.start_class = &this_class;
3528 f = SCF_DO_STCLASS_AND;
3530 if (flags & SCF_WHILEM_VISITED_POS)
3531 f |= SCF_WHILEM_VISITED_POS;
3533 /* we suppose the run is continuous, last=next...*/
3534 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3536 stopparen, recursed_depth, NULL, f,depth+1);
3539 if (deltanext == SSize_t_MAX) {
3540 is_inf = is_inf_internal = 1;
3542 } else if (max1 < minnext + deltanext)
3543 max1 = minnext + deltanext;
3545 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3547 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3548 if ( stopmin > minnext)
3549 stopmin = min + min1;
3550 flags &= ~SCF_DO_SUBSTR;
3552 data->flags |= SCF_SEEN_ACCEPT;
3555 if (data_fake.flags & SF_HAS_EVAL)
3556 data->flags |= SF_HAS_EVAL;
3557 data->whilem_c = data_fake.whilem_c;
3559 if (flags & SCF_DO_STCLASS)
3560 ssc_or(pRExC_state, &accum, &this_class);
3562 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3564 if (flags & SCF_DO_SUBSTR) {
3565 data->pos_min += min1;
3566 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3567 data->pos_delta = SSize_t_MAX;
3569 data->pos_delta += max1 - min1;
3570 if (max1 != min1 || is_inf)
3571 data->longest = &(data->longest_float);
3574 if (delta == SSize_t_MAX
3575 || SSize_t_MAX - delta - (max1 - min1) < 0)
3576 delta = SSize_t_MAX;
3578 delta += max1 - min1;
3579 if (flags & SCF_DO_STCLASS_OR) {
3580 ssc_or(pRExC_state, data->start_class, &accum);
3582 ssc_and(pRExC_state, data->start_class, and_withp);
3583 flags &= ~SCF_DO_STCLASS;
3586 else if (flags & SCF_DO_STCLASS_AND) {
3588 ssc_and(pRExC_state, data->start_class, &accum);
3589 flags &= ~SCF_DO_STCLASS;
3592 /* Switch to OR mode: cache the old value of
3593 * data->start_class */
3595 StructCopy(data->start_class, and_withp, regnode_ssc);
3596 flags &= ~SCF_DO_STCLASS_AND;
3597 StructCopy(&accum, data->start_class, regnode_ssc);
3598 flags |= SCF_DO_STCLASS_OR;
3602 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3605 Assuming this was/is a branch we are dealing with: 'scan'
3606 now points at the item that follows the branch sequence,
3607 whatever it is. We now start at the beginning of the
3608 sequence and look for subsequences of
3614 which would be constructed from a pattern like
3617 If we can find such a subsequence we need to turn the first
3618 element into a trie and then add the subsequent branch exact
3619 strings to the trie.
3623 1. patterns where the whole set of branches can be
3626 2. patterns where only a subset can be converted.
3628 In case 1 we can replace the whole set with a single regop
3629 for the trie. In case 2 we need to keep the start and end
3632 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3633 becomes BRANCH TRIE; BRANCH X;
3635 There is an additional case, that being where there is a
3636 common prefix, which gets split out into an EXACT like node
3637 preceding the TRIE node.
3639 If x(1..n)==tail then we can do a simple trie, if not we make
3640 a "jump" trie, such that when we match the appropriate word
3641 we "jump" to the appropriate tail node. Essentially we turn
3642 a nested if into a case structure of sorts.
3647 if (!re_trie_maxbuff) {
3648 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3649 if (!SvIOK(re_trie_maxbuff))
3650 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3652 if ( SvIV(re_trie_maxbuff)>=0 ) {
3654 regnode *first = (regnode *)NULL;
3655 regnode *last = (regnode *)NULL;
3656 regnode *tail = scan;
3661 SV * const mysv = sv_newmortal(); /* for dumping */
3663 /* var tail is used because there may be a TAIL
3664 regop in the way. Ie, the exacts will point to the
3665 thing following the TAIL, but the last branch will
3666 point at the TAIL. So we advance tail. If we
3667 have nested (?:) we may have to move through several
3671 while ( OP( tail ) == TAIL ) {
3672 /* this is the TAIL generated by (?:) */
3673 tail = regnext( tail );
3677 DEBUG_TRIE_COMPILE_r({
3678 regprop(RExC_rx, mysv, tail );
3679 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3680 (int)depth * 2 + 2, "",
3681 "Looking for TRIE'able sequences. Tail node is: ",
3682 SvPV_nolen_const( mysv )
3688 Step through the branches
3689 cur represents each branch,
3690 noper is the first thing to be matched as part
3692 noper_next is the regnext() of that node.
3694 We normally handle a case like this
3695 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3696 support building with NOJUMPTRIE, which restricts
3697 the trie logic to structures like /FOO|BAR/.
3699 If noper is a trieable nodetype then the branch is
3700 a possible optimization target. If we are building
3701 under NOJUMPTRIE then we require that noper_next is
3702 the same as scan (our current position in the regex
3705 Once we have two or more consecutive such branches
3706 we can create a trie of the EXACT's contents and
3707 stitch it in place into the program.
3709 If the sequence represents all of the branches in
3710 the alternation we replace the entire thing with a
3713 Otherwise when it is a subsequence we need to
3714 stitch it in place and replace only the relevant
3715 branches. This means the first branch has to remain
3716 as it is used by the alternation logic, and its
3717 next pointer, and needs to be repointed at the item
3718 on the branch chain following the last branch we
3719 have optimized away.
3721 This could be either a BRANCH, in which case the
3722 subsequence is internal, or it could be the item
3723 following the branch sequence in which case the
3724 subsequence is at the end (which does not
3725 necessarily mean the first node is the start of the
3728 TRIE_TYPE(X) is a define which maps the optype to a
3732 ----------------+-----------
3736 EXACTFU_SS | EXACTFU
3741 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3742 ( EXACT == (X) ) ? EXACT : \
3743 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3744 ( EXACTFA == (X) ) ? EXACTFA : \
3747 /* dont use tail as the end marker for this traverse */
3748 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3749 regnode * const noper = NEXTOPER( cur );
3750 U8 noper_type = OP( noper );
3751 U8 noper_trietype = TRIE_TYPE( noper_type );
3752 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3753 regnode * const noper_next = regnext( noper );
3754 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3755 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3758 DEBUG_TRIE_COMPILE_r({
3759 regprop(RExC_rx, mysv, cur);
3760 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3761 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3763 regprop(RExC_rx, mysv, noper);
3764 PerlIO_printf( Perl_debug_log, " -> %s",
3765 SvPV_nolen_const(mysv));
3768 regprop(RExC_rx, mysv, noper_next );
3769 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3770 SvPV_nolen_const(mysv));
3772 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3773 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3774 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3778 /* Is noper a trieable nodetype that can be merged
3779 * with the current trie (if there is one)? */
3783 ( noper_trietype == NOTHING)
3784 || ( trietype == NOTHING )
3785 || ( trietype == noper_trietype )
3788 && noper_next == tail
3792 /* Handle mergable triable node Either we are
3793 * the first node in a new trieable sequence,
3794 * in which case we do some bookkeeping,
3795 * otherwise we update the end pointer. */
3798 if ( noper_trietype == NOTHING ) {
3799 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3800 regnode * const noper_next = regnext( noper );
3801 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3802 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3805 if ( noper_next_trietype ) {
3806 trietype = noper_next_trietype;
3807 } else if (noper_next_type) {
3808 /* a NOTHING regop is 1 regop wide.
3809 * We need at least two for a trie
3810 * so we can't merge this in */
3814 trietype = noper_trietype;
3817 if ( trietype == NOTHING )
3818 trietype = noper_trietype;
3823 } /* end handle mergable triable node */
3825 /* handle unmergable node -
3826 * noper may either be a triable node which can
3827 * not be tried together with the current trie,
3828 * or a non triable node */
3830 /* If last is set and trietype is not
3831 * NOTHING then we have found at least two
3832 * triable branch sequences in a row of a
3833 * similar trietype so we can turn them
3834 * into a trie. If/when we allow NOTHING to
3835 * start a trie sequence this condition
3836 * will be required, and it isn't expensive
3837 * so we leave it in for now. */
3838 if ( trietype && trietype != NOTHING )
3839 make_trie( pRExC_state,
3840 startbranch, first, cur, tail, count,
3841 trietype, depth+1 );
3842 last = NULL; /* note: we clear/update
3843 first, trietype etc below,
3844 so we dont do it here */
3848 && noper_next == tail
3851 /* noper is triable, so we can start a new
3855 trietype = noper_trietype;
3857 /* if we already saw a first but the
3858 * current node is not triable then we have
3859 * to reset the first information. */
3864 } /* end handle unmergable node */
3865 } /* loop over branches */
3866 DEBUG_TRIE_COMPILE_r({
3867 regprop(RExC_rx, mysv, cur);
3868 PerlIO_printf( Perl_debug_log,
3869 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3870 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3873 if ( last && trietype ) {
3874 if ( trietype != NOTHING ) {
3875 /* the last branch of the sequence was part of
3876 * a trie, so we have to construct it here
3877 * outside of the loop */
3878 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3879 #ifdef TRIE_STUDY_OPT
3880 if ( ((made == MADE_EXACT_TRIE &&
3881 startbranch == first)
3882 || ( first_non_open == first )) &&
3884 flags |= SCF_TRIE_RESTUDY;
3885 if ( startbranch == first
3888 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3893 /* at this point we know whatever we have is a
3894 * NOTHING sequence/branch AND if 'startbranch'
3895 * is 'first' then we can turn the whole thing
3898 if ( startbranch == first ) {
3900 /* the entire thing is a NOTHING sequence,
3901 * something like this: (?:|) So we can
3902 * turn it into a plain NOTHING op. */
3903 DEBUG_TRIE_COMPILE_r({
3904 regprop(RExC_rx, mysv, cur);
3905 PerlIO_printf( Perl_debug_log,
3906 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3907 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3910 OP(startbranch)= NOTHING;
3911 NEXT_OFF(startbranch)= tail - startbranch;
3912 for ( opt= startbranch + 1; opt < tail ; opt++ )
3916 } /* end if ( last) */
3917 } /* TRIE_MAXBUF is non zero */
3922 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3923 scan = NEXTOPER(NEXTOPER(scan));
3924 } else /* single branch is optimized. */
3925 scan = NEXTOPER(scan);
3927 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3928 scan_frame *newframe = NULL;
3932 U32 my_recursed_depth= recursed_depth;
3934 if (OP(scan) != SUSPEND) {
3935 /* set the pointer */
3936 if (OP(scan) == GOSUB) {
3938 RExC_recurse[ARG2L(scan)] = scan;
3939 start = RExC_open_parens[paren-1];
3940 end = RExC_close_parens[paren-1];
3943 start = RExC_rxi->program + 1;
3948 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
3950 if (!recursed_depth) {
3951 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
3953 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
3954 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
3955 RExC_study_chunk_recursed_bytes, U8);
3957 /* we havent recursed into this paren yet, so recurse into it */
3958 DEBUG_STUDYDATA("set:", data,depth);
3959 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
3960 my_recursed_depth= recursed_depth + 1;
3961 Newx(newframe,1,scan_frame);
3963 DEBUG_STUDYDATA("inf:", data,depth);
3964 /* some form of infinite recursion, assume infinite length */
3965 if (flags & SCF_DO_SUBSTR) {
3966 SCAN_COMMIT(pRExC_state,data,minlenp);
3967 data->longest = &(data->longest_float);
3969 is_inf = is_inf_internal = 1;
3970 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3971 ssc_anything(data->start_class);
3972 flags &= ~SCF_DO_STCLASS;
3975 Newx(newframe,1,scan_frame);
3978 end = regnext(scan);
3983 SAVEFREEPV(newframe);
3984 newframe->next = regnext(scan);
3985 newframe->last = last;
3986 newframe->stop = stopparen;
3987 newframe->prev = frame;
3988 newframe->prev_recursed_depth = recursed_depth;
3990 DEBUG_STUDYDATA("frame-new:",data,depth);
3991 DEBUG_PEEP("fnew", scan, depth);
3998 recursed_depth= my_recursed_depth;
4003 else if (OP(scan) == EXACT) {
4004 SSize_t l = STR_LEN(scan);
4007 const U8 * const s = (U8*)STRING(scan);
4008 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4009 l = utf8_length(s, s + l);
4011 uc = *((U8*)STRING(scan));
4014 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4015 /* The code below prefers earlier match for fixed
4016 offset, later match for variable offset. */
4017 if (data->last_end == -1) { /* Update the start info. */
4018 data->last_start_min = data->pos_min;
4019 data->last_start_max = is_inf
4020 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4022 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4024 SvUTF8_on(data->last_found);
4026 SV * const sv = data->last_found;
4027 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4028 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4029 if (mg && mg->mg_len >= 0)
4030 mg->mg_len += utf8_length((U8*)STRING(scan),
4031 (U8*)STRING(scan)+STR_LEN(scan));
4033 data->last_end = data->pos_min + l;
4034 data->pos_min += l; /* As in the first entry. */
4035 data->flags &= ~SF_BEFORE_EOL;
4038 /* ANDing the code point leaves at most it, and not in locale, and
4039 * can't match null string */
4040 if (flags & SCF_DO_STCLASS_AND) {
4041 ssc_cp_and(data->start_class, uc);
4042 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4043 ssc_clear_locale(data->start_class);
4045 else if (flags & SCF_DO_STCLASS_OR) {
4046 ssc_add_cp(data->start_class, uc);
4047 ssc_and(pRExC_state, data->start_class, and_withp);
4049 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4050 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4052 flags &= ~SCF_DO_STCLASS;
4054 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4055 SSize_t l = STR_LEN(scan);
4056 UV uc = *((U8*)STRING(scan));
4057 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4058 separate code points */
4060 /* Search for fixed substrings supports EXACT only. */
4061 if (flags & SCF_DO_SUBSTR) {
4063 SCAN_COMMIT(pRExC_state, data, minlenp);
4066 const U8 * const s = (U8 *)STRING(scan);
4067 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4068 l = utf8_length(s, s + l);
4070 if (has_exactf_sharp_s) {
4071 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
4073 min += l - min_subtract;
4075 delta += min_subtract;
4076 if (flags & SCF_DO_SUBSTR) {
4077 data->pos_min += l - min_subtract;
4078 if (data->pos_min < 0) {
4081 data->pos_delta += min_subtract;
4083 data->longest = &(data->longest_float);
4086 if (OP(scan) == EXACTFL) {
4087 if (flags & SCF_DO_STCLASS_AND) {
4088 ssc_flags_and(data->start_class,
4089 ANYOF_LOCALE|ANYOF_LOC_FOLD);
4091 else if (flags & SCF_DO_STCLASS_OR) {
4092 ANYOF_FLAGS(data->start_class)
4093 |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
4096 /* We don't know what the folds are; it could be anything. XXX
4097 * Actually, we only support UTF-8 encoding for code points
4098 * above Latin1, so we could know what those folds are. */
4099 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4103 else { /* Non-locale EXACTFish */
4104 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4105 if (flags & SCF_DO_STCLASS_AND) {
4106 ssc_clear_locale(data->start_class);
4108 if (uc < 256) { /* We know what the Latin1 folds are ... */
4109 if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
4110 know if anything folds
4112 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4113 PL_fold_latin1[uc]);
4114 if (OP(scan) != EXACTFA) { /* The folds below aren't
4116 if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4118 = add_cp_to_invlist(EXACTF_invlist,
4119 LATIN_SMALL_LETTER_SHARP_S);
4121 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4123 = add_cp_to_invlist(EXACTF_invlist, 's');
4125 = add_cp_to_invlist(EXACTF_invlist, 'S');
4129 /* We also know if there are above-Latin1 code points
4130 * that fold to this (none legal for ASCII and /iaa) */
4131 if ((! isASCII(uc) || OP(scan) != EXACTFA)
4132 && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4134 /* XXX We could know exactly what does fold to this
4135 * if the reverse folds are loaded, as currently in
4137 _invlist_union(EXACTF_invlist,
4143 else { /* Non-locale, above Latin1. XXX We don't currently
4144 know what participates in folds with this, so have
4145 to assume anything could */
4147 /* XXX We could know exactly what does fold to this if the
4148 * reverse folds are loaded, as currently in S_regclass().
4149 * But we do know that under /iaa nothing in the ASCII
4150 * range can participate */
4151 if (OP(scan) == EXACTFA) {
4152 _invlist_union_complement_2nd(EXACTF_invlist,
4153 PL_Posix_ptrs[_CC_ASCII],
4157 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4162 if (flags & SCF_DO_STCLASS_AND) {
4163 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4164 ANYOF_POSIXL_ZERO(data->start_class);
4165 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4167 else if (flags & SCF_DO_STCLASS_OR) {
4168 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4169 ssc_and(pRExC_state, data->start_class, and_withp);
4171 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4172 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4174 flags &= ~SCF_DO_STCLASS;
4175 SvREFCNT_dec(EXACTF_invlist);
4177 else if (REGNODE_VARIES(OP(scan))) {
4178 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4179 I32 fl = 0, f = flags;
4180 regnode * const oscan = scan;
4181 regnode_ssc this_class;
4182 regnode_ssc *oclass = NULL;
4183 I32 next_is_eval = 0;
4185 switch (PL_regkind[OP(scan)]) {
4186 case WHILEM: /* End of (?:...)* . */
4187 scan = NEXTOPER(scan);
4190 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4191 next = NEXTOPER(scan);
4192 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4194 maxcount = REG_INFTY;
4195 next = regnext(scan);
4196 scan = NEXTOPER(scan);
4200 if (flags & SCF_DO_SUBSTR)
4205 if (flags & SCF_DO_STCLASS) {
4207 maxcount = REG_INFTY;
4208 next = regnext(scan);
4209 scan = NEXTOPER(scan);
4212 is_inf = is_inf_internal = 1;
4213 scan = regnext(scan);
4214 if (flags & SCF_DO_SUBSTR) {
4215 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
4216 data->longest = &(data->longest_float);
4218 goto optimize_curly_tail;
4220 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4221 && (scan->flags == stopparen))
4226 mincount = ARG1(scan);
4227 maxcount = ARG2(scan);
4229 next = regnext(scan);
4230 if (OP(scan) == CURLYX) {
4231 I32 lp = (data ? *(data->last_closep) : 0);
4232 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4234 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4235 next_is_eval = (OP(scan) == EVAL);
4237 if (flags & SCF_DO_SUBSTR) {
4238 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
4239 pos_before = data->pos_min;
4243 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4245 data->flags |= SF_IS_INF;
4247 if (flags & SCF_DO_STCLASS) {
4248 ssc_init(pRExC_state, &this_class);
4249 oclass = data->start_class;
4250 data->start_class = &this_class;
4251 f |= SCF_DO_STCLASS_AND;
4252 f &= ~SCF_DO_STCLASS_OR;
4254 /* Exclude from super-linear cache processing any {n,m}
4255 regops for which the combination of input pos and regex
4256 pos is not enough information to determine if a match
4259 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4260 regex pos at the \s*, the prospects for a match depend not
4261 only on the input position but also on how many (bar\s*)
4262 repeats into the {4,8} we are. */
4263 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4264 f &= ~SCF_WHILEM_VISITED_POS;
4266 /* This will finish on WHILEM, setting scan, or on NULL: */
4267 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4268 last, data, stopparen, recursed_depth, NULL,
4270 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
4272 if (flags & SCF_DO_STCLASS)
4273 data->start_class = oclass;
4274 if (mincount == 0 || minnext == 0) {
4275 if (flags & SCF_DO_STCLASS_OR) {
4276 ssc_or(pRExC_state, data->start_class, &this_class);
4278 else if (flags & SCF_DO_STCLASS_AND) {
4279 /* Switch to OR mode: cache the old value of
4280 * data->start_class */
4282 StructCopy(data->start_class, and_withp, regnode_ssc);
4283 flags &= ~SCF_DO_STCLASS_AND;
4284 StructCopy(&this_class, data->start_class, regnode_ssc);
4285 flags |= SCF_DO_STCLASS_OR;
4286 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4288 } else { /* Non-zero len */
4289 if (flags & SCF_DO_STCLASS_OR) {
4290 ssc_or(pRExC_state, data->start_class, &this_class);
4291 ssc_and(pRExC_state, data->start_class, and_withp);
4293 else if (flags & SCF_DO_STCLASS_AND)
4294 ssc_and(pRExC_state, data->start_class, &this_class);
4295 flags &= ~SCF_DO_STCLASS;
4297 if (!scan) /* It was not CURLYX, but CURLY. */
4299 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4300 /* ? quantifier ok, except for (?{ ... }) */
4301 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4302 && (minnext == 0) && (deltanext == 0)
4303 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4304 && maxcount <= REG_INFTY/3) /* Complement check for big count */
4306 /* Fatal warnings may leak the regexp without this: */
4307 SAVEFREESV(RExC_rx_sv);
4308 ckWARNreg(RExC_parse,
4309 "Quantifier unexpected on zero-length expression");
4310 (void)ReREFCNT_inc(RExC_rx_sv);
4313 min += minnext * mincount;
4314 is_inf_internal |= deltanext == SSize_t_MAX
4315 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4316 is_inf |= is_inf_internal;
4318 delta = SSize_t_MAX;
4320 delta += (minnext + deltanext) * maxcount - minnext * mincount;
4322 /* Try powerful optimization CURLYX => CURLYN. */
4323 if ( OP(oscan) == CURLYX && data
4324 && data->flags & SF_IN_PAR
4325 && !(data->flags & SF_HAS_EVAL)
4326 && !deltanext && minnext == 1 ) {
4327 /* Try to optimize to CURLYN. */
4328 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4329 regnode * const nxt1 = nxt;
4336 if (!REGNODE_SIMPLE(OP(nxt))
4337 && !(PL_regkind[OP(nxt)] == EXACT
4338 && STR_LEN(nxt) == 1))
4344 if (OP(nxt) != CLOSE)
4346 if (RExC_open_parens) {
4347 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4348 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4350 /* Now we know that nxt2 is the only contents: */
4351 oscan->flags = (U8)ARG(nxt);
4353 OP(nxt1) = NOTHING; /* was OPEN. */
4356 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4357 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4358 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4359 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4360 OP(nxt + 1) = OPTIMIZED; /* was count. */
4361 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4366 /* Try optimization CURLYX => CURLYM. */
4367 if ( OP(oscan) == CURLYX && data
4368 && !(data->flags & SF_HAS_PAR)
4369 && !(data->flags & SF_HAS_EVAL)
4370 && !deltanext /* atom is fixed width */
4371 && minnext != 0 /* CURLYM can't handle zero width */
4372 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4374 /* XXXX How to optimize if data == 0? */
4375 /* Optimize to a simpler form. */
4376 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4380 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4381 && (OP(nxt2) != WHILEM))
4383 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4384 /* Need to optimize away parenths. */
4385 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4386 /* Set the parenth number. */
4387 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4389 oscan->flags = (U8)ARG(nxt);
4390 if (RExC_open_parens) {
4391 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4392 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4394 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4395 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4398 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4399 OP(nxt + 1) = OPTIMIZED; /* was count. */
4400 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4401 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4404 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4405 regnode *nnxt = regnext(nxt1);
4407 if (reg_off_by_arg[OP(nxt1)])
4408 ARG_SET(nxt1, nxt2 - nxt1);
4409 else if (nxt2 - nxt1 < U16_MAX)
4410 NEXT_OFF(nxt1) = nxt2 - nxt1;
4412 OP(nxt) = NOTHING; /* Cannot beautify */
4417 /* Optimize again: */
4418 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4419 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4424 else if ((OP(oscan) == CURLYX)
4425 && (flags & SCF_WHILEM_VISITED_POS)
4426 /* See the comment on a similar expression above.
4427 However, this time it's not a subexpression
4428 we care about, but the expression itself. */
4429 && (maxcount == REG_INFTY)
4430 && data && ++data->whilem_c < 16) {
4431 /* This stays as CURLYX, we can put the count/of pair. */
4432 /* Find WHILEM (as in regexec.c) */
4433 regnode *nxt = oscan + NEXT_OFF(oscan);
4435 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4437 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4438 | (RExC_whilem_seen << 4)); /* On WHILEM */
4440 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4442 if (flags & SCF_DO_SUBSTR) {
4443 SV *last_str = NULL;
4444 int counted = mincount != 0;
4446 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4447 SSize_t b = pos_before >= data->last_start_min
4448 ? pos_before : data->last_start_min;
4450 const char * const s = SvPV_const(data->last_found, l);
4451 SSize_t old = b - data->last_start_min;
4454 old = utf8_hop((U8*)s, old) - (U8*)s;
4456 /* Get the added string: */
4457 last_str = newSVpvn_utf8(s + old, l, UTF);
4458 if (deltanext == 0 && pos_before == b) {
4459 /* What was added is a constant string */
4461 SvGROW(last_str, (mincount * l) + 1);
4462 repeatcpy(SvPVX(last_str) + l,
4463 SvPVX_const(last_str), l, mincount - 1);
4464 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4465 /* Add additional parts. */
4466 SvCUR_set(data->last_found,
4467 SvCUR(data->last_found) - l);
4468 sv_catsv(data->last_found, last_str);
4470 SV * sv = data->last_found;
4472 SvUTF8(sv) && SvMAGICAL(sv) ?
4473 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4474 if (mg && mg->mg_len >= 0)
4475 mg->mg_len += CHR_SVLEN(last_str) - l;
4477 data->last_end += l * (mincount - 1);
4480 /* start offset must point into the last copy */
4481 data->last_start_min += minnext * (mincount - 1);
4482 data->last_start_max += is_inf ? SSize_t_MAX
4483 : (maxcount - 1) * (minnext + data->pos_delta);
4486 /* It is counted once already... */
4487 data->pos_min += minnext * (mincount - counted);
4489 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4490 " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4491 " maxcount=%"UVdf" mincount=%"UVdf"\n",
4492 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4494 if (deltanext != SSize_t_MAX)
4495 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4496 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4497 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4499 if (deltanext == SSize_t_MAX ||
4500 -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4501 data->pos_delta = SSize_t_MAX;
4503 data->pos_delta += - counted * deltanext +
4504 (minnext + deltanext) * maxcount - minnext * mincount;
4505 if (mincount != maxcount) {
4506 /* Cannot extend fixed substrings found inside
4508 SCAN_COMMIT(pRExC_state,data,minlenp);
4509 if (mincount && last_str) {
4510 SV * const sv = data->last_found;
4511 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4512 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4516 sv_setsv(sv, last_str);
4517 data->last_end = data->pos_min;
4518 data->last_start_min =
4519 data->pos_min - CHR_SVLEN(last_str);
4520 data->last_start_max = is_inf
4522 : data->pos_min + data->pos_delta
4523 - CHR_SVLEN(last_str);
4525 data->longest = &(data->longest_float);
4527 SvREFCNT_dec(last_str);
4529 if (data && (fl & SF_HAS_EVAL))
4530 data->flags |= SF_HAS_EVAL;
4531 optimize_curly_tail:
4532 if (OP(oscan) != CURLYX) {
4533 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4535 NEXT_OFF(oscan) += NEXT_OFF(next);
4541 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4546 if (flags & SCF_DO_SUBSTR) {
4547 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4548 data->longest = &(data->longest_float);
4550 is_inf = is_inf_internal = 1;
4551 if (flags & SCF_DO_STCLASS_OR) {
4552 if (OP(scan) == CLUMP) {
4553 /* Actually is any start char, but very few code points
4554 * aren't start characters */
4555 ssc_match_all_cp(data->start_class);
4558 ssc_anything(data->start_class);
4561 flags &= ~SCF_DO_STCLASS;
4565 else if (OP(scan) == LNBREAK) {
4566 if (flags & SCF_DO_STCLASS) {
4567 if (flags & SCF_DO_STCLASS_AND) {
4568 ssc_intersection(data->start_class,
4569 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4570 ssc_clear_locale(data->start_class);
4571 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4573 else if (flags & SCF_DO_STCLASS_OR) {
4574 ssc_union(data->start_class,
4575 PL_XPosix_ptrs[_CC_VERTSPACE],
4577 ssc_and(pRExC_state, data->start_class, and_withp);
4579 /* See commit msg for
4580 * 749e076fceedeb708a624933726e7989f2302f6a */
4581 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4583 flags &= ~SCF_DO_STCLASS;
4586 delta++; /* Because of the 2 char string cr-lf */
4587 if (flags & SCF_DO_SUBSTR) {
4588 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4590 data->pos_delta += 1;
4591 data->longest = &(data->longest_float);
4594 else if (REGNODE_SIMPLE(OP(scan))) {
4596 if (flags & SCF_DO_SUBSTR) {
4597 SCAN_COMMIT(pRExC_state,data,minlenp);
4601 if (flags & SCF_DO_STCLASS) {
4603 SV* my_invlist = sv_2mortal(_new_invlist(0));
4607 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4608 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4610 /* Some of the logic below assumes that switching
4611 locale on will only add false positives. */
4616 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4620 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4621 ssc_match_all_cp(data->start_class);
4626 SV* REG_ANY_invlist = _new_invlist(2);
4627 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4629 if (flags & SCF_DO_STCLASS_OR) {
4630 ssc_union(data->start_class,
4632 TRUE /* TRUE => invert, hence all but \n
4636 else if (flags & SCF_DO_STCLASS_AND) {
4637 ssc_intersection(data->start_class,
4639 TRUE /* TRUE => invert */
4641 ssc_clear_locale(data->start_class);
4643 SvREFCNT_dec_NN(REG_ANY_invlist);
4648 if (flags & SCF_DO_STCLASS_AND)
4649 ssc_and(pRExC_state, data->start_class,
4650 (regnode_ssc*) scan);
4652 ssc_or(pRExC_state, data->start_class,
4653 (regnode_ssc*)scan);
4661 classnum = FLAGS(scan);
4662 namedclass = classnum_to_namedclass(classnum) + invert;
4663 if (flags & SCF_DO_STCLASS_AND) {
4664 bool was_there = cBOOL(
4665 ANYOF_POSIXL_TEST(data->start_class,
4667 ANYOF_POSIXL_ZERO(data->start_class);
4668 if (was_there) { /* Do an AND */
4669 ANYOF_POSIXL_SET(data->start_class, namedclass);
4671 /* No individual code points can now match */
4672 data->start_class->invlist
4673 = sv_2mortal(_new_invlist(0));
4676 int complement = namedclass + ((invert) ? -1 : 1);
4678 assert(flags & SCF_DO_STCLASS_OR);
4680 /* If the complement of this class was already there,
4681 * the result is that they match all code points,
4682 * (\d + \D == everything). Remove the classes from
4683 * future consideration. Locale is not relevant in
4685 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4686 ssc_match_all_cp(data->start_class);
4687 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4688 ANYOF_POSIXL_CLEAR(data->start_class, complement);
4689 if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
4691 ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
4694 else { /* The usual case; just add this class to the
4696 ANYOF_POSIXL_SET(data->start_class, namedclass);
4697 ANYOF_FLAGS(data->start_class)
4698 |= ANYOF_LOCALE|ANYOF_POSIXL;
4703 case NPOSIXA: /* For these, we always know the exact set of
4708 classnum = FLAGS(scan);
4709 my_invlist = PL_Posix_ptrs[classnum];
4718 classnum = FLAGS(scan);
4720 /* If we know all the code points that match the class, use
4721 * that; otherwise use the Latin1 code points, plus we have
4722 * to assume that it could match anything above Latin1 */
4723 if (PL_XPosix_ptrs[classnum]) {
4724 my_invlist = invlist_clone(PL_XPosix_ptrs[classnum]);
4727 _invlist_union(PL_L1Posix_ptrs[classnum],
4728 PL_AboveLatin1, &my_invlist);
4731 /* NPOSIXD matches all upper Latin1 code points unless the
4732 * target string being matched is UTF-8, which is
4733 * unknowable until match time */
4734 if (PL_regkind[OP(scan)] == NPOSIXD) {
4735 _invlist_union_complement_2nd(my_invlist,
4736 PL_Posix_ptrs[_CC_ASCII], &my_invlist);
4741 if (flags & SCF_DO_STCLASS_AND) {
4742 ssc_intersection(data->start_class, my_invlist, invert);
4743 ssc_clear_locale(data->start_class);
4746 assert(flags & SCF_DO_STCLASS_OR);
4747 ssc_union(data->start_class, my_invlist, invert);
4750 if (flags & SCF_DO_STCLASS_OR)
4751 ssc_and(pRExC_state, data->start_class, and_withp);
4752 flags &= ~SCF_DO_STCLASS;
4755 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4756 data->flags |= (OP(scan) == MEOL
4759 SCAN_COMMIT(pRExC_state, data, minlenp);
4762 else if ( PL_regkind[OP(scan)] == BRANCHJ
4763 /* Lookbehind, or need to calculate parens/evals/stclass: */
4764 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4765 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4766 if ( OP(scan) == UNLESSM &&
4768 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4769 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4772 regnode *upto= regnext(scan);
4774 SV * const mysv_val=sv_newmortal();
4775 DEBUG_STUDYDATA("OPFAIL",data,depth);
4777 /*DEBUG_PARSE_MSG("opfail");*/
4778 regprop(RExC_rx, mysv_val, upto);
4779 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4780 SvPV_nolen_const(mysv_val),
4781 (IV)REG_NODE_NUM(upto),
4786 NEXT_OFF(scan) = upto - scan;
4787 for (opt= scan + 1; opt < upto ; opt++)
4788 OP(opt) = OPTIMIZED;
4792 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4793 || OP(scan) == UNLESSM )
4795 /* Negative Lookahead/lookbehind
4796 In this case we can't do fixed string optimisation.
4799 SSize_t deltanext, minnext, fake = 0;
4804 data_fake.flags = 0;
4806 data_fake.whilem_c = data->whilem_c;
4807 data_fake.last_closep = data->last_closep;
4810 data_fake.last_closep = &fake;
4811 data_fake.pos_delta = delta;
4812 if ( flags & SCF_DO_STCLASS && !scan->flags
4813 && OP(scan) == IFMATCH ) { /* Lookahead */
4814 ssc_init(pRExC_state, &intrnl);
4815 data_fake.start_class = &intrnl;
4816 f |= SCF_DO_STCLASS_AND;
4818 if (flags & SCF_WHILEM_VISITED_POS)
4819 f |= SCF_WHILEM_VISITED_POS;
4820 next = regnext(scan);
4821 nscan = NEXTOPER(NEXTOPER(scan));
4822 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4823 last, &data_fake, stopparen, recursed_depth, NULL, f, depth+1);
4826 FAIL("Variable length lookbehind not implemented");
4828 else if (minnext > (I32)U8_MAX) {
4829 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4831 scan->flags = (U8)minnext;
4834 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4836 if (data_fake.flags & SF_HAS_EVAL)
4837 data->flags |= SF_HAS_EVAL;
4838 data->whilem_c = data_fake.whilem_c;
4840 if (f & SCF_DO_STCLASS_AND) {
4841 if (flags & SCF_DO_STCLASS_OR) {
4842 /* OR before, AND after: ideally we would recurse with
4843 * data_fake to get the AND applied by study of the
4844 * remainder of the pattern, and then derecurse;
4845 * *** HACK *** for now just treat as "no information".
4846 * See [perl #56690].
4848 ssc_init(pRExC_state, data->start_class);
4850 /* AND before and after: combine and continue */
4851 ssc_and(pRExC_state, data->start_class, &intrnl);
4855 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4857 /* Positive Lookahead/lookbehind
4858 In this case we can do fixed string optimisation,
4859 but we must be careful about it. Note in the case of
4860 lookbehind the positions will be offset by the minimum
4861 length of the pattern, something we won't know about
4862 until after the recurse.
4864 SSize_t deltanext, fake = 0;
4868 /* We use SAVEFREEPV so that when the full compile
4869 is finished perl will clean up the allocated
4870 minlens when it's all done. This way we don't
4871 have to worry about freeing them when we know
4872 they wont be used, which would be a pain.
4875 Newx( minnextp, 1, SSize_t );
4876 SAVEFREEPV(minnextp);
4879 StructCopy(data, &data_fake, scan_data_t);
4880 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4883 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4884 data_fake.last_found=newSVsv(data->last_found);
4888 data_fake.last_closep = &fake;
4889 data_fake.flags = 0;
4890 data_fake.pos_delta = delta;
4892 data_fake.flags |= SF_IS_INF;
4893 if ( flags & SCF_DO_STCLASS && !scan->flags
4894 && OP(scan) == IFMATCH ) { /* Lookahead */
4895 ssc_init(pRExC_state, &intrnl);
4896 data_fake.start_class = &intrnl;
4897 f |= SCF_DO_STCLASS_AND;
4899 if (flags & SCF_WHILEM_VISITED_POS)
4900 f |= SCF_WHILEM_VISITED_POS;
4901 next = regnext(scan);
4902 nscan = NEXTOPER(NEXTOPER(scan));
4904 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4905 last, &data_fake, stopparen, recursed_depth, NULL, f,depth+1);
4908 FAIL("Variable length lookbehind not implemented");
4910 else if (*minnextp > (I32)U8_MAX) {
4911 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4913 scan->flags = (U8)*minnextp;
4918 if (f & SCF_DO_STCLASS_AND) {
4919 ssc_and(pRExC_state, data->start_class, &intrnl);
4922 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4924 if (data_fake.flags & SF_HAS_EVAL)
4925 data->flags |= SF_HAS_EVAL;
4926 data->whilem_c = data_fake.whilem_c;
4927 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4928 if (RExC_rx->minlen<*minnextp)
4929 RExC_rx->minlen=*minnextp;
4930 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4931 SvREFCNT_dec_NN(data_fake.last_found);
4933 if ( data_fake.minlen_fixed != minlenp )
4935 data->offset_fixed= data_fake.offset_fixed;
4936 data->minlen_fixed= data_fake.minlen_fixed;
4937 data->lookbehind_fixed+= scan->flags;
4939 if ( data_fake.minlen_float != minlenp )
4941 data->minlen_float= data_fake.minlen_float;
4942 data->offset_float_min=data_fake.offset_float_min;
4943 data->offset_float_max=data_fake.offset_float_max;
4944 data->lookbehind_float+= scan->flags;
4951 else if (OP(scan) == OPEN) {
4952 if (stopparen != (I32)ARG(scan))
4955 else if (OP(scan) == CLOSE) {
4956 if (stopparen == (I32)ARG(scan)) {
4959 if ((I32)ARG(scan) == is_par) {
4960 next = regnext(scan);
4962 if ( next && (OP(next) != WHILEM) && next < last)
4963 is_par = 0; /* Disable optimization */
4966 *(data->last_closep) = ARG(scan);
4968 else if (OP(scan) == EVAL) {
4970 data->flags |= SF_HAS_EVAL;
4972 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4973 if (flags & SCF_DO_SUBSTR) {
4974 SCAN_COMMIT(pRExC_state,data,minlenp);
4975 flags &= ~SCF_DO_SUBSTR;
4977 if (data && OP(scan)==ACCEPT) {
4978 data->flags |= SCF_SEEN_ACCEPT;
4983 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4985 if (flags & SCF_DO_SUBSTR) {
4986 SCAN_COMMIT(pRExC_state,data,minlenp);
4987 data->longest = &(data->longest_float);
4989 is_inf = is_inf_internal = 1;
4990 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4991 ssc_anything(data->start_class);
4992 flags &= ~SCF_DO_STCLASS;
4994 else if (OP(scan) == GPOS) {
4995 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4996 !(delta || is_inf || (data && data->pos_delta)))
4998 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4999 RExC_rx->extflags |= RXf_ANCH_GPOS;
5000 if (RExC_rx->gofs < (STRLEN)min)
5001 RExC_rx->gofs = min;
5003 RExC_rx->extflags |= RXf_GPOS_FLOAT;
5007 #ifdef TRIE_STUDY_OPT
5008 #ifdef FULL_TRIE_STUDY
5009 else if (PL_regkind[OP(scan)] == TRIE) {
5010 /* NOTE - There is similar code to this block above for handling
5011 BRANCH nodes on the initial study. If you change stuff here
5013 regnode *trie_node= scan;
5014 regnode *tail= regnext(scan);
5015 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5016 SSize_t max1 = 0, min1 = SSize_t_MAX;
5019 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
5020 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
5021 if (flags & SCF_DO_STCLASS)
5022 ssc_init_zero(pRExC_state, &accum);
5028 const regnode *nextbranch= NULL;
5031 for ( word=1 ; word <= trie->wordcount ; word++)
5033 SSize_t deltanext=0, minnext=0, f = 0, fake;
5034 regnode_ssc this_class;
5036 data_fake.flags = 0;
5038 data_fake.whilem_c = data->whilem_c;
5039 data_fake.last_closep = data->last_closep;
5042 data_fake.last_closep = &fake;
5043 data_fake.pos_delta = delta;
5044 if (flags & SCF_DO_STCLASS) {
5045 ssc_init(pRExC_state, &this_class);
5046 data_fake.start_class = &this_class;
5047 f = SCF_DO_STCLASS_AND;
5049 if (flags & SCF_WHILEM_VISITED_POS)
5050 f |= SCF_WHILEM_VISITED_POS;
5052 if (trie->jump[word]) {
5054 nextbranch = trie_node + trie->jump[0];
5055 scan= trie_node + trie->jump[word];
5056 /* We go from the jump point to the branch that follows
5057 it. Note this means we need the vestigal unused branches
5058 even though they arent otherwise used.
5060 minnext = study_chunk(pRExC_state, &scan, minlenp,
5061 &deltanext, (regnode *)nextbranch, &data_fake,
5062 stopparen, recursed_depth, NULL, f,depth+1);
5064 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5065 nextbranch= regnext((regnode*)nextbranch);
5067 if (min1 > (SSize_t)(minnext + trie->minlen))
5068 min1 = minnext + trie->minlen;
5069 if (deltanext == SSize_t_MAX) {
5070 is_inf = is_inf_internal = 1;
5072 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5073 max1 = minnext + deltanext + trie->maxlen;
5075 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5077 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5078 if ( stopmin > min + min1)
5079 stopmin = min + min1;
5080 flags &= ~SCF_DO_SUBSTR;
5082 data->flags |= SCF_SEEN_ACCEPT;
5085 if (data_fake.flags & SF_HAS_EVAL)
5086 data->flags |= SF_HAS_EVAL;
5087 data->whilem_c = data_fake.whilem_c;
5089 if (flags & SCF_DO_STCLASS)
5090 ssc_or(pRExC_state, &accum, &this_class);
5093 if (flags & SCF_DO_SUBSTR) {
5094 data->pos_min += min1;
5095 data->pos_delta += max1 - min1;
5096 if (max1 != min1 || is_inf)
5097 data->longest = &(data->longest_float);
5100 delta += max1 - min1;
5101 if (flags & SCF_DO_STCLASS_OR) {
5102 ssc_or(pRExC_state, data->start_class, &accum);
5104 ssc_and(pRExC_state, data->start_class, and_withp);
5105 flags &= ~SCF_DO_STCLASS;
5108 else if (flags & SCF_DO_STCLASS_AND) {
5110 ssc_and(pRExC_state, data->start_class, &accum);
5111 flags &= ~SCF_DO_STCLASS;
5114 /* Switch to OR mode: cache the old value of
5115 * data->start_class */
5117 StructCopy(data->start_class, and_withp, regnode_ssc);
5118 flags &= ~SCF_DO_STCLASS_AND;
5119 StructCopy(&accum, data->start_class, regnode_ssc);
5120 flags |= SCF_DO_STCLASS_OR;
5127 else if (PL_regkind[OP(scan)] == TRIE) {
5128 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5131 min += trie->minlen;
5132 delta += (trie->maxlen - trie->minlen);
5133 flags &= ~SCF_DO_STCLASS; /* xxx */
5134 if (flags & SCF_DO_SUBSTR) {
5135 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
5136 data->pos_min += trie->minlen;
5137 data->pos_delta += (trie->maxlen - trie->minlen);
5138 if (trie->maxlen != trie->minlen)
5139 data->longest = &(data->longest_float);
5141 if (trie->jump) /* no more substrings -- for now /grr*/
5142 flags &= ~SCF_DO_SUBSTR;
5144 #endif /* old or new */
5145 #endif /* TRIE_STUDY_OPT */
5147 /* Else: zero-length, ignore. */
5148 scan = regnext(scan);
5150 /* If we are exiting a recursion we can unset its recursed bit
5151 * and allow ourselves to enter it again - no danger of an
5152 * infinite loop there.
5153 if (stopparen > -1 && recursed) {
5154 DEBUG_STUDYDATA("unset:", data,depth);
5155 PAREN_UNSET( recursed, stopparen);
5159 DEBUG_STUDYDATA("frame-end:",data,depth);
5160 DEBUG_PEEP("fend", scan, depth);
5161 /* restore previous context */
5164 stopparen = frame->stop;
5165 recursed_depth = frame->prev_recursed_depth;
5168 frame = frame->prev;
5169 goto fake_study_recurse;
5174 DEBUG_STUDYDATA("pre-fin:",data,depth);
5177 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5178 if (flags & SCF_DO_SUBSTR && is_inf)
5179 data->pos_delta = SSize_t_MAX - data->pos_min;
5180 if (is_par > (I32)U8_MAX)
5182 if (is_par && pars==1 && data) {
5183 data->flags |= SF_IN_PAR;
5184 data->flags &= ~SF_HAS_PAR;
5186 else if (pars && data) {
5187 data->flags |= SF_HAS_PAR;
5188 data->flags &= ~SF_IN_PAR;
5190 if (flags & SCF_DO_STCLASS_OR)
5191 ssc_and(pRExC_state, data->start_class, and_withp);
5192 if (flags & SCF_TRIE_RESTUDY)
5193 data->flags |= SCF_TRIE_RESTUDY;
5195 DEBUG_STUDYDATA("post-fin:",data,depth);
5197 return min < stopmin ? min : stopmin;
5201 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5203 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5205 PERL_ARGS_ASSERT_ADD_DATA;
5207 Renewc(RExC_rxi->data,
5208 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5209 char, struct reg_data);
5211 Renew(RExC_rxi->data->what, count + n, U8);
5213 Newx(RExC_rxi->data->what, n, U8);
5214 RExC_rxi->data->count = count + n;
5215 Copy(s, RExC_rxi->data->what + count, n, U8);
5219 /*XXX: todo make this not included in a non debugging perl */
5220 #ifndef PERL_IN_XSUB_RE
5222 Perl_reginitcolors(pTHX)
5225 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5227 char *t = savepv(s);
5231 t = strchr(t, '\t');
5237 PL_colors[i] = t = (char *)"";
5242 PL_colors[i++] = (char *)"";
5249 #ifdef TRIE_STUDY_OPT
5250 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5253 (data.flags & SCF_TRIE_RESTUDY) \
5261 #define CHECK_RESTUDY_GOTO_butfirst
5265 * pregcomp - compile a regular expression into internal code
5267 * Decides which engine's compiler to call based on the hint currently in
5271 #ifndef PERL_IN_XSUB_RE
5273 /* return the currently in-scope regex engine (or the default if none) */
5275 regexp_engine const *
5276 Perl_current_re_engine(pTHX)
5280 if (IN_PERL_COMPILETIME) {
5281 HV * const table = GvHV(PL_hintgv);
5284 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5285 return &PL_core_reg_engine;
5286 ptr = hv_fetchs(table, "regcomp", FALSE);
5287 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5288 return &PL_core_reg_engine;
5289 return INT2PTR(regexp_engine*,SvIV(*ptr));
5293 if (!PL_curcop->cop_hints_hash)
5294 return &PL_core_reg_engine;
5295 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5296 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5297 return &PL_core_reg_engine;
5298 return INT2PTR(regexp_engine*,SvIV(ptr));
5304 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5307 regexp_engine const *eng = current_re_engine();
5308 GET_RE_DEBUG_FLAGS_DECL;
5310 PERL_ARGS_ASSERT_PREGCOMP;
5312 /* Dispatch a request to compile a regexp to correct regexp engine. */
5314 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5317 return CALLREGCOMP_ENG(eng, pattern, flags);
5321 /* public(ish) entry point for the perl core's own regex compiling code.
5322 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5323 * pattern rather than a list of OPs, and uses the internal engine rather
5324 * than the current one */
5327 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5329 SV *pat = pattern; /* defeat constness! */
5330 PERL_ARGS_ASSERT_RE_COMPILE;
5331 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5332 #ifdef PERL_IN_XSUB_RE
5335 &PL_core_reg_engine,
5337 NULL, NULL, rx_flags, 0);
5341 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5342 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5343 * point to the realloced string and length.
5345 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5349 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5350 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5352 U8 *const src = (U8*)*pat_p;
5355 STRLEN s = 0, d = 0;
5357 GET_RE_DEBUG_FLAGS_DECL;
5359 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5360 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5362 Newx(dst, *plen_p * 2 + 1, U8);
5364 while (s < *plen_p) {
5365 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5368 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5369 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
5371 if (n < num_code_blocks) {
5372 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5373 pRExC_state->code_blocks[n].start = d;
5374 assert(dst[d] == '(');
5377 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5378 pRExC_state->code_blocks[n].end = d;
5379 assert(dst[d] == ')');
5389 *pat_p = (char*) dst;
5391 RExC_orig_utf8 = RExC_utf8 = 1;
5396 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5397 * while recording any code block indices, and handling overloading,
5398 * nested qr// objects etc. If pat is null, it will allocate a new
5399 * string, or just return the first arg, if there's only one.
5401 * Returns the malloced/updated pat.
5402 * patternp and pat_count is the array of SVs to be concatted;
5403 * oplist is the optional list of ops that generated the SVs;
5404 * recompile_p is a pointer to a boolean that will be set if
5405 * the regex will need to be recompiled.
5406 * delim, if non-null is an SV that will be inserted between each element
5410 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5411 SV *pat, SV ** const patternp, int pat_count,
5412 OP *oplist, bool *recompile_p, SV *delim)
5416 bool use_delim = FALSE;
5417 bool alloced = FALSE;
5419 /* if we know we have at least two args, create an empty string,
5420 * then concatenate args to that. For no args, return an empty string */
5421 if (!pat && pat_count != 1) {
5422 pat = newSVpvn("", 0);
5427 for (svp = patternp; svp < patternp + pat_count; svp++) {
5430 STRLEN orig_patlen = 0;
5432 SV *msv = use_delim ? delim : *svp;
5433 if (!msv) msv = &PL_sv_undef;
5435 /* if we've got a delimiter, we go round the loop twice for each
5436 * svp slot (except the last), using the delimiter the second
5445 if (SvTYPE(msv) == SVt_PVAV) {
5446 /* we've encountered an interpolated array within
5447 * the pattern, e.g. /...@a..../. Expand the list of elements,
5448 * then recursively append elements.
5449 * The code in this block is based on S_pushav() */
5451 AV *const av = (AV*)msv;
5452 const SSize_t maxarg = AvFILL(av) + 1;
5456 assert(oplist->op_type == OP_PADAV
5457 || oplist->op_type == OP_RV2AV);
5458 oplist = oplist->op_sibling;;
5461 if (SvRMAGICAL(av)) {
5464 Newx(array, maxarg, SV*);
5466 for (i=0; i < maxarg; i++) {
5467 SV ** const svp = av_fetch(av, i, FALSE);
5468 array[i] = svp ? *svp : &PL_sv_undef;
5472 array = AvARRAY(av);
5474 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5475 array, maxarg, NULL, recompile_p,
5477 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5483 /* we make the assumption here that each op in the list of
5484 * op_siblings maps to one SV pushed onto the stack,
5485 * except for code blocks, with have both an OP_NULL and
5487 * This allows us to match up the list of SVs against the
5488 * list of OPs to find the next code block.
5490 * Note that PUSHMARK PADSV PADSV ..
5492 * PADRANGE PADSV PADSV ..
5493 * so the alignment still works. */
5496 if (oplist->op_type == OP_NULL
5497 && (oplist->op_flags & OPf_SPECIAL))
5499 assert(n < pRExC_state->num_code_blocks);
5500 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5501 pRExC_state->code_blocks[n].block = oplist;
5502 pRExC_state->code_blocks[n].src_regex = NULL;
5505 oplist = oplist->op_sibling; /* skip CONST */
5508 oplist = oplist->op_sibling;;
5511 /* apply magic and QR overloading to arg */
5514 if (SvROK(msv) && SvAMAGIC(msv)) {
5515 SV *sv = AMG_CALLunary(msv, regexp_amg);
5519 if (SvTYPE(sv) != SVt_REGEXP)
5520 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5525 /* try concatenation overload ... */
5526 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5527 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5530 /* overloading involved: all bets are off over literal
5531 * code. Pretend we haven't seen it */
5532 pRExC_state->num_code_blocks -= n;
5536 /* ... or failing that, try "" overload */
5537 while (SvAMAGIC(msv)
5538 && (sv = AMG_CALLunary(msv, string_amg))
5542 && SvRV(msv) == SvRV(sv))
5547 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5551 /* this is a partially unrolled
5552 * sv_catsv_nomg(pat, msv);
5553 * that allows us to adjust code block indices if
5556 char *dst = SvPV_force_nomg(pat, dlen);
5558 if (SvUTF8(msv) && !SvUTF8(pat)) {
5559 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5560 sv_setpvn(pat, dst, dlen);
5563 sv_catsv_nomg(pat, msv);
5570 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5573 /* extract any code blocks within any embedded qr//'s */
5574 if (rx && SvTYPE(rx) == SVt_REGEXP
5575 && RX_ENGINE((REGEXP*)rx)->op_comp)
5578 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5579 if (ri->num_code_blocks) {
5581 /* the presence of an embedded qr// with code means
5582 * we should always recompile: the text of the
5583 * qr// may not have changed, but it may be a
5584 * different closure than last time */
5586 Renew(pRExC_state->code_blocks,
5587 pRExC_state->num_code_blocks + ri->num_code_blocks,
5588 struct reg_code_block);
5589 pRExC_state->num_code_blocks += ri->num_code_blocks;
5591 for (i=0; i < ri->num_code_blocks; i++) {
5592 struct reg_code_block *src, *dst;
5593 STRLEN offset = orig_patlen
5594 + ReANY((REGEXP *)rx)->pre_prefix;
5595 assert(n < pRExC_state->num_code_blocks);
5596 src = &ri->code_blocks[i];
5597 dst = &pRExC_state->code_blocks[n];
5598 dst->start = src->start + offset;
5599 dst->end = src->end + offset;
5600 dst->block = src->block;
5601 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5610 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5619 /* see if there are any run-time code blocks in the pattern.
5620 * False positives are allowed */
5623 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5624 char *pat, STRLEN plen)
5629 for (s = 0; s < plen; s++) {
5630 if (n < pRExC_state->num_code_blocks
5631 && s == pRExC_state->code_blocks[n].start)
5633 s = pRExC_state->code_blocks[n].end;
5637 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5639 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5641 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5648 /* Handle run-time code blocks. We will already have compiled any direct
5649 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5650 * copy of it, but with any literal code blocks blanked out and
5651 * appropriate chars escaped; then feed it into
5653 * eval "qr'modified_pattern'"
5657 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5661 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5663 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5664 * and merge them with any code blocks of the original regexp.
5666 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5667 * instead, just save the qr and return FALSE; this tells our caller that
5668 * the original pattern needs upgrading to utf8.
5672 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5673 char *pat, STRLEN plen)
5677 GET_RE_DEBUG_FLAGS_DECL;
5679 if (pRExC_state->runtime_code_qr) {
5680 /* this is the second time we've been called; this should
5681 * only happen if the main pattern got upgraded to utf8
5682 * during compilation; re-use the qr we compiled first time
5683 * round (which should be utf8 too)
5685 qr = pRExC_state->runtime_code_qr;
5686 pRExC_state->runtime_code_qr = NULL;
5687 assert(RExC_utf8 && SvUTF8(qr));
5693 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5697 /* determine how many extra chars we need for ' and \ escaping */
5698 for (s = 0; s < plen; s++) {
5699 if (pat[s] == '\'' || pat[s] == '\\')
5703 Newx(newpat, newlen, char);
5705 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5707 for (s = 0; s < plen; s++) {
5708 if (n < pRExC_state->num_code_blocks
5709 && s == pRExC_state->code_blocks[n].start)
5711 /* blank out literal code block */
5712 assert(pat[s] == '(');
5713 while (s <= pRExC_state->code_blocks[n].end) {
5721 if (pat[s] == '\'' || pat[s] == '\\')
5726 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5730 PerlIO_printf(Perl_debug_log,
5731 "%sre-parsing pattern for runtime code:%s %s\n",
5732 PL_colors[4],PL_colors[5],newpat);
5735 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5741 PUSHSTACKi(PERLSI_REQUIRE);
5742 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5743 * parsing qr''; normally only q'' does this. It also alters
5745 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5746 SvREFCNT_dec_NN(sv);
5751 SV * const errsv = ERRSV;
5752 if (SvTRUE_NN(errsv))
5754 Safefree(pRExC_state->code_blocks);
5755 /* use croak_sv ? */
5756 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5759 assert(SvROK(qr_ref));
5761 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5762 /* the leaving below frees the tmp qr_ref.
5763 * Give qr a life of its own */
5771 if (!RExC_utf8 && SvUTF8(qr)) {
5772 /* first time through; the pattern got upgraded; save the
5773 * qr for the next time through */
5774 assert(!pRExC_state->runtime_code_qr);
5775 pRExC_state->runtime_code_qr = qr;
5780 /* extract any code blocks within the returned qr// */
5783 /* merge the main (r1) and run-time (r2) code blocks into one */
5785 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5786 struct reg_code_block *new_block, *dst;
5787 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5790 if (!r2->num_code_blocks) /* we guessed wrong */
5792 SvREFCNT_dec_NN(qr);
5797 r1->num_code_blocks + r2->num_code_blocks,
5798 struct reg_code_block);
5801 while ( i1 < r1->num_code_blocks
5802 || i2 < r2->num_code_blocks)
5804 struct reg_code_block *src;
5807 if (i1 == r1->num_code_blocks) {
5808 src = &r2->code_blocks[i2++];
5811 else if (i2 == r2->num_code_blocks)
5812 src = &r1->code_blocks[i1++];
5813 else if ( r1->code_blocks[i1].start
5814 < r2->code_blocks[i2].start)
5816 src = &r1->code_blocks[i1++];
5817 assert(src->end < r2->code_blocks[i2].start);
5820 assert( r1->code_blocks[i1].start
5821 > r2->code_blocks[i2].start);
5822 src = &r2->code_blocks[i2++];
5824 assert(src->end < r1->code_blocks[i1].start);
5827 assert(pat[src->start] == '(');
5828 assert(pat[src->end] == ')');
5829 dst->start = src->start;
5830 dst->end = src->end;
5831 dst->block = src->block;
5832 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5836 r1->num_code_blocks += r2->num_code_blocks;
5837 Safefree(r1->code_blocks);
5838 r1->code_blocks = new_block;
5841 SvREFCNT_dec_NN(qr);
5847 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5848 SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5850 /* This is the common code for setting up the floating and fixed length
5851 * string data extracted from Perl_re_op_compile() below. Returns a boolean
5852 * as to whether succeeded or not */
5857 if (! (longest_length
5858 || (eol /* Can't have SEOL and MULTI */
5859 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5861 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5862 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5867 /* copy the information about the longest from the reg_scan_data
5868 over to the program. */
5869 if (SvUTF8(sv_longest)) {
5870 *rx_utf8 = sv_longest;
5873 *rx_substr = sv_longest;
5876 /* end_shift is how many chars that must be matched that
5877 follow this item. We calculate it ahead of time as once the
5878 lookbehind offset is added in we lose the ability to correctly
5880 ml = minlen ? *(minlen) : (SSize_t)longest_length;
5881 *rx_end_shift = ml - offset
5882 - longest_length + (SvTAIL(sv_longest) != 0)
5885 t = (eol/* Can't have SEOL and MULTI */
5886 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5887 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5893 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5894 * regular expression into internal code.
5895 * The pattern may be passed either as:
5896 * a list of SVs (patternp plus pat_count)
5897 * a list of OPs (expr)
5898 * If both are passed, the SV list is used, but the OP list indicates
5899 * which SVs are actually pre-compiled code blocks
5901 * The SVs in the list have magic and qr overloading applied to them (and
5902 * the list may be modified in-place with replacement SVs in the latter
5905 * If the pattern hasn't changed from old_re, then old_re will be
5908 * eng is the current engine. If that engine has an op_comp method, then
5909 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5910 * do the initial concatenation of arguments and pass on to the external
5913 * If is_bare_re is not null, set it to a boolean indicating whether the
5914 * arg list reduced (after overloading) to a single bare regex which has
5915 * been returned (i.e. /$qr/).
5917 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5919 * pm_flags contains the PMf_* flags, typically based on those from the
5920 * pm_flags field of the related PMOP. Currently we're only interested in
5921 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5923 * We can't allocate space until we know how big the compiled form will be,
5924 * but we can't compile it (and thus know how big it is) until we've got a
5925 * place to put the code. So we cheat: we compile it twice, once with code
5926 * generation turned off and size counting turned on, and once "for real".
5927 * This also means that we don't allocate space until we are sure that the
5928 * thing really will compile successfully, and we never have to move the
5929 * code and thus invalidate pointers into it. (Note that it has to be in
5930 * one piece because free() must be able to free it all.) [NB: not true in perl]
5932 * Beware that the optimization-preparation code in here knows about some
5933 * of the structure of the compiled regexp. [I'll say.]
5937 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5938 OP *expr, const regexp_engine* eng, REGEXP *old_re,
5939 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5944 regexp_internal *ri;
5952 SV *code_blocksv = NULL;
5953 SV** new_patternp = patternp;
5955 /* these are all flags - maybe they should be turned
5956 * into a single int with different bit masks */
5957 I32 sawlookahead = 0;
5962 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5964 bool runtime_code = 0;
5966 RExC_state_t RExC_state;
5967 RExC_state_t * const pRExC_state = &RExC_state;
5968 #ifdef TRIE_STUDY_OPT
5970 RExC_state_t copyRExC_state;
5972 GET_RE_DEBUG_FLAGS_DECL;
5974 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5976 DEBUG_r(if (!PL_colorset) reginitcolors());
5978 #ifndef PERL_IN_XSUB_RE
5979 /* Initialize these here instead of as-needed, as is quick and avoids
5980 * having to test them each time otherwise */
5981 if (! PL_AboveLatin1) {
5982 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5983 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5984 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
5986 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5987 PL_L1Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5988 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5990 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5991 = _new_invlist_C_array(L1PosixAlnum_invlist);
5992 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5993 = _new_invlist_C_array(PosixAlnum_invlist);
5995 PL_L1Posix_ptrs[_CC_ALPHA]
5996 = _new_invlist_C_array(L1PosixAlpha_invlist);
5997 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5999 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
6000 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
6002 /* Cased is the same as Alpha in the ASCII range */
6003 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
6004 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
6006 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
6007 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
6009 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
6010 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
6012 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
6013 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
6015 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
6016 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
6018 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
6019 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
6021 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
6022 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
6024 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
6025 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
6026 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
6027 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
6029 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
6030 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
6032 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
6034 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
6035 PL_L1Posix_ptrs[_CC_WORDCHAR]
6036 = _new_invlist_C_array(L1PosixWord_invlist);
6038 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
6039 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
6041 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
6045 pRExC_state->code_blocks = NULL;
6046 pRExC_state->num_code_blocks = 0;
6049 *is_bare_re = FALSE;
6051 if (expr && (expr->op_type == OP_LIST ||
6052 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6053 /* allocate code_blocks if needed */
6057 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6058 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6059 ncode++; /* count of DO blocks */
6061 pRExC_state->num_code_blocks = ncode;
6062 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6067 /* compile-time pattern with just OP_CONSTs and DO blocks */
6072 /* find how many CONSTs there are */
6075 if (expr->op_type == OP_CONST)
6078 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6079 if (o->op_type == OP_CONST)
6083 /* fake up an SV array */
6085 assert(!new_patternp);
6086 Newx(new_patternp, n, SV*);
6087 SAVEFREEPV(new_patternp);
6091 if (expr->op_type == OP_CONST)
6092 new_patternp[n] = cSVOPx_sv(expr);
6094 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6095 if (o->op_type == OP_CONST)
6096 new_patternp[n++] = cSVOPo_sv;
6101 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6102 "Assembling pattern from %d elements%s\n", pat_count,
6103 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6105 /* set expr to the first arg op */
6107 if (pRExC_state->num_code_blocks
6108 && expr->op_type != OP_CONST)
6110 expr = cLISTOPx(expr)->op_first;
6111 assert( expr->op_type == OP_PUSHMARK
6112 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6113 || expr->op_type == OP_PADRANGE);
6114 expr = expr->op_sibling;
6117 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6118 expr, &recompile, NULL);
6120 /* handle bare (possibly after overloading) regex: foo =~ $re */
6125 if (SvTYPE(re) == SVt_REGEXP) {
6129 Safefree(pRExC_state->code_blocks);
6130 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6131 "Precompiled pattern%s\n",
6132 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6138 exp = SvPV_nomg(pat, plen);
6140 if (!eng->op_comp) {
6141 if ((SvUTF8(pat) && IN_BYTES)
6142 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6144 /* make a temporary copy; either to convert to bytes,
6145 * or to avoid repeating get-magic / overloaded stringify */
6146 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6147 (IN_BYTES ? 0 : SvUTF8(pat)));
6149 Safefree(pRExC_state->code_blocks);
6150 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6153 /* ignore the utf8ness if the pattern is 0 length */
6154 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6155 RExC_uni_semantics = 0;
6156 RExC_contains_locale = 0;
6157 RExC_contains_i = 0;
6158 pRExC_state->runtime_code_qr = NULL;
6161 SV *dsv= sv_newmortal();
6162 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6163 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6164 PL_colors[4],PL_colors[5],s);
6168 /* we jump here if we upgrade the pattern to utf8 and have to
6171 if ((pm_flags & PMf_USE_RE_EVAL)
6172 /* this second condition covers the non-regex literal case,
6173 * i.e. $foo =~ '(?{})'. */
6174 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6176 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6178 /* return old regex if pattern hasn't changed */
6179 /* XXX: note in the below we have to check the flags as well as the pattern.
6181 * Things get a touch tricky as we have to compare the utf8 flag independently
6182 * from the compile flags.
6187 && !!RX_UTF8(old_re) == !!RExC_utf8
6188 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6189 && RX_PRECOMP(old_re)
6190 && RX_PRELEN(old_re) == plen
6191 && memEQ(RX_PRECOMP(old_re), exp, plen)
6192 && !runtime_code /* with runtime code, always recompile */ )
6194 Safefree(pRExC_state->code_blocks);
6198 rx_flags = orig_rx_flags;
6200 if (rx_flags & PMf_FOLD) {
6201 RExC_contains_i = 1;
6203 if (initial_charset == REGEX_LOCALE_CHARSET) {
6204 RExC_contains_locale = 1;
6206 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6208 /* Set to use unicode semantics if the pattern is in utf8 and has the
6209 * 'depends' charset specified, as it means unicode when utf8 */
6210 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6214 RExC_flags = rx_flags;
6215 RExC_pm_flags = pm_flags;
6218 if (TAINTING_get && TAINT_get)
6219 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6221 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6222 /* whoops, we have a non-utf8 pattern, whilst run-time code
6223 * got compiled as utf8. Try again with a utf8 pattern */
6224 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6225 pRExC_state->num_code_blocks);
6226 goto redo_first_pass;
6229 assert(!pRExC_state->runtime_code_qr);
6234 RExC_in_lookbehind = 0;
6235 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6237 RExC_override_recoding = 0;
6238 RExC_in_multi_char_class = 0;
6240 /* First pass: determine size, legality. */
6243 RExC_end = exp + plen;
6248 RExC_emit = (regnode *) &RExC_emit_dummy;
6249 RExC_whilem_seen = 0;
6250 RExC_open_parens = NULL;
6251 RExC_close_parens = NULL;
6253 RExC_paren_names = NULL;
6255 RExC_paren_name_list = NULL;
6257 RExC_recurse = NULL;
6258 RExC_study_chunk_recursed = NULL;
6259 RExC_study_chunk_recursed_bytes= 0;
6260 RExC_recurse_count = 0;
6261 pRExC_state->code_index = 0;
6263 #if 0 /* REGC() is (currently) a NOP at the first pass.
6264 * Clever compilers notice this and complain. --jhi */
6265 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6268 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6270 RExC_lastparse=NULL;
6272 /* reg may croak on us, not giving us a chance to free
6273 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6274 need it to survive as long as the regexp (qr/(?{})/).
6275 We must check that code_blocksv is not already set, because we may
6276 have jumped back to restart the sizing pass. */
6277 if (pRExC_state->code_blocks && !code_blocksv) {
6278 code_blocksv = newSV_type(SVt_PV);
6279 SAVEFREESV(code_blocksv);
6280 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6281 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6283 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6284 /* It's possible to write a regexp in ascii that represents Unicode
6285 codepoints outside of the byte range, such as via \x{100}. If we
6286 detect such a sequence we have to convert the entire pattern to utf8
6287 and then recompile, as our sizing calculation will have been based
6288 on 1 byte == 1 character, but we will need to use utf8 to encode
6289 at least some part of the pattern, and therefore must convert the whole
6292 if (flags & RESTART_UTF8) {
6293 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6294 pRExC_state->num_code_blocks);
6295 goto redo_first_pass;
6297 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6300 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6303 PerlIO_printf(Perl_debug_log,
6304 "Required size %"IVdf" nodes\n"
6305 "Starting second pass (creation)\n",
6308 RExC_lastparse=NULL;
6311 /* The first pass could have found things that force Unicode semantics */
6312 if ((RExC_utf8 || RExC_uni_semantics)
6313 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6315 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6318 /* Small enough for pointer-storage convention?
6319 If extralen==0, this means that we will not need long jumps. */
6320 if (RExC_size >= 0x10000L && RExC_extralen)
6321 RExC_size += RExC_extralen;
6324 if (RExC_whilem_seen > 15)
6325 RExC_whilem_seen = 15;
6327 /* Allocate space and zero-initialize. Note, the two step process
6328 of zeroing when in debug mode, thus anything assigned has to
6329 happen after that */
6330 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6332 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6333 char, regexp_internal);
6334 if ( r == NULL || ri == NULL )
6335 FAIL("Regexp out of space");
6337 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6338 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
6340 /* bulk initialize base fields with 0. */
6341 Zero(ri, sizeof(regexp_internal), char);
6344 /* non-zero initialization begins here */
6347 r->extflags = rx_flags;
6348 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6350 if (pm_flags & PMf_IS_QR) {
6351 ri->code_blocks = pRExC_state->code_blocks;
6352 ri->num_code_blocks = pRExC_state->num_code_blocks;
6357 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6358 if (pRExC_state->code_blocks[n].src_regex)
6359 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6360 SAVEFREEPV(pRExC_state->code_blocks);
6364 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6365 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
6367 /* The caret is output if there are any defaults: if not all the STD
6368 * flags are set, or if no character set specifier is needed */
6370 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6372 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
6373 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6374 >> RXf_PMf_STD_PMMOD_SHIFT);
6375 const char *fptr = STD_PAT_MODS; /*"msix"*/
6377 /* Allocate for the worst case, which is all the std flags are turned
6378 * on. If more precision is desired, we could do a population count of
6379 * the flags set. This could be done with a small lookup table, or by
6380 * shifting, masking and adding, or even, when available, assembly
6381 * language for a machine-language population count.
6382 * We never output a minus, as all those are defaults, so are
6383 * covered by the caret */
6384 const STRLEN wraplen = plen + has_p + has_runon
6385 + has_default /* If needs a caret */
6387 /* If needs a character set specifier */
6388 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6389 + (sizeof(STD_PAT_MODS) - 1)
6390 + (sizeof("(?:)") - 1);
6392 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6393 r->xpv_len_u.xpvlenu_pv = p;
6395 SvFLAGS(rx) |= SVf_UTF8;
6398 /* If a default, cover it using the caret */
6400 *p++= DEFAULT_PAT_MOD;
6404 const char* const name = get_regex_charset_name(r->extflags, &len);
6405 Copy(name, p, len, char);
6409 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6412 while((ch = *fptr++)) {
6420 Copy(RExC_precomp, p, plen, char);
6421 assert ((RX_WRAPPED(rx) - p) < 16);
6422 r->pre_prefix = p - RX_WRAPPED(rx);
6428 SvCUR_set(rx, p - RX_WRAPPED(rx));
6432 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6434 /* setup various meta data about recursion, this all requires
6435 * RExC_npar to be correctly set, and a bit later on we clear it */
6436 if (RExC_seen & REG_SEEN_RECURSE) {
6437 Newxz(RExC_open_parens, RExC_npar,regnode *);
6438 SAVEFREEPV(RExC_open_parens);
6439 Newxz(RExC_close_parens,RExC_npar,regnode *);
6440 SAVEFREEPV(RExC_close_parens);
6442 if (RExC_seen & (REG_SEEN_RECURSE | REG_SEEN_GOSTART)) {
6443 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6444 * So its 1 if there are no parens. */
6445 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6446 ((RExC_npar & 0x07) != 0);
6447 Newx(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6448 SAVEFREEPV(RExC_study_chunk_recursed);
6451 /* Useful during FAIL. */
6452 #ifdef RE_TRACK_PATTERN_OFFSETS
6453 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6454 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6455 "%s %"UVuf" bytes for offset annotations.\n",
6456 ri->u.offsets ? "Got" : "Couldn't get",
6457 (UV)((2*RExC_size+1) * sizeof(U32))));
6459 SetProgLen(ri,RExC_size);
6464 /* Second pass: emit code. */
6465 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6466 RExC_pm_flags = pm_flags;
6468 RExC_end = exp + plen;
6471 RExC_emit_start = ri->program;
6472 RExC_emit = ri->program;
6473 RExC_emit_bound = ri->program + RExC_size + 1;
6474 pRExC_state->code_index = 0;
6476 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6477 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6479 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6481 /* XXXX To minimize changes to RE engine we always allocate
6482 3-units-long substrs field. */
6483 Newx(r->substrs, 1, struct reg_substr_data);
6484 if (RExC_recurse_count) {
6485 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6486 SAVEFREEPV(RExC_recurse);
6490 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6491 Zero(r->substrs, 1, struct reg_substr_data);
6492 if (RExC_study_chunk_recursed)
6493 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6495 #ifdef TRIE_STUDY_OPT
6497 StructCopy(&zero_scan_data, &data, scan_data_t);
6498 copyRExC_state = RExC_state;
6501 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6503 RExC_state = copyRExC_state;
6504 if (seen & REG_TOP_LEVEL_BRANCHES)
6505 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6507 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6508 StructCopy(&zero_scan_data, &data, scan_data_t);
6511 StructCopy(&zero_scan_data, &data, scan_data_t);
6514 /* Dig out information for optimizations. */
6515 r->extflags = RExC_flags; /* was pm_op */
6516 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6519 SvUTF8_on(rx); /* Unicode in it? */
6520 ri->regstclass = NULL;
6521 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6522 r->intflags |= PREGf_NAUGHTY;
6523 scan = ri->program + 1; /* First BRANCH. */
6525 /* testing for BRANCH here tells us whether there is "must appear"
6526 data in the pattern. If there is then we can use it for optimisations */
6527 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6529 STRLEN longest_float_length, longest_fixed_length;
6530 regnode_ssc ch_class; /* pointed to by data */
6532 SSize_t last_close = 0; /* pointed to by data */
6533 regnode *first= scan;
6534 regnode *first_next= regnext(first);
6536 * Skip introductions and multiplicators >= 1
6537 * so that we can extract the 'meat' of the pattern that must
6538 * match in the large if() sequence following.
6539 * NOTE that EXACT is NOT covered here, as it is normally
6540 * picked up by the optimiser separately.
6542 * This is unfortunate as the optimiser isnt handling lookahead
6543 * properly currently.
6546 while ((OP(first) == OPEN && (sawopen = 1)) ||
6547 /* An OR of *one* alternative - should not happen now. */
6548 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6549 /* for now we can't handle lookbehind IFMATCH*/
6550 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6551 (OP(first) == PLUS) ||
6552 (OP(first) == MINMOD) ||
6553 /* An {n,m} with n>0 */
6554 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6555 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6558 * the only op that could be a regnode is PLUS, all the rest
6559 * will be regnode_1 or regnode_2.
6561 * (yves doesn't think this is true)
6563 if (OP(first) == PLUS)
6566 if (OP(first) == MINMOD)
6568 first += regarglen[OP(first)];
6570 first = NEXTOPER(first);
6571 first_next= regnext(first);
6574 /* Starting-point info. */
6576 DEBUG_PEEP("first:",first,0);
6577 /* Ignore EXACT as we deal with it later. */
6578 if (PL_regkind[OP(first)] == EXACT) {
6579 if (OP(first) == EXACT)
6580 NOOP; /* Empty, get anchored substr later. */
6582 ri->regstclass = first;
6585 else if (PL_regkind[OP(first)] == TRIE &&
6586 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6589 /* this can happen only on restudy */
6590 if ( OP(first) == TRIE ) {
6591 struct regnode_1 *trieop = (struct regnode_1 *)
6592 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6593 StructCopy(first,trieop,struct regnode_1);
6594 trie_op=(regnode *)trieop;
6596 struct regnode_charclass *trieop = (struct regnode_charclass *)
6597 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6598 StructCopy(first,trieop,struct regnode_charclass);
6599 trie_op=(regnode *)trieop;
6602 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6603 ri->regstclass = trie_op;
6606 else if (REGNODE_SIMPLE(OP(first)))
6607 ri->regstclass = first;
6608 else if (PL_regkind[OP(first)] == BOUND ||
6609 PL_regkind[OP(first)] == NBOUND)
6610 ri->regstclass = first;
6611 else if (PL_regkind[OP(first)] == BOL) {
6612 r->extflags |= (OP(first) == MBOL
6614 : (OP(first) == SBOL
6617 first = NEXTOPER(first);
6620 else if (OP(first) == GPOS) {
6621 r->extflags |= RXf_ANCH_GPOS;
6622 first = NEXTOPER(first);
6625 else if ((!sawopen || !RExC_sawback) &&
6626 (OP(first) == STAR &&
6627 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6628 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6630 /* turn .* into ^.* with an implied $*=1 */
6632 (OP(NEXTOPER(first)) == REG_ANY)
6635 r->extflags |= type;
6636 r->intflags |= PREGf_IMPLICIT;
6637 first = NEXTOPER(first);
6640 if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6641 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6642 /* x+ must match at the 1st pos of run of x's */
6643 r->intflags |= PREGf_SKIP;
6645 /* Scan is after the zeroth branch, first is atomic matcher. */
6646 #ifdef TRIE_STUDY_OPT
6649 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6650 (IV)(first - scan + 1))
6654 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6655 (IV)(first - scan + 1))
6661 * If there's something expensive in the r.e., find the
6662 * longest literal string that must appear and make it the
6663 * regmust. Resolve ties in favor of later strings, since
6664 * the regstart check works with the beginning of the r.e.
6665 * and avoiding duplication strengthens checking. Not a
6666 * strong reason, but sufficient in the absence of others.
6667 * [Now we resolve ties in favor of the earlier string if
6668 * it happens that c_offset_min has been invalidated, since the
6669 * earlier string may buy us something the later one won't.]
6672 data.longest_fixed = newSVpvs("");
6673 data.longest_float = newSVpvs("");
6674 data.last_found = newSVpvs("");
6675 data.longest = &(data.longest_fixed);
6676 ENTER_with_name("study_chunk");
6677 SAVEFREESV(data.longest_fixed);
6678 SAVEFREESV(data.longest_float);
6679 SAVEFREESV(data.last_found);
6681 if (!ri->regstclass) {
6682 ssc_init(pRExC_state, &ch_class);
6683 data.start_class = &ch_class;
6684 stclass_flag = SCF_DO_STCLASS_AND;
6685 } else /* XXXX Check for BOUND? */
6687 data.last_closep = &last_close;
6690 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6692 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6693 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6697 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6700 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6701 && data.last_start_min == 0 && data.last_end > 0
6702 && !RExC_seen_zerolen
6703 && !(RExC_seen & REG_SEEN_VERBARG)
6704 && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6705 r->extflags |= RXf_CHECK_ALL;
6706 scan_commit(pRExC_state, &data,&minlen,0);
6708 longest_float_length = CHR_SVLEN(data.longest_float);
6710 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6711 && data.offset_fixed == data.offset_float_min
6712 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6713 && S_setup_longest (aTHX_ pRExC_state,
6717 &(r->float_end_shift),
6718 data.lookbehind_float,
6719 data.offset_float_min,
6721 longest_float_length,
6722 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6723 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6725 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6726 r->float_max_offset = data.offset_float_max;
6727 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6728 r->float_max_offset -= data.lookbehind_float;
6729 SvREFCNT_inc_simple_void_NN(data.longest_float);
6732 r->float_substr = r->float_utf8 = NULL;
6733 longest_float_length = 0;
6736 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6738 if (S_setup_longest (aTHX_ pRExC_state,
6740 &(r->anchored_utf8),
6741 &(r->anchored_substr),
6742 &(r->anchored_end_shift),
6743 data.lookbehind_fixed,
6746 longest_fixed_length,
6747 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6748 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6750 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6751 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6754 r->anchored_substr = r->anchored_utf8 = NULL;
6755 longest_fixed_length = 0;
6757 LEAVE_with_name("study_chunk");
6760 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6761 ri->regstclass = NULL;
6763 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6765 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6766 && !ssc_is_anything(data.start_class))
6768 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6770 ssc_finalize(pRExC_state, data.start_class);
6772 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6773 StructCopy(data.start_class,
6774 (regnode_ssc*)RExC_rxi->data->data[n],
6776 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6777 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6778 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6779 regprop(r, sv, (regnode*)data.start_class);
6780 PerlIO_printf(Perl_debug_log,
6781 "synthetic stclass \"%s\".\n",
6782 SvPVX_const(sv));});
6783 data.start_class = NULL;
6786 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6787 if (longest_fixed_length > longest_float_length) {
6788 r->check_end_shift = r->anchored_end_shift;
6789 r->check_substr = r->anchored_substr;
6790 r->check_utf8 = r->anchored_utf8;
6791 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6792 if (r->extflags & RXf_ANCH_SINGLE)
6793 r->extflags |= RXf_NOSCAN;
6796 r->check_end_shift = r->float_end_shift;
6797 r->check_substr = r->float_substr;
6798 r->check_utf8 = r->float_utf8;
6799 r->check_offset_min = r->float_min_offset;
6800 r->check_offset_max = r->float_max_offset;
6802 if ((r->check_substr || r->check_utf8) ) {
6803 r->extflags |= RXf_USE_INTUIT;
6804 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6805 r->extflags |= RXf_INTUIT_TAIL;
6807 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6808 if ( (STRLEN)minlen < longest_float_length )
6809 minlen= longest_float_length;
6810 if ( (STRLEN)minlen < longest_fixed_length )
6811 minlen= longest_fixed_length;
6815 /* Several toplevels. Best we can is to set minlen. */
6817 regnode_ssc ch_class;
6818 SSize_t last_close = 0;
6820 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6822 scan = ri->program + 1;
6823 ssc_init(pRExC_state, &ch_class);
6824 data.start_class = &ch_class;
6825 data.last_closep = &last_close;
6828 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6830 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6831 |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6834 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6836 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6837 = r->float_substr = r->float_utf8 = NULL;
6839 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6840 && ! ssc_is_anything(data.start_class))
6842 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6844 ssc_finalize(pRExC_state, data.start_class);
6846 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6847 StructCopy(data.start_class,
6848 (regnode_ssc*)RExC_rxi->data->data[n],
6850 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6851 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6852 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6853 regprop(r, sv, (regnode*)data.start_class);
6854 PerlIO_printf(Perl_debug_log,
6855 "synthetic stclass \"%s\".\n",
6856 SvPVX_const(sv));});
6857 data.start_class = NULL;
6861 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6862 the "real" pattern. */
6864 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6865 (IV)minlen, (IV)r->minlen);
6867 r->minlenret = minlen;
6868 if (r->minlen < minlen)
6871 if (RExC_seen & REG_SEEN_GPOS)
6872 r->extflags |= RXf_GPOS_SEEN;
6873 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6874 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6875 if (pRExC_state->num_code_blocks)
6876 r->extflags |= RXf_EVAL_SEEN;
6877 if (RExC_seen & REG_SEEN_CANY)
6878 r->extflags |= RXf_CANY_SEEN;
6879 if (RExC_seen & REG_SEEN_VERBARG)
6881 r->intflags |= PREGf_VERBARG_SEEN;
6882 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6884 if (RExC_seen & REG_SEEN_CUTGROUP)
6885 r->intflags |= PREGf_CUTGROUP_SEEN;
6886 if (pm_flags & PMf_USE_RE_EVAL)
6887 r->intflags |= PREGf_USE_RE_EVAL;
6888 if (RExC_paren_names)
6889 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6891 RXp_PAREN_NAMES(r) = NULL;
6894 regnode *first = ri->program + 1;
6896 regnode *next = NEXTOPER(first);
6899 if (PL_regkind[fop] == NOTHING && nop == END)
6900 r->extflags |= RXf_NULL;
6901 else if (PL_regkind[fop] == BOL && nop == END)
6902 r->extflags |= RXf_START_ONLY;
6903 else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6904 r->extflags |= RXf_WHITE;
6905 else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6906 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6910 if (RExC_paren_names) {
6911 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
6912 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6915 ri->name_list_idx = 0;
6917 if (RExC_recurse_count) {
6918 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6919 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6920 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6923 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6924 /* assume we don't need to swap parens around before we match */
6928 PerlIO_printf(Perl_debug_log,"Final program:\n");
6931 #ifdef RE_TRACK_PATTERN_OFFSETS
6932 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6933 const STRLEN len = ri->u.offsets[0];
6935 GET_RE_DEBUG_FLAGS_DECL;
6936 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6937 for (i = 1; i <= len; i++) {
6938 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6939 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6940 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6942 PerlIO_printf(Perl_debug_log, "\n");
6947 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6948 * by setting the regexp SV to readonly-only instead. If the
6949 * pattern's been recompiled, the USEDness should remain. */
6950 if (old_re && SvREADONLY(old_re))
6958 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6961 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6963 PERL_UNUSED_ARG(value);
6965 if (flags & RXapif_FETCH) {
6966 return reg_named_buff_fetch(rx, key, flags);
6967 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6968 Perl_croak_no_modify();
6970 } else if (flags & RXapif_EXISTS) {
6971 return reg_named_buff_exists(rx, key, flags)
6974 } else if (flags & RXapif_REGNAMES) {
6975 return reg_named_buff_all(rx, flags);
6976 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6977 return reg_named_buff_scalar(rx, flags);
6979 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6985 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6988 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6989 PERL_UNUSED_ARG(lastkey);
6991 if (flags & RXapif_FIRSTKEY)
6992 return reg_named_buff_firstkey(rx, flags);
6993 else if (flags & RXapif_NEXTKEY)
6994 return reg_named_buff_nextkey(rx, flags);
6996 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
7002 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7005 AV *retarray = NULL;
7007 struct regexp *const rx = ReANY(r);
7009 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7011 if (flags & RXapif_ALL)
7014 if (rx && RXp_PAREN_NAMES(rx)) {
7015 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7018 SV* sv_dat=HeVAL(he_str);
7019 I32 *nums=(I32*)SvPVX(sv_dat);
7020 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7021 if ((I32)(rx->nparens) >= nums[i]
7022 && rx->offs[nums[i]].start != -1
7023 && rx->offs[nums[i]].end != -1)
7026 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7031 ret = newSVsv(&PL_sv_undef);
7034 av_push(retarray, ret);
7037 return newRV_noinc(MUTABLE_SV(retarray));
7044 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7047 struct regexp *const rx = ReANY(r);
7049 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7051 if (rx && RXp_PAREN_NAMES(rx)) {
7052 if (flags & RXapif_ALL) {
7053 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7055 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7057 SvREFCNT_dec_NN(sv);
7069 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7071 struct regexp *const rx = ReANY(r);
7073 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7075 if ( rx && RXp_PAREN_NAMES(rx) ) {
7076 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7078 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7085 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7087 struct regexp *const rx = ReANY(r);
7088 GET_RE_DEBUG_FLAGS_DECL;
7090 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7092 if (rx && RXp_PAREN_NAMES(rx)) {
7093 HV *hv = RXp_PAREN_NAMES(rx);
7095 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7098 SV* sv_dat = HeVAL(temphe);
7099 I32 *nums = (I32*)SvPVX(sv_dat);
7100 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7101 if ((I32)(rx->lastparen) >= nums[i] &&
7102 rx->offs[nums[i]].start != -1 &&
7103 rx->offs[nums[i]].end != -1)
7109 if (parno || flags & RXapif_ALL) {
7110 return newSVhek(HeKEY_hek(temphe));
7118 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7123 struct regexp *const rx = ReANY(r);
7125 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7127 if (rx && RXp_PAREN_NAMES(rx)) {
7128 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7129 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7130 } else if (flags & RXapif_ONE) {
7131 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7132 av = MUTABLE_AV(SvRV(ret));
7133 length = av_len(av);
7134 SvREFCNT_dec_NN(ret);
7135 return newSViv(length + 1);
7137 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
7141 return &PL_sv_undef;
7145 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7147 struct regexp *const rx = ReANY(r);
7150 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7152 if (rx && RXp_PAREN_NAMES(rx)) {
7153 HV *hv= RXp_PAREN_NAMES(rx);
7155 (void)hv_iterinit(hv);
7156 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7159 SV* sv_dat = HeVAL(temphe);
7160 I32 *nums = (I32*)SvPVX(sv_dat);
7161 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7162 if ((I32)(rx->lastparen) >= nums[i] &&
7163 rx->offs[nums[i]].start != -1 &&
7164 rx->offs[nums[i]].end != -1)
7170 if (parno || flags & RXapif_ALL) {
7171 av_push(av, newSVhek(HeKEY_hek(temphe)));
7176 return newRV_noinc(MUTABLE_SV(av));
7180 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7183 struct regexp *const rx = ReANY(r);
7189 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7191 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7192 || n == RX_BUFF_IDX_CARET_FULLMATCH
7193 || n == RX_BUFF_IDX_CARET_POSTMATCH
7196 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7198 /* on something like
7201 * the KEEPCOPY is set on the PMOP rather than the regex */
7202 if (PL_curpm && r == PM_GETRE(PL_curpm))
7203 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7212 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7213 /* no need to distinguish between them any more */
7214 n = RX_BUFF_IDX_FULLMATCH;
7216 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7217 && rx->offs[0].start != -1)
7219 /* $`, ${^PREMATCH} */
7220 i = rx->offs[0].start;
7224 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7225 && rx->offs[0].end != -1)
7227 /* $', ${^POSTMATCH} */
7228 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7229 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7232 if ( 0 <= n && n <= (I32)rx->nparens &&
7233 (s1 = rx->offs[n].start) != -1 &&
7234 (t1 = rx->offs[n].end) != -1)
7236 /* $&, ${^MATCH}, $1 ... */
7238 s = rx->subbeg + s1 - rx->suboffset;
7243 assert(s >= rx->subbeg);
7244 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7246 #if NO_TAINT_SUPPORT
7247 sv_setpvn(sv, s, i);
7249 const int oldtainted = TAINT_get;
7251 sv_setpvn(sv, s, i);
7252 TAINT_set(oldtainted);
7254 if ( (rx->extflags & RXf_CANY_SEEN)
7255 ? (RXp_MATCH_UTF8(rx)
7256 && (!i || is_utf8_string((U8*)s, i)))
7257 : (RXp_MATCH_UTF8(rx)) )
7264 if (RXp_MATCH_TAINTED(rx)) {
7265 if (SvTYPE(sv) >= SVt_PVMG) {
7266 MAGIC* const mg = SvMAGIC(sv);
7269 SvMAGIC_set(sv, mg->mg_moremagic);
7271 if ((mgt = SvMAGIC(sv))) {
7272 mg->mg_moremagic = mgt;
7273 SvMAGIC_set(sv, mg);
7284 sv_setsv(sv,&PL_sv_undef);
7290 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7291 SV const * const value)
7293 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7295 PERL_UNUSED_ARG(rx);
7296 PERL_UNUSED_ARG(paren);
7297 PERL_UNUSED_ARG(value);
7300 Perl_croak_no_modify();
7304 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7307 struct regexp *const rx = ReANY(r);
7311 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7313 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7314 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7315 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7318 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7320 /* on something like
7323 * the KEEPCOPY is set on the PMOP rather than the regex */
7324 if (PL_curpm && r == PM_GETRE(PL_curpm))
7325 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7331 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7333 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7334 case RX_BUFF_IDX_PREMATCH: /* $` */
7335 if (rx->offs[0].start != -1) {
7336 i = rx->offs[0].start;
7345 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7346 case RX_BUFF_IDX_POSTMATCH: /* $' */
7347 if (rx->offs[0].end != -1) {
7348 i = rx->sublen - rx->offs[0].end;
7350 s1 = rx->offs[0].end;
7357 default: /* $& / ${^MATCH}, $1, $2, ... */
7358 if (paren <= (I32)rx->nparens &&
7359 (s1 = rx->offs[paren].start) != -1 &&
7360 (t1 = rx->offs[paren].end) != -1)
7366 if (ckWARN(WARN_UNINITIALIZED))
7367 report_uninit((const SV *)sv);
7372 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7373 const char * const s = rx->subbeg - rx->suboffset + s1;
7378 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7385 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7387 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7388 PERL_UNUSED_ARG(rx);
7392 return newSVpvs("Regexp");
7395 /* Scans the name of a named buffer from the pattern.
7396 * If flags is REG_RSN_RETURN_NULL returns null.
7397 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7398 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7399 * to the parsed name as looked up in the RExC_paren_names hash.
7400 * If there is an error throws a vFAIL().. type exception.
7403 #define REG_RSN_RETURN_NULL 0
7404 #define REG_RSN_RETURN_NAME 1
7405 #define REG_RSN_RETURN_DATA 2
7408 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7410 char *name_start = RExC_parse;
7412 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7414 assert (RExC_parse <= RExC_end);
7415 if (RExC_parse == RExC_end) NOOP;
7416 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7417 /* skip IDFIRST by using do...while */
7420 RExC_parse += UTF8SKIP(RExC_parse);
7421 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7425 } while (isWORDCHAR(*RExC_parse));
7427 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7428 vFAIL("Group name must start with a non-digit word character");
7432 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7433 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7434 if ( flags == REG_RSN_RETURN_NAME)
7436 else if (flags==REG_RSN_RETURN_DATA) {
7439 if ( ! sv_name ) /* should not happen*/
7440 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7441 if (RExC_paren_names)
7442 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7444 sv_dat = HeVAL(he_str);
7446 vFAIL("Reference to nonexistent named group");
7450 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7451 (unsigned long) flags);
7453 assert(0); /* NOT REACHED */
7458 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7459 int rem=(int)(RExC_end - RExC_parse); \
7468 if (RExC_lastparse!=RExC_parse) \
7469 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7472 iscut ? "..." : "<" \
7475 PerlIO_printf(Perl_debug_log,"%16s",""); \
7478 num = RExC_size + 1; \
7480 num=REG_NODE_NUM(RExC_emit); \
7481 if (RExC_lastnum!=num) \
7482 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7484 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7485 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7486 (int)((depth*2)), "", \
7490 RExC_lastparse=RExC_parse; \
7495 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7496 DEBUG_PARSE_MSG((funcname)); \
7497 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7499 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7500 DEBUG_PARSE_MSG((funcname)); \
7501 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7504 /* This section of code defines the inversion list object and its methods. The
7505 * interfaces are highly subject to change, so as much as possible is static to
7506 * this file. An inversion list is here implemented as a malloc'd C UV array
7507 * as an SVt_INVLIST scalar.
7509 * An inversion list for Unicode is an array of code points, sorted by ordinal
7510 * number. The zeroth element is the first code point in the list. The 1th
7511 * element is the first element beyond that not in the list. In other words,
7512 * the first range is
7513 * invlist[0]..(invlist[1]-1)
7514 * The other ranges follow. Thus every element whose index is divisible by two
7515 * marks the beginning of a range that is in the list, and every element not
7516 * divisible by two marks the beginning of a range not in the list. A single
7517 * element inversion list that contains the single code point N generally
7518 * consists of two elements
7521 * (The exception is when N is the highest representable value on the
7522 * machine, in which case the list containing just it would be a single
7523 * element, itself. By extension, if the last range in the list extends to
7524 * infinity, then the first element of that range will be in the inversion list
7525 * at a position that is divisible by two, and is the final element in the
7527 * Taking the complement (inverting) an inversion list is quite simple, if the
7528 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7529 * This implementation reserves an element at the beginning of each inversion
7530 * list to always contain 0; there is an additional flag in the header which
7531 * indicates if the list begins at the 0, or is offset to begin at the next
7534 * More about inversion lists can be found in "Unicode Demystified"
7535 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7536 * More will be coming when functionality is added later.
7538 * The inversion list data structure is currently implemented as an SV pointing
7539 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7540 * array of UV whose memory management is automatically handled by the existing
7541 * facilities for SV's.
7543 * Some of the methods should always be private to the implementation, and some
7544 * should eventually be made public */
7546 /* The header definitions are in F<inline_invlist.c> */
7548 PERL_STATIC_INLINE UV*
7549 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7551 /* Returns a pointer to the first element in the inversion list's array.
7552 * This is called upon initialization of an inversion list. Where the
7553 * array begins depends on whether the list has the code point U+0000 in it
7554 * or not. The other parameter tells it whether the code that follows this
7555 * call is about to put a 0 in the inversion list or not. The first
7556 * element is either the element reserved for 0, if TRUE, or the element
7557 * after it, if FALSE */
7559 bool* offset = get_invlist_offset_addr(invlist);
7560 UV* zero_addr = (UV *) SvPVX(invlist);
7562 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7565 assert(! _invlist_len(invlist));
7569 /* 1^1 = 0; 1^0 = 1 */
7570 *offset = 1 ^ will_have_0;
7571 return zero_addr + *offset;
7574 PERL_STATIC_INLINE UV*
7575 S_invlist_array(pTHX_ SV* const invlist)
7577 /* Returns the pointer to the inversion list's array. Every time the
7578 * length changes, this needs to be called in case malloc or realloc moved
7581 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7583 /* Must not be empty. If these fail, you probably didn't check for <len>
7584 * being non-zero before trying to get the array */
7585 assert(_invlist_len(invlist));
7587 /* The very first element always contains zero, The array begins either
7588 * there, or if the inversion list is offset, at the element after it.
7589 * The offset header field determines which; it contains 0 or 1 to indicate
7590 * how much additionally to add */
7591 assert(0 == *(SvPVX(invlist)));
7592 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7595 PERL_STATIC_INLINE void
7596 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7598 /* Sets the current number of elements stored in the inversion list.
7599 * Updates SvCUR correspondingly */
7601 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7603 assert(SvTYPE(invlist) == SVt_INVLIST);
7608 : TO_INTERNAL_SIZE(len + offset));
7609 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7612 PERL_STATIC_INLINE IV*
7613 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7615 /* Return the address of the IV that is reserved to hold the cached index
7618 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7620 assert(SvTYPE(invlist) == SVt_INVLIST);
7622 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7625 PERL_STATIC_INLINE IV
7626 S_invlist_previous_index(pTHX_ SV* const invlist)
7628 /* Returns cached index of previous search */
7630 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7632 return *get_invlist_previous_index_addr(invlist);
7635 PERL_STATIC_INLINE void
7636 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7638 /* Caches <index> for later retrieval */
7640 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7642 assert(index == 0 || index < (int) _invlist_len(invlist));
7644 *get_invlist_previous_index_addr(invlist) = index;
7647 PERL_STATIC_INLINE UV
7648 S_invlist_max(pTHX_ SV* const invlist)
7650 /* Returns the maximum number of elements storable in the inversion list's
7651 * array, without having to realloc() */
7653 PERL_ARGS_ASSERT_INVLIST_MAX;
7655 assert(SvTYPE(invlist) == SVt_INVLIST);
7657 /* Assumes worst case, in which the 0 element is not counted in the
7658 * inversion list, so subtracts 1 for that */
7659 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7660 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7661 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7664 #ifndef PERL_IN_XSUB_RE
7666 Perl__new_invlist(pTHX_ IV initial_size)
7669 /* Return a pointer to a newly constructed inversion list, with enough
7670 * space to store 'initial_size' elements. If that number is negative, a
7671 * system default is used instead */
7675 if (initial_size < 0) {
7679 /* Allocate the initial space */
7680 new_list = newSV_type(SVt_INVLIST);
7682 /* First 1 is in case the zero element isn't in the list; second 1 is for
7684 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7685 invlist_set_len(new_list, 0, 0);
7687 /* Force iterinit() to be used to get iteration to work */
7688 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7690 *get_invlist_previous_index_addr(new_list) = 0;
7697 S__new_invlist_C_array(pTHX_ const UV* const list)
7699 /* Return a pointer to a newly constructed inversion list, initialized to
7700 * point to <list>, which has to be in the exact correct inversion list
7701 * form, including internal fields. Thus this is a dangerous routine that
7702 * should not be used in the wrong hands. The passed in 'list' contains
7703 * several header fields at the beginning that are not part of the
7704 * inversion list body proper */
7706 const STRLEN length = (STRLEN) list[0];
7707 const UV version_id = list[1];
7708 const bool offset = cBOOL(list[2]);
7709 #define HEADER_LENGTH 3
7710 /* If any of the above changes in any way, you must change HEADER_LENGTH
7711 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7712 * perl -E 'say int(rand 2**31-1)'
7714 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7715 data structure type, so that one being
7716 passed in can be validated to be an
7717 inversion list of the correct vintage.
7720 SV* invlist = newSV_type(SVt_INVLIST);
7722 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7724 if (version_id != INVLIST_VERSION_ID) {
7725 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7728 /* The generated array passed in includes header elements that aren't part
7729 * of the list proper, so start it just after them */
7730 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7732 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7733 shouldn't touch it */
7735 *(get_invlist_offset_addr(invlist)) = offset;
7737 /* The 'length' passed to us is the physical number of elements in the
7738 * inversion list. But if there is an offset the logical number is one
7740 invlist_set_len(invlist, length - offset, offset);
7742 invlist_set_previous_index(invlist, 0);
7744 /* Initialize the iteration pointer. */
7745 invlist_iterfinish(invlist);
7751 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7753 /* Grow the maximum size of an inversion list */
7755 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7757 assert(SvTYPE(invlist) == SVt_INVLIST);
7759 /* Add one to account for the zero element at the beginning which may not
7760 * be counted by the calling parameters */
7761 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7764 PERL_STATIC_INLINE void
7765 S_invlist_trim(pTHX_ SV* const invlist)
7767 PERL_ARGS_ASSERT_INVLIST_TRIM;
7769 assert(SvTYPE(invlist) == SVt_INVLIST);
7771 /* Change the length of the inversion list to how many entries it currently
7773 SvPV_shrink_to_cur((SV *) invlist);
7777 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7779 /* Subject to change or removal. Append the range from 'start' to 'end' at
7780 * the end of the inversion list. The range must be above any existing
7784 UV max = invlist_max(invlist);
7785 UV len = _invlist_len(invlist);
7788 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7790 if (len == 0) { /* Empty lists must be initialized */
7791 offset = start != 0;
7792 array = _invlist_array_init(invlist, ! offset);
7795 /* Here, the existing list is non-empty. The current max entry in the
7796 * list is generally the first value not in the set, except when the
7797 * set extends to the end of permissible values, in which case it is
7798 * the first entry in that final set, and so this call is an attempt to
7799 * append out-of-order */
7801 UV final_element = len - 1;
7802 array = invlist_array(invlist);
7803 if (array[final_element] > start
7804 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7806 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7807 array[final_element], start,
7808 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7811 /* Here, it is a legal append. If the new range begins with the first
7812 * value not in the set, it is extending the set, so the new first
7813 * value not in the set is one greater than the newly extended range.
7815 offset = *get_invlist_offset_addr(invlist);
7816 if (array[final_element] == start) {
7817 if (end != UV_MAX) {
7818 array[final_element] = end + 1;
7821 /* But if the end is the maximum representable on the machine,
7822 * just let the range that this would extend to have no end */
7823 invlist_set_len(invlist, len - 1, offset);
7829 /* Here the new range doesn't extend any existing set. Add it */
7831 len += 2; /* Includes an element each for the start and end of range */
7833 /* If wll overflow the existing space, extend, which may cause the array to
7836 invlist_extend(invlist, len);
7838 /* Have to set len here to avoid assert failure in invlist_array() */
7839 invlist_set_len(invlist, len, offset);
7841 array = invlist_array(invlist);
7844 invlist_set_len(invlist, len, offset);
7847 /* The next item on the list starts the range, the one after that is
7848 * one past the new range. */
7849 array[len - 2] = start;
7850 if (end != UV_MAX) {
7851 array[len - 1] = end + 1;
7854 /* But if the end is the maximum representable on the machine, just let
7855 * the range have no end */
7856 invlist_set_len(invlist, len - 1, offset);
7860 #ifndef PERL_IN_XSUB_RE
7863 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7865 /* Searches the inversion list for the entry that contains the input code
7866 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7867 * return value is the index into the list's array of the range that
7872 IV high = _invlist_len(invlist);
7873 const IV highest_element = high - 1;
7876 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7878 /* If list is empty, return failure. */
7883 /* (We can't get the array unless we know the list is non-empty) */
7884 array = invlist_array(invlist);
7886 mid = invlist_previous_index(invlist);
7887 assert(mid >=0 && mid <= highest_element);
7889 /* <mid> contains the cache of the result of the previous call to this
7890 * function (0 the first time). See if this call is for the same result,
7891 * or if it is for mid-1. This is under the theory that calls to this
7892 * function will often be for related code points that are near each other.
7893 * And benchmarks show that caching gives better results. We also test
7894 * here if the code point is within the bounds of the list. These tests
7895 * replace others that would have had to be made anyway to make sure that
7896 * the array bounds were not exceeded, and these give us extra information
7897 * at the same time */
7898 if (cp >= array[mid]) {
7899 if (cp >= array[highest_element]) {
7900 return highest_element;
7903 /* Here, array[mid] <= cp < array[highest_element]. This means that
7904 * the final element is not the answer, so can exclude it; it also
7905 * means that <mid> is not the final element, so can refer to 'mid + 1'
7907 if (cp < array[mid + 1]) {
7913 else { /* cp < aray[mid] */
7914 if (cp < array[0]) { /* Fail if outside the array */
7918 if (cp >= array[mid - 1]) {
7923 /* Binary search. What we are looking for is <i> such that
7924 * array[i] <= cp < array[i+1]
7925 * The loop below converges on the i+1. Note that there may not be an
7926 * (i+1)th element in the array, and things work nonetheless */
7927 while (low < high) {
7928 mid = (low + high) / 2;
7929 assert(mid <= highest_element);
7930 if (array[mid] <= cp) { /* cp >= array[mid] */
7933 /* We could do this extra test to exit the loop early.
7934 if (cp < array[low]) {
7939 else { /* cp < array[mid] */
7946 invlist_set_previous_index(invlist, high);
7951 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7953 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7954 * but is used when the swash has an inversion list. This makes this much
7955 * faster, as it uses a binary search instead of a linear one. This is
7956 * intimately tied to that function, and perhaps should be in utf8.c,
7957 * except it is intimately tied to inversion lists as well. It assumes
7958 * that <swatch> is all 0's on input */
7961 const IV len = _invlist_len(invlist);
7965 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7967 if (len == 0) { /* Empty inversion list */
7971 array = invlist_array(invlist);
7973 /* Find which element it is */
7974 i = _invlist_search(invlist, start);
7976 /* We populate from <start> to <end> */
7977 while (current < end) {
7980 /* The inversion list gives the results for every possible code point
7981 * after the first one in the list. Only those ranges whose index is
7982 * even are ones that the inversion list matches. For the odd ones,
7983 * and if the initial code point is not in the list, we have to skip
7984 * forward to the next element */
7985 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7987 if (i >= len) { /* Finished if beyond the end of the array */
7991 if (current >= end) { /* Finished if beyond the end of what we
7993 if (LIKELY(end < UV_MAX)) {
7997 /* We get here when the upper bound is the maximum
7998 * representable on the machine, and we are looking for just
7999 * that code point. Have to special case it */
8001 goto join_end_of_list;
8004 assert(current >= start);
8006 /* The current range ends one below the next one, except don't go past
8009 upper = (i < len && array[i] < end) ? array[i] : end;
8011 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8012 * for each code point in it */
8013 for (; current < upper; current++) {
8014 const STRLEN offset = (STRLEN)(current - start);
8015 swatch[offset >> 3] |= 1 << (offset & 7);
8020 /* Quit if at the end of the list */
8023 /* But first, have to deal with the highest possible code point on
8024 * the platform. The previous code assumes that <end> is one
8025 * beyond where we want to populate, but that is impossible at the
8026 * platform's infinity, so have to handle it specially */
8027 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8029 const STRLEN offset = (STRLEN)(end - start);
8030 swatch[offset >> 3] |= 1 << (offset & 7);
8035 /* Advance to the next range, which will be for code points not in the
8044 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
8046 /* Take the union of two inversion lists and point <output> to it. *output
8047 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8048 * the reference count to that list will be decremented if not already a
8049 * temporary (mortal); otherwise *output will be made correspondingly
8050 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8051 * second list is returned. If <complement_b> is TRUE, the union is taken
8052 * of the complement (inversion) of <b> instead of b itself.
8054 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8055 * Richard Gillam, published by Addison-Wesley, and explained at some
8056 * length there. The preface says to incorporate its examples into your
8057 * code at your own risk.
8059 * The algorithm is like a merge sort.
8061 * XXX A potential performance improvement is to keep track as we go along
8062 * if only one of the inputs contributes to the result, meaning the other
8063 * is a subset of that one. In that case, we can skip the final copy and
8064 * return the larger of the input lists, but then outside code might need
8065 * to keep track of whether to free the input list or not */
8067 const UV* array_a; /* a's array */
8069 UV len_a; /* length of a's array */
8072 SV* u; /* the resulting union */
8076 UV i_a = 0; /* current index into a's array */
8080 /* running count, as explained in the algorithm source book; items are
8081 * stopped accumulating and are output when the count changes to/from 0.
8082 * The count is incremented when we start a range that's in the set, and
8083 * decremented when we start a range that's not in the set. So its range
8084 * is 0 to 2. Only when the count is zero is something not in the set.
8088 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8091 /* If either one is empty, the union is the other one */
8092 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8093 bool make_temp = FALSE; /* Should we mortalize the result? */
8097 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8103 *output = invlist_clone(b);
8105 _invlist_invert(*output);
8107 } /* else *output already = b; */
8110 sv_2mortal(*output);
8114 else if ((len_b = _invlist_len(b)) == 0) {
8115 bool make_temp = FALSE;
8117 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8122 /* The complement of an empty list is a list that has everything in it,
8123 * so the union with <a> includes everything too */
8126 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8130 *output = _new_invlist(1);
8131 _append_range_to_invlist(*output, 0, UV_MAX);
8133 else if (*output != a) {
8134 *output = invlist_clone(a);
8136 /* else *output already = a; */
8139 sv_2mortal(*output);
8144 /* Here both lists exist and are non-empty */
8145 array_a = invlist_array(a);
8146 array_b = invlist_array(b);
8148 /* If are to take the union of 'a' with the complement of b, set it
8149 * up so are looking at b's complement. */
8152 /* To complement, we invert: if the first element is 0, remove it. To
8153 * do this, we just pretend the array starts one later */
8154 if (array_b[0] == 0) {
8160 /* But if the first element is not zero, we pretend the list starts
8161 * at the 0 that is always stored immediately before the array. */
8167 /* Size the union for the worst case: that the sets are completely
8169 u = _new_invlist(len_a + len_b);
8171 /* Will contain U+0000 if either component does */
8172 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8173 || (len_b > 0 && array_b[0] == 0));
8175 /* Go through each list item by item, stopping when exhausted one of
8177 while (i_a < len_a && i_b < len_b) {
8178 UV cp; /* The element to potentially add to the union's array */
8179 bool cp_in_set; /* is it in the the input list's set or not */
8181 /* We need to take one or the other of the two inputs for the union.
8182 * Since we are merging two sorted lists, we take the smaller of the
8183 * next items. In case of a tie, we take the one that is in its set
8184 * first. If we took one not in the set first, it would decrement the
8185 * count, possibly to 0 which would cause it to be output as ending the
8186 * range, and the next time through we would take the same number, and
8187 * output it again as beginning the next range. By doing it the
8188 * opposite way, there is no possibility that the count will be
8189 * momentarily decremented to 0, and thus the two adjoining ranges will
8190 * be seamlessly merged. (In a tie and both are in the set or both not
8191 * in the set, it doesn't matter which we take first.) */
8192 if (array_a[i_a] < array_b[i_b]
8193 || (array_a[i_a] == array_b[i_b]
8194 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8196 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8200 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8201 cp = array_b[i_b++];
8204 /* Here, have chosen which of the two inputs to look at. Only output
8205 * if the running count changes to/from 0, which marks the
8206 * beginning/end of a range in that's in the set */
8209 array_u[i_u++] = cp;
8216 array_u[i_u++] = cp;
8221 /* Here, we are finished going through at least one of the lists, which
8222 * means there is something remaining in at most one. We check if the list
8223 * that hasn't been exhausted is positioned such that we are in the middle
8224 * of a range in its set or not. (i_a and i_b point to the element beyond
8225 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8226 * is potentially more to output.
8227 * There are four cases:
8228 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8229 * in the union is entirely from the non-exhausted set.
8230 * 2) Both were in their sets, count is 2. Nothing further should
8231 * be output, as everything that remains will be in the exhausted
8232 * list's set, hence in the union; decrementing to 1 but not 0 insures
8234 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8235 * Nothing further should be output because the union includes
8236 * everything from the exhausted set. Not decrementing ensures that.
8237 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8238 * decrementing to 0 insures that we look at the remainder of the
8239 * non-exhausted set */
8240 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8241 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8246 /* The final length is what we've output so far, plus what else is about to
8247 * be output. (If 'count' is non-zero, then the input list we exhausted
8248 * has everything remaining up to the machine's limit in its set, and hence
8249 * in the union, so there will be no further output. */
8252 /* At most one of the subexpressions will be non-zero */
8253 len_u += (len_a - i_a) + (len_b - i_b);
8256 /* Set result to final length, which can change the pointer to array_u, so
8258 if (len_u != _invlist_len(u)) {
8259 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8261 array_u = invlist_array(u);
8264 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8265 * the other) ended with everything above it not in its set. That means
8266 * that the remaining part of the union is precisely the same as the
8267 * non-exhausted list, so can just copy it unchanged. (If both list were
8268 * exhausted at the same time, then the operations below will be both 0.)
8271 IV copy_count; /* At most one will have a non-zero copy count */
8272 if ((copy_count = len_a - i_a) > 0) {
8273 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8275 else if ((copy_count = len_b - i_b) > 0) {
8276 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8280 /* We may be removing a reference to one of the inputs. If so, the output
8281 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8282 * count decremented) */
8283 if (a == *output || b == *output) {
8284 assert(! invlist_is_iterating(*output));
8285 if ((SvTEMP(*output))) {
8289 SvREFCNT_dec_NN(*output);
8299 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
8301 /* Take the intersection of two inversion lists and point <i> to it. *i
8302 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8303 * the reference count to that list will be decremented if not already a
8304 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8305 * The first list, <a>, may be NULL, in which case an empty list is
8306 * returned. If <complement_b> is TRUE, the result will be the
8307 * intersection of <a> and the complement (or inversion) of <b> instead of
8310 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8311 * Richard Gillam, published by Addison-Wesley, and explained at some
8312 * length there. The preface says to incorporate its examples into your
8313 * code at your own risk. In fact, it had bugs
8315 * The algorithm is like a merge sort, and is essentially the same as the
8319 const UV* array_a; /* a's array */
8321 UV len_a; /* length of a's array */
8324 SV* r; /* the resulting intersection */
8328 UV i_a = 0; /* current index into a's array */
8332 /* running count, as explained in the algorithm source book; items are
8333 * stopped accumulating and are output when the count changes to/from 2.
8334 * The count is incremented when we start a range that's in the set, and
8335 * decremented when we start a range that's not in the set. So its range
8336 * is 0 to 2. Only when the count is 2 is something in the intersection.
8340 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8343 /* Special case if either one is empty */
8344 len_a = (a == NULL) ? 0 : _invlist_len(a);
8345 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8346 bool make_temp = FALSE;
8348 if (len_a != 0 && complement_b) {
8350 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8351 * be empty. Here, also we are using 'b's complement, which hence
8352 * must be every possible code point. Thus the intersection is
8356 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8361 *i = invlist_clone(a);
8363 /* else *i is already 'a' */
8371 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8372 * intersection must be empty */
8374 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8379 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8383 *i = _new_invlist(0);
8391 /* Here both lists exist and are non-empty */
8392 array_a = invlist_array(a);
8393 array_b = invlist_array(b);
8395 /* If are to take the intersection of 'a' with the complement of b, set it
8396 * up so are looking at b's complement. */
8399 /* To complement, we invert: if the first element is 0, remove it. To
8400 * do this, we just pretend the array starts one later */
8401 if (array_b[0] == 0) {
8407 /* But if the first element is not zero, we pretend the list starts
8408 * at the 0 that is always stored immediately before the array. */
8414 /* Size the intersection for the worst case: that the intersection ends up
8415 * fragmenting everything to be completely disjoint */
8416 r= _new_invlist(len_a + len_b);
8418 /* Will contain U+0000 iff both components do */
8419 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8420 && len_b > 0 && array_b[0] == 0);
8422 /* Go through each list item by item, stopping when exhausted one of
8424 while (i_a < len_a && i_b < len_b) {
8425 UV cp; /* The element to potentially add to the intersection's
8427 bool cp_in_set; /* Is it in the input list's set or not */
8429 /* We need to take one or the other of the two inputs for the
8430 * intersection. Since we are merging two sorted lists, we take the
8431 * smaller of the next items. In case of a tie, we take the one that
8432 * is not in its set first (a difference from the union algorithm). If
8433 * we took one in the set first, it would increment the count, possibly
8434 * to 2 which would cause it to be output as starting a range in the
8435 * intersection, and the next time through we would take that same
8436 * number, and output it again as ending the set. By doing it the
8437 * opposite of this, there is no possibility that the count will be
8438 * momentarily incremented to 2. (In a tie and both are in the set or
8439 * both not in the set, it doesn't matter which we take first.) */
8440 if (array_a[i_a] < array_b[i_b]
8441 || (array_a[i_a] == array_b[i_b]
8442 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8444 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8448 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8452 /* Here, have chosen which of the two inputs to look at. Only output
8453 * if the running count changes to/from 2, which marks the
8454 * beginning/end of a range that's in the intersection */
8458 array_r[i_r++] = cp;
8463 array_r[i_r++] = cp;
8469 /* Here, we are finished going through at least one of the lists, which
8470 * means there is something remaining in at most one. We check if the list
8471 * that has been exhausted is positioned such that we are in the middle
8472 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8473 * the ones we care about.) There are four cases:
8474 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8475 * nothing left in the intersection.
8476 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8477 * above 2. What should be output is exactly that which is in the
8478 * non-exhausted set, as everything it has is also in the intersection
8479 * set, and everything it doesn't have can't be in the intersection
8480 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8481 * gets incremented to 2. Like the previous case, the intersection is
8482 * everything that remains in the non-exhausted set.
8483 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8484 * remains 1. And the intersection has nothing more. */
8485 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8486 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8491 /* The final length is what we've output so far plus what else is in the
8492 * intersection. At most one of the subexpressions below will be non-zero */
8495 len_r += (len_a - i_a) + (len_b - i_b);
8498 /* Set result to final length, which can change the pointer to array_r, so
8500 if (len_r != _invlist_len(r)) {
8501 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8503 array_r = invlist_array(r);
8506 /* Finish outputting any remaining */
8507 if (count >= 2) { /* At most one will have a non-zero copy count */
8509 if ((copy_count = len_a - i_a) > 0) {
8510 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8512 else if ((copy_count = len_b - i_b) > 0) {
8513 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8517 /* We may be removing a reference to one of the inputs. If so, the output
8518 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8519 * count decremented) */
8520 if (a == *i || b == *i) {
8521 assert(! invlist_is_iterating(*i));
8526 SvREFCNT_dec_NN(*i);
8536 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8538 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8539 * set. A pointer to the inversion list is returned. This may actually be
8540 * a new list, in which case the passed in one has been destroyed. The
8541 * passed in inversion list can be NULL, in which case a new one is created
8542 * with just the one range in it */
8547 if (invlist == NULL) {
8548 invlist = _new_invlist(2);
8552 len = _invlist_len(invlist);
8555 /* If comes after the final entry actually in the list, can just append it
8558 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8559 && start >= invlist_array(invlist)[len - 1]))
8561 _append_range_to_invlist(invlist, start, end);
8565 /* Here, can't just append things, create and return a new inversion list
8566 * which is the union of this range and the existing inversion list */
8567 range_invlist = _new_invlist(2);
8568 _append_range_to_invlist(range_invlist, start, end);
8570 _invlist_union(invlist, range_invlist, &invlist);
8572 /* The temporary can be freed */
8573 SvREFCNT_dec_NN(range_invlist);
8579 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, UV** other_elements_ptr)
8581 /* Create and return an inversion list whose contents are to be populated
8582 * by the caller. The caller gives the number of elements (in 'size') and
8583 * the very first element ('element0'). This function will set
8584 * '*other_elements_ptr' to an array of UVs, where the remaining elements
8587 * Obviously there is some trust involved that the caller will properly
8588 * fill in the other elements of the array.
8590 * (The first element needs to be passed in, as the underlying code does
8591 * things differently depending on whether it is zero or non-zero) */
8593 SV* invlist = _new_invlist(size);
8596 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8598 _append_range_to_invlist(invlist, element0, element0);
8599 offset = *get_invlist_offset_addr(invlist);
8601 invlist_set_len(invlist, size, offset);
8602 *other_elements_ptr = invlist_array(invlist) + 1;
8608 PERL_STATIC_INLINE SV*
8609 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8610 return _add_range_to_invlist(invlist, cp, cp);
8613 #ifndef PERL_IN_XSUB_RE
8615 Perl__invlist_invert(pTHX_ SV* const invlist)
8617 /* Complement the input inversion list. This adds a 0 if the list didn't
8618 * have a zero; removes it otherwise. As described above, the data
8619 * structure is set up so that this is very efficient */
8621 PERL_ARGS_ASSERT__INVLIST_INVERT;
8623 assert(! invlist_is_iterating(invlist));
8625 /* The inverse of matching nothing is matching everything */
8626 if (_invlist_len(invlist) == 0) {
8627 _append_range_to_invlist(invlist, 0, UV_MAX);
8631 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8636 PERL_STATIC_INLINE SV*
8637 S_invlist_clone(pTHX_ SV* const invlist)
8640 /* Return a new inversion list that is a copy of the input one, which is
8641 * unchanged. The new list will not be mortal even if the old one was. */
8643 /* Need to allocate extra space to accommodate Perl's addition of a
8644 * trailing NUL to SvPV's, since it thinks they are always strings */
8645 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8646 STRLEN physical_length = SvCUR(invlist);
8647 bool offset = *(get_invlist_offset_addr(invlist));
8649 PERL_ARGS_ASSERT_INVLIST_CLONE;
8651 *(get_invlist_offset_addr(new_invlist)) = offset;
8652 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8653 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8658 PERL_STATIC_INLINE STRLEN*
8659 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8661 /* Return the address of the UV that contains the current iteration
8664 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8666 assert(SvTYPE(invlist) == SVt_INVLIST);
8668 return &(((XINVLIST*) SvANY(invlist))->iterator);
8671 PERL_STATIC_INLINE void
8672 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8674 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8676 *get_invlist_iter_addr(invlist) = 0;
8679 PERL_STATIC_INLINE void
8680 S_invlist_iterfinish(pTHX_ SV* invlist)
8682 /* Terminate iterator for invlist. This is to catch development errors.
8683 * Any iteration that is interrupted before completed should call this
8684 * function. Functions that add code points anywhere else but to the end
8685 * of an inversion list assert that they are not in the middle of an
8686 * iteration. If they were, the addition would make the iteration
8687 * problematical: if the iteration hadn't reached the place where things
8688 * were being added, it would be ok */
8690 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8692 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8696 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8698 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8699 * This call sets in <*start> and <*end>, the next range in <invlist>.
8700 * Returns <TRUE> if successful and the next call will return the next
8701 * range; <FALSE> if was already at the end of the list. If the latter,
8702 * <*start> and <*end> are unchanged, and the next call to this function
8703 * will start over at the beginning of the list */
8705 STRLEN* pos = get_invlist_iter_addr(invlist);
8706 UV len = _invlist_len(invlist);
8709 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8712 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8716 array = invlist_array(invlist);
8718 *start = array[(*pos)++];
8724 *end = array[(*pos)++] - 1;
8730 PERL_STATIC_INLINE bool
8731 S_invlist_is_iterating(pTHX_ SV* const invlist)
8733 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8735 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8738 PERL_STATIC_INLINE UV
8739 S_invlist_highest(pTHX_ SV* const invlist)
8741 /* Returns the highest code point that matches an inversion list. This API
8742 * has an ambiguity, as it returns 0 under either the highest is actually
8743 * 0, or if the list is empty. If this distinction matters to you, check
8744 * for emptiness before calling this function */
8746 UV len = _invlist_len(invlist);
8749 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8755 array = invlist_array(invlist);
8757 /* The last element in the array in the inversion list always starts a
8758 * range that goes to infinity. That range may be for code points that are
8759 * matched in the inversion list, or it may be for ones that aren't
8760 * matched. In the latter case, the highest code point in the set is one
8761 * less than the beginning of this range; otherwise it is the final element
8762 * of this range: infinity */
8763 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8765 : array[len - 1] - 1;
8768 #ifndef PERL_IN_XSUB_RE
8770 Perl__invlist_contents(pTHX_ SV* const invlist)
8772 /* Get the contents of an inversion list into a string SV so that they can
8773 * be printed out. It uses the format traditionally done for debug tracing
8777 SV* output = newSVpvs("\n");
8779 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8781 assert(! invlist_is_iterating(invlist));
8783 invlist_iterinit(invlist);
8784 while (invlist_iternext(invlist, &start, &end)) {
8785 if (end == UV_MAX) {
8786 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8788 else if (end != start) {
8789 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8793 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8801 #ifndef PERL_IN_XSUB_RE
8803 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8805 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
8806 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
8807 * the string 'indent'. The output looks like this:
8808 [0] 0x000A .. 0x000D
8810 [4] 0x2028 .. 0x2029
8811 [6] 0x3104 .. INFINITY
8812 * This means that the first range of code points matched by the list are
8813 * 0xA through 0xD; the second range contains only the single code point
8814 * 0x85, etc. An inversion list is an array of UVs. Two array elements
8815 * are used to define each range (except if the final range extends to
8816 * infinity, only a single element is needed). The array index of the
8817 * first element for the corresponding range is given in brackets. */
8822 PERL_ARGS_ASSERT__INVLIST_DUMP;
8824 if (invlist_is_iterating(invlist)) {
8825 Perl_dump_indent(aTHX_ level, file,
8826 "%sCan't dump inversion list because is in middle of iterating\n",
8831 invlist_iterinit(invlist);
8832 while (invlist_iternext(invlist, &start, &end)) {
8833 if (end == UV_MAX) {
8834 Perl_dump_indent(aTHX_ level, file,
8835 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8836 indent, (UV)count, start);
8838 else if (end != start) {
8839 Perl_dump_indent(aTHX_ level, file,
8840 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8841 indent, (UV)count, start, end);
8844 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8845 indent, (UV)count, start);
8852 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8854 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8856 /* Return a boolean as to if the two passed in inversion lists are
8857 * identical. The final argument, if TRUE, says to take the complement of
8858 * the second inversion list before doing the comparison */
8860 const UV* array_a = invlist_array(a);
8861 const UV* array_b = invlist_array(b);
8862 UV len_a = _invlist_len(a);
8863 UV len_b = _invlist_len(b);
8865 UV i = 0; /* current index into the arrays */
8866 bool retval = TRUE; /* Assume are identical until proven otherwise */
8868 PERL_ARGS_ASSERT__INVLISTEQ;
8870 /* If are to compare 'a' with the complement of b, set it
8871 * up so are looking at b's complement. */
8874 /* The complement of nothing is everything, so <a> would have to have
8875 * just one element, starting at zero (ending at infinity) */
8877 return (len_a == 1 && array_a[0] == 0);
8879 else if (array_b[0] == 0) {
8881 /* Otherwise, to complement, we invert. Here, the first element is
8882 * 0, just remove it. To do this, we just pretend the array starts
8890 /* But if the first element is not zero, we pretend the list starts
8891 * at the 0 that is always stored immediately before the array. */
8897 /* Make sure that the lengths are the same, as well as the final element
8898 * before looping through the remainder. (Thus we test the length, final,
8899 * and first elements right off the bat) */
8900 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8903 else for (i = 0; i < len_a - 1; i++) {
8904 if (array_a[i] != array_b[i]) {
8914 #undef HEADER_LENGTH
8915 #undef TO_INTERNAL_SIZE
8916 #undef FROM_INTERNAL_SIZE
8917 #undef INVLIST_VERSION_ID
8919 /* End of inversion list object */
8922 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
8924 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8925 * constructs, and updates RExC_flags with them. On input, RExC_parse
8926 * should point to the first flag; it is updated on output to point to the
8927 * final ')' or ':'. There needs to be at least one flag, or this will
8930 /* for (?g), (?gc), and (?o) warnings; warning
8931 about (?c) will warn about (?g) -- japhy */
8933 #define WASTED_O 0x01
8934 #define WASTED_G 0x02
8935 #define WASTED_C 0x04
8936 #define WASTED_GC (WASTED_G|WASTED_C)
8937 I32 wastedflags = 0x00;
8938 U32 posflags = 0, negflags = 0;
8939 U32 *flagsp = &posflags;
8940 char has_charset_modifier = '\0';
8942 bool has_use_defaults = FALSE;
8943 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8945 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8947 /* '^' as an initial flag sets certain defaults */
8948 if (UCHARAT(RExC_parse) == '^') {
8950 has_use_defaults = TRUE;
8951 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8952 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8953 ? REGEX_UNICODE_CHARSET
8954 : REGEX_DEPENDS_CHARSET);
8957 cs = get_regex_charset(RExC_flags);
8958 if (cs == REGEX_DEPENDS_CHARSET
8959 && (RExC_utf8 || RExC_uni_semantics))
8961 cs = REGEX_UNICODE_CHARSET;
8964 while (*RExC_parse) {
8965 /* && strchr("iogcmsx", *RExC_parse) */
8966 /* (?g), (?gc) and (?o) are useless here
8967 and must be globally applied -- japhy */
8968 switch (*RExC_parse) {
8970 /* Code for the imsx flags */
8971 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8973 case LOCALE_PAT_MOD:
8974 if (has_charset_modifier) {
8975 goto excess_modifier;
8977 else if (flagsp == &negflags) {
8980 cs = REGEX_LOCALE_CHARSET;
8981 has_charset_modifier = LOCALE_PAT_MOD;
8982 RExC_contains_locale = 1;
8984 case UNICODE_PAT_MOD:
8985 if (has_charset_modifier) {
8986 goto excess_modifier;
8988 else if (flagsp == &negflags) {
8991 cs = REGEX_UNICODE_CHARSET;
8992 has_charset_modifier = UNICODE_PAT_MOD;
8994 case ASCII_RESTRICT_PAT_MOD:
8995 if (flagsp == &negflags) {
8998 if (has_charset_modifier) {
8999 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9000 goto excess_modifier;
9002 /* Doubled modifier implies more restricted */
9003 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9006 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9008 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9010 case DEPENDS_PAT_MOD:
9011 if (has_use_defaults) {
9012 goto fail_modifiers;
9014 else if (flagsp == &negflags) {
9017 else if (has_charset_modifier) {
9018 goto excess_modifier;
9021 /* The dual charset means unicode semantics if the
9022 * pattern (or target, not known until runtime) are
9023 * utf8, or something in the pattern indicates unicode
9025 cs = (RExC_utf8 || RExC_uni_semantics)
9026 ? REGEX_UNICODE_CHARSET
9027 : REGEX_DEPENDS_CHARSET;
9028 has_charset_modifier = DEPENDS_PAT_MOD;
9032 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9033 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9035 else if (has_charset_modifier == *(RExC_parse - 1)) {
9036 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9039 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9044 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9046 case ONCE_PAT_MOD: /* 'o' */
9047 case GLOBAL_PAT_MOD: /* 'g' */
9048 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9049 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9050 if (! (wastedflags & wflagbit) ) {
9051 wastedflags |= wflagbit;
9052 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9055 "Useless (%s%c) - %suse /%c modifier",
9056 flagsp == &negflags ? "?-" : "?",
9058 flagsp == &negflags ? "don't " : "",
9065 case CONTINUE_PAT_MOD: /* 'c' */
9066 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9067 if (! (wastedflags & WASTED_C) ) {
9068 wastedflags |= WASTED_GC;
9069 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9072 "Useless (%sc) - %suse /gc modifier",
9073 flagsp == &negflags ? "?-" : "?",
9074 flagsp == &negflags ? "don't " : ""
9079 case KEEPCOPY_PAT_MOD: /* 'p' */
9080 if (flagsp == &negflags) {
9082 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9084 *flagsp |= RXf_PMf_KEEPCOPY;
9088 /* A flag is a default iff it is following a minus, so
9089 * if there is a minus, it means will be trying to
9090 * re-specify a default which is an error */
9091 if (has_use_defaults || flagsp == &negflags) {
9092 goto fail_modifiers;
9095 wastedflags = 0; /* reset so (?g-c) warns twice */
9099 RExC_flags |= posflags;
9100 RExC_flags &= ~negflags;
9101 set_regex_charset(&RExC_flags, cs);
9102 if (RExC_flags & RXf_PMf_FOLD) {
9103 RExC_contains_i = 1;
9109 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9110 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9111 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9112 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9121 - reg - regular expression, i.e. main body or parenthesized thing
9123 * Caller must absorb opening parenthesis.
9125 * Combining parenthesis handling with the base level of regular expression
9126 * is a trifle forced, but the need to tie the tails of the branches to what
9127 * follows makes it hard to avoid.
9129 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9131 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9133 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9136 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9137 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9138 needs to be restarted.
9139 Otherwise would only return NULL if regbranch() returns NULL, which
9142 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9143 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9144 * 2 is like 1, but indicates that nextchar() has been called to advance
9145 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9146 * this flag alerts us to the need to check for that */
9149 regnode *ret; /* Will be the head of the group. */
9152 regnode *ender = NULL;
9155 U32 oregflags = RExC_flags;
9156 bool have_branch = 0;
9158 I32 freeze_paren = 0;
9159 I32 after_freeze = 0;
9161 char * parse_start = RExC_parse; /* MJD */
9162 char * const oregcomp_parse = RExC_parse;
9164 GET_RE_DEBUG_FLAGS_DECL;
9166 PERL_ARGS_ASSERT_REG;
9167 DEBUG_PARSE("reg ");
9169 *flagp = 0; /* Tentatively. */
9172 /* Make an OPEN node, if parenthesized. */
9175 /* Under /x, space and comments can be gobbled up between the '(' and
9176 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9177 * intervening space, as the sequence is a token, and a token should be
9179 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9181 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9182 char *start_verb = RExC_parse;
9183 STRLEN verb_len = 0;
9184 char *start_arg = NULL;
9185 unsigned char op = 0;
9187 int internal_argval = 0; /* internal_argval is only useful if !argok */
9189 if (has_intervening_patws && SIZE_ONLY) {
9190 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9192 while ( *RExC_parse && *RExC_parse != ')' ) {
9193 if ( *RExC_parse == ':' ) {
9194 start_arg = RExC_parse + 1;
9200 verb_len = RExC_parse - start_verb;
9203 while ( *RExC_parse && *RExC_parse != ')' )
9205 if ( *RExC_parse != ')' )
9206 vFAIL("Unterminated verb pattern argument");
9207 if ( RExC_parse == start_arg )
9210 if ( *RExC_parse != ')' )
9211 vFAIL("Unterminated verb pattern");
9214 switch ( *start_verb ) {
9215 case 'A': /* (*ACCEPT) */
9216 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9218 internal_argval = RExC_nestroot;
9221 case 'C': /* (*COMMIT) */
9222 if ( memEQs(start_verb,verb_len,"COMMIT") )
9225 case 'F': /* (*FAIL) */
9226 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9231 case ':': /* (*:NAME) */
9232 case 'M': /* (*MARK:NAME) */
9233 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9238 case 'P': /* (*PRUNE) */
9239 if ( memEQs(start_verb,verb_len,"PRUNE") )
9242 case 'S': /* (*SKIP) */
9243 if ( memEQs(start_verb,verb_len,"SKIP") )
9246 case 'T': /* (*THEN) */
9247 /* [19:06] <TimToady> :: is then */
9248 if ( memEQs(start_verb,verb_len,"THEN") ) {
9250 RExC_seen |= REG_SEEN_CUTGROUP;
9255 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9257 "Unknown verb pattern '%"UTF8f"'",
9258 UTF8fARG(UTF, verb_len, start_verb));
9261 if ( start_arg && internal_argval ) {
9262 vFAIL3("Verb pattern '%.*s' may not have an argument",
9263 verb_len, start_verb);
9264 } else if ( argok < 0 && !start_arg ) {
9265 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9266 verb_len, start_verb);
9268 ret = reganode(pRExC_state, op, internal_argval);
9269 if ( ! internal_argval && ! SIZE_ONLY ) {
9271 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
9272 ARG(ret) = add_data( pRExC_state, STR_WITH_LEN("S"));
9273 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9280 if (!internal_argval)
9281 RExC_seen |= REG_SEEN_VERBARG;
9282 } else if ( start_arg ) {
9283 vFAIL3("Verb pattern '%.*s' may not have an argument",
9284 verb_len, start_verb);
9286 ret = reg_node(pRExC_state, op);
9288 nextchar(pRExC_state);
9291 else if (*RExC_parse == '?') { /* (?...) */
9292 bool is_logical = 0;
9293 const char * const seqstart = RExC_parse;
9294 if (has_intervening_patws && SIZE_ONLY) {
9295 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9299 paren = *RExC_parse++;
9300 ret = NULL; /* For look-ahead/behind. */
9303 case 'P': /* (?P...) variants for those used to PCRE/Python */
9304 paren = *RExC_parse++;
9305 if ( paren == '<') /* (?P<...>) named capture */
9307 else if (paren == '>') { /* (?P>name) named recursion */
9308 goto named_recursion;
9310 else if (paren == '=') { /* (?P=...) named backref */
9311 /* this pretty much dupes the code for \k<NAME> in regatom(), if
9312 you change this make sure you change that */
9313 char* name_start = RExC_parse;
9315 SV *sv_dat = reg_scan_name(pRExC_state,
9316 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9317 if (RExC_parse == name_start || *RExC_parse != ')')
9318 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9319 vFAIL2("Sequence %.3s... not terminated",parse_start);
9322 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9323 RExC_rxi->data->data[num]=(void*)sv_dat;
9324 SvREFCNT_inc_simple_void(sv_dat);
9327 ret = reganode(pRExC_state,
9330 : (ASCII_FOLD_RESTRICTED)
9332 : (AT_LEAST_UNI_SEMANTICS)
9340 Set_Node_Offset(ret, parse_start+1);
9341 Set_Node_Cur_Length(ret, parse_start);
9343 nextchar(pRExC_state);
9347 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9348 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9350 case '<': /* (?<...) */
9351 if (*RExC_parse == '!')
9353 else if (*RExC_parse != '=')
9359 case '\'': /* (?'...') */
9360 name_start= RExC_parse;
9361 svname = reg_scan_name(pRExC_state,
9362 SIZE_ONLY /* reverse test from the others */
9363 ? REG_RSN_RETURN_NAME
9364 : REG_RSN_RETURN_NULL);
9365 if (RExC_parse == name_start || *RExC_parse != paren)
9366 vFAIL2("Sequence (?%c... not terminated",
9367 paren=='>' ? '<' : paren);
9371 if (!svname) /* shouldn't happen */
9373 "panic: reg_scan_name returned NULL");
9374 if (!RExC_paren_names) {
9375 RExC_paren_names= newHV();
9376 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9378 RExC_paren_name_list= newAV();
9379 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9382 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9384 sv_dat = HeVAL(he_str);
9386 /* croak baby croak */
9388 "panic: paren_name hash element allocation failed");
9389 } else if ( SvPOK(sv_dat) ) {
9390 /* (?|...) can mean we have dupes so scan to check
9391 its already been stored. Maybe a flag indicating
9392 we are inside such a construct would be useful,
9393 but the arrays are likely to be quite small, so
9394 for now we punt -- dmq */
9395 IV count = SvIV(sv_dat);
9396 I32 *pv = (I32*)SvPVX(sv_dat);
9398 for ( i = 0 ; i < count ; i++ ) {
9399 if ( pv[i] == RExC_npar ) {
9405 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
9406 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9407 pv[count] = RExC_npar;
9408 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9411 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9412 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
9414 SvIV_set(sv_dat, 1);
9417 /* Yes this does cause a memory leak in debugging Perls */
9418 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
9419 SvREFCNT_dec_NN(svname);
9422 /*sv_dump(sv_dat);*/
9424 nextchar(pRExC_state);
9426 goto capturing_parens;
9428 RExC_seen |= REG_SEEN_LOOKBEHIND;
9429 RExC_in_lookbehind++;
9431 case '=': /* (?=...) */
9432 RExC_seen_zerolen++;
9434 case '!': /* (?!...) */
9435 RExC_seen_zerolen++;
9436 if (*RExC_parse == ')') {
9437 ret=reg_node(pRExC_state, OPFAIL);
9438 nextchar(pRExC_state);
9442 case '|': /* (?|...) */
9443 /* branch reset, behave like a (?:...) except that
9444 buffers in alternations share the same numbers */
9446 after_freeze = freeze_paren = RExC_npar;
9448 case ':': /* (?:...) */
9449 case '>': /* (?>...) */
9451 case '$': /* (?$...) */
9452 case '@': /* (?@...) */
9453 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9455 case '#': /* (?#...) */
9456 /* XXX As soon as we disallow separating the '?' and '*' (by
9457 * spaces or (?#...) comment), it is believed that this case
9458 * will be unreachable and can be removed. See
9460 while (*RExC_parse && *RExC_parse != ')')
9462 if (*RExC_parse != ')')
9463 FAIL("Sequence (?#... not terminated");
9464 nextchar(pRExC_state);
9467 case '0' : /* (?0) */
9468 case 'R' : /* (?R) */
9469 if (*RExC_parse != ')')
9470 FAIL("Sequence (?R) not terminated");
9471 ret = reg_node(pRExC_state, GOSTART);
9472 RExC_seen |= REG_SEEN_GOSTART;
9473 *flagp |= POSTPONED;
9474 nextchar(pRExC_state);
9477 { /* named and numeric backreferences */
9479 case '&': /* (?&NAME) */
9480 parse_start = RExC_parse - 1;
9483 SV *sv_dat = reg_scan_name(pRExC_state,
9484 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9485 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9487 if (RExC_parse == RExC_end || *RExC_parse != ')')
9488 vFAIL("Sequence (?&... not terminated");
9489 goto gen_recurse_regop;
9490 assert(0); /* NOT REACHED */
9492 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9494 vFAIL("Illegal pattern");
9496 goto parse_recursion;
9498 case '-': /* (?-1) */
9499 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9500 RExC_parse--; /* rewind to let it be handled later */
9504 case '1': case '2': case '3': case '4': /* (?1) */
9505 case '5': case '6': case '7': case '8': case '9':
9508 num = atoi(RExC_parse);
9509 parse_start = RExC_parse - 1; /* MJD */
9510 if (*RExC_parse == '-')
9512 while (isDIGIT(*RExC_parse))
9514 if (*RExC_parse!=')')
9515 vFAIL("Expecting close bracket");
9518 if ( paren == '-' ) {
9520 Diagram of capture buffer numbering.
9521 Top line is the normal capture buffer numbers
9522 Bottom line is the negative indexing as from
9526 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9530 num = RExC_npar + num;
9533 vFAIL("Reference to nonexistent group");
9535 } else if ( paren == '+' ) {
9536 num = RExC_npar + num - 1;
9539 ret = reganode(pRExC_state, GOSUB, num);
9541 if (num > (I32)RExC_rx->nparens) {
9543 vFAIL("Reference to nonexistent group");
9545 ARG2L_SET( ret, RExC_recurse_count++);
9547 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9548 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9552 RExC_seen |= REG_SEEN_RECURSE;
9553 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9554 Set_Node_Offset(ret, parse_start); /* MJD */
9556 *flagp |= POSTPONED;
9557 nextchar(pRExC_state);
9559 } /* named and numeric backreferences */
9560 assert(0); /* NOT REACHED */
9562 case '?': /* (??...) */
9564 if (*RExC_parse != '{') {
9566 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9568 "Sequence (%"UTF8f"...) not recognized",
9569 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9572 *flagp |= POSTPONED;
9573 paren = *RExC_parse++;
9575 case '{': /* (?{...}) */
9578 struct reg_code_block *cb;
9580 RExC_seen_zerolen++;
9582 if ( !pRExC_state->num_code_blocks
9583 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9584 || pRExC_state->code_blocks[pRExC_state->code_index].start
9585 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9588 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9589 FAIL("panic: Sequence (?{...}): no code block found\n");
9590 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9592 /* this is a pre-compiled code block (?{...}) */
9593 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9594 RExC_parse = RExC_start + cb->end;
9597 if (cb->src_regex) {
9598 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9599 RExC_rxi->data->data[n] =
9600 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9601 RExC_rxi->data->data[n+1] = (void*)o;
9604 n = add_data(pRExC_state,
9605 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9606 RExC_rxi->data->data[n] = (void*)o;
9609 pRExC_state->code_index++;
9610 nextchar(pRExC_state);
9614 ret = reg_node(pRExC_state, LOGICAL);
9615 eval = reganode(pRExC_state, EVAL, n);
9618 /* for later propagation into (??{}) return value */
9619 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9621 REGTAIL(pRExC_state, ret, eval);
9622 /* deal with the length of this later - MJD */
9625 ret = reganode(pRExC_state, EVAL, n);
9626 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9627 Set_Node_Offset(ret, parse_start);
9630 case '(': /* (?(?{...})...) and (?(?=...)...) */
9633 if (RExC_parse[0] == '?') { /* (?(?...)) */
9634 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9635 || RExC_parse[1] == '<'
9636 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9640 ret = reg_node(pRExC_state, LOGICAL);
9644 tail = reg(pRExC_state, 1, &flag, depth+1);
9645 if (flag & RESTART_UTF8) {
9646 *flagp = RESTART_UTF8;
9649 REGTAIL(pRExC_state, ret, tail);
9653 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9654 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9656 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9657 char *name_start= RExC_parse++;
9659 SV *sv_dat=reg_scan_name(pRExC_state,
9660 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9661 if (RExC_parse == name_start || *RExC_parse != ch)
9662 vFAIL2("Sequence (?(%c... not terminated",
9663 (ch == '>' ? '<' : ch));
9666 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9667 RExC_rxi->data->data[num]=(void*)sv_dat;
9668 SvREFCNT_inc_simple_void(sv_dat);
9670 ret = reganode(pRExC_state,NGROUPP,num);
9671 goto insert_if_check_paren;
9673 else if (RExC_parse[0] == 'D' &&
9674 RExC_parse[1] == 'E' &&
9675 RExC_parse[2] == 'F' &&
9676 RExC_parse[3] == 'I' &&
9677 RExC_parse[4] == 'N' &&
9678 RExC_parse[5] == 'E')
9680 ret = reganode(pRExC_state,DEFINEP,0);
9683 goto insert_if_check_paren;
9685 else if (RExC_parse[0] == 'R') {
9688 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9689 parno = atoi(RExC_parse++);
9690 while (isDIGIT(*RExC_parse))
9692 } else if (RExC_parse[0] == '&') {
9695 sv_dat = reg_scan_name(pRExC_state,
9696 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9697 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9699 ret = reganode(pRExC_state,INSUBP,parno);
9700 goto insert_if_check_paren;
9702 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9706 parno = atoi(RExC_parse++);
9708 while (isDIGIT(*RExC_parse))
9710 ret = reganode(pRExC_state, GROUPP, parno);
9712 insert_if_check_paren:
9713 if (*(tmp = nextchar(pRExC_state)) != ')') {
9714 /* nextchar also skips comments, so undo its work
9715 * and skip over the the next character.
9718 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9719 vFAIL("Switch condition not recognized");
9722 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9723 br = regbranch(pRExC_state, &flags, 1,depth+1);
9725 if (flags & RESTART_UTF8) {
9726 *flagp = RESTART_UTF8;
9729 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9732 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9733 c = *nextchar(pRExC_state);
9738 vFAIL("(?(DEFINE)....) does not allow branches");
9739 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9740 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9741 if (flags & RESTART_UTF8) {
9742 *flagp = RESTART_UTF8;
9745 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9748 REGTAIL(pRExC_state, ret, lastbr);
9751 c = *nextchar(pRExC_state);
9756 vFAIL("Switch (?(condition)... contains too many branches");
9757 ender = reg_node(pRExC_state, TAIL);
9758 REGTAIL(pRExC_state, br, ender);
9760 REGTAIL(pRExC_state, lastbr, ender);
9761 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9764 REGTAIL(pRExC_state, ret, ender);
9765 RExC_size++; /* XXX WHY do we need this?!!
9766 For large programs it seems to be required
9767 but I can't figure out why. -- dmq*/
9771 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9772 vFAIL("Unknown switch condition (?(...))");
9775 case '[': /* (?[ ... ]) */
9776 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9779 RExC_parse--; /* for vFAIL to print correctly */
9780 vFAIL("Sequence (? incomplete");
9782 default: /* e.g., (?i) */
9785 parse_lparen_question_flags(pRExC_state);
9786 if (UCHARAT(RExC_parse) != ':') {
9787 nextchar(pRExC_state);
9792 nextchar(pRExC_state);
9802 ret = reganode(pRExC_state, OPEN, parno);
9805 RExC_nestroot = parno;
9806 if (RExC_seen & REG_SEEN_RECURSE
9807 && !RExC_open_parens[parno-1])
9809 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9810 "Setting open paren #%"IVdf" to %d\n",
9811 (IV)parno, REG_NODE_NUM(ret)));
9812 RExC_open_parens[parno-1]= ret;
9815 Set_Node_Length(ret, 1); /* MJD */
9816 Set_Node_Offset(ret, RExC_parse); /* MJD */
9824 /* Pick up the branches, linking them together. */
9825 parse_start = RExC_parse; /* MJD */
9826 br = regbranch(pRExC_state, &flags, 1,depth+1);
9828 /* branch_len = (paren != 0); */
9831 if (flags & RESTART_UTF8) {
9832 *flagp = RESTART_UTF8;
9835 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9837 if (*RExC_parse == '|') {
9838 if (!SIZE_ONLY && RExC_extralen) {
9839 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9842 reginsert(pRExC_state, BRANCH, br, depth+1);
9843 Set_Node_Length(br, paren != 0);
9844 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9848 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9850 else if (paren == ':') {
9851 *flagp |= flags&SIMPLE;
9853 if (is_open) { /* Starts with OPEN. */
9854 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9856 else if (paren != '?') /* Not Conditional */
9858 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9860 while (*RExC_parse == '|') {
9861 if (!SIZE_ONLY && RExC_extralen) {
9862 ender = reganode(pRExC_state, LONGJMP,0);
9863 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9866 RExC_extralen += 2; /* Account for LONGJMP. */
9867 nextchar(pRExC_state);
9869 if (RExC_npar > after_freeze)
9870 after_freeze = RExC_npar;
9871 RExC_npar = freeze_paren;
9873 br = regbranch(pRExC_state, &flags, 0, depth+1);
9876 if (flags & RESTART_UTF8) {
9877 *flagp = RESTART_UTF8;
9880 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9882 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9884 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9887 if (have_branch || paren != ':') {
9888 /* Make a closing node, and hook it on the end. */
9891 ender = reg_node(pRExC_state, TAIL);
9894 ender = reganode(pRExC_state, CLOSE, parno);
9895 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9896 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9897 "Setting close paren #%"IVdf" to %d\n",
9898 (IV)parno, REG_NODE_NUM(ender)));
9899 RExC_close_parens[parno-1]= ender;
9900 if (RExC_nestroot == parno)
9903 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9904 Set_Node_Length(ender,1); /* MJD */
9910 *flagp &= ~HASWIDTH;
9913 ender = reg_node(pRExC_state, SUCCEED);
9916 ender = reg_node(pRExC_state, END);
9918 assert(!RExC_opend); /* there can only be one! */
9923 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9924 SV * const mysv_val1=sv_newmortal();
9925 SV * const mysv_val2=sv_newmortal();
9926 DEBUG_PARSE_MSG("lsbr");
9927 regprop(RExC_rx, mysv_val1, lastbr);
9928 regprop(RExC_rx, mysv_val2, ender);
9929 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9930 SvPV_nolen_const(mysv_val1),
9931 (IV)REG_NODE_NUM(lastbr),
9932 SvPV_nolen_const(mysv_val2),
9933 (IV)REG_NODE_NUM(ender),
9934 (IV)(ender - lastbr)
9937 REGTAIL(pRExC_state, lastbr, ender);
9939 if (have_branch && !SIZE_ONLY) {
9942 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9944 /* Hook the tails of the branches to the closing node. */
9945 for (br = ret; br; br = regnext(br)) {
9946 const U8 op = PL_regkind[OP(br)];
9948 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9949 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9952 else if (op == BRANCHJ) {
9953 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9954 /* for now we always disable this optimisation * /
9955 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9961 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9962 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9963 SV * const mysv_val1=sv_newmortal();
9964 SV * const mysv_val2=sv_newmortal();
9965 DEBUG_PARSE_MSG("NADA");
9966 regprop(RExC_rx, mysv_val1, ret);
9967 regprop(RExC_rx, mysv_val2, ender);
9968 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9969 SvPV_nolen_const(mysv_val1),
9970 (IV)REG_NODE_NUM(ret),
9971 SvPV_nolen_const(mysv_val2),
9972 (IV)REG_NODE_NUM(ender),
9977 if (OP(ender) == TAIL) {
9982 for ( opt= br + 1; opt < ender ; opt++ )
9984 NEXT_OFF(br)= ender - br;
9992 static const char parens[] = "=!<,>";
9994 if (paren && (p = strchr(parens, paren))) {
9995 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9996 int flag = (p - parens) > 1;
9999 node = SUSPEND, flag = 0;
10000 reginsert(pRExC_state, node,ret, depth+1);
10001 Set_Node_Cur_Length(ret, parse_start);
10002 Set_Node_Offset(ret, parse_start + 1);
10004 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10008 /* Check for proper termination. */
10010 /* restore original flags, but keep (?p) */
10011 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10012 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10013 RExC_parse = oregcomp_parse;
10014 vFAIL("Unmatched (");
10017 else if (!paren && RExC_parse < RExC_end) {
10018 if (*RExC_parse == ')') {
10020 vFAIL("Unmatched )");
10023 FAIL("Junk on end of regexp"); /* "Can't happen". */
10024 assert(0); /* NOTREACHED */
10027 if (RExC_in_lookbehind) {
10028 RExC_in_lookbehind--;
10030 if (after_freeze > RExC_npar)
10031 RExC_npar = after_freeze;
10036 - regbranch - one alternative of an | operator
10038 * Implements the concatenation operator.
10040 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10044 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10048 regnode *chain = NULL;
10050 I32 flags = 0, c = 0;
10051 GET_RE_DEBUG_FLAGS_DECL;
10053 PERL_ARGS_ASSERT_REGBRANCH;
10055 DEBUG_PARSE("brnc");
10060 if (!SIZE_ONLY && RExC_extralen)
10061 ret = reganode(pRExC_state, BRANCHJ,0);
10063 ret = reg_node(pRExC_state, BRANCH);
10064 Set_Node_Length(ret, 1);
10068 if (!first && SIZE_ONLY)
10069 RExC_extralen += 1; /* BRANCHJ */
10071 *flagp = WORST; /* Tentatively. */
10074 nextchar(pRExC_state);
10075 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10076 flags &= ~TRYAGAIN;
10077 latest = regpiece(pRExC_state, &flags,depth+1);
10078 if (latest == NULL) {
10079 if (flags & TRYAGAIN)
10081 if (flags & RESTART_UTF8) {
10082 *flagp = RESTART_UTF8;
10085 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10087 else if (ret == NULL)
10089 *flagp |= flags&(HASWIDTH|POSTPONED);
10090 if (chain == NULL) /* First piece. */
10091 *flagp |= flags&SPSTART;
10094 REGTAIL(pRExC_state, chain, latest);
10099 if (chain == NULL) { /* Loop ran zero times. */
10100 chain = reg_node(pRExC_state, NOTHING);
10105 *flagp |= flags&SIMPLE;
10112 - regpiece - something followed by possible [*+?]
10114 * Note that the branching code sequences used for ? and the general cases
10115 * of * and + are somewhat optimized: they use the same NOTHING node as
10116 * both the endmarker for their branch list and the body of the last branch.
10117 * It might seem that this node could be dispensed with entirely, but the
10118 * endmarker role is not redundant.
10120 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10122 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10126 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10133 const char * const origparse = RExC_parse;
10135 I32 max = REG_INFTY;
10136 #ifdef RE_TRACK_PATTERN_OFFSETS
10139 const char *maxpos = NULL;
10141 /* Save the original in case we change the emitted regop to a FAIL. */
10142 regnode * const orig_emit = RExC_emit;
10144 GET_RE_DEBUG_FLAGS_DECL;
10146 PERL_ARGS_ASSERT_REGPIECE;
10148 DEBUG_PARSE("piec");
10150 ret = regatom(pRExC_state, &flags,depth+1);
10152 if (flags & (TRYAGAIN|RESTART_UTF8))
10153 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10155 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10161 if (op == '{' && regcurly(RExC_parse, FALSE)) {
10163 #ifdef RE_TRACK_PATTERN_OFFSETS
10164 parse_start = RExC_parse; /* MJD */
10166 next = RExC_parse + 1;
10167 while (isDIGIT(*next) || *next == ',') {
10168 if (*next == ',') {
10176 if (*next == '}') { /* got one */
10180 min = atoi(RExC_parse);
10181 if (*maxpos == ',')
10184 maxpos = RExC_parse;
10185 max = atoi(maxpos);
10186 if (!max && *maxpos != '0')
10187 max = REG_INFTY; /* meaning "infinity" */
10188 else if (max >= REG_INFTY)
10189 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10191 nextchar(pRExC_state);
10192 if (max < min) { /* If can't match, warn and optimize to fail
10195 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10197 /* We can't back off the size because we have to reserve
10198 * enough space for all the things we are about to throw
10199 * away, but we can shrink it by the ammount we are about
10200 * to re-use here */
10201 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10204 RExC_emit = orig_emit;
10206 ret = reg_node(pRExC_state, OPFAIL);
10209 else if (min == max
10210 && RExC_parse < RExC_end
10211 && (*RExC_parse == '?' || *RExC_parse == '+'))
10214 ckWARN2reg(RExC_parse + 1,
10215 "Useless use of greediness modifier '%c'",
10218 /* Absorb the modifier, so later code doesn't see nor use
10220 nextchar(pRExC_state);
10224 if ((flags&SIMPLE)) {
10225 RExC_naughty += 2 + RExC_naughty / 2;
10226 reginsert(pRExC_state, CURLY, ret, depth+1);
10227 Set_Node_Offset(ret, parse_start+1); /* MJD */
10228 Set_Node_Cur_Length(ret, parse_start);
10231 regnode * const w = reg_node(pRExC_state, WHILEM);
10234 REGTAIL(pRExC_state, ret, w);
10235 if (!SIZE_ONLY && RExC_extralen) {
10236 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10237 reginsert(pRExC_state, NOTHING,ret, depth+1);
10238 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10240 reginsert(pRExC_state, CURLYX,ret, depth+1);
10242 Set_Node_Offset(ret, parse_start+1);
10243 Set_Node_Length(ret,
10244 op == '{' ? (RExC_parse - parse_start) : 1);
10246 if (!SIZE_ONLY && RExC_extralen)
10247 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10248 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10250 RExC_whilem_seen++, RExC_extralen += 3;
10251 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10258 *flagp |= HASWIDTH;
10260 ARG1_SET(ret, (U16)min);
10261 ARG2_SET(ret, (U16)max);
10268 if (!ISMULT1(op)) {
10273 #if 0 /* Now runtime fix should be reliable. */
10275 /* if this is reinstated, don't forget to put this back into perldiag:
10277 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10279 (F) The part of the regexp subject to either the * or + quantifier
10280 could match an empty string. The {#} shows in the regular
10281 expression about where the problem was discovered.
10285 if (!(flags&HASWIDTH) && op != '?')
10286 vFAIL("Regexp *+ operand could be empty");
10289 #ifdef RE_TRACK_PATTERN_OFFSETS
10290 parse_start = RExC_parse;
10292 nextchar(pRExC_state);
10294 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10296 if (op == '*' && (flags&SIMPLE)) {
10297 reginsert(pRExC_state, STAR, ret, depth+1);
10301 else if (op == '*') {
10305 else if (op == '+' && (flags&SIMPLE)) {
10306 reginsert(pRExC_state, PLUS, ret, depth+1);
10310 else if (op == '+') {
10314 else if (op == '?') {
10319 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10320 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10321 ckWARN2reg(RExC_parse,
10322 "%"UTF8f" matches null string many times",
10323 UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
10325 (void)ReREFCNT_inc(RExC_rx_sv);
10328 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10329 nextchar(pRExC_state);
10330 reginsert(pRExC_state, MINMOD, ret, depth+1);
10331 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10334 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10336 nextchar(pRExC_state);
10337 ender = reg_node(pRExC_state, SUCCEED);
10338 REGTAIL(pRExC_state, ret, ender);
10339 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10341 ender = reg_node(pRExC_state, TAIL);
10342 REGTAIL(pRExC_state, ret, ender);
10345 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10347 vFAIL("Nested quantifiers");
10354 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10355 const bool strict /* Apply stricter parsing rules? */
10359 /* This is expected to be called by a parser routine that has recognized '\N'
10360 and needs to handle the rest. RExC_parse is expected to point at the first
10361 char following the N at the time of the call. On successful return,
10362 RExC_parse has been updated to point to just after the sequence identified
10363 by this routine, and <*flagp> has been updated.
10365 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10368 \N may begin either a named sequence, or if outside a character class, mean
10369 to match a non-newline. For non single-quoted regexes, the tokenizer has
10370 attempted to decide which, and in the case of a named sequence, converted it
10371 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10372 where c1... are the characters in the sequence. For single-quoted regexes,
10373 the tokenizer passes the \N sequence through unchanged; this code will not
10374 attempt to determine this nor expand those, instead raising a syntax error.
10375 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10376 or there is no '}', it signals that this \N occurrence means to match a
10379 Only the \N{U+...} form should occur in a character class, for the same
10380 reason that '.' inside a character class means to just match a period: it
10381 just doesn't make sense.
10383 The function raises an error (via vFAIL), and doesn't return for various
10384 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
10385 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10386 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10387 only possible if node_p is non-NULL.
10390 If <valuep> is non-null, it means the caller can accept an input sequence
10391 consisting of a just a single code point; <*valuep> is set to that value
10392 if the input is such.
10394 If <node_p> is non-null it signifies that the caller can accept any other
10395 legal sequence (i.e., one that isn't just a single code point). <*node_p>
10397 1) \N means not-a-NL: points to a newly created REG_ANY node;
10398 2) \N{}: points to a new NOTHING node;
10399 3) otherwise: points to a new EXACT node containing the resolved
10401 Note that FALSE is returned for single code point sequences if <valuep> is
10405 char * endbrace; /* '}' following the name */
10407 char *endchar; /* Points to '.' or '}' ending cur char in the input
10409 bool has_multiple_chars; /* true if the input stream contains a sequence of
10410 more than one character */
10412 GET_RE_DEBUG_FLAGS_DECL;
10414 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10416 GET_RE_DEBUG_FLAGS;
10418 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10420 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10421 * modifier. The other meaning does not, so use a temporary until we find
10422 * out which we are being called with */
10423 p = (RExC_flags & RXf_PMf_EXTENDED)
10424 ? regwhite( pRExC_state, RExC_parse )
10427 /* Disambiguate between \N meaning a named character versus \N meaning
10428 * [^\n]. The former is assumed when it can't be the latter. */
10429 if (*p != '{' || regcurly(p, FALSE)) {
10432 /* no bare \N allowed in a charclass */
10433 if (in_char_class) {
10434 vFAIL("\\N in a character class must be a named character: \\N{...}");
10438 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10440 nextchar(pRExC_state);
10441 *node_p = reg_node(pRExC_state, REG_ANY);
10442 *flagp |= HASWIDTH|SIMPLE;
10444 Set_Node_Length(*node_p, 1); /* MJD */
10448 /* Here, we have decided it should be a named character or sequence */
10450 /* The test above made sure that the next real character is a '{', but
10451 * under the /x modifier, it could be separated by space (or a comment and
10452 * \n) and this is not allowed (for consistency with \x{...} and the
10453 * tokenizer handling of \N{NAME}). */
10454 if (*RExC_parse != '{') {
10455 vFAIL("Missing braces on \\N{}");
10458 RExC_parse++; /* Skip past the '{' */
10460 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10461 || ! (endbrace == RExC_parse /* nothing between the {} */
10462 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
10463 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
10465 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10466 vFAIL("\\N{NAME} must be resolved by the lexer");
10469 if (endbrace == RExC_parse) { /* empty: \N{} */
10472 *node_p = reg_node(pRExC_state,NOTHING);
10474 else if (in_char_class) {
10475 if (SIZE_ONLY && in_char_class) {
10477 RExC_parse++; /* Position after the "}" */
10478 vFAIL("Zero length \\N{}");
10481 ckWARNreg(RExC_parse,
10482 "Ignoring zero length \\N{} in character class");
10490 nextchar(pRExC_state);
10494 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10495 RExC_parse += 2; /* Skip past the 'U+' */
10497 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10499 /* Code points are separated by dots. If none, there is only one code
10500 * point, and is terminated by the brace */
10501 has_multiple_chars = (endchar < endbrace);
10503 if (valuep && (! has_multiple_chars || in_char_class)) {
10504 /* We only pay attention to the first char of
10505 multichar strings being returned in char classes. I kinda wonder
10506 if this makes sense as it does change the behaviour
10507 from earlier versions, OTOH that behaviour was broken
10508 as well. XXX Solution is to recharacterize as
10509 [rest-of-class]|multi1|multi2... */
10511 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10512 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10513 | PERL_SCAN_DISALLOW_PREFIX
10514 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10516 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10518 /* The tokenizer should have guaranteed validity, but it's possible to
10519 * bypass it by using single quoting, so check */
10520 if (length_of_hex == 0
10521 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10523 RExC_parse += length_of_hex; /* Includes all the valid */
10524 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10525 ? UTF8SKIP(RExC_parse)
10527 /* Guard against malformed utf8 */
10528 if (RExC_parse >= endchar) {
10529 RExC_parse = endchar;
10531 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10534 if (in_char_class && has_multiple_chars) {
10536 RExC_parse = endbrace;
10537 vFAIL("\\N{} in character class restricted to one character");
10540 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10544 RExC_parse = endbrace + 1;
10546 else if (! node_p || ! has_multiple_chars) {
10548 /* Here, the input is legal, but not according to the caller's
10549 * options. We fail without advancing the parse, so that the
10550 * caller can try again */
10556 /* What is done here is to convert this to a sub-pattern of the form
10557 * (?:\x{char1}\x{char2}...)
10558 * and then call reg recursively. That way, it retains its atomicness,
10559 * while not having to worry about special handling that some code
10560 * points may have. toke.c has converted the original Unicode values
10561 * to native, so that we can just pass on the hex values unchanged. We
10562 * do have to set a flag to keep recoding from happening in the
10565 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10567 char *orig_end = RExC_end;
10570 while (RExC_parse < endbrace) {
10572 /* Convert to notation the rest of the code understands */
10573 sv_catpv(substitute_parse, "\\x{");
10574 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10575 sv_catpv(substitute_parse, "}");
10577 /* Point to the beginning of the next character in the sequence. */
10578 RExC_parse = endchar + 1;
10579 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10581 sv_catpv(substitute_parse, ")");
10583 RExC_parse = SvPV(substitute_parse, len);
10585 /* Don't allow empty number */
10587 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10589 RExC_end = RExC_parse + len;
10591 /* The values are Unicode, and therefore not subject to recoding */
10592 RExC_override_recoding = 1;
10594 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10595 if (flags & RESTART_UTF8) {
10596 *flagp = RESTART_UTF8;
10599 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10602 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10604 RExC_parse = endbrace;
10605 RExC_end = orig_end;
10606 RExC_override_recoding = 0;
10608 nextchar(pRExC_state);
10618 * It returns the code point in utf8 for the value in *encp.
10619 * value: a code value in the source encoding
10620 * encp: a pointer to an Encode object
10622 * If the result from Encode is not a single character,
10623 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10626 S_reg_recode(pTHX_ const char value, SV **encp)
10629 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10630 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10631 const STRLEN newlen = SvCUR(sv);
10632 UV uv = UNICODE_REPLACEMENT;
10634 PERL_ARGS_ASSERT_REG_RECODE;
10638 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10641 if (!newlen || numlen != newlen) {
10642 uv = UNICODE_REPLACEMENT;
10648 PERL_STATIC_INLINE U8
10649 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10653 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10659 op = get_regex_charset(RExC_flags);
10660 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10661 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10662 been, so there is no hole */
10665 return op + EXACTF;
10668 PERL_STATIC_INLINE void
10669 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10671 /* This knows the details about sizing an EXACTish node, setting flags for
10672 * it (by setting <*flagp>, and potentially populating it with a single
10675 * If <len> (the length in bytes) is non-zero, this function assumes that
10676 * the node has already been populated, and just does the sizing. In this
10677 * case <code_point> should be the final code point that has already been
10678 * placed into the node. This value will be ignored except that under some
10679 * circumstances <*flagp> is set based on it.
10681 * If <len> is zero, the function assumes that the node is to contain only
10682 * the single character given by <code_point> and calculates what <len>
10683 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10684 * additionally will populate the node's STRING with <code_point>, if <len>
10685 * is 0. In both cases <*flagp> is appropriately set
10687 * It knows that under FOLD, the Latin Sharp S and UTF characters above
10688 * 255, must be folded (the former only when the rules indicate it can
10691 bool len_passed_in = cBOOL(len != 0);
10692 U8 character[UTF8_MAXBYTES_CASE+1];
10694 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10696 if (! len_passed_in) {
10698 if (FOLD && (! LOC || code_point > 255)) {
10699 _to_uni_fold_flags(code_point,
10702 FOLD_FLAGS_FULL | ((LOC)
10703 ? FOLD_FLAGS_LOCALE
10704 : (ASCII_FOLD_RESTRICTED)
10705 ? FOLD_FLAGS_NOMIX_ASCII
10709 uvchr_to_utf8( character, code_point);
10710 len = UTF8SKIP(character);
10714 || code_point != LATIN_SMALL_LETTER_SHARP_S
10715 || ASCII_FOLD_RESTRICTED
10716 || ! AT_LEAST_UNI_SEMANTICS)
10718 *character = (U8) code_point;
10723 *(character + 1) = 's';
10729 RExC_size += STR_SZ(len);
10732 RExC_emit += STR_SZ(len);
10733 STR_LEN(node) = len;
10734 if (! len_passed_in) {
10735 Copy((char *) character, STRING(node), len, char);
10739 *flagp |= HASWIDTH;
10741 /* A single character node is SIMPLE, except for the special-cased SHARP S
10743 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10744 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10745 || ! FOLD || ! DEPENDS_SEMANTICS))
10752 /* return atoi(p), unless it's too big to sensibly be a backref,
10753 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
10756 S_backref_value(char *p)
10760 for (;isDIGIT(*q); q++); /* calculate length of num */
10761 if (q - p == 0 || q - p > 9)
10768 - regatom - the lowest level
10770 Try to identify anything special at the start of the pattern. If there
10771 is, then handle it as required. This may involve generating a single regop,
10772 such as for an assertion; or it may involve recursing, such as to
10773 handle a () structure.
10775 If the string doesn't start with something special then we gobble up
10776 as much literal text as we can.
10778 Once we have been able to handle whatever type of thing started the
10779 sequence, we return.
10781 Note: we have to be careful with escapes, as they can be both literal
10782 and special, and in the case of \10 and friends, context determines which.
10784 A summary of the code structure is:
10786 switch (first_byte) {
10787 cases for each special:
10788 handle this special;
10791 switch (2nd byte) {
10792 cases for each unambiguous special:
10793 handle this special;
10795 cases for each ambigous special/literal:
10797 if (special) handle here
10799 default: // unambiguously literal:
10802 default: // is a literal char
10805 create EXACTish node for literal;
10806 while (more input and node isn't full) {
10807 switch (input_byte) {
10808 cases for each special;
10809 make sure parse pointer is set so that the next call to
10810 regatom will see this special first
10811 goto loopdone; // EXACTish node terminated by prev. char
10813 append char to EXACTISH node;
10815 get next input byte;
10819 return the generated node;
10821 Specifically there are two separate switches for handling
10822 escape sequences, with the one for handling literal escapes requiring
10823 a dummy entry for all of the special escapes that are actually handled
10826 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10828 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10830 Otherwise does not return NULL.
10834 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10837 regnode *ret = NULL;
10839 char *parse_start = RExC_parse;
10843 GET_RE_DEBUG_FLAGS_DECL;
10845 *flagp = WORST; /* Tentatively. */
10847 DEBUG_PARSE("atom");
10849 PERL_ARGS_ASSERT_REGATOM;
10852 switch ((U8)*RExC_parse) {
10854 RExC_seen_zerolen++;
10855 nextchar(pRExC_state);
10856 if (RExC_flags & RXf_PMf_MULTILINE)
10857 ret = reg_node(pRExC_state, MBOL);
10858 else if (RExC_flags & RXf_PMf_SINGLELINE)
10859 ret = reg_node(pRExC_state, SBOL);
10861 ret = reg_node(pRExC_state, BOL);
10862 Set_Node_Length(ret, 1); /* MJD */
10865 nextchar(pRExC_state);
10867 RExC_seen_zerolen++;
10868 if (RExC_flags & RXf_PMf_MULTILINE)
10869 ret = reg_node(pRExC_state, MEOL);
10870 else if (RExC_flags & RXf_PMf_SINGLELINE)
10871 ret = reg_node(pRExC_state, SEOL);
10873 ret = reg_node(pRExC_state, EOL);
10874 Set_Node_Length(ret, 1); /* MJD */
10877 nextchar(pRExC_state);
10878 if (RExC_flags & RXf_PMf_SINGLELINE)
10879 ret = reg_node(pRExC_state, SANY);
10881 ret = reg_node(pRExC_state, REG_ANY);
10882 *flagp |= HASWIDTH|SIMPLE;
10884 Set_Node_Length(ret, 1); /* MJD */
10888 char * const oregcomp_parse = ++RExC_parse;
10889 ret = regclass(pRExC_state, flagp,depth+1,
10890 FALSE, /* means parse the whole char class */
10891 TRUE, /* allow multi-char folds */
10892 FALSE, /* don't silence non-portable warnings. */
10894 if (*RExC_parse != ']') {
10895 RExC_parse = oregcomp_parse;
10896 vFAIL("Unmatched [");
10899 if (*flagp & RESTART_UTF8)
10901 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10904 nextchar(pRExC_state);
10905 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10909 nextchar(pRExC_state);
10910 ret = reg(pRExC_state, 2, &flags,depth+1);
10912 if (flags & TRYAGAIN) {
10913 if (RExC_parse == RExC_end) {
10914 /* Make parent create an empty node if needed. */
10915 *flagp |= TRYAGAIN;
10920 if (flags & RESTART_UTF8) {
10921 *flagp = RESTART_UTF8;
10924 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10926 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10930 if (flags & TRYAGAIN) {
10931 *flagp |= TRYAGAIN;
10934 vFAIL("Internal urp");
10935 /* Supposed to be caught earlier. */
10938 if (!regcurly(RExC_parse, FALSE)) {
10947 vFAIL("Quantifier follows nothing");
10952 This switch handles escape sequences that resolve to some kind
10953 of special regop and not to literal text. Escape sequnces that
10954 resolve to literal text are handled below in the switch marked
10957 Every entry in this switch *must* have a corresponding entry
10958 in the literal escape switch. However, the opposite is not
10959 required, as the default for this switch is to jump to the
10960 literal text handling code.
10962 switch ((U8)*++RExC_parse) {
10964 /* Special Escapes */
10966 RExC_seen_zerolen++;
10967 ret = reg_node(pRExC_state, SBOL);
10969 goto finish_meta_pat;
10971 ret = reg_node(pRExC_state, GPOS);
10972 RExC_seen |= REG_SEEN_GPOS;
10974 goto finish_meta_pat;
10976 RExC_seen_zerolen++;
10977 ret = reg_node(pRExC_state, KEEPS);
10979 /* XXX:dmq : disabling in-place substitution seems to
10980 * be necessary here to avoid cases of memory corruption, as
10981 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10983 RExC_seen |= REG_SEEN_LOOKBEHIND;
10984 goto finish_meta_pat;
10986 ret = reg_node(pRExC_state, SEOL);
10988 RExC_seen_zerolen++; /* Do not optimize RE away */
10989 goto finish_meta_pat;
10991 ret = reg_node(pRExC_state, EOS);
10993 RExC_seen_zerolen++; /* Do not optimize RE away */
10994 goto finish_meta_pat;
10996 ret = reg_node(pRExC_state, CANY);
10997 RExC_seen |= REG_SEEN_CANY;
10998 *flagp |= HASWIDTH|SIMPLE;
10999 goto finish_meta_pat;
11001 ret = reg_node(pRExC_state, CLUMP);
11002 *flagp |= HASWIDTH;
11003 goto finish_meta_pat;
11009 arg = ANYOF_WORDCHAR;
11013 RExC_seen_zerolen++;
11014 RExC_seen |= REG_SEEN_LOOKBEHIND;
11015 op = BOUND + get_regex_charset(RExC_flags);
11016 if (op > BOUNDA) { /* /aa is same as /a */
11019 ret = reg_node(pRExC_state, op);
11020 FLAGS(ret) = get_regex_charset(RExC_flags);
11022 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11023 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
11025 goto finish_meta_pat;
11027 RExC_seen_zerolen++;
11028 RExC_seen |= REG_SEEN_LOOKBEHIND;
11029 op = NBOUND + get_regex_charset(RExC_flags);
11030 if (op > NBOUNDA) { /* /aa is same as /a */
11033 ret = reg_node(pRExC_state, op);
11034 FLAGS(ret) = get_regex_charset(RExC_flags);
11036 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11037 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
11039 goto finish_meta_pat;
11049 ret = reg_node(pRExC_state, LNBREAK);
11050 *flagp |= HASWIDTH|SIMPLE;
11051 goto finish_meta_pat;
11059 goto join_posix_op_known;
11065 arg = ANYOF_VERTWS;
11067 goto join_posix_op_known;
11077 op = POSIXD + get_regex_charset(RExC_flags);
11078 if (op > POSIXA) { /* /aa is same as /a */
11082 join_posix_op_known:
11085 op += NPOSIXD - POSIXD;
11088 ret = reg_node(pRExC_state, op);
11090 FLAGS(ret) = namedclass_to_classnum(arg);
11093 *flagp |= HASWIDTH|SIMPLE;
11097 nextchar(pRExC_state);
11098 Set_Node_Length(ret, 2); /* MJD */
11104 char* parse_start = RExC_parse - 2;
11109 ret = regclass(pRExC_state, flagp,depth+1,
11110 TRUE, /* means just parse this element */
11111 FALSE, /* don't allow multi-char folds */
11112 FALSE, /* don't silence non-portable warnings.
11113 It would be a bug if these returned
11116 /* regclass() can only return RESTART_UTF8 if multi-char folds
11119 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11124 Set_Node_Offset(ret, parse_start + 2);
11125 Set_Node_Cur_Length(ret, parse_start);
11126 nextchar(pRExC_state);
11130 /* Handle \N and \N{NAME} with multiple code points here and not
11131 * below because it can be multicharacter. join_exact() will join
11132 * them up later on. Also this makes sure that things like
11133 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11134 * The options to the grok function call causes it to fail if the
11135 * sequence is just a single code point. We then go treat it as
11136 * just another character in the current EXACT node, and hence it
11137 * gets uniform treatment with all the other characters. The
11138 * special treatment for quantifiers is not needed for such single
11139 * character sequences */
11141 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11142 FALSE /* not strict */ )) {
11143 if (*flagp & RESTART_UTF8)
11149 case 'k': /* Handle \k<NAME> and \k'NAME' */
11152 char ch= RExC_parse[1];
11153 if (ch != '<' && ch != '\'' && ch != '{') {
11155 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11156 vFAIL2("Sequence %.2s... not terminated",parse_start);
11158 /* this pretty much dupes the code for (?P=...) in reg(), if
11159 you change this make sure you change that */
11160 char* name_start = (RExC_parse += 2);
11162 SV *sv_dat = reg_scan_name(pRExC_state,
11163 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11164 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11165 if (RExC_parse == name_start || *RExC_parse != ch)
11166 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11167 vFAIL2("Sequence %.3s... not terminated",parse_start);
11170 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11171 RExC_rxi->data->data[num]=(void*)sv_dat;
11172 SvREFCNT_inc_simple_void(sv_dat);
11176 ret = reganode(pRExC_state,
11179 : (ASCII_FOLD_RESTRICTED)
11181 : (AT_LEAST_UNI_SEMANTICS)
11187 *flagp |= HASWIDTH;
11189 /* override incorrect value set in reganode MJD */
11190 Set_Node_Offset(ret, parse_start+1);
11191 Set_Node_Cur_Length(ret, parse_start);
11192 nextchar(pRExC_state);
11198 case '1': case '2': case '3': case '4':
11199 case '5': case '6': case '7': case '8': case '9':
11204 if (*RExC_parse == 'g') {
11208 if (*RExC_parse == '{') {
11212 if (*RExC_parse == '-') {
11216 if (hasbrace && !isDIGIT(*RExC_parse)) {
11217 if (isrel) RExC_parse--;
11219 goto parse_named_seq;
11222 num = S_backref_value(RExC_parse);
11224 vFAIL("Reference to invalid group 0");
11225 else if (num == I32_MAX) {
11226 if (isDIGIT(*RExC_parse))
11227 vFAIL("Reference to nonexistent group");
11229 vFAIL("Unterminated \\g... pattern");
11233 num = RExC_npar - num;
11235 vFAIL("Reference to nonexistent or unclosed group");
11239 num = S_backref_value(RExC_parse);
11240 /* bare \NNN might be backref or octal */
11241 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11242 && *RExC_parse != '8' && *RExC_parse != '9'))
11243 /* Probably a character specified in octal, e.g. \35 */
11247 /* at this point RExC_parse definitely points to a backref
11250 #ifdef RE_TRACK_PATTERN_OFFSETS
11251 char * const parse_start = RExC_parse - 1; /* MJD */
11253 while (isDIGIT(*RExC_parse))
11256 if (*RExC_parse != '}')
11257 vFAIL("Unterminated \\g{...} pattern");
11261 if (num > (I32)RExC_rx->nparens)
11262 vFAIL("Reference to nonexistent group");
11265 ret = reganode(pRExC_state,
11268 : (ASCII_FOLD_RESTRICTED)
11270 : (AT_LEAST_UNI_SEMANTICS)
11276 *flagp |= HASWIDTH;
11278 /* override incorrect value set in reganode MJD */
11279 Set_Node_Offset(ret, parse_start+1);
11280 Set_Node_Cur_Length(ret, parse_start);
11282 nextchar(pRExC_state);
11287 if (RExC_parse >= RExC_end)
11288 FAIL("Trailing \\");
11291 /* Do not generate "unrecognized" warnings here, we fall
11292 back into the quick-grab loop below */
11299 if (RExC_flags & RXf_PMf_EXTENDED) {
11300 if ( reg_skipcomment( pRExC_state ) )
11307 parse_start = RExC_parse - 1;
11316 #define MAX_NODE_STRING_SIZE 127
11317 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11319 U8 upper_parse = MAX_NODE_STRING_SIZE;
11321 U8 node_type = compute_EXACTish(pRExC_state);
11322 bool next_is_quantifier;
11323 char * oldp = NULL;
11325 /* We can convert EXACTF nodes to EXACTFU if they contain only
11326 * characters that match identically regardless of the target
11327 * string's UTF8ness. The reason to do this is that EXACTF is not
11328 * trie-able, EXACTFU is. (We don't need to figure this out until
11330 bool maybe_exactfu = node_type == EXACTF && PASS2;
11332 /* If a folding node contains only code points that don't
11333 * participate in folds, it can be changed into an EXACT node,
11334 * which allows the optimizer more things to look for */
11337 ret = reg_node(pRExC_state, node_type);
11339 /* In pass1, folded, we use a temporary buffer instead of the
11340 * actual node, as the node doesn't exist yet */
11341 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11347 /* We do the EXACTFish to EXACT node only if folding, and not if in
11348 * locale, as whether a character folds or not isn't known until
11349 * runtime. (And we don't need to figure this out until pass 2) */
11350 maybe_exact = FOLD && ! LOC && PASS2;
11352 /* XXX The node can hold up to 255 bytes, yet this only goes to
11353 * 127. I (khw) do not know why. Keeping it somewhat less than
11354 * 255 allows us to not have to worry about overflow due to
11355 * converting to utf8 and fold expansion, but that value is
11356 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11357 * split up by this limit into a single one using the real max of
11358 * 255. Even at 127, this breaks under rare circumstances. If
11359 * folding, we do not want to split a node at a character that is a
11360 * non-final in a multi-char fold, as an input string could just
11361 * happen to want to match across the node boundary. The join
11362 * would solve that problem if the join actually happens. But a
11363 * series of more than two nodes in a row each of 127 would cause
11364 * the first join to succeed to get to 254, but then there wouldn't
11365 * be room for the next one, which could at be one of those split
11366 * multi-char folds. I don't know of any fool-proof solution. One
11367 * could back off to end with only a code point that isn't such a
11368 * non-final, but it is possible for there not to be any in the
11370 for (p = RExC_parse - 1;
11371 len < upper_parse && p < RExC_end;
11376 if (RExC_flags & RXf_PMf_EXTENDED)
11377 p = regwhite( pRExC_state, p );
11388 /* Literal Escapes Switch
11390 This switch is meant to handle escape sequences that
11391 resolve to a literal character.
11393 Every escape sequence that represents something
11394 else, like an assertion or a char class, is handled
11395 in the switch marked 'Special Escapes' above in this
11396 routine, but also has an entry here as anything that
11397 isn't explicitly mentioned here will be treated as
11398 an unescaped equivalent literal.
11401 switch ((U8)*++p) {
11402 /* These are all the special escapes. */
11403 case 'A': /* Start assertion */
11404 case 'b': case 'B': /* Word-boundary assertion*/
11405 case 'C': /* Single char !DANGEROUS! */
11406 case 'd': case 'D': /* digit class */
11407 case 'g': case 'G': /* generic-backref, pos assertion */
11408 case 'h': case 'H': /* HORIZWS */
11409 case 'k': case 'K': /* named backref, keep marker */
11410 case 'p': case 'P': /* Unicode property */
11411 case 'R': /* LNBREAK */
11412 case 's': case 'S': /* space class */
11413 case 'v': case 'V': /* VERTWS */
11414 case 'w': case 'W': /* word class */
11415 case 'X': /* eXtended Unicode "combining character sequence" */
11416 case 'z': case 'Z': /* End of line/string assertion */
11420 /* Anything after here is an escape that resolves to a
11421 literal. (Except digits, which may or may not)
11427 case 'N': /* Handle a single-code point named character. */
11428 /* The options cause it to fail if a multiple code
11429 * point sequence. Handle those in the switch() above
11431 RExC_parse = p + 1;
11432 if (! grok_bslash_N(pRExC_state, NULL, &ender,
11433 flagp, depth, FALSE,
11434 FALSE /* not strict */ ))
11436 if (*flagp & RESTART_UTF8)
11437 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11438 RExC_parse = p = oldp;
11442 if (ender > 0xff) {
11459 ender = ASCII_TO_NATIVE('\033');
11469 const char* error_msg;
11471 bool valid = grok_bslash_o(&p,
11474 TRUE, /* out warnings */
11475 FALSE, /* not strict */
11476 TRUE, /* Output warnings
11481 RExC_parse = p; /* going to die anyway; point
11482 to exact spot of failure */
11486 if (PL_encoding && ender < 0x100) {
11487 goto recode_encoding;
11489 if (ender > 0xff) {
11496 UV result = UV_MAX; /* initialize to erroneous
11498 const char* error_msg;
11500 bool valid = grok_bslash_x(&p,
11503 TRUE, /* out warnings */
11504 FALSE, /* not strict */
11505 TRUE, /* Output warnings
11510 RExC_parse = p; /* going to die anyway; point
11511 to exact spot of failure */
11516 if (PL_encoding && ender < 0x100) {
11517 goto recode_encoding;
11519 if (ender > 0xff) {
11526 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11528 case '8': case '9': /* must be a backreference */
11531 case '1': case '2': case '3':case '4':
11532 case '5': case '6': case '7':
11533 /* When we parse backslash escapes there is ambiguity
11534 * between backreferences and octal escapes. Any escape
11535 * from \1 - \9 is a backreference, any multi-digit
11536 * escape which does not start with 0 and which when
11537 * evaluated as decimal could refer to an already
11538 * parsed capture buffer is a backslash. Anything else
11541 * Note this implies that \118 could be interpreted as
11542 * 118 OR as "\11" . "8" depending on whether there
11543 * were 118 capture buffers defined already in the
11545 if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
11546 { /* Not to be treated as an octal constant, go
11553 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11555 ender = grok_oct(p, &numlen, &flags, NULL);
11556 if (ender > 0xff) {
11560 if (SIZE_ONLY /* like \08, \178 */
11563 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11565 reg_warn_non_literal_string(
11567 form_short_octal_warning(p, numlen));
11570 if (PL_encoding && ender < 0x100)
11571 goto recode_encoding;
11574 if (! RExC_override_recoding) {
11575 SV* enc = PL_encoding;
11576 ender = reg_recode((const char)(U8)ender, &enc);
11577 if (!enc && SIZE_ONLY)
11578 ckWARNreg(p, "Invalid escape in the specified encoding");
11584 FAIL("Trailing \\");
11587 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11588 /* Include any { following the alpha to emphasize
11589 * that it could be part of an escape at some point
11591 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11592 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11594 goto normal_default;
11595 } /* End of switch on '\' */
11597 default: /* A literal character */
11600 && RExC_flags & RXf_PMf_EXTENDED
11601 && ckWARN_d(WARN_DEPRECATED)
11602 && is_PATWS_non_low(p, UTF))
11604 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11605 "Escape literal pattern white space under /x");
11609 if (UTF8_IS_START(*p) && UTF) {
11611 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11612 &numlen, UTF8_ALLOW_DEFAULT);
11618 } /* End of switch on the literal */
11620 /* Here, have looked at the literal character and <ender>
11621 * contains its ordinal, <p> points to the character after it
11624 if ( RExC_flags & RXf_PMf_EXTENDED)
11625 p = regwhite( pRExC_state, p );
11627 /* If the next thing is a quantifier, it applies to this
11628 * character only, which means that this character has to be in
11629 * its own node and can't just be appended to the string in an
11630 * existing node, so if there are already other characters in
11631 * the node, close the node with just them, and set up to do
11632 * this character again next time through, when it will be the
11633 * only thing in its new node */
11634 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11642 const STRLEN unilen = reguni(pRExC_state, ender, s);
11648 /* The loop increments <len> each time, as all but this
11649 * path (and one other) through it add a single byte to
11650 * the EXACTish node. But this one has changed len to
11651 * be the correct final value, so subtract one to
11652 * cancel out the increment that follows */
11656 REGC((char)ender, s++);
11659 else /* FOLD */ if (! ( UTF
11660 /* See comments for join_exact() as to why we fold this
11661 * non-UTF at compile time */
11662 || (node_type == EXACTFU
11663 && ender == LATIN_SMALL_LETTER_SHARP_S)))
11665 if (IS_IN_SOME_FOLD_L1(ender)) {
11666 maybe_exact = FALSE;
11668 /* See if the character's fold differs between /d and
11669 * /u. This includes the multi-char fold SHARP S to
11672 && (PL_fold[ender] != PL_fold_latin1[ender]
11673 || ender == LATIN_SMALL_LETTER_SHARP_S
11675 && isARG2_lower_or_UPPER_ARG1('s', ender)
11676 && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11678 maybe_exactfu = FALSE;
11681 *(s++) = (char) ender;
11685 /* Prime the casefolded buffer. Locale rules, which apply
11686 * only to code points < 256, aren't known until execution,
11687 * so for them, just output the original character using
11688 * utf8. If we start to fold non-UTF patterns, be sure to
11689 * update join_exact() */
11690 if (LOC && ender < 256) {
11691 if (UVCHR_IS_INVARIANT(ender)) {
11695 *s = UTF8_TWO_BYTE_HI(ender);
11696 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11701 UV folded = _to_uni_fold_flags(
11706 | ((LOC) ? FOLD_FLAGS_LOCALE
11707 : (ASCII_FOLD_RESTRICTED)
11708 ? FOLD_FLAGS_NOMIX_ASCII
11712 /* If this node only contains non-folding code points
11713 * so far, see if this new one is also non-folding */
11715 if (folded != ender) {
11716 maybe_exact = FALSE;
11719 /* Here the fold is the original; we have
11720 * to check further to see if anything
11722 if (! PL_utf8_foldable) {
11723 SV* swash = swash_init("utf8",
11725 &PL_sv_undef, 1, 0);
11727 _get_swash_invlist(swash);
11728 SvREFCNT_dec_NN(swash);
11730 if (_invlist_contains_cp(PL_utf8_foldable,
11733 maybe_exact = FALSE;
11741 /* The loop increments <len> each time, as all but this
11742 * path (and one other) through it add a single byte to the
11743 * EXACTish node. But this one has changed len to be the
11744 * correct final value, so subtract one to cancel out the
11745 * increment that follows */
11746 len += foldlen - 1;
11749 if (next_is_quantifier) {
11751 /* Here, the next input is a quantifier, and to get here,
11752 * the current character is the only one in the node.
11753 * Also, here <len> doesn't include the final byte for this
11759 } /* End of loop through literal characters */
11761 /* Here we have either exhausted the input or ran out of room in
11762 * the node. (If we encountered a character that can't be in the
11763 * node, transfer is made directly to <loopdone>, and so we
11764 * wouldn't have fallen off the end of the loop.) In the latter
11765 * case, we artificially have to split the node into two, because
11766 * we just don't have enough space to hold everything. This
11767 * creates a problem if the final character participates in a
11768 * multi-character fold in the non-final position, as a match that
11769 * should have occurred won't, due to the way nodes are matched,
11770 * and our artificial boundary. So back off until we find a non-
11771 * problematic character -- one that isn't at the beginning or
11772 * middle of such a fold. (Either it doesn't participate in any
11773 * folds, or appears only in the final position of all the folds it
11774 * does participate in.) A better solution with far fewer false
11775 * positives, and that would fill the nodes more completely, would
11776 * be to actually have available all the multi-character folds to
11777 * test against, and to back-off only far enough to be sure that
11778 * this node isn't ending with a partial one. <upper_parse> is set
11779 * further below (if we need to reparse the node) to include just
11780 * up through that final non-problematic character that this code
11781 * identifies, so when it is set to less than the full node, we can
11782 * skip the rest of this */
11783 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11785 const STRLEN full_len = len;
11787 assert(len >= MAX_NODE_STRING_SIZE);
11789 /* Here, <s> points to the final byte of the final character.
11790 * Look backwards through the string until find a non-
11791 * problematic character */
11795 /* These two have no multi-char folds to non-UTF characters
11797 if (ASCII_FOLD_RESTRICTED || LOC) {
11801 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11805 if (! PL_NonL1NonFinalFold) {
11806 PL_NonL1NonFinalFold = _new_invlist_C_array(
11807 NonL1_Perl_Non_Final_Folds_invlist);
11810 /* Point to the first byte of the final character */
11811 s = (char *) utf8_hop((U8 *) s, -1);
11813 while (s >= s0) { /* Search backwards until find
11814 non-problematic char */
11815 if (UTF8_IS_INVARIANT(*s)) {
11817 /* There are no ascii characters that participate
11818 * in multi-char folds under /aa. In EBCDIC, the
11819 * non-ascii invariants are all control characters,
11820 * so don't ever participate in any folds. */
11821 if (ASCII_FOLD_RESTRICTED
11822 || ! IS_NON_FINAL_FOLD(*s))
11827 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11829 /* No Latin1 characters participate in multi-char
11830 * folds under /l */
11832 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11838 else if (! _invlist_contains_cp(
11839 PL_NonL1NonFinalFold,
11840 valid_utf8_to_uvchr((U8 *) s, NULL)))
11845 /* Here, the current character is problematic in that
11846 * it does occur in the non-final position of some
11847 * fold, so try the character before it, but have to
11848 * special case the very first byte in the string, so
11849 * we don't read outside the string */
11850 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11851 } /* End of loop backwards through the string */
11853 /* If there were only problematic characters in the string,
11854 * <s> will point to before s0, in which case the length
11855 * should be 0, otherwise include the length of the
11856 * non-problematic character just found */
11857 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11860 /* Here, have found the final character, if any, that is
11861 * non-problematic as far as ending the node without splitting
11862 * it across a potential multi-char fold. <len> contains the
11863 * number of bytes in the node up-to and including that
11864 * character, or is 0 if there is no such character, meaning
11865 * the whole node contains only problematic characters. In
11866 * this case, give up and just take the node as-is. We can't
11871 /* If the node ends in an 's' we make sure it stays EXACTF,
11872 * as if it turns into an EXACTFU, it could later get
11873 * joined with another 's' that would then wrongly match
11875 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11877 maybe_exactfu = FALSE;
11881 /* Here, the node does contain some characters that aren't
11882 * problematic. If one such is the final character in the
11883 * node, we are done */
11884 if (len == full_len) {
11887 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11889 /* If the final character is problematic, but the
11890 * penultimate is not, back-off that last character to
11891 * later start a new node with it */
11896 /* Here, the final non-problematic character is earlier
11897 * in the input than the penultimate character. What we do
11898 * is reparse from the beginning, going up only as far as
11899 * this final ok one, thus guaranteeing that the node ends
11900 * in an acceptable character. The reason we reparse is
11901 * that we know how far in the character is, but we don't
11902 * know how to correlate its position with the input parse.
11903 * An alternate implementation would be to build that
11904 * correlation as we go along during the original parse,
11905 * but that would entail extra work for every node, whereas
11906 * this code gets executed only when the string is too
11907 * large for the node, and the final two characters are
11908 * problematic, an infrequent occurrence. Yet another
11909 * possible strategy would be to save the tail of the
11910 * string, and the next time regatom is called, initialize
11911 * with that. The problem with this is that unless you
11912 * back off one more character, you won't be guaranteed
11913 * regatom will get called again, unless regbranch,
11914 * regpiece ... are also changed. If you do back off that
11915 * extra character, so that there is input guaranteed to
11916 * force calling regatom, you can't handle the case where
11917 * just the first character in the node is acceptable. I
11918 * (khw) decided to try this method which doesn't have that
11919 * pitfall; if performance issues are found, we can do a
11920 * combination of the current approach plus that one */
11926 } /* End of verifying node ends with an appropriate char */
11928 loopdone: /* Jumped to when encounters something that shouldn't be in
11931 /* I (khw) don't know if you can get here with zero length, but the
11932 * old code handled this situation by creating a zero-length EXACT
11933 * node. Might as well be NOTHING instead */
11939 /* If 'maybe_exact' is still set here, means there are no
11940 * code points in the node that participate in folds;
11941 * similarly for 'maybe_exactfu' and code points that match
11942 * differently depending on UTF8ness of the target string
11947 else if (maybe_exactfu) {
11951 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11954 RExC_parse = p - 1;
11955 Set_Node_Cur_Length(ret, parse_start);
11956 nextchar(pRExC_state);
11958 /* len is STRLEN which is unsigned, need to copy to signed */
11961 vFAIL("Internal disaster");
11964 } /* End of label 'defchar:' */
11966 } /* End of giant switch on input character */
11972 S_regwhite( RExC_state_t *pRExC_state, char *p )
11974 const char *e = RExC_end;
11976 PERL_ARGS_ASSERT_REGWHITE;
11981 else if (*p == '#') {
11984 if (*p++ == '\n') {
11990 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11999 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12001 /* Returns the next non-pattern-white space, non-comment character (the
12002 * latter only if 'recognize_comment is true) in the string p, which is
12003 * ended by RExC_end. If there is no line break ending a comment,
12004 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
12005 const char *e = RExC_end;
12007 PERL_ARGS_ASSERT_REGPATWS;
12011 if ((len = is_PATWS_safe(p, e, UTF))) {
12014 else if (recognize_comment && *p == '#') {
12018 if (is_LNBREAK_safe(p, e, UTF)) {
12024 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12033 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12035 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12036 * sets up the bitmap and any flags, removing those code points from the
12037 * inversion list, setting it to NULL should it become completely empty */
12039 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12040 assert(PL_regkind[OP(node)] == ANYOF);
12042 ANYOF_BITMAP_ZERO(node);
12043 if (*invlist_ptr) {
12045 /* This gets set if we actually need to modify things */
12046 bool change_invlist = FALSE;
12050 /* Start looking through *invlist_ptr */
12051 invlist_iterinit(*invlist_ptr);
12052 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12056 if (end == UV_MAX && start <= 256) {
12057 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12060 /* Quit if are above what we should change */
12065 change_invlist = TRUE;
12067 /* Set all the bits in the range, up to the max that we are doing */
12068 high = (end < 255) ? end : 255;
12069 for (i = start; i <= (int) high; i++) {
12070 if (! ANYOF_BITMAP_TEST(node, i)) {
12071 ANYOF_BITMAP_SET(node, i);
12075 invlist_iterfinish(*invlist_ptr);
12077 /* Done with loop; remove any code points that are in the bitmap from
12078 * *invlist_ptr; similarly for code points above latin1 if we have a flag
12079 * to match all of them anyways */
12080 if (change_invlist) {
12081 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12083 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12084 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12087 /* If have completely emptied it, remove it completely */
12088 if (_invlist_len(*invlist_ptr) == 0) {
12089 SvREFCNT_dec_NN(*invlist_ptr);
12090 *invlist_ptr = NULL;
12095 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12096 Character classes ([:foo:]) can also be negated ([:^foo:]).
12097 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12098 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12099 but trigger failures because they are currently unimplemented. */
12101 #define POSIXCC_DONE(c) ((c) == ':')
12102 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12103 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12105 PERL_STATIC_INLINE I32
12106 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12109 I32 namedclass = OOB_NAMEDCLASS;
12111 PERL_ARGS_ASSERT_REGPPOSIXCC;
12113 if (value == '[' && RExC_parse + 1 < RExC_end &&
12114 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12115 POSIXCC(UCHARAT(RExC_parse)))
12117 const char c = UCHARAT(RExC_parse);
12118 char* const s = RExC_parse++;
12120 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12122 if (RExC_parse == RExC_end) {
12125 /* Try to give a better location for the error (than the end of
12126 * the string) by looking for the matching ']' */
12128 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12131 vFAIL2("Unmatched '%c' in POSIX class", c);
12133 /* Grandfather lone [:, [=, [. */
12137 const char* const t = RExC_parse++; /* skip over the c */
12140 if (UCHARAT(RExC_parse) == ']') {
12141 const char *posixcc = s + 1;
12142 RExC_parse++; /* skip over the ending ] */
12145 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12146 const I32 skip = t - posixcc;
12148 /* Initially switch on the length of the name. */
12151 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12152 this is the Perl \w
12154 namedclass = ANYOF_WORDCHAR;
12157 /* Names all of length 5. */
12158 /* alnum alpha ascii blank cntrl digit graph lower
12159 print punct space upper */
12160 /* Offset 4 gives the best switch position. */
12161 switch (posixcc[4]) {
12163 if (memEQ(posixcc, "alph", 4)) /* alpha */
12164 namedclass = ANYOF_ALPHA;
12167 if (memEQ(posixcc, "spac", 4)) /* space */
12168 namedclass = ANYOF_PSXSPC;
12171 if (memEQ(posixcc, "grap", 4)) /* graph */
12172 namedclass = ANYOF_GRAPH;
12175 if (memEQ(posixcc, "asci", 4)) /* ascii */
12176 namedclass = ANYOF_ASCII;
12179 if (memEQ(posixcc, "blan", 4)) /* blank */
12180 namedclass = ANYOF_BLANK;
12183 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12184 namedclass = ANYOF_CNTRL;
12187 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12188 namedclass = ANYOF_ALPHANUMERIC;
12191 if (memEQ(posixcc, "lowe", 4)) /* lower */
12192 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12193 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12194 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12197 if (memEQ(posixcc, "digi", 4)) /* digit */
12198 namedclass = ANYOF_DIGIT;
12199 else if (memEQ(posixcc, "prin", 4)) /* print */
12200 namedclass = ANYOF_PRINT;
12201 else if (memEQ(posixcc, "punc", 4)) /* punct */
12202 namedclass = ANYOF_PUNCT;
12207 if (memEQ(posixcc, "xdigit", 6))
12208 namedclass = ANYOF_XDIGIT;
12212 if (namedclass == OOB_NAMEDCLASS)
12214 "POSIX class [:%"UTF8f":] unknown",
12215 UTF8fARG(UTF, t - s - 1, s + 1));
12217 /* The #defines are structured so each complement is +1 to
12218 * the normal one */
12222 assert (posixcc[skip] == ':');
12223 assert (posixcc[skip+1] == ']');
12224 } else if (!SIZE_ONLY) {
12225 /* [[=foo=]] and [[.foo.]] are still future. */
12227 /* adjust RExC_parse so the warning shows after
12228 the class closes */
12229 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12231 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12234 /* Maternal grandfather:
12235 * "[:" ending in ":" but not in ":]" */
12237 vFAIL("Unmatched '[' in POSIX class");
12240 /* Grandfather lone [:, [=, [. */
12250 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12252 /* This applies some heuristics at the current parse position (which should
12253 * be at a '[') to see if what follows might be intended to be a [:posix:]
12254 * class. It returns true if it really is a posix class, of course, but it
12255 * also can return true if it thinks that what was intended was a posix
12256 * class that didn't quite make it.
12258 * It will return true for
12260 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12261 * ')' indicating the end of the (?[
12262 * [:any garbage including %^&$ punctuation:]
12264 * This is designed to be called only from S_handle_regex_sets; it could be
12265 * easily adapted to be called from the spot at the beginning of regclass()
12266 * that checks to see in a normal bracketed class if the surrounding []
12267 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12268 * change long-standing behavior, so I (khw) didn't do that */
12269 char* p = RExC_parse + 1;
12270 char first_char = *p;
12272 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12274 assert(*(p - 1) == '[');
12276 if (! POSIXCC(first_char)) {
12281 while (p < RExC_end && isWORDCHAR(*p)) p++;
12283 if (p >= RExC_end) {
12287 if (p - RExC_parse > 2 /* Got at least 1 word character */
12288 && (*p == first_char
12289 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12294 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12297 && p - RExC_parse > 2 /* [:] evaluates to colon;
12298 [::] is a bad posix class. */
12299 && first_char == *(p - 1));
12303 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
12304 char * const oregcomp_parse)
12306 /* Handle the (?[...]) construct to do set operations */
12309 UV start, end; /* End points of code point ranges */
12311 char *save_end, *save_parse;
12316 const bool save_fold = FOLD;
12318 GET_RE_DEBUG_FLAGS_DECL;
12320 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12323 vFAIL("(?[...]) not valid in locale");
12325 RExC_uni_semantics = 1;
12327 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12328 * (such as EXACT). Thus we can skip most everything if just sizing. We
12329 * call regclass to handle '[]' so as to not have to reinvent its parsing
12330 * rules here (throwing away the size it computes each time). And, we exit
12331 * upon an unescaped ']' that isn't one ending a regclass. To do both
12332 * these things, we need to realize that something preceded by a backslash
12333 * is escaped, so we have to keep track of backslashes */
12335 UV depth = 0; /* how many nested (?[...]) constructs */
12337 Perl_ck_warner_d(aTHX_
12338 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12339 "The regex_sets feature is experimental" REPORT_LOCATION,
12340 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12341 UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
12343 while (RExC_parse < RExC_end) {
12344 SV* current = NULL;
12345 RExC_parse = regpatws(pRExC_state, RExC_parse,
12346 TRUE); /* means recognize comments */
12347 switch (*RExC_parse) {
12349 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12354 /* Skip the next byte (which could cause us to end up in
12355 * the middle of a UTF-8 character, but since none of those
12356 * are confusable with anything we currently handle in this
12357 * switch (invariants all), it's safe. We'll just hit the
12358 * default: case next time and keep on incrementing until
12359 * we find one of the invariants we do handle. */
12364 /* If this looks like it is a [:posix:] class, leave the
12365 * parse pointer at the '[' to fool regclass() into
12366 * thinking it is part of a '[[:posix:]]'. That function
12367 * will use strict checking to force a syntax error if it
12368 * doesn't work out to a legitimate class */
12369 bool is_posix_class
12370 = could_it_be_a_POSIX_class(pRExC_state);
12371 if (! is_posix_class) {
12375 /* regclass() can only return RESTART_UTF8 if multi-char
12376 folds are allowed. */
12377 if (!regclass(pRExC_state, flagp,depth+1,
12378 is_posix_class, /* parse the whole char
12379 class only if not a
12381 FALSE, /* don't allow multi-char folds */
12382 TRUE, /* silence non-portable warnings. */
12384 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12387 /* function call leaves parse pointing to the ']', except
12388 * if we faked it */
12389 if (is_posix_class) {
12393 SvREFCNT_dec(current); /* In case it returned something */
12398 if (depth--) break;
12400 if (RExC_parse < RExC_end
12401 && *RExC_parse == ')')
12403 node = reganode(pRExC_state, ANYOF, 0);
12404 RExC_size += ANYOF_SKIP;
12405 nextchar(pRExC_state);
12406 Set_Node_Length(node,
12407 RExC_parse - oregcomp_parse + 1); /* MJD */
12416 FAIL("Syntax error in (?[...])");
12419 /* Pass 2 only after this. Everything in this construct is a
12420 * metacharacter. Operands begin with either a '\' (for an escape
12421 * sequence), or a '[' for a bracketed character class. Any other
12422 * character should be an operator, or parenthesis for grouping. Both
12423 * types of operands are handled by calling regclass() to parse them. It
12424 * is called with a parameter to indicate to return the computed inversion
12425 * list. The parsing here is implemented via a stack. Each entry on the
12426 * stack is a single character representing one of the operators, or the
12427 * '('; or else a pointer to an operand inversion list. */
12429 #define IS_OPERAND(a) (! SvIOK(a))
12431 /* The stack starts empty. It is a syntax error if the first thing parsed
12432 * is a binary operator; everything else is pushed on the stack. When an
12433 * operand is parsed, the top of the stack is examined. If it is a binary
12434 * operator, the item before it should be an operand, and both are replaced
12435 * by the result of doing that operation on the new operand and the one on
12436 * the stack. Thus a sequence of binary operands is reduced to a single
12437 * one before the next one is parsed.
12439 * A unary operator may immediately follow a binary in the input, for
12442 * When an operand is parsed and the top of the stack is a unary operator,
12443 * the operation is performed, and then the stack is rechecked to see if
12444 * this new operand is part of a binary operation; if so, it is handled as
12447 * A '(' is simply pushed on the stack; it is valid only if the stack is
12448 * empty, or the top element of the stack is an operator or another '('
12449 * (for which the parenthesized expression will become an operand). By the
12450 * time the corresponding ')' is parsed everything in between should have
12451 * been parsed and evaluated to a single operand (or else is a syntax
12452 * error), and is handled as a regular operand */
12454 sv_2mortal((SV *)(stack = newAV()));
12456 while (RExC_parse < RExC_end) {
12457 I32 top_index = av_tindex(stack);
12459 SV* current = NULL;
12461 /* Skip white space */
12462 RExC_parse = regpatws(pRExC_state, RExC_parse,
12463 TRUE); /* means recognize comments */
12464 if (RExC_parse >= RExC_end) {
12465 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12467 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12474 if (av_tindex(stack) >= 0 /* This makes sure that we can
12475 safely subtract 1 from
12476 RExC_parse in the next clause.
12477 If we have something on the
12478 stack, we have parsed something
12480 && UCHARAT(RExC_parse - 1) == '('
12481 && RExC_parse < RExC_end)
12483 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12484 * This happens when we have some thing like
12486 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12488 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12490 * Here we would be handling the interpolated
12491 * '$thai_or_lao'. We handle this by a recursive call to
12492 * ourselves which returns the inversion list the
12493 * interpolated expression evaluates to. We use the flags
12494 * from the interpolated pattern. */
12495 U32 save_flags = RExC_flags;
12496 const char * const save_parse = ++RExC_parse;
12498 parse_lparen_question_flags(pRExC_state);
12500 if (RExC_parse == save_parse /* Makes sure there was at
12501 least one flag (or this
12502 embedding wasn't compiled)
12504 || RExC_parse >= RExC_end - 4
12505 || UCHARAT(RExC_parse) != ':'
12506 || UCHARAT(++RExC_parse) != '('
12507 || UCHARAT(++RExC_parse) != '?'
12508 || UCHARAT(++RExC_parse) != '[')
12511 /* In combination with the above, this moves the
12512 * pointer to the point just after the first erroneous
12513 * character (or if there are no flags, to where they
12514 * should have been) */
12515 if (RExC_parse >= RExC_end - 4) {
12516 RExC_parse = RExC_end;
12518 else if (RExC_parse != save_parse) {
12519 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12521 vFAIL("Expecting '(?flags:(?[...'");
12524 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
12525 depth+1, oregcomp_parse);
12527 /* Here, 'current' contains the embedded expression's
12528 * inversion list, and RExC_parse points to the trailing
12529 * ']'; the next character should be the ')' which will be
12530 * paired with the '(' that has been put on the stack, so
12531 * the whole embedded expression reduces to '(operand)' */
12534 RExC_flags = save_flags;
12535 goto handle_operand;
12540 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12541 vFAIL("Unexpected character");
12544 /* regclass() can only return RESTART_UTF8 if multi-char
12545 folds are allowed. */
12546 if (!regclass(pRExC_state, flagp,depth+1,
12547 TRUE, /* means parse just the next thing */
12548 FALSE, /* don't allow multi-char folds */
12549 FALSE, /* don't silence non-portable warnings. */
12551 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12553 /* regclass() will return with parsing just the \ sequence,
12554 * leaving the parse pointer at the next thing to parse */
12556 goto handle_operand;
12558 case '[': /* Is a bracketed character class */
12560 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12562 if (! is_posix_class) {
12566 /* regclass() can only return RESTART_UTF8 if multi-char
12567 folds are allowed. */
12568 if(!regclass(pRExC_state, flagp,depth+1,
12569 is_posix_class, /* parse the whole char class
12570 only if not a posix class */
12571 FALSE, /* don't allow multi-char folds */
12572 FALSE, /* don't silence non-portable warnings. */
12574 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12576 /* function call leaves parse pointing to the ']', except if we
12578 if (is_posix_class) {
12582 goto handle_operand;
12591 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12592 || ! IS_OPERAND(*top_ptr))
12595 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12597 av_push(stack, newSVuv(curchar));
12601 av_push(stack, newSVuv(curchar));
12605 if (top_index >= 0) {
12606 top_ptr = av_fetch(stack, top_index, FALSE);
12608 if (IS_OPERAND(*top_ptr)) {
12610 vFAIL("Unexpected '(' with no preceding operator");
12613 av_push(stack, newSVuv(curchar));
12620 || ! (current = av_pop(stack))
12621 || ! IS_OPERAND(current)
12622 || ! (lparen = av_pop(stack))
12623 || IS_OPERAND(lparen)
12624 || SvUV(lparen) != '(')
12626 SvREFCNT_dec(current);
12628 vFAIL("Unexpected ')'");
12631 SvREFCNT_dec_NN(lparen);
12638 /* Here, we have an operand to process, in 'current' */
12640 if (top_index < 0) { /* Just push if stack is empty */
12641 av_push(stack, current);
12644 SV* top = av_pop(stack);
12646 char current_operator;
12648 if (IS_OPERAND(top)) {
12649 SvREFCNT_dec_NN(top);
12650 SvREFCNT_dec_NN(current);
12651 vFAIL("Operand with no preceding operator");
12653 current_operator = (char) SvUV(top);
12654 switch (current_operator) {
12655 case '(': /* Push the '(' back on followed by the new
12657 av_push(stack, top);
12658 av_push(stack, current);
12659 SvREFCNT_inc(top); /* Counters the '_dec' done
12660 just after the 'break', so
12661 it doesn't get wrongly freed
12666 _invlist_invert(current);
12668 /* Unlike binary operators, the top of the stack,
12669 * now that this unary one has been popped off, may
12670 * legally be an operator, and we now have operand
12673 SvREFCNT_dec_NN(top);
12674 goto handle_operand;
12677 prev = av_pop(stack);
12678 _invlist_intersection(prev,
12681 av_push(stack, current);
12686 prev = av_pop(stack);
12687 _invlist_union(prev, current, ¤t);
12688 av_push(stack, current);
12692 prev = av_pop(stack);;
12693 _invlist_subtract(prev, current, ¤t);
12694 av_push(stack, current);
12697 case '^': /* The union minus the intersection */
12703 prev = av_pop(stack);
12704 _invlist_union(prev, current, &u);
12705 _invlist_intersection(prev, current, &i);
12706 /* _invlist_subtract will overwrite current
12707 without freeing what it already contains */
12709 _invlist_subtract(u, i, ¤t);
12710 av_push(stack, current);
12711 SvREFCNT_dec_NN(i);
12712 SvREFCNT_dec_NN(u);
12713 SvREFCNT_dec_NN(element);
12718 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12720 SvREFCNT_dec_NN(top);
12721 SvREFCNT_dec(prev);
12725 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12728 if (av_tindex(stack) < 0 /* Was empty */
12729 || ((final = av_pop(stack)) == NULL)
12730 || ! IS_OPERAND(final)
12731 || av_tindex(stack) >= 0) /* More left on stack */
12733 vFAIL("Incomplete expression within '(?[ ])'");
12736 /* Here, 'final' is the resultant inversion list from evaluating the
12737 * expression. Return it if so requested */
12738 if (return_invlist) {
12739 *return_invlist = final;
12743 /* Otherwise generate a resultant node, based on 'final'. regclass() is
12744 * expecting a string of ranges and individual code points */
12745 invlist_iterinit(final);
12746 result_string = newSVpvs("");
12747 while (invlist_iternext(final, &start, &end)) {
12748 if (start == end) {
12749 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12752 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12757 save_parse = RExC_parse;
12758 RExC_parse = SvPV(result_string, len);
12759 save_end = RExC_end;
12760 RExC_end = RExC_parse + len;
12762 /* We turn off folding around the call, as the class we have constructed
12763 * already has all folding taken into consideration, and we don't want
12764 * regclass() to add to that */
12765 RExC_flags &= ~RXf_PMf_FOLD;
12766 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12768 node = regclass(pRExC_state, flagp,depth+1,
12769 FALSE, /* means parse the whole char class */
12770 FALSE, /* don't allow multi-char folds */
12771 TRUE, /* silence non-portable warnings. The above may very
12772 well have generated non-portable code points, but
12773 they're valid on this machine */
12776 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12779 RExC_flags |= RXf_PMf_FOLD;
12781 RExC_parse = save_parse + 1;
12782 RExC_end = save_end;
12783 SvREFCNT_dec_NN(final);
12784 SvREFCNT_dec_NN(result_string);
12786 nextchar(pRExC_state);
12787 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12792 /* The names of properties whose definitions are not known at compile time are
12793 * stored in this SV, after a constant heading. So if the length has been
12794 * changed since initialization, then there is a run-time definition. */
12795 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12798 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12799 const bool stop_at_1, /* Just parse the next thing, don't
12800 look for a full character class */
12801 bool allow_multi_folds,
12802 const bool silence_non_portable, /* Don't output warnings
12805 SV** ret_invlist) /* Return an inversion list, not a node */
12807 /* parse a bracketed class specification. Most of these will produce an
12808 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12809 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
12810 * under /i with multi-character folds: it will be rewritten following the
12811 * paradigm of this example, where the <multi-fold>s are characters which
12812 * fold to multiple character sequences:
12813 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12814 * gets effectively rewritten as:
12815 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12816 * reg() gets called (recursively) on the rewritten version, and this
12817 * function will return what it constructs. (Actually the <multi-fold>s
12818 * aren't physically removed from the [abcdefghi], it's just that they are
12819 * ignored in the recursion by means of a flag:
12820 * <RExC_in_multi_char_class>.)
12822 * ANYOF nodes contain a bit map for the first 256 characters, with the
12823 * corresponding bit set if that character is in the list. For characters
12824 * above 255, a range list or swash is used. There are extra bits for \w,
12825 * etc. in locale ANYOFs, as what these match is not determinable at
12828 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12829 * to be restarted. This can only happen if ret_invlist is non-NULL.
12833 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12835 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12838 IV namedclass = OOB_NAMEDCLASS;
12839 char *rangebegin = NULL;
12840 bool need_class = 0;
12842 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12843 than just initialized. */
12844 SV* properties = NULL; /* Code points that match \p{} \P{} */
12845 SV* posixes = NULL; /* Code points that match classes like [:word:],
12846 extended beyond the Latin1 range. These have to
12847 be kept separate from other code points for much
12848 of this function because their handling is
12849 different under /i, and for most classes under
12851 UV element_count = 0; /* Number of distinct elements in the class.
12852 Optimizations may be possible if this is tiny */
12853 AV * multi_char_matches = NULL; /* Code points that fold to more than one
12854 character; used under /i */
12856 char * stop_ptr = RExC_end; /* where to stop parsing */
12857 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12859 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12861 /* Unicode properties are stored in a swash; this holds the current one
12862 * being parsed. If this swash is the only above-latin1 component of the
12863 * character class, an optimization is to pass it directly on to the
12864 * execution engine. Otherwise, it is set to NULL to indicate that there
12865 * are other things in the class that have to be dealt with at execution
12867 SV* swash = NULL; /* Code points that match \p{} \P{} */
12869 /* Set if a component of this character class is user-defined; just passed
12870 * on to the engine */
12871 bool has_user_defined_property = FALSE;
12873 /* inversion list of code points this node matches only when the target
12874 * string is in UTF-8. (Because is under /d) */
12875 SV* depends_list = NULL;
12877 /* inversion list of code points this node matches. For much of the
12878 * function, it includes only those that match regardless of the utf8ness
12879 * of the target string */
12880 SV* cp_list = NULL;
12883 /* In a range, counts how many 0-2 of the ends of it came from literals,
12884 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
12885 UV literal_endpoint = 0;
12887 bool invert = FALSE; /* Is this class to be complemented */
12889 /* Is there any thing like \W or [:^digit:] that matches above the legal
12890 * Unicode range? */
12891 bool runtime_posix_matches_above_Unicode = FALSE;
12893 bool warn_super = ALWAYS_WARN_SUPER;
12895 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12896 case we need to change the emitted regop to an EXACT. */
12897 const char * orig_parse = RExC_parse;
12898 const SSize_t orig_size = RExC_size;
12899 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
12900 GET_RE_DEBUG_FLAGS_DECL;
12902 PERL_ARGS_ASSERT_REGCLASS;
12904 PERL_UNUSED_ARG(depth);
12907 DEBUG_PARSE("clas");
12909 /* Assume we are going to generate an ANYOF node. */
12910 ret = reganode(pRExC_state, ANYOF, 0);
12913 RExC_size += ANYOF_SKIP;
12914 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12917 ANYOF_FLAGS(ret) = 0;
12919 RExC_emit += ANYOF_SKIP;
12921 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12923 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12924 initial_listsv_len = SvCUR(listsv);
12925 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
12929 RExC_parse = regpatws(pRExC_state, RExC_parse,
12930 FALSE /* means don't recognize comments */);
12933 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12936 allow_multi_folds = FALSE;
12939 RExC_parse = regpatws(pRExC_state, RExC_parse,
12940 FALSE /* means don't recognize comments */);
12944 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12945 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12946 const char *s = RExC_parse;
12947 const char c = *s++;
12949 while (isWORDCHAR(*s))
12951 if (*s && c == *s && s[1] == ']') {
12952 SAVEFREESV(RExC_rx_sv);
12954 "POSIX syntax [%c %c] belongs inside character classes",
12956 (void)ReREFCNT_inc(RExC_rx_sv);
12960 /* If the caller wants us to just parse a single element, accomplish this
12961 * by faking the loop ending condition */
12962 if (stop_at_1 && RExC_end > RExC_parse) {
12963 stop_ptr = RExC_parse + 1;
12966 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12967 if (UCHARAT(RExC_parse) == ']')
12968 goto charclassloop;
12972 if (RExC_parse >= stop_ptr) {
12977 RExC_parse = regpatws(pRExC_state, RExC_parse,
12978 FALSE /* means don't recognize comments */);
12981 if (UCHARAT(RExC_parse) == ']') {
12987 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12988 save_value = value;
12989 save_prevvalue = prevvalue;
12992 rangebegin = RExC_parse;
12996 value = utf8n_to_uvchr((U8*)RExC_parse,
12997 RExC_end - RExC_parse,
12998 &numlen, UTF8_ALLOW_DEFAULT);
12999 RExC_parse += numlen;
13002 value = UCHARAT(RExC_parse++);
13005 && RExC_parse < RExC_end
13006 && POSIXCC(UCHARAT(RExC_parse)))
13008 namedclass = regpposixcc(pRExC_state, value, strict);
13010 else if (value == '\\') {
13012 value = utf8n_to_uvchr((U8*)RExC_parse,
13013 RExC_end - RExC_parse,
13014 &numlen, UTF8_ALLOW_DEFAULT);
13015 RExC_parse += numlen;
13018 value = UCHARAT(RExC_parse++);
13020 /* Some compilers cannot handle switching on 64-bit integer
13021 * values, therefore value cannot be an UV. Yes, this will
13022 * be a problem later if we want switch on Unicode.
13023 * A similar issue a little bit later when switching on
13024 * namedclass. --jhi */
13026 /* If the \ is escaping white space when white space is being
13027 * skipped, it means that that white space is wanted literally, and
13028 * is already in 'value'. Otherwise, need to translate the escape
13029 * into what it signifies. */
13030 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13032 case 'w': namedclass = ANYOF_WORDCHAR; break;
13033 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13034 case 's': namedclass = ANYOF_SPACE; break;
13035 case 'S': namedclass = ANYOF_NSPACE; break;
13036 case 'd': namedclass = ANYOF_DIGIT; break;
13037 case 'D': namedclass = ANYOF_NDIGIT; break;
13038 case 'v': namedclass = ANYOF_VERTWS; break;
13039 case 'V': namedclass = ANYOF_NVERTWS; break;
13040 case 'h': namedclass = ANYOF_HORIZWS; break;
13041 case 'H': namedclass = ANYOF_NHORIZWS; break;
13042 case 'N': /* Handle \N{NAME} in class */
13044 /* We only pay attention to the first char of
13045 multichar strings being returned. I kinda wonder
13046 if this makes sense as it does change the behaviour
13047 from earlier versions, OTOH that behaviour was broken
13049 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13050 TRUE, /* => charclass */
13053 if (*flagp & RESTART_UTF8)
13054 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13064 /* We will handle any undefined properties ourselves */
13065 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13066 /* And we actually would prefer to get
13067 * the straight inversion list of the
13068 * swash, since we will be accessing it
13069 * anyway, to save a little time */
13070 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13072 if (RExC_parse >= RExC_end)
13073 vFAIL2("Empty \\%c{}", (U8)value);
13074 if (*RExC_parse == '{') {
13075 const U8 c = (U8)value;
13076 e = strchr(RExC_parse++, '}');
13078 vFAIL2("Missing right brace on \\%c{}", c);
13079 while (isSPACE(UCHARAT(RExC_parse)))
13081 if (e == RExC_parse)
13082 vFAIL2("Empty \\%c{}", c);
13083 n = e - RExC_parse;
13084 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13096 if (UCHARAT(RExC_parse) == '^') {
13099 /* toggle. (The rhs xor gets the single bit that
13100 * differs between P and p; the other xor inverts just
13102 value ^= 'P' ^ 'p';
13104 while (isSPACE(UCHARAT(RExC_parse))) {
13109 /* Try to get the definition of the property into
13110 * <invlist>. If /i is in effect, the effective property
13111 * will have its name be <__NAME_i>. The design is
13112 * discussed in commit
13113 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13114 formatted = Perl_form(aTHX_
13116 (FOLD) ? "__" : "",
13121 name = savepvn(formatted, strlen(formatted));
13123 /* Look up the property name, and get its swash and
13124 * inversion list, if the property is found */
13126 SvREFCNT_dec_NN(swash);
13128 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13131 NULL, /* No inversion list */
13134 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13136 SvREFCNT_dec_NN(swash);
13140 /* Here didn't find it. It could be a user-defined
13141 * property that will be available at run-time. If we
13142 * accept only compile-time properties, is an error;
13143 * otherwise add it to the list for run-time look up */
13145 RExC_parse = e + 1;
13147 "Property '%"UTF8f"' is unknown",
13148 UTF8fARG(UTF, n, name));
13150 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13151 (value == 'p' ? '+' : '!'),
13152 UTF8fARG(UTF, n, name));
13153 has_user_defined_property = TRUE;
13155 /* We don't know yet, so have to assume that the
13156 * property could match something in the Latin1 range,
13157 * hence something that isn't utf8. Note that this
13158 * would cause things in <depends_list> to match
13159 * inappropriately, except that any \p{}, including
13160 * this one forces Unicode semantics, which means there
13161 * is no <depends_list> */
13162 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13166 /* Here, did get the swash and its inversion list. If
13167 * the swash is from a user-defined property, then this
13168 * whole character class should be regarded as such */
13169 if (swash_init_flags
13170 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13172 has_user_defined_property = TRUE;
13175 /* We warn on matching an above-Unicode code point
13176 * if the match would return true, except don't
13177 * warn for \p{All}, which has exactly one element
13179 (_invlist_contains_cp(invlist, 0x110000)
13180 && (! (_invlist_len(invlist) == 1
13181 && *invlist_array(invlist) == 0)))
13187 /* Invert if asking for the complement */
13188 if (value == 'P') {
13189 _invlist_union_complement_2nd(properties,
13193 /* The swash can't be used as-is, because we've
13194 * inverted things; delay removing it to here after
13195 * have copied its invlist above */
13196 SvREFCNT_dec_NN(swash);
13200 _invlist_union(properties, invlist, &properties);
13205 RExC_parse = e + 1;
13206 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13209 /* \p means they want Unicode semantics */
13210 RExC_uni_semantics = 1;
13213 case 'n': value = '\n'; break;
13214 case 'r': value = '\r'; break;
13215 case 't': value = '\t'; break;
13216 case 'f': value = '\f'; break;
13217 case 'b': value = '\b'; break;
13218 case 'e': value = ASCII_TO_NATIVE('\033');break;
13219 case 'a': value = '\a'; break;
13221 RExC_parse--; /* function expects to be pointed at the 'o' */
13223 const char* error_msg;
13224 bool valid = grok_bslash_o(&RExC_parse,
13227 SIZE_ONLY, /* warnings in pass
13230 silence_non_portable,
13236 if (PL_encoding && value < 0x100) {
13237 goto recode_encoding;
13241 RExC_parse--; /* function expects to be pointed at the 'x' */
13243 const char* error_msg;
13244 bool valid = grok_bslash_x(&RExC_parse,
13247 TRUE, /* Output warnings */
13249 silence_non_portable,
13255 if (PL_encoding && value < 0x100)
13256 goto recode_encoding;
13259 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
13261 case '0': case '1': case '2': case '3': case '4':
13262 case '5': case '6': case '7':
13264 /* Take 1-3 octal digits */
13265 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13266 numlen = (strict) ? 4 : 3;
13267 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13268 RExC_parse += numlen;
13271 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13272 vFAIL("Need exactly 3 octal digits");
13274 else if (! SIZE_ONLY /* like \08, \178 */
13276 && RExC_parse < RExC_end
13277 && isDIGIT(*RExC_parse)
13278 && ckWARN(WARN_REGEXP))
13280 SAVEFREESV(RExC_rx_sv);
13281 reg_warn_non_literal_string(
13283 form_short_octal_warning(RExC_parse, numlen));
13284 (void)ReREFCNT_inc(RExC_rx_sv);
13287 if (PL_encoding && value < 0x100)
13288 goto recode_encoding;
13292 if (! RExC_override_recoding) {
13293 SV* enc = PL_encoding;
13294 value = reg_recode((const char)(U8)value, &enc);
13297 vFAIL("Invalid escape in the specified encoding");
13299 else if (SIZE_ONLY) {
13300 ckWARNreg(RExC_parse,
13301 "Invalid escape in the specified encoding");
13307 /* Allow \_ to not give an error */
13308 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13310 vFAIL2("Unrecognized escape \\%c in character class",
13314 SAVEFREESV(RExC_rx_sv);
13315 ckWARN2reg(RExC_parse,
13316 "Unrecognized escape \\%c in character class passed through",
13318 (void)ReREFCNT_inc(RExC_rx_sv);
13322 } /* End of switch on char following backslash */
13323 } /* end of handling backslash escape sequences */
13326 literal_endpoint++;
13329 /* Here, we have the current token in 'value' */
13331 /* What matches in a locale is not known until runtime. This includes
13332 * what the Posix classes (like \w, [:space:]) match. Room must be
13333 * reserved (one time per outer bracketed class) to store such classes,
13334 * either if Perl is compiled so that locale nodes always should have
13335 * this space, or if there is such posix class info to be stored. The
13336 * space will contain a bit for each named class that is to be matched
13337 * against. This isn't needed for \p{} and pseudo-classes, as they are
13338 * not affected by locale, and hence are dealt with separately */
13341 && (ANYOF_LOCALE == ANYOF_POSIXL
13342 || (namedclass > OOB_NAMEDCLASS
13343 && namedclass < ANYOF_POSIXL_MAX)))
13347 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13350 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13352 ANYOF_POSIXL_ZERO(ret);
13353 ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13356 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13359 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13360 * literal, as is the character that began the false range, i.e.
13361 * the 'a' in the examples */
13364 const int w = (RExC_parse >= rangebegin)
13365 ? RExC_parse - rangebegin
13369 "False [] range \"%"UTF8f"\"",
13370 UTF8fARG(UTF, w, rangebegin));
13373 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13374 ckWARN2reg(RExC_parse,
13375 "False [] range \"%"UTF8f"\"",
13376 UTF8fARG(UTF, w, rangebegin));
13377 (void)ReREFCNT_inc(RExC_rx_sv);
13378 cp_list = add_cp_to_invlist(cp_list, '-');
13379 cp_list = add_cp_to_invlist(cp_list, prevvalue);
13383 range = 0; /* this was not a true range */
13384 element_count += 2; /* So counts for three values */
13387 classnum = namedclass_to_classnum(namedclass);
13389 if (LOC && namedclass < ANYOF_POSIXL_MAX
13390 #ifndef HAS_ISASCII
13391 && classnum != _CC_ASCII
13393 #ifndef HAS_ISBLANK
13394 && classnum != _CC_BLANK
13397 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13398 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13402 posixl_matches_all = TRUE;
13405 ANYOF_POSIXL_SET(ret, namedclass);
13407 /* XXX After have made all the posix classes known at compile time
13408 * we can move the LOC handling below to above */
13411 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
13412 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13414 /* Here, should be \h, \H, \v, or \V. Neither /d nor
13415 * /l make a difference in what these match. There
13416 * would be problems if these characters had folds
13417 * other than themselves, as cp_list is subject to
13419 if (classnum != _CC_VERTSPACE) {
13420 assert( namedclass == ANYOF_HORIZWS
13421 || namedclass == ANYOF_NHORIZWS);
13423 /* It turns out that \h is just a synonym for
13425 classnum = _CC_BLANK;
13428 _invlist_union_maybe_complement_2nd(
13430 PL_XPosix_ptrs[classnum],
13431 cBOOL(namedclass % 2), /* Complement if odd
13432 (NHORIZWS, NVERTWS)
13437 else if (classnum == _CC_ASCII) {
13440 ANYOF_POSIXL_SET(ret, namedclass);
13443 #endif /* Not isascii(); just use the hard-coded definition for it */
13445 _invlist_union_maybe_complement_2nd(
13447 PL_Posix_ptrs[_CC_ASCII],
13448 cBOOL(namedclass % 2), /* Complement if odd
13452 /* The code points 128-255 added above will be
13453 * subtracted out below under /d, so the flag needs to
13455 if (namedclass == ANYOF_NASCII && DEPENDS_SEMANTICS) {
13456 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13460 else { /* Garden variety class */
13462 /* The ascii range inversion list */
13463 SV* ascii_source = PL_Posix_ptrs[classnum];
13465 /* The full Latin1 range inversion list */
13466 SV* l1_source = PL_L1Posix_ptrs[classnum];
13468 /* This code is structured into two major clauses. The
13469 * first is for classes whose complete definitions may not
13470 * already be known. If not, the Latin1 definition
13471 * (guaranteed to already known) is used plus code is
13472 * generated to load the rest at run-time (only if needed).
13473 * If the complete definition is known, it drops down to
13474 * the second clause, where the complete definition is
13477 if (classnum < _FIRST_NON_SWASH_CC) {
13479 /* Here, the class has a swash, which may or not
13480 * already be loaded */
13482 /* The name of the property to use to match the full
13483 * eXtended Unicode range swash for this character
13485 const char *Xname = swash_property_names[classnum];
13487 /* If returning the inversion list, we can't defer
13488 * getting this until runtime */
13489 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
13490 PL_utf8_swash_ptrs[classnum] =
13491 _core_swash_init("utf8", Xname, &PL_sv_undef,
13494 NULL, /* No inversion list */
13495 NULL /* No flags */
13497 assert(PL_utf8_swash_ptrs[classnum]);
13499 if ( ! PL_utf8_swash_ptrs[classnum]) {
13500 if (namedclass % 2 == 0) { /* A non-complemented
13502 /* If not /a matching, there are code points we
13503 * don't know at compile time. Arrange for the
13504 * unknown matches to be loaded at run-time, if
13506 if (! AT_LEAST_ASCII_RESTRICTED) {
13507 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
13510 if (LOC) { /* Under locale, set run-time
13512 ANYOF_POSIXL_SET(ret, namedclass);
13515 /* Add the current class's code points to
13516 * the running total */
13517 _invlist_union(posixes,
13518 (AT_LEAST_ASCII_RESTRICTED)
13524 else { /* A complemented class */
13525 if (AT_LEAST_ASCII_RESTRICTED) {
13526 /* Under /a should match everything above
13527 * ASCII, plus the complement of the set's
13529 _invlist_union_complement_2nd(posixes,
13534 /* Arrange for the unknown matches to be
13535 * loaded at run-time, if needed */
13536 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
13538 runtime_posix_matches_above_Unicode = TRUE;
13540 ANYOF_POSIXL_SET(ret, namedclass);
13544 /* We want to match everything in
13545 * Latin1, except those things that
13546 * l1_source matches */
13547 SV* scratch_list = NULL;
13548 _invlist_subtract(PL_Latin1, l1_source,
13551 /* Add the list from this class to the
13554 posixes = scratch_list;
13557 _invlist_union(posixes,
13560 SvREFCNT_dec_NN(scratch_list);
13562 if (DEPENDS_SEMANTICS) {
13564 |= ANYOF_NON_UTF8_LATIN1_ALL;
13569 goto namedclass_done;
13572 /* Here, there is a swash loaded for the class. If no
13573 * inversion list for it yet, get it */
13574 if (! PL_XPosix_ptrs[classnum]) {
13575 PL_XPosix_ptrs[classnum]
13576 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
13580 /* Here there is an inversion list already loaded for the
13583 if (namedclass % 2 == 0) { /* A non-complemented class,
13584 like ANYOF_PUNCT */
13586 /* For non-locale, just add it to any existing list
13588 _invlist_union(posixes,
13589 (AT_LEAST_ASCII_RESTRICTED)
13591 : PL_XPosix_ptrs[classnum],
13594 else { /* Locale */
13595 SV* scratch_list = NULL;
13597 /* For above Latin1 code points, we use the full
13599 _invlist_intersection(PL_AboveLatin1,
13600 PL_XPosix_ptrs[classnum],
13602 /* And set the output to it, adding instead if
13603 * there already is an output. Checking if
13604 * 'posixes' is NULL first saves an extra clone.
13605 * Its reference count will be decremented at the
13606 * next union, etc, or if this is the only
13607 * instance, at the end of the routine */
13609 posixes = scratch_list;
13612 _invlist_union(posixes, scratch_list, &posixes);
13613 SvREFCNT_dec_NN(scratch_list);
13616 #ifndef HAS_ISBLANK
13617 if (namedclass != ANYOF_BLANK) {
13619 /* Set this class in the node for runtime
13621 ANYOF_POSIXL_SET(ret, namedclass);
13622 #ifndef HAS_ISBLANK
13625 /* No isblank(), use the hard-coded ASCII-range
13626 * blanks, adding them to the running total. */
13628 _invlist_union(posixes, ascii_source, &posixes);
13633 else { /* A complemented class, like ANYOF_NPUNCT */
13635 _invlist_union_complement_2nd(
13637 (AT_LEAST_ASCII_RESTRICTED)
13639 : PL_XPosix_ptrs[classnum],
13641 /* Under /d, everything in the upper half of the
13642 * Latin1 range matches this complement */
13643 if (DEPENDS_SEMANTICS) {
13644 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13647 else { /* Locale */
13648 SV* scratch_list = NULL;
13649 _invlist_subtract(PL_AboveLatin1,
13650 PL_XPosix_ptrs[classnum],
13653 posixes = scratch_list;
13656 _invlist_union(posixes, scratch_list, &posixes);
13657 SvREFCNT_dec_NN(scratch_list);
13659 #ifndef HAS_ISBLANK
13660 if (namedclass != ANYOF_NBLANK) {
13662 ANYOF_POSIXL_SET(ret, namedclass);
13663 #ifndef HAS_ISBLANK
13666 /* Get the list of all code points in Latin1
13667 * that are not ASCII blanks, and add them to
13668 * the running total */
13669 _invlist_subtract(PL_Latin1, ascii_source,
13671 _invlist_union(posixes, scratch_list, &posixes);
13672 SvREFCNT_dec_NN(scratch_list);
13679 continue; /* Go get next character */
13681 } /* end of namedclass \blah */
13683 /* Here, we have a single value. If 'range' is set, it is the ending
13684 * of a range--check its validity. Later, we will handle each
13685 * individual code point in the range. If 'range' isn't set, this
13686 * could be the beginning of a range, so check for that by looking
13687 * ahead to see if the next real character to be processed is the range
13688 * indicator--the minus sign */
13691 RExC_parse = regpatws(pRExC_state, RExC_parse,
13692 FALSE /* means don't recognize comments */);
13696 if (prevvalue > value) /* b-a */ {
13697 const int w = RExC_parse - rangebegin;
13699 "Invalid [] range \"%"UTF8f"\"",
13700 UTF8fARG(UTF, w, rangebegin));
13701 range = 0; /* not a valid range */
13705 prevvalue = value; /* save the beginning of the potential range */
13706 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13707 && *RExC_parse == '-')
13709 char* next_char_ptr = RExC_parse + 1;
13710 if (skip_white) { /* Get the next real char after the '-' */
13711 next_char_ptr = regpatws(pRExC_state,
13713 FALSE); /* means don't recognize
13717 /* If the '-' is at the end of the class (just before the ']',
13718 * it is a literal minus; otherwise it is a range */
13719 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13720 RExC_parse = next_char_ptr;
13722 /* a bad range like \w-, [:word:]- ? */
13723 if (namedclass > OOB_NAMEDCLASS) {
13724 if (strict || ckWARN(WARN_REGEXP)) {
13726 RExC_parse >= rangebegin ?
13727 RExC_parse - rangebegin : 0;
13729 vFAIL4("False [] range \"%*.*s\"",
13734 "False [] range \"%*.*s\"",
13739 cp_list = add_cp_to_invlist(cp_list, '-');
13743 range = 1; /* yeah, it's a range! */
13744 continue; /* but do it the next time */
13749 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13752 /* non-Latin1 code point implies unicode semantics. Must be set in
13753 * pass1 so is there for the whole of pass 2 */
13755 RExC_uni_semantics = 1;
13758 /* Ready to process either the single value, or the completed range.
13759 * For single-valued non-inverted ranges, we consider the possibility
13760 * of multi-char folds. (We made a conscious decision to not do this
13761 * for the other cases because it can often lead to non-intuitive
13762 * results. For example, you have the peculiar case that:
13763 * "s s" =~ /^[^\xDF]+$/i => Y
13764 * "ss" =~ /^[^\xDF]+$/i => N
13766 * See [perl #89750] */
13767 if (FOLD && allow_multi_folds && value == prevvalue) {
13768 if (value == LATIN_SMALL_LETTER_SHARP_S
13769 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13772 /* Here <value> is indeed a multi-char fold. Get what it is */
13774 U8 foldbuf[UTF8_MAXBYTES_CASE];
13777 UV folded = _to_uni_fold_flags(
13782 | ((LOC) ? FOLD_FLAGS_LOCALE
13783 : (ASCII_FOLD_RESTRICTED)
13784 ? FOLD_FLAGS_NOMIX_ASCII
13788 /* Here, <folded> should be the first character of the
13789 * multi-char fold of <value>, with <foldbuf> containing the
13790 * whole thing. But, if this fold is not allowed (because of
13791 * the flags), <fold> will be the same as <value>, and should
13792 * be processed like any other character, so skip the special
13794 if (folded != value) {
13796 /* Skip if we are recursed, currently parsing the class
13797 * again. Otherwise add this character to the list of
13798 * multi-char folds. */
13799 if (! RExC_in_multi_char_class) {
13800 AV** this_array_ptr;
13802 STRLEN cp_count = utf8_length(foldbuf,
13803 foldbuf + foldlen);
13804 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13806 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13809 if (! multi_char_matches) {
13810 multi_char_matches = newAV();
13813 /* <multi_char_matches> is actually an array of arrays.
13814 * There will be one or two top-level elements: [2],
13815 * and/or [3]. The [2] element is an array, each
13816 * element thereof is a character which folds to TWO
13817 * characters; [3] is for folds to THREE characters.
13818 * (Unicode guarantees a maximum of 3 characters in any
13819 * fold.) When we rewrite the character class below,
13820 * we will do so such that the longest folds are
13821 * written first, so that it prefers the longest
13822 * matching strings first. This is done even if it
13823 * turns out that any quantifier is non-greedy, out of
13824 * programmer laziness. Tom Christiansen has agreed
13825 * that this is ok. This makes the test for the
13826 * ligature 'ffi' come before the test for 'ff' */
13827 if (av_exists(multi_char_matches, cp_count)) {
13828 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13830 this_array = *this_array_ptr;
13833 this_array = newAV();
13834 av_store(multi_char_matches, cp_count,
13837 av_push(this_array, multi_fold);
13840 /* This element should not be processed further in this
13843 value = save_value;
13844 prevvalue = save_prevvalue;
13850 /* Deal with this element of the class */
13853 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13855 SV* this_range = _new_invlist(1);
13856 _append_range_to_invlist(this_range, prevvalue, value);
13858 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13859 * If this range was specified using something like 'i-j', we want
13860 * to include only the 'i' and the 'j', and not anything in
13861 * between, so exclude non-ASCII, non-alphabetics from it.
13862 * However, if the range was specified with something like
13863 * [\x89-\x91] or [\x89-j], all code points within it should be
13864 * included. literal_endpoint==2 means both ends of the range used
13865 * a literal character, not \x{foo} */
13866 if (literal_endpoint == 2
13867 && ((prevvalue >= 'a' && value <= 'z')
13868 || (prevvalue >= 'A' && value <= 'Z')))
13870 _invlist_intersection(this_range, PL_ASCII,
13872 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13875 _invlist_union(cp_list, this_range, &cp_list);
13876 literal_endpoint = 0;
13880 range = 0; /* this range (if it was one) is done now */
13881 } /* End of loop through all the text within the brackets */
13883 /* If anything in the class expands to more than one character, we have to
13884 * deal with them by building up a substitute parse string, and recursively
13885 * calling reg() on it, instead of proceeding */
13886 if (multi_char_matches) {
13887 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13890 char *save_end = RExC_end;
13891 char *save_parse = RExC_parse;
13892 bool first_time = TRUE; /* First multi-char occurrence doesn't get
13897 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13898 because too confusing */
13900 sv_catpv(substitute_parse, "(?:");
13904 /* Look at the longest folds first */
13905 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13907 if (av_exists(multi_char_matches, cp_count)) {
13908 AV** this_array_ptr;
13911 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13913 while ((this_sequence = av_pop(*this_array_ptr)) !=
13916 if (! first_time) {
13917 sv_catpv(substitute_parse, "|");
13919 first_time = FALSE;
13921 sv_catpv(substitute_parse, SvPVX(this_sequence));
13926 /* If the character class contains anything else besides these
13927 * multi-character folds, have to include it in recursive parsing */
13928 if (element_count) {
13929 sv_catpv(substitute_parse, "|[");
13930 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13931 sv_catpv(substitute_parse, "]");
13934 sv_catpv(substitute_parse, ")");
13937 /* This is a way to get the parse to skip forward a whole named
13938 * sequence instead of matching the 2nd character when it fails the
13940 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13944 RExC_parse = SvPV(substitute_parse, len);
13945 RExC_end = RExC_parse + len;
13946 RExC_in_multi_char_class = 1;
13947 RExC_emit = (regnode *)orig_emit;
13949 ret = reg(pRExC_state, 1, ®_flags, depth+1);
13951 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13953 RExC_parse = save_parse;
13954 RExC_end = save_end;
13955 RExC_in_multi_char_class = 0;
13956 SvREFCNT_dec_NN(multi_char_matches);
13960 /* Here, we've gone through the entire class and dealt with multi-char
13961 * folds. We are now in a position that we can do some checks to see if we
13962 * can optimize this ANYOF node into a simpler one, even in Pass 1.
13963 * Currently we only do two checks:
13964 * 1) is in the unlikely event that the user has specified both, eg. \w and
13965 * \W under /l, then the class matches everything. (This optimization
13966 * is done only to make the optimizer code run later work.)
13967 * 2) if the character class contains only a single element (including a
13968 * single range), we see if there is an equivalent node for it.
13969 * Other checks are possible */
13970 if (! ret_invlist /* Can't optimize if returning the constructed
13972 && (UNLIKELY(posixl_matches_all) || element_count == 1))
13977 if (UNLIKELY(posixl_matches_all)) {
13980 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
13981 \w or [:digit:] or \p{foo}
13984 /* All named classes are mapped into POSIXish nodes, with its FLAG
13985 * argument giving which class it is */
13986 switch ((I32)namedclass) {
13987 case ANYOF_UNIPROP:
13990 /* These don't depend on the charset modifiers. They always
13991 * match under /u rules */
13992 case ANYOF_NHORIZWS:
13993 case ANYOF_HORIZWS:
13994 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13997 case ANYOF_NVERTWS:
14002 /* The actual POSIXish node for all the rest depends on the
14003 * charset modifier. The ones in the first set depend only on
14004 * ASCII or, if available on this platform, locale */
14008 op = (LOC) ? POSIXL : POSIXA;
14019 /* under /a could be alpha */
14021 if (ASCII_RESTRICTED) {
14022 namedclass = ANYOF_ALPHA + (namedclass % 2);
14030 /* The rest have more possibilities depending on the charset.
14031 * We take advantage of the enum ordering of the charset
14032 * modifiers to get the exact node type, */
14034 op = POSIXD + get_regex_charset(RExC_flags);
14035 if (op > POSIXA) { /* /aa is same as /a */
14038 #ifndef HAS_ISBLANK
14040 && (namedclass == ANYOF_BLANK
14041 || namedclass == ANYOF_NBLANK))
14048 /* The odd numbered ones are the complements of the
14049 * next-lower even number one */
14050 if (namedclass % 2 == 1) {
14054 arg = namedclass_to_classnum(namedclass);
14058 else if (value == prevvalue) {
14060 /* Here, the class consists of just a single code point */
14063 if (! LOC && value == '\n') {
14064 op = REG_ANY; /* Optimize [^\n] */
14065 *flagp |= HASWIDTH|SIMPLE;
14069 else if (value < 256 || UTF) {
14071 /* Optimize a single value into an EXACTish node, but not if it
14072 * would require converting the pattern to UTF-8. */
14073 op = compute_EXACTish(pRExC_state);
14075 } /* Otherwise is a range */
14076 else if (! LOC) { /* locale could vary these */
14077 if (prevvalue == '0') {
14078 if (value == '9') {
14085 /* Here, we have changed <op> away from its initial value iff we found
14086 * an optimization */
14089 /* Throw away this ANYOF regnode, and emit the calculated one,
14090 * which should correspond to the beginning, not current, state of
14092 const char * cur_parse = RExC_parse;
14093 RExC_parse = (char *)orig_parse;
14097 /* To get locale nodes to not use the full ANYOF size would
14098 * require moving the code above that writes the portions
14099 * of it that aren't in other nodes to after this point.
14100 * e.g. ANYOF_POSIXL_SET */
14101 RExC_size = orig_size;
14105 RExC_emit = (regnode *)orig_emit;
14106 if (PL_regkind[op] == POSIXD) {
14108 op += NPOSIXD - POSIXD;
14113 ret = reg_node(pRExC_state, op);
14115 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14119 *flagp |= HASWIDTH|SIMPLE;
14121 else if (PL_regkind[op] == EXACT) {
14122 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14125 RExC_parse = (char *) cur_parse;
14127 SvREFCNT_dec(posixes);
14128 SvREFCNT_dec(cp_list);
14135 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14137 /* If folding, we calculate all characters that could fold to or from the
14138 * ones already on the list */
14139 if (FOLD && cp_list) {
14140 UV start, end; /* End points of code point ranges */
14142 SV* fold_intersection = NULL;
14144 /* If the highest code point is within Latin1, we can use the
14145 * compiled-in Alphas list, and not have to go out to disk. This
14146 * yields two false positives, the masculine and feminine ordinal
14147 * indicators, which are weeded out below using the
14148 * IS_IN_SOME_FOLD_L1() macro */
14149 if (invlist_highest(cp_list) < 256) {
14150 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
14151 &fold_intersection);
14155 /* Here, there are non-Latin1 code points, so we will have to go
14156 * fetch the list of all the characters that participate in folds
14158 if (! PL_utf8_foldable) {
14159 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14160 &PL_sv_undef, 1, 0);
14161 PL_utf8_foldable = _get_swash_invlist(swash);
14162 SvREFCNT_dec_NN(swash);
14165 /* This is a hash that for a particular fold gives all characters
14166 * that are involved in it */
14167 if (! PL_utf8_foldclosures) {
14169 /* If we were unable to find any folds, then we likely won't be
14170 * able to find the closures. So just create an empty list.
14171 * Folding will effectively be restricted to the non-Unicode
14172 * rules hard-coded into Perl. (This case happens legitimately
14173 * during compilation of Perl itself before the Unicode tables
14174 * are generated) */
14175 if (_invlist_len(PL_utf8_foldable) == 0) {
14176 PL_utf8_foldclosures = newHV();
14179 /* If the folds haven't been read in, call a fold function
14181 if (! PL_utf8_tofold) {
14182 U8 dummy[UTF8_MAXBYTES_CASE+1];
14184 /* This string is just a short named one above \xff */
14185 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14186 assert(PL_utf8_tofold); /* Verify that worked */
14188 PL_utf8_foldclosures =
14189 _swash_inversion_hash(PL_utf8_tofold);
14193 /* Only the characters in this class that participate in folds need
14194 * be checked. Get the intersection of this class and all the
14195 * possible characters that are foldable. This can quickly narrow
14196 * down a large class */
14197 _invlist_intersection(PL_utf8_foldable, cp_list,
14198 &fold_intersection);
14201 /* Now look at the foldable characters in this class individually */
14202 invlist_iterinit(fold_intersection);
14203 while (invlist_iternext(fold_intersection, &start, &end)) {
14206 /* Locale folding for Latin1 characters is deferred until runtime */
14207 if (LOC && start < 256) {
14211 /* Look at every character in the range */
14212 for (j = start; j <= end; j++) {
14214 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14220 /* We have the latin1 folding rules hard-coded here so that
14221 * an innocent-looking character class, like /[ks]/i won't
14222 * have to go out to disk to find the possible matches.
14223 * XXX It would be better to generate these via regen, in
14224 * case a new version of the Unicode standard adds new
14225 * mappings, though that is not really likely, and may be
14226 * caught by the default: case of the switch below. */
14228 if (IS_IN_SOME_FOLD_L1(j)) {
14230 /* ASCII is always matched; non-ASCII is matched only
14231 * under Unicode rules */
14232 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
14234 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
14238 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
14242 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14243 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14245 /* Certain Latin1 characters have matches outside
14246 * Latin1. To get here, <j> is one of those
14247 * characters. None of these matches is valid for
14248 * ASCII characters under /aa, which is why the 'if'
14249 * just above excludes those. These matches only
14250 * happen when the target string is utf8. The code
14251 * below adds the single fold closures for <j> to the
14252 * inversion list. */
14257 add_cp_to_invlist(cp_list, KELVIN_SIGN);
14261 cp_list = add_cp_to_invlist(cp_list,
14262 LATIN_SMALL_LETTER_LONG_S);
14265 cp_list = add_cp_to_invlist(cp_list,
14266 GREEK_CAPITAL_LETTER_MU);
14267 cp_list = add_cp_to_invlist(cp_list,
14268 GREEK_SMALL_LETTER_MU);
14270 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14271 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14273 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
14275 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14276 cp_list = add_cp_to_invlist(cp_list,
14277 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14279 case LATIN_SMALL_LETTER_SHARP_S:
14280 cp_list = add_cp_to_invlist(cp_list,
14281 LATIN_CAPITAL_LETTER_SHARP_S);
14283 case 'F': case 'f':
14284 case 'I': case 'i':
14285 case 'L': case 'l':
14286 case 'T': case 't':
14287 case 'A': case 'a':
14288 case 'H': case 'h':
14289 case 'J': case 'j':
14290 case 'N': case 'n':
14291 case 'W': case 'w':
14292 case 'Y': case 'y':
14293 /* These all are targets of multi-character
14294 * folds from code points that require UTF8 to
14295 * express, so they can't match unless the
14296 * target string is in UTF-8, so no action here
14297 * is necessary, as regexec.c properly handles
14298 * the general case for UTF-8 matching and
14299 * multi-char folds */
14302 /* Use deprecated warning to increase the
14303 * chances of this being output */
14304 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14311 /* Here is an above Latin1 character. We don't have the rules
14312 * hard-coded for it. First, get its fold. This is the simple
14313 * fold, as the multi-character folds have been handled earlier
14314 * and separated out */
14315 _to_uni_fold_flags(j, foldbuf, &foldlen,
14317 ? FOLD_FLAGS_LOCALE
14318 : (ASCII_FOLD_RESTRICTED)
14319 ? FOLD_FLAGS_NOMIX_ASCII
14322 /* Single character fold of above Latin1. Add everything in
14323 * its fold closure to the list that this node should match.
14324 * The fold closures data structure is a hash with the keys
14325 * being the UTF-8 of every character that is folded to, like
14326 * 'k', and the values each an array of all code points that
14327 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14328 * Multi-character folds are not included */
14329 if ((listp = hv_fetch(PL_utf8_foldclosures,
14330 (char *) foldbuf, foldlen, FALSE)))
14332 AV* list = (AV*) *listp;
14334 for (k = 0; k <= av_len(list); k++) {
14335 SV** c_p = av_fetch(list, k, FALSE);
14338 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14342 /* /aa doesn't allow folds between ASCII and non-; /l
14343 * doesn't allow them between above and below 256 */
14344 if ((ASCII_FOLD_RESTRICTED
14345 && (isASCII(c) != isASCII(j)))
14346 || (LOC && c < 256)) {
14350 /* Folds involving non-ascii Latin1 characters
14351 * under /d are added to a separate list */
14352 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14354 cp_list = add_cp_to_invlist(cp_list, c);
14357 depends_list = add_cp_to_invlist(depends_list, c);
14363 SvREFCNT_dec_NN(fold_intersection);
14366 /* And combine the result (if any) with any inversion list from posix
14367 * classes. The lists are kept separate up to now because we don't want to
14368 * fold the classes (folding of those is automatically handled by the swash
14369 * fetching code) */
14371 if (! DEPENDS_SEMANTICS) {
14373 _invlist_union(cp_list, posixes, &cp_list);
14374 SvREFCNT_dec_NN(posixes);
14381 /* Under /d, we put into a separate list the Latin1 things that
14382 * match only when the target string is utf8 */
14383 SV* nonascii_but_latin1_properties = NULL;
14384 _invlist_intersection(posixes, PL_UpperLatin1,
14385 &nonascii_but_latin1_properties);
14386 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14389 _invlist_union(cp_list, posixes, &cp_list);
14390 SvREFCNT_dec_NN(posixes);
14396 if (depends_list) {
14397 _invlist_union(depends_list, nonascii_but_latin1_properties,
14399 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14402 depends_list = nonascii_but_latin1_properties;
14407 /* And combine the result (if any) with any inversion list from properties.
14408 * The lists are kept separate up to now so that we can distinguish the two
14409 * in regards to matching above-Unicode. A run-time warning is generated
14410 * if a Unicode property is matched against a non-Unicode code point. But,
14411 * we allow user-defined properties to match anything, without any warning,
14412 * and we also suppress the warning if there is a portion of the character
14413 * class that isn't a Unicode property, and which matches above Unicode, \W
14414 * or [\x{110000}] for example.
14415 * (Note that in this case, unlike the Posix one above, there is no
14416 * <depends_list>, because having a Unicode property forces Unicode
14421 /* If it matters to the final outcome, see if a non-property
14422 * component of the class matches above Unicode. If so, the
14423 * warning gets suppressed. This is true even if just a single
14424 * such code point is specified, as though not strictly correct if
14425 * another such code point is matched against, the fact that they
14426 * are using above-Unicode code points indicates they should know
14427 * the issues involved */
14429 bool non_prop_matches_above_Unicode =
14430 runtime_posix_matches_above_Unicode
14431 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
14433 non_prop_matches_above_Unicode =
14434 ! non_prop_matches_above_Unicode;
14436 warn_super = ! non_prop_matches_above_Unicode;
14439 _invlist_union(properties, cp_list, &cp_list);
14440 SvREFCNT_dec_NN(properties);
14443 cp_list = properties;
14447 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14451 /* Here, we have calculated what code points should be in the character
14454 * Now we can see about various optimizations. Fold calculation (which we
14455 * did above) needs to take place before inversion. Otherwise /[^k]/i
14456 * would invert to include K, which under /i would match k, which it
14457 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14458 * folded until runtime */
14460 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14461 * at compile time. Besides not inverting folded locale now, we can't
14462 * invert if there are things such as \w, which aren't known until runtime
14465 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_POSIXL)))
14467 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14469 _invlist_invert(cp_list);
14471 /* Any swash can't be used as-is, because we've inverted things */
14473 SvREFCNT_dec_NN(swash);
14477 /* Clear the invert flag since have just done it here */
14482 *ret_invlist = cp_list;
14483 SvREFCNT_dec(swash);
14485 /* Discard the generated node */
14487 RExC_size = orig_size;
14490 RExC_emit = orig_emit;
14495 /* If we didn't do folding, it's because some information isn't available
14496 * until runtime; set the run-time fold flag for these. (We don't have to
14497 * worry about properties folding, as that is taken care of by the swash
14501 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14504 /* Some character classes are equivalent to other nodes. Such nodes take
14505 * up less room and generally fewer operations to execute than ANYOF nodes.
14506 * Above, we checked for and optimized into some such equivalents for
14507 * certain common classes that are easy to test. Getting to this point in
14508 * the code means that the class didn't get optimized there. Since this
14509 * code is only executed in Pass 2, it is too late to save space--it has
14510 * been allocated in Pass 1, and currently isn't given back. But turning
14511 * things into an EXACTish node can allow the optimizer to join it to any
14512 * adjacent such nodes. And if the class is equivalent to things like /./,
14513 * expensive run-time swashes can be avoided. Now that we have more
14514 * complete information, we can find things necessarily missed by the
14515 * earlier code. I (khw) am not sure how much to look for here. It would
14516 * be easy, but perhaps too slow, to check any candidates against all the
14517 * node types they could possibly match using _invlistEQ(). */
14522 && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
14523 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14525 /* We don't optimize if we are supposed to make sure all non-Unicode
14526 * code points raise a warning, as only ANYOF nodes have this check.
14528 && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14531 U8 op = END; /* The optimzation node-type */
14532 const char * cur_parse= RExC_parse;
14534 invlist_iterinit(cp_list);
14535 if (! invlist_iternext(cp_list, &start, &end)) {
14537 /* Here, the list is empty. This happens, for example, when a
14538 * Unicode property is the only thing in the character class, and
14539 * it doesn't match anything. (perluniprops.pod notes such
14542 *flagp |= HASWIDTH|SIMPLE;
14544 else if (start == end) { /* The range is a single code point */
14545 if (! invlist_iternext(cp_list, &start, &end)
14547 /* Don't do this optimization if it would require changing
14548 * the pattern to UTF-8 */
14549 && (start < 256 || UTF))
14551 /* Here, the list contains a single code point. Can optimize
14552 * into an EXACT node */
14561 /* A locale node under folding with one code point can be
14562 * an EXACTFL, as its fold won't be calculated until
14568 /* Here, we are generally folding, but there is only one
14569 * code point to match. If we have to, we use an EXACT
14570 * node, but it would be better for joining with adjacent
14571 * nodes in the optimization pass if we used the same
14572 * EXACTFish node that any such are likely to be. We can
14573 * do this iff the code point doesn't participate in any
14574 * folds. For example, an EXACTF of a colon is the same as
14575 * an EXACT one, since nothing folds to or from a colon. */
14577 if (IS_IN_SOME_FOLD_L1(value)) {
14582 if (! PL_utf8_foldable) {
14583 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14584 &PL_sv_undef, 1, 0);
14585 PL_utf8_foldable = _get_swash_invlist(swash);
14586 SvREFCNT_dec_NN(swash);
14588 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14593 /* If we haven't found the node type, above, it means we
14594 * can use the prevailing one */
14596 op = compute_EXACTish(pRExC_state);
14601 else if (start == 0) {
14602 if (end == UV_MAX) {
14604 *flagp |= HASWIDTH|SIMPLE;
14607 else if (end == '\n' - 1
14608 && invlist_iternext(cp_list, &start, &end)
14609 && start == '\n' + 1 && end == UV_MAX)
14612 *flagp |= HASWIDTH|SIMPLE;
14616 invlist_iterfinish(cp_list);
14619 RExC_parse = (char *)orig_parse;
14620 RExC_emit = (regnode *)orig_emit;
14622 ret = reg_node(pRExC_state, op);
14624 RExC_parse = (char *)cur_parse;
14626 if (PL_regkind[op] == EXACT) {
14627 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14630 SvREFCNT_dec_NN(cp_list);
14635 /* Here, <cp_list> contains all the code points we can determine at
14636 * compile time that match under all conditions. Go through it, and
14637 * for things that belong in the bitmap, put them there, and delete from
14638 * <cp_list>. While we are at it, see if everything above 255 is in the
14639 * list, and if so, set a flag to speed up execution */
14641 populate_ANYOF_from_invlist(ret, &cp_list);
14644 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14647 /* Here, the bitmap has been populated with all the Latin1 code points that
14648 * always match. Can now add to the overall list those that match only
14649 * when the target string is UTF-8 (<depends_list>). */
14650 if (depends_list) {
14652 _invlist_union(cp_list, depends_list, &cp_list);
14653 SvREFCNT_dec_NN(depends_list);
14656 cp_list = depends_list;
14660 /* If there is a swash and more than one element, we can't use the swash in
14661 * the optimization below. */
14662 if (swash && element_count > 1) {
14663 SvREFCNT_dec_NN(swash);
14667 set_ANYOF_arg(pRExC_state, ret, cp_list,
14668 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14670 swash, has_user_defined_property);
14672 *flagp |= HASWIDTH|SIMPLE;
14676 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14679 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14680 regnode* const node,
14682 SV* const runtime_defns,
14684 const bool has_user_defined_property)
14686 /* Sets the arg field of an ANYOF-type node 'node', using information about
14687 * the node passed-in. If there is nothing outside the node's bitmap, the
14688 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
14689 * the count returned by add_data(), having allocated and stored an array,
14690 * av, that that count references, as follows:
14691 * av[0] stores the character class description in its textual form.
14692 * This is used later (regexec.c:Perl_regclass_swash()) to
14693 * initialize the appropriate swash, and is also useful for dumping
14694 * the regnode. This is set to &PL_sv_undef if the textual
14695 * description is not needed at run-time (as happens if the other
14696 * elements completely define the class)
14697 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14698 * computed from av[0]. But if no further computation need be done,
14699 * the swash is stored here now (and av[0] is &PL_sv_undef).
14700 * av[2] stores the cp_list inversion list for use in addition or instead
14701 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14702 * (Otherwise everything needed is already in av[0] and av[1])
14703 * av[3] is set if any component of the class is from a user-defined
14704 * property; used only if av[2] exists */
14708 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14710 if (! cp_list && ! runtime_defns) {
14711 ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14714 AV * const av = newAV();
14717 av_store(av, 0, (runtime_defns)
14718 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14720 av_store(av, 1, swash);
14721 SvREFCNT_dec_NN(cp_list);
14724 av_store(av, 1, &PL_sv_undef);
14726 av_store(av, 2, cp_list);
14727 av_store(av, 3, newSVuv(has_user_defined_property));
14731 rv = newRV_noinc(MUTABLE_SV(av));
14732 n = add_data(pRExC_state, STR_WITH_LEN("s"));
14733 RExC_rxi->data->data[n] = (void*)rv;
14739 /* reg_skipcomment()
14741 Absorbs an /x style # comments from the input stream.
14742 Returns true if there is more text remaining in the stream.
14743 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14744 terminates the pattern without including a newline.
14746 Note its the callers responsibility to ensure that we are
14747 actually in /x mode
14752 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14756 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14758 while (RExC_parse < RExC_end)
14759 if (*RExC_parse++ == '\n') {
14764 /* we ran off the end of the pattern without ending
14765 the comment, so we have to add an \n when wrapping */
14766 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14774 Advances the parse position, and optionally absorbs
14775 "whitespace" from the inputstream.
14777 Without /x "whitespace" means (?#...) style comments only,
14778 with /x this means (?#...) and # comments and whitespace proper.
14780 Returns the RExC_parse point from BEFORE the scan occurs.
14782 This is the /x friendly way of saying RExC_parse++.
14786 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14788 char* const retval = RExC_parse++;
14790 PERL_ARGS_ASSERT_NEXTCHAR;
14793 if (RExC_end - RExC_parse >= 3
14794 && *RExC_parse == '('
14795 && RExC_parse[1] == '?'
14796 && RExC_parse[2] == '#')
14798 while (*RExC_parse != ')') {
14799 if (RExC_parse == RExC_end)
14800 FAIL("Sequence (?#... not terminated");
14806 if (RExC_flags & RXf_PMf_EXTENDED) {
14807 if (isSPACE(*RExC_parse)) {
14811 else if (*RExC_parse == '#') {
14812 if ( reg_skipcomment( pRExC_state ) )
14821 - reg_node - emit a node
14823 STATIC regnode * /* Location. */
14824 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14828 regnode * const ret = RExC_emit;
14829 GET_RE_DEBUG_FLAGS_DECL;
14831 PERL_ARGS_ASSERT_REG_NODE;
14834 SIZE_ALIGN(RExC_size);
14838 if (RExC_emit >= RExC_emit_bound)
14839 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14840 op, RExC_emit, RExC_emit_bound);
14842 NODE_ALIGN_FILL(ret);
14844 FILL_ADVANCE_NODE(ptr, op);
14845 #ifdef RE_TRACK_PATTERN_OFFSETS
14846 if (RExC_offsets) { /* MJD */
14847 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14848 "reg_node", __LINE__,
14850 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14851 ? "Overwriting end of array!\n" : "OK",
14852 (UV)(RExC_emit - RExC_emit_start),
14853 (UV)(RExC_parse - RExC_start),
14854 (UV)RExC_offsets[0]));
14855 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14863 - reganode - emit a node with an argument
14865 STATIC regnode * /* Location. */
14866 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14870 regnode * const ret = RExC_emit;
14871 GET_RE_DEBUG_FLAGS_DECL;
14873 PERL_ARGS_ASSERT_REGANODE;
14876 SIZE_ALIGN(RExC_size);
14881 assert(2==regarglen[op]+1);
14883 Anything larger than this has to allocate the extra amount.
14884 If we changed this to be:
14886 RExC_size += (1 + regarglen[op]);
14888 then it wouldn't matter. Its not clear what side effect
14889 might come from that so its not done so far.
14894 if (RExC_emit >= RExC_emit_bound)
14895 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14896 op, RExC_emit, RExC_emit_bound);
14898 NODE_ALIGN_FILL(ret);
14900 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14901 #ifdef RE_TRACK_PATTERN_OFFSETS
14902 if (RExC_offsets) { /* MJD */
14903 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14907 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14908 "Overwriting end of array!\n" : "OK",
14909 (UV)(RExC_emit - RExC_emit_start),
14910 (UV)(RExC_parse - RExC_start),
14911 (UV)RExC_offsets[0]));
14912 Set_Cur_Node_Offset;
14920 - reguni - emit (if appropriate) a Unicode character
14923 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14927 PERL_ARGS_ASSERT_REGUNI;
14929 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14933 - reginsert - insert an operator in front of already-emitted operand
14935 * Means relocating the operand.
14938 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14944 const int offset = regarglen[(U8)op];
14945 const int size = NODE_STEP_REGNODE + offset;
14946 GET_RE_DEBUG_FLAGS_DECL;
14948 PERL_ARGS_ASSERT_REGINSERT;
14949 PERL_UNUSED_ARG(depth);
14950 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14951 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14960 if (RExC_open_parens) {
14962 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14963 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14964 if ( RExC_open_parens[paren] >= opnd ) {
14965 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14966 RExC_open_parens[paren] += size;
14968 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14970 if ( RExC_close_parens[paren] >= opnd ) {
14971 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14972 RExC_close_parens[paren] += size;
14974 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14979 while (src > opnd) {
14980 StructCopy(--src, --dst, regnode);
14981 #ifdef RE_TRACK_PATTERN_OFFSETS
14982 if (RExC_offsets) { /* MJD 20010112 */
14983 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14987 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14988 ? "Overwriting end of array!\n" : "OK",
14989 (UV)(src - RExC_emit_start),
14990 (UV)(dst - RExC_emit_start),
14991 (UV)RExC_offsets[0]));
14992 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14993 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14999 place = opnd; /* Op node, where operand used to be. */
15000 #ifdef RE_TRACK_PATTERN_OFFSETS
15001 if (RExC_offsets) { /* MJD */
15002 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15006 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15007 ? "Overwriting end of array!\n" : "OK",
15008 (UV)(place - RExC_emit_start),
15009 (UV)(RExC_parse - RExC_start),
15010 (UV)RExC_offsets[0]));
15011 Set_Node_Offset(place, RExC_parse);
15012 Set_Node_Length(place, 1);
15015 src = NEXTOPER(place);
15016 FILL_ADVANCE_NODE(place, op);
15017 Zero(src, offset, regnode);
15021 - regtail - set the next-pointer at the end of a node chain of p to val.
15022 - SEE ALSO: regtail_study
15024 /* TODO: All three parms should be const */
15026 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
15030 GET_RE_DEBUG_FLAGS_DECL;
15032 PERL_ARGS_ASSERT_REGTAIL;
15034 PERL_UNUSED_ARG(depth);
15040 /* Find last node. */
15043 regnode * const temp = regnext(scan);
15045 SV * const mysv=sv_newmortal();
15046 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15047 regprop(RExC_rx, mysv, scan);
15048 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15049 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15050 (temp == NULL ? "->" : ""),
15051 (temp == NULL ? PL_reg_name[OP(val)] : "")
15059 if (reg_off_by_arg[OP(scan)]) {
15060 ARG_SET(scan, val - scan);
15063 NEXT_OFF(scan) = val - scan;
15069 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15070 - Look for optimizable sequences at the same time.
15071 - currently only looks for EXACT chains.
15073 This is experimental code. The idea is to use this routine to perform
15074 in place optimizations on branches and groups as they are constructed,
15075 with the long term intention of removing optimization from study_chunk so
15076 that it is purely analytical.
15078 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15079 to control which is which.
15082 /* TODO: All four parms should be const */
15085 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
15090 #ifdef EXPERIMENTAL_INPLACESCAN
15093 GET_RE_DEBUG_FLAGS_DECL;
15095 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15101 /* Find last node. */
15105 regnode * const temp = regnext(scan);
15106 #ifdef EXPERIMENTAL_INPLACESCAN
15107 if (PL_regkind[OP(scan)] == EXACT) {
15108 bool has_exactf_sharp_s; /* Unexamined in this routine */
15109 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
15114 switch (OP(scan)) {
15117 case EXACTFA_NO_TRIE:
15122 if( exact == PSEUDO )
15124 else if ( exact != OP(scan) )
15133 SV * const mysv=sv_newmortal();
15134 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15135 regprop(RExC_rx, mysv, scan);
15136 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15137 SvPV_nolen_const(mysv),
15138 REG_NODE_NUM(scan),
15139 PL_reg_name[exact]);
15146 SV * const mysv_val=sv_newmortal();
15147 DEBUG_PARSE_MSG("");
15148 regprop(RExC_rx, mysv_val, val);
15149 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15150 SvPV_nolen_const(mysv_val),
15151 (IV)REG_NODE_NUM(val),
15155 if (reg_off_by_arg[OP(scan)]) {
15156 ARG_SET(scan, val - scan);
15159 NEXT_OFF(scan) = val - scan;
15167 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15172 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15177 for (bit=0; bit<32; bit++) {
15178 if (flags & (1<<bit)) {
15179 if (!set++ && lead)
15180 PerlIO_printf(Perl_debug_log, "%s",lead);
15181 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15186 PerlIO_printf(Perl_debug_log, "\n");
15188 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15193 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15199 for (bit=0; bit<32; bit++) {
15200 if (flags & (1<<bit)) {
15201 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15204 if (!set++ && lead)
15205 PerlIO_printf(Perl_debug_log, "%s",lead);
15206 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15209 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15210 if (!set++ && lead) {
15211 PerlIO_printf(Perl_debug_log, "%s",lead);
15214 case REGEX_UNICODE_CHARSET:
15215 PerlIO_printf(Perl_debug_log, "UNICODE");
15217 case REGEX_LOCALE_CHARSET:
15218 PerlIO_printf(Perl_debug_log, "LOCALE");
15220 case REGEX_ASCII_RESTRICTED_CHARSET:
15221 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15223 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15224 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15227 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15233 PerlIO_printf(Perl_debug_log, "\n");
15235 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15241 Perl_regdump(pTHX_ const regexp *r)
15245 SV * const sv = sv_newmortal();
15246 SV *dsv= sv_newmortal();
15247 RXi_GET_DECL(r,ri);
15248 GET_RE_DEBUG_FLAGS_DECL;
15250 PERL_ARGS_ASSERT_REGDUMP;
15252 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15254 /* Header fields of interest. */
15255 if (r->anchored_substr) {
15256 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15257 RE_SV_DUMPLEN(r->anchored_substr), 30);
15258 PerlIO_printf(Perl_debug_log,
15259 "anchored %s%s at %"IVdf" ",
15260 s, RE_SV_TAIL(r->anchored_substr),
15261 (IV)r->anchored_offset);
15262 } else if (r->anchored_utf8) {
15263 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15264 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15265 PerlIO_printf(Perl_debug_log,
15266 "anchored utf8 %s%s at %"IVdf" ",
15267 s, RE_SV_TAIL(r->anchored_utf8),
15268 (IV)r->anchored_offset);
15270 if (r->float_substr) {
15271 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15272 RE_SV_DUMPLEN(r->float_substr), 30);
15273 PerlIO_printf(Perl_debug_log,
15274 "floating %s%s at %"IVdf"..%"UVuf" ",
15275 s, RE_SV_TAIL(r->float_substr),
15276 (IV)r->float_min_offset, (UV)r->float_max_offset);
15277 } else if (r->float_utf8) {
15278 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15279 RE_SV_DUMPLEN(r->float_utf8), 30);
15280 PerlIO_printf(Perl_debug_log,
15281 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15282 s, RE_SV_TAIL(r->float_utf8),
15283 (IV)r->float_min_offset, (UV)r->float_max_offset);
15285 if (r->check_substr || r->check_utf8)
15286 PerlIO_printf(Perl_debug_log,
15288 (r->check_substr == r->float_substr
15289 && r->check_utf8 == r->float_utf8
15290 ? "(checking floating" : "(checking anchored"));
15291 if (r->extflags & RXf_NOSCAN)
15292 PerlIO_printf(Perl_debug_log, " noscan");
15293 if (r->extflags & RXf_CHECK_ALL)
15294 PerlIO_printf(Perl_debug_log, " isall");
15295 if (r->check_substr || r->check_utf8)
15296 PerlIO_printf(Perl_debug_log, ") ");
15298 if (ri->regstclass) {
15299 regprop(r, sv, ri->regstclass);
15300 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15302 if (r->extflags & RXf_ANCH) {
15303 PerlIO_printf(Perl_debug_log, "anchored");
15304 if (r->extflags & RXf_ANCH_BOL)
15305 PerlIO_printf(Perl_debug_log, "(BOL)");
15306 if (r->extflags & RXf_ANCH_MBOL)
15307 PerlIO_printf(Perl_debug_log, "(MBOL)");
15308 if (r->extflags & RXf_ANCH_SBOL)
15309 PerlIO_printf(Perl_debug_log, "(SBOL)");
15310 if (r->extflags & RXf_ANCH_GPOS)
15311 PerlIO_printf(Perl_debug_log, "(GPOS)");
15312 PerlIO_putc(Perl_debug_log, ' ');
15314 if (r->extflags & RXf_GPOS_SEEN)
15315 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15316 if (r->intflags & PREGf_SKIP)
15317 PerlIO_printf(Perl_debug_log, "plus ");
15318 if (r->intflags & PREGf_IMPLICIT)
15319 PerlIO_printf(Perl_debug_log, "implicit ");
15320 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15321 if (r->extflags & RXf_EVAL_SEEN)
15322 PerlIO_printf(Perl_debug_log, "with eval ");
15323 PerlIO_printf(Perl_debug_log, "\n");
15325 regdump_extflags("r->extflags: ",r->extflags);
15326 regdump_intflags("r->intflags: ",r->intflags);
15329 PERL_ARGS_ASSERT_REGDUMP;
15330 PERL_UNUSED_CONTEXT;
15331 PERL_UNUSED_ARG(r);
15332 #endif /* DEBUGGING */
15336 - regprop - printable representation of opcode
15340 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
15346 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15347 static const char * const anyofs[] = {
15348 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15349 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15350 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15351 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15352 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15353 || _CC_VERTSPACE != 16
15354 #error Need to adjust order of anyofs[]
15391 RXi_GET_DECL(prog,progi);
15392 GET_RE_DEBUG_FLAGS_DECL;
15394 PERL_ARGS_ASSERT_REGPROP;
15398 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15399 /* It would be nice to FAIL() here, but this may be called from
15400 regexec.c, and it would be hard to supply pRExC_state. */
15401 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
15402 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15404 k = PL_regkind[OP(o)];
15407 sv_catpvs(sv, " ");
15408 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15409 * is a crude hack but it may be the best for now since
15410 * we have no flag "this EXACTish node was UTF-8"
15412 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15413 PERL_PV_ESCAPE_UNI_DETECT |
15414 PERL_PV_ESCAPE_NONASCII |
15415 PERL_PV_PRETTY_ELLIPSES |
15416 PERL_PV_PRETTY_LTGT |
15417 PERL_PV_PRETTY_NOCLEAR
15419 } else if (k == TRIE) {
15420 /* print the details of the trie in dumpuntil instead, as
15421 * progi->data isn't available here */
15422 const char op = OP(o);
15423 const U32 n = ARG(o);
15424 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15425 (reg_ac_data *)progi->data->data[n] :
15427 const reg_trie_data * const trie
15428 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15430 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15431 DEBUG_TRIE_COMPILE_r(
15432 Perl_sv_catpvf(aTHX_ sv,
15433 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15434 (UV)trie->startstate,
15435 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15436 (UV)trie->wordcount,
15439 (UV)TRIE_CHARCOUNT(trie),
15440 (UV)trie->uniquecharcount
15443 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15444 sv_catpvs(sv, "[");
15445 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15447 : TRIE_BITMAP(trie));
15448 sv_catpvs(sv, "]");
15451 } else if (k == CURLY) {
15452 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15453 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15454 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15456 else if (k == WHILEM && o->flags) /* Ordinal/of */
15457 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15458 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
15459 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15460 if ( RXp_PAREN_NAMES(prog) ) {
15461 if ( k != REF || (OP(o) < NREF)) {
15462 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15463 SV **name= av_fetch(list, ARG(o), 0 );
15465 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15468 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15469 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15470 I32 *nums=(I32*)SvPVX(sv_dat);
15471 SV **name= av_fetch(list, nums[0], 0 );
15474 for ( n=0; n<SvIVX(sv_dat); n++ ) {
15475 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15476 (n ? "," : ""), (IV)nums[n]);
15478 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15482 } else if (k == GOSUB)
15483 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
15484 else if (k == VERB) {
15486 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15487 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15488 } else if (k == LOGICAL)
15489 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
15490 else if (k == ANYOF) {
15491 const U8 flags = ANYOF_FLAGS(o);
15495 if (flags & ANYOF_LOCALE)
15496 sv_catpvs(sv, "{loc}");
15497 if (flags & ANYOF_LOC_FOLD)
15498 sv_catpvs(sv, "{i}");
15499 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15500 if (flags & ANYOF_INVERT)
15501 sv_catpvs(sv, "^");
15503 /* output what the standard cp 0-255 bitmap matches */
15504 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15506 /* output any special charclass tests (used entirely under use
15508 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15510 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15511 if (ANYOF_POSIXL_TEST(o,i)) {
15512 sv_catpv(sv, anyofs[i]);
15518 if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
15519 || ANYOF_NONBITMAP(o))
15522 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15523 if (flags & ANYOF_INVERT)
15524 /*make sure the invert info is in each */
15525 sv_catpvs(sv, "^");
15528 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
15529 sv_catpvs(sv, "{non-utf8-latin1-all}");
15532 /* output information about the unicode matching */
15533 if (flags & ANYOF_ABOVE_LATIN1_ALL)
15534 sv_catpvs(sv, "{unicode_all}");
15535 else if (ANYOF_NONBITMAP(o)) {
15536 SV *lv; /* Set if there is something outside the bit map. */
15537 bool byte_output = FALSE; /* If something in the bitmap has been
15540 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15541 sv_catpvs(sv, "{outside bitmap}");
15544 sv_catpvs(sv, "{utf8}");
15547 /* Get the stuff that wasn't in the bitmap */
15548 (void) regclass_swash(prog, o, FALSE, &lv, NULL);
15549 if (lv && lv != &PL_sv_undef) {
15550 char *s = savesvpv(lv);
15551 char * const origs = s;
15553 while (*s && *s != '\n')
15557 const char * const t = ++s;
15560 sv_catpvs(sv, " ");
15566 /* Truncate very long output */
15567 if (s - origs > 256) {
15568 Perl_sv_catpvf(aTHX_ sv,
15570 (int) (s - origs - 1),
15576 else if (*s == '\t') {
15590 SvREFCNT_dec_NN(lv);
15595 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15597 else if (k == POSIXD || k == NPOSIXD) {
15598 U8 index = FLAGS(o) * 2;
15599 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
15600 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15603 if (*anyofs[index] != '[') {
15606 sv_catpv(sv, anyofs[index]);
15607 if (*anyofs[index] != '[') {
15612 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15613 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15615 PERL_UNUSED_CONTEXT;
15616 PERL_UNUSED_ARG(sv);
15617 PERL_UNUSED_ARG(o);
15618 PERL_UNUSED_ARG(prog);
15619 #endif /* DEBUGGING */
15623 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15624 { /* Assume that RE_INTUIT is set */
15626 struct regexp *const prog = ReANY(r);
15627 GET_RE_DEBUG_FLAGS_DECL;
15629 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15630 PERL_UNUSED_CONTEXT;
15634 const char * const s = SvPV_nolen_const(prog->check_substr
15635 ? prog->check_substr : prog->check_utf8);
15637 if (!PL_colorset) reginitcolors();
15638 PerlIO_printf(Perl_debug_log,
15639 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15641 prog->check_substr ? "" : "utf8 ",
15642 PL_colors[5],PL_colors[0],
15645 (strlen(s) > 60 ? "..." : ""));
15648 return prog->check_substr ? prog->check_substr : prog->check_utf8;
15654 handles refcounting and freeing the perl core regexp structure. When
15655 it is necessary to actually free the structure the first thing it
15656 does is call the 'free' method of the regexp_engine associated to
15657 the regexp, allowing the handling of the void *pprivate; member
15658 first. (This routine is not overridable by extensions, which is why
15659 the extensions free is called first.)
15661 See regdupe and regdupe_internal if you change anything here.
15663 #ifndef PERL_IN_XSUB_RE
15665 Perl_pregfree(pTHX_ REGEXP *r)
15671 Perl_pregfree2(pTHX_ REGEXP *rx)
15674 struct regexp *const r = ReANY(rx);
15675 GET_RE_DEBUG_FLAGS_DECL;
15677 PERL_ARGS_ASSERT_PREGFREE2;
15679 if (r->mother_re) {
15680 ReREFCNT_dec(r->mother_re);
15682 CALLREGFREE_PVT(rx); /* free the private data */
15683 SvREFCNT_dec(RXp_PAREN_NAMES(r));
15684 Safefree(r->xpv_len_u.xpvlenu_pv);
15687 SvREFCNT_dec(r->anchored_substr);
15688 SvREFCNT_dec(r->anchored_utf8);
15689 SvREFCNT_dec(r->float_substr);
15690 SvREFCNT_dec(r->float_utf8);
15691 Safefree(r->substrs);
15693 RX_MATCH_COPY_FREE(rx);
15694 #ifdef PERL_ANY_COW
15695 SvREFCNT_dec(r->saved_copy);
15698 SvREFCNT_dec(r->qr_anoncv);
15699 rx->sv_u.svu_rx = 0;
15704 This is a hacky workaround to the structural issue of match results
15705 being stored in the regexp structure which is in turn stored in
15706 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15707 could be PL_curpm in multiple contexts, and could require multiple
15708 result sets being associated with the pattern simultaneously, such
15709 as when doing a recursive match with (??{$qr})
15711 The solution is to make a lightweight copy of the regexp structure
15712 when a qr// is returned from the code executed by (??{$qr}) this
15713 lightweight copy doesn't actually own any of its data except for
15714 the starp/end and the actual regexp structure itself.
15720 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15722 struct regexp *ret;
15723 struct regexp *const r = ReANY(rx);
15724 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15726 PERL_ARGS_ASSERT_REG_TEMP_COPY;
15729 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15731 SvOK_off((SV *)ret_x);
15733 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15734 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
15735 made both spots point to the same regexp body.) */
15736 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15737 assert(!SvPVX(ret_x));
15738 ret_x->sv_u.svu_rx = temp->sv_any;
15739 temp->sv_any = NULL;
15740 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15741 SvREFCNT_dec_NN(temp);
15742 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15743 ing below will not set it. */
15744 SvCUR_set(ret_x, SvCUR(rx));
15747 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15748 sv_force_normal(sv) is called. */
15750 ret = ReANY(ret_x);
15752 SvFLAGS(ret_x) |= SvUTF8(rx);
15753 /* We share the same string buffer as the original regexp, on which we
15754 hold a reference count, incremented when mother_re is set below.
15755 The string pointer is copied here, being part of the regexp struct.
15757 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15758 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15760 const I32 npar = r->nparens+1;
15761 Newx(ret->offs, npar, regexp_paren_pair);
15762 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15765 Newx(ret->substrs, 1, struct reg_substr_data);
15766 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15768 SvREFCNT_inc_void(ret->anchored_substr);
15769 SvREFCNT_inc_void(ret->anchored_utf8);
15770 SvREFCNT_inc_void(ret->float_substr);
15771 SvREFCNT_inc_void(ret->float_utf8);
15773 /* check_substr and check_utf8, if non-NULL, point to either their
15774 anchored or float namesakes, and don't hold a second reference. */
15776 RX_MATCH_COPIED_off(ret_x);
15777 #ifdef PERL_ANY_COW
15778 ret->saved_copy = NULL;
15780 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15781 SvREFCNT_inc_void(ret->qr_anoncv);
15787 /* regfree_internal()
15789 Free the private data in a regexp. This is overloadable by
15790 extensions. Perl takes care of the regexp structure in pregfree(),
15791 this covers the *pprivate pointer which technically perl doesn't
15792 know about, however of course we have to handle the
15793 regexp_internal structure when no extension is in use.
15795 Note this is called before freeing anything in the regexp
15800 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15803 struct regexp *const r = ReANY(rx);
15804 RXi_GET_DECL(r,ri);
15805 GET_RE_DEBUG_FLAGS_DECL;
15807 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15813 SV *dsv= sv_newmortal();
15814 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15815 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15816 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15817 PL_colors[4],PL_colors[5],s);
15820 #ifdef RE_TRACK_PATTERN_OFFSETS
15822 Safefree(ri->u.offsets); /* 20010421 MJD */
15824 if (ri->code_blocks) {
15826 for (n = 0; n < ri->num_code_blocks; n++)
15827 SvREFCNT_dec(ri->code_blocks[n].src_regex);
15828 Safefree(ri->code_blocks);
15832 int n = ri->data->count;
15835 /* If you add a ->what type here, update the comment in regcomp.h */
15836 switch (ri->data->what[n]) {
15842 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15845 Safefree(ri->data->data[n]);
15851 { /* Aho Corasick add-on structure for a trie node.
15852 Used in stclass optimization only */
15854 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15856 refcount = --aho->refcount;
15859 PerlMemShared_free(aho->states);
15860 PerlMemShared_free(aho->fail);
15861 /* do this last!!!! */
15862 PerlMemShared_free(ri->data->data[n]);
15863 PerlMemShared_free(ri->regstclass);
15869 /* trie structure. */
15871 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15873 refcount = --trie->refcount;
15876 PerlMemShared_free(trie->charmap);
15877 PerlMemShared_free(trie->states);
15878 PerlMemShared_free(trie->trans);
15880 PerlMemShared_free(trie->bitmap);
15882 PerlMemShared_free(trie->jump);
15883 PerlMemShared_free(trie->wordinfo);
15884 /* do this last!!!! */
15885 PerlMemShared_free(ri->data->data[n]);
15890 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15893 Safefree(ri->data->what);
15894 Safefree(ri->data);
15900 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15901 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15902 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15905 re_dup - duplicate a regexp.
15907 This routine is expected to clone a given regexp structure. It is only
15908 compiled under USE_ITHREADS.
15910 After all of the core data stored in struct regexp is duplicated
15911 the regexp_engine.dupe method is used to copy any private data
15912 stored in the *pprivate pointer. This allows extensions to handle
15913 any duplication it needs to do.
15915 See pregfree() and regfree_internal() if you change anything here.
15917 #if defined(USE_ITHREADS)
15918 #ifndef PERL_IN_XSUB_RE
15920 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15924 const struct regexp *r = ReANY(sstr);
15925 struct regexp *ret = ReANY(dstr);
15927 PERL_ARGS_ASSERT_RE_DUP_GUTS;
15929 npar = r->nparens+1;
15930 Newx(ret->offs, npar, regexp_paren_pair);
15931 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15933 if (ret->substrs) {
15934 /* Do it this way to avoid reading from *r after the StructCopy().
15935 That way, if any of the sv_dup_inc()s dislodge *r from the L1
15936 cache, it doesn't matter. */
15937 const bool anchored = r->check_substr
15938 ? r->check_substr == r->anchored_substr
15939 : r->check_utf8 == r->anchored_utf8;
15940 Newx(ret->substrs, 1, struct reg_substr_data);
15941 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15943 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15944 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15945 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15946 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15948 /* check_substr and check_utf8, if non-NULL, point to either their
15949 anchored or float namesakes, and don't hold a second reference. */
15951 if (ret->check_substr) {
15953 assert(r->check_utf8 == r->anchored_utf8);
15954 ret->check_substr = ret->anchored_substr;
15955 ret->check_utf8 = ret->anchored_utf8;
15957 assert(r->check_substr == r->float_substr);
15958 assert(r->check_utf8 == r->float_utf8);
15959 ret->check_substr = ret->float_substr;
15960 ret->check_utf8 = ret->float_utf8;
15962 } else if (ret->check_utf8) {
15964 ret->check_utf8 = ret->anchored_utf8;
15966 ret->check_utf8 = ret->float_utf8;
15971 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15972 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15975 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15977 if (RX_MATCH_COPIED(dstr))
15978 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15980 ret->subbeg = NULL;
15981 #ifdef PERL_ANY_COW
15982 ret->saved_copy = NULL;
15985 /* Whether mother_re be set or no, we need to copy the string. We
15986 cannot refrain from copying it when the storage points directly to
15987 our mother regexp, because that's
15988 1: a buffer in a different thread
15989 2: something we no longer hold a reference on
15990 so we need to copy it locally. */
15991 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15992 ret->mother_re = NULL;
15994 #endif /* PERL_IN_XSUB_RE */
15999 This is the internal complement to regdupe() which is used to copy
16000 the structure pointed to by the *pprivate pointer in the regexp.
16001 This is the core version of the extension overridable cloning hook.
16002 The regexp structure being duplicated will be copied by perl prior
16003 to this and will be provided as the regexp *r argument, however
16004 with the /old/ structures pprivate pointer value. Thus this routine
16005 may override any copying normally done by perl.
16007 It returns a pointer to the new regexp_internal structure.
16011 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16014 struct regexp *const r = ReANY(rx);
16015 regexp_internal *reti;
16017 RXi_GET_DECL(r,ri);
16019 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16023 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
16024 Copy(ri->program, reti->program, len+1, regnode);
16026 reti->num_code_blocks = ri->num_code_blocks;
16027 if (ri->code_blocks) {
16029 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16030 struct reg_code_block);
16031 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16032 struct reg_code_block);
16033 for (n = 0; n < ri->num_code_blocks; n++)
16034 reti->code_blocks[n].src_regex = (REGEXP*)
16035 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16038 reti->code_blocks = NULL;
16040 reti->regstclass = NULL;
16043 struct reg_data *d;
16044 const int count = ri->data->count;
16047 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16048 char, struct reg_data);
16049 Newx(d->what, count, U8);
16052 for (i = 0; i < count; i++) {
16053 d->what[i] = ri->data->what[i];
16054 switch (d->what[i]) {
16055 /* see also regcomp.h and regfree_internal() */
16056 case 'a': /* actually an AV, but the dup function is identical. */
16060 case 'u': /* actually an HV, but the dup function is identical. */
16061 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16064 /* This is cheating. */
16065 Newx(d->data[i], 1, regnode_ssc);
16066 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16067 reti->regstclass = (regnode*)d->data[i];
16070 /* Trie stclasses are readonly and can thus be shared
16071 * without duplication. We free the stclass in pregfree
16072 * when the corresponding reg_ac_data struct is freed.
16074 reti->regstclass= ri->regstclass;
16078 ((reg_trie_data*)ri->data->data[i])->refcount++;
16083 d->data[i] = ri->data->data[i];
16086 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
16095 reti->name_list_idx = ri->name_list_idx;
16097 #ifdef RE_TRACK_PATTERN_OFFSETS
16098 if (ri->u.offsets) {
16099 Newx(reti->u.offsets, 2*len+1, U32);
16100 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16103 SetProgLen(reti,len);
16106 return (void*)reti;
16109 #endif /* USE_ITHREADS */
16111 #ifndef PERL_IN_XSUB_RE
16114 - regnext - dig the "next" pointer out of a node
16117 Perl_regnext(pTHX_ regnode *p)
16125 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16126 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
16129 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16138 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16141 STRLEN l1 = strlen(pat1);
16142 STRLEN l2 = strlen(pat2);
16145 const char *message;
16147 PERL_ARGS_ASSERT_RE_CROAK2;
16153 Copy(pat1, buf, l1 , char);
16154 Copy(pat2, buf + l1, l2 , char);
16155 buf[l1 + l2] = '\n';
16156 buf[l1 + l2 + 1] = '\0';
16157 va_start(args, pat2);
16158 msv = vmess(buf, &args);
16160 message = SvPV_const(msv,l1);
16163 Copy(message, buf, l1 , char);
16164 /* l1-1 to avoid \n */
16165 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16168 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
16170 #ifndef PERL_IN_XSUB_RE
16172 Perl_save_re_context(pTHX)
16176 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16178 const REGEXP * const rx = PM_GETRE(PL_curpm);
16181 for (i = 1; i <= RX_NPARENS(rx); i++) {
16182 char digits[TYPE_CHARS(long)];
16183 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
16184 GV *const *const gvp
16185 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16188 GV * const gv = *gvp;
16189 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16201 S_put_byte(pTHX_ SV *sv, int c)
16203 PERL_ARGS_ASSERT_PUT_BYTE;
16207 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16208 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16209 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16210 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16211 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16214 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16219 const char string = c;
16220 if (c == '-' || c == ']' || c == '\\' || c == '^')
16221 sv_catpvs(sv, "\\");
16222 sv_catpvn(sv, &string, 1);
16227 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16229 /* Appends to 'sv' a displayable version of the innards of the bracketed
16230 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16231 * output anything */
16234 int rangestart = -1;
16235 bool has_output_anything = FALSE;
16237 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16239 for (i = 0; i <= 256; i++) {
16240 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16241 if (rangestart == -1)
16243 } else if (rangestart != -1) {
16245 if (i <= rangestart + 3) { /* Individual chars in short ranges */
16246 for (; rangestart < i; rangestart++)
16247 put_byte(sv, rangestart);
16250 || ! isALPHANUMERIC(rangestart)
16251 || ! isALPHANUMERIC(j)
16252 || isDIGIT(rangestart) != isDIGIT(j)
16253 || isUPPER(rangestart) != isUPPER(j)
16254 || isLOWER(rangestart) != isLOWER(j)
16256 /* This final test should get optimized out except
16257 * on EBCDIC platforms, where it causes ranges that
16258 * cross discontinuities like i/j to be shown as hex
16259 * instead of the misleading, e.g. H-K (since that
16260 * range includes more than H, I, J, K). */
16261 || (j - rangestart)
16262 != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
16264 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
16266 (j < 256) ? j : 255);
16268 else { /* Here, the ends of the range are both digits, or both
16269 uppercase, or both lowercase; and there's no
16270 discontinuity in the range (which could happen on EBCDIC
16272 put_byte(sv, rangestart);
16273 sv_catpvs(sv, "-");
16277 has_output_anything = TRUE;
16281 return has_output_anything;
16284 #define CLEAR_OPTSTART \
16285 if (optstart) STMT_START { \
16286 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16290 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16292 STATIC const regnode *
16293 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16294 const regnode *last, const regnode *plast,
16295 SV* sv, I32 indent, U32 depth)
16298 U8 op = PSEUDO; /* Arbitrary non-END op. */
16299 const regnode *next;
16300 const regnode *optstart= NULL;
16302 RXi_GET_DECL(r,ri);
16303 GET_RE_DEBUG_FLAGS_DECL;
16305 PERL_ARGS_ASSERT_DUMPUNTIL;
16307 #ifdef DEBUG_DUMPUNTIL
16308 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16309 last ? last-start : 0,plast ? plast-start : 0);
16312 if (plast && plast < last)
16315 while (PL_regkind[op] != END && (!last || node < last)) {
16316 /* While that wasn't END last time... */
16319 if (op == CLOSE || op == WHILEM)
16321 next = regnext((regnode *)node);
16324 if (OP(node) == OPTIMIZED) {
16325 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16332 regprop(r, sv, node);
16333 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16334 (int)(2*indent + 1), "", SvPVX_const(sv));
16336 if (OP(node) != OPTIMIZED) {
16337 if (next == NULL) /* Next ptr. */
16338 PerlIO_printf(Perl_debug_log, " (0)");
16339 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
16340 PerlIO_printf(Perl_debug_log, " (FAIL)");
16342 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16343 (void)PerlIO_putc(Perl_debug_log, '\n');
16347 if (PL_regkind[(U8)op] == BRANCHJ) {
16350 const regnode *nnode = (OP(next) == LONGJMP
16351 ? regnext((regnode *)next)
16353 if (last && nnode > last)
16355 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16358 else if (PL_regkind[(U8)op] == BRANCH) {
16360 DUMPUNTIL(NEXTOPER(node), next);
16362 else if ( PL_regkind[(U8)op] == TRIE ) {
16363 const regnode *this_trie = node;
16364 const char op = OP(node);
16365 const U32 n = ARG(node);
16366 const reg_ac_data * const ac = op>=AHOCORASICK ?
16367 (reg_ac_data *)ri->data->data[n] :
16369 const reg_trie_data * const trie =
16370 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16372 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16374 const regnode *nextbranch= NULL;
16377 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16378 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16380 PerlIO_printf(Perl_debug_log, "%*s%s ",
16381 (int)(2*(indent+3)), "",
16382 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
16383 PL_colors[0], PL_colors[1],
16384 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
16385 PERL_PV_PRETTY_ELLIPSES |
16386 PERL_PV_PRETTY_LTGT
16391 U16 dist= trie->jump[word_idx+1];
16392 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16393 (UV)((dist ? this_trie + dist : next) - start));
16396 nextbranch= this_trie + trie->jump[0];
16397 DUMPUNTIL(this_trie + dist, nextbranch);
16399 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16400 nextbranch= regnext((regnode *)nextbranch);
16402 PerlIO_printf(Perl_debug_log, "\n");
16405 if (last && next > last)
16410 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
16411 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16412 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16414 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16416 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16418 else if ( op == PLUS || op == STAR) {
16419 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16421 else if (PL_regkind[(U8)op] == ANYOF) {
16422 /* arglen 1 + class block */
16423 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16424 ? ANYOF_POSIXL_SKIP : ANYOF_SKIP);
16425 node = NEXTOPER(node);
16427 else if (PL_regkind[(U8)op] == EXACT) {
16428 /* Literal string, where present. */
16429 node += NODE_SZ_STR(node) - 1;
16430 node = NEXTOPER(node);
16433 node = NEXTOPER(node);
16434 node += regarglen[(U8)op];
16436 if (op == CURLYX || op == OPEN)
16440 #ifdef DEBUG_DUMPUNTIL
16441 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16446 #endif /* DEBUGGING */
16450 * c-indentation-style: bsd
16451 * c-basic-offset: 4
16452 * indent-tabs-mode: nil
16455 * ex: set ts=8 sts=4 sw=4 et: