5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 struct RExC_state_t {
104 U32 flags; /* RXf_* are we folding, multilining? */
105 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
106 char *precomp; /* uncompiled string. */
107 REGEXP *rx_sv; /* The SV that is the regexp. */
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
110 char *start; /* Start of input for compile */
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
113 SSize_t whilem_seen; /* number of WHILEM in this expr */
114 regnode *emit_start; /* Start of emitted-code area */
115 regnode *emit_bound; /* First regnode outside of the allocated space */
116 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
117 implies compiling, so don't emit */
118 regnode_ssc emit_dummy; /* placeholder for emit to point to;
119 large enough for the largest
120 non-EXACTish node, so can use it as
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
125 SSize_t size; /* Code size. */
126 I32 npar; /* Capture buffer count, (OPEN) plus one. ("par" 0 is the whole pattern)*/
127 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
140 HV *paren_names; /* Paren names */
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
144 U8 *study_chunk_recursed; /* bitmap of which parens we have moved through */
145 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
149 I32 override_recoding;
150 I32 in_multi_char_class;
151 struct reg_code_block *code_blocks; /* positions of literal (?{})
153 int num_code_blocks; /* size of code_blocks[] */
154 int code_index; /* next code_blocks[] slot */
156 char *starttry; /* -Dr: where regtry was called. */
157 #define RExC_starttry (pRExC_state->starttry)
159 SV *runtime_code_qr; /* qr with the runtime code blocks */
161 const char *lastparse;
163 AV *paren_name_list; /* idx -> name */
164 #define RExC_lastparse (pRExC_state->lastparse)
165 #define RExC_lastnum (pRExC_state->lastnum)
166 #define RExC_paren_name_list (pRExC_state->paren_name_list)
170 #define RExC_flags (pRExC_state->flags)
171 #define RExC_pm_flags (pRExC_state->pm_flags)
172 #define RExC_precomp (pRExC_state->precomp)
173 #define RExC_rx_sv (pRExC_state->rx_sv)
174 #define RExC_rx (pRExC_state->rx)
175 #define RExC_rxi (pRExC_state->rxi)
176 #define RExC_start (pRExC_state->start)
177 #define RExC_end (pRExC_state->end)
178 #define RExC_parse (pRExC_state->parse)
179 #define RExC_whilem_seen (pRExC_state->whilem_seen)
180 #ifdef RE_TRACK_PATTERN_OFFSETS
181 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
183 #define RExC_emit (pRExC_state->emit)
184 #define RExC_emit_dummy (pRExC_state->emit_dummy)
185 #define RExC_emit_start (pRExC_state->emit_start)
186 #define RExC_emit_bound (pRExC_state->emit_bound)
187 #define RExC_naughty (pRExC_state->naughty)
188 #define RExC_sawback (pRExC_state->sawback)
189 #define RExC_seen (pRExC_state->seen)
190 #define RExC_size (pRExC_state->size)
191 #define RExC_npar (pRExC_state->npar)
192 #define RExC_nestroot (pRExC_state->nestroot)
193 #define RExC_extralen (pRExC_state->extralen)
194 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
195 #define RExC_utf8 (pRExC_state->utf8)
196 #define RExC_uni_semantics (pRExC_state->uni_semantics)
197 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
198 #define RExC_open_parens (pRExC_state->open_parens)
199 #define RExC_close_parens (pRExC_state->close_parens)
200 #define RExC_opend (pRExC_state->opend)
201 #define RExC_paren_names (pRExC_state->paren_names)
202 #define RExC_recurse (pRExC_state->recurse)
203 #define RExC_recurse_count (pRExC_state->recurse_count)
204 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
205 #define RExC_study_chunk_recursed_bytes (pRExC_state->study_chunk_recursed_bytes)
206 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
207 #define RExC_contains_locale (pRExC_state->contains_locale)
208 #define RExC_contains_i (pRExC_state->contains_i)
209 #define RExC_override_recoding (pRExC_state->override_recoding)
210 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
213 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
214 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
215 ((*s) == '{' && regcurly(s, FALSE)))
218 * Flags to be passed up and down.
220 #define WORST 0 /* Worst case. */
221 #define HASWIDTH 0x01 /* Known to match non-null strings. */
223 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
224 * character. (There needs to be a case: in the switch statement in regexec.c
225 * for any node marked SIMPLE.) Note that this is not the same thing as
228 #define SPSTART 0x04 /* Starts with * or + */
229 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
230 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
231 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
233 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
235 /* whether trie related optimizations are enabled */
236 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
237 #define TRIE_STUDY_OPT
238 #define FULL_TRIE_STUDY
244 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
245 #define PBITVAL(paren) (1 << ((paren) & 7))
246 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
247 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
248 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
250 #define REQUIRE_UTF8 STMT_START { \
252 *flagp = RESTART_UTF8; \
257 /* This converts the named class defined in regcomp.h to its equivalent class
258 * number defined in handy.h. */
259 #define namedclass_to_classnum(class) ((int) ((class) / 2))
260 #define classnum_to_namedclass(classnum) ((classnum) * 2)
262 #define _invlist_union_complement_2nd(a, b, output) \
263 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
264 #define _invlist_intersection_complement_2nd(a, b, output) \
265 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
267 /* About scan_data_t.
269 During optimisation we recurse through the regexp program performing
270 various inplace (keyhole style) optimisations. In addition study_chunk
271 and scan_commit populate this data structure with information about
272 what strings MUST appear in the pattern. We look for the longest
273 string that must appear at a fixed location, and we look for the
274 longest string that may appear at a floating location. So for instance
279 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
280 strings (because they follow a .* construct). study_chunk will identify
281 both FOO and BAR as being the longest fixed and floating strings respectively.
283 The strings can be composites, for instance
287 will result in a composite fixed substring 'foo'.
289 For each string some basic information is maintained:
291 - offset or min_offset
292 This is the position the string must appear at, or not before.
293 It also implicitly (when combined with minlenp) tells us how many
294 characters must match before the string we are searching for.
295 Likewise when combined with minlenp and the length of the string it
296 tells us how many characters must appear after the string we have
300 Only used for floating strings. This is the rightmost point that
301 the string can appear at. If set to SSize_t_MAX it indicates that the
302 string can occur infinitely far to the right.
305 A pointer to the minimum number of characters of the pattern that the
306 string was found inside. This is important as in the case of positive
307 lookahead or positive lookbehind we can have multiple patterns
312 The minimum length of the pattern overall is 3, the minimum length
313 of the lookahead part is 3, but the minimum length of the part that
314 will actually match is 1. So 'FOO's minimum length is 3, but the
315 minimum length for the F is 1. This is important as the minimum length
316 is used to determine offsets in front of and behind the string being
317 looked for. Since strings can be composites this is the length of the
318 pattern at the time it was committed with a scan_commit. Note that
319 the length is calculated by study_chunk, so that the minimum lengths
320 are not known until the full pattern has been compiled, thus the
321 pointer to the value.
325 In the case of lookbehind the string being searched for can be
326 offset past the start point of the final matching string.
327 If this value was just blithely removed from the min_offset it would
328 invalidate some of the calculations for how many chars must match
329 before or after (as they are derived from min_offset and minlen and
330 the length of the string being searched for).
331 When the final pattern is compiled and the data is moved from the
332 scan_data_t structure into the regexp structure the information
333 about lookbehind is factored in, with the information that would
334 have been lost precalculated in the end_shift field for the
337 The fields pos_min and pos_delta are used to store the minimum offset
338 and the delta to the maximum offset at the current point in the pattern.
342 typedef struct scan_data_t {
343 /*I32 len_min; unused */
344 /*I32 len_delta; unused */
348 SSize_t last_end; /* min value, <0 unless valid. */
349 SSize_t last_start_min;
350 SSize_t last_start_max;
351 SV **longest; /* Either &l_fixed, or &l_float. */
352 SV *longest_fixed; /* longest fixed string found in pattern */
353 SSize_t offset_fixed; /* offset where it starts */
354 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
355 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
356 SV *longest_float; /* longest floating string found in pattern */
357 SSize_t offset_float_min; /* earliest point in string it can appear */
358 SSize_t offset_float_max; /* latest point in string it can appear */
359 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
360 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
363 SSize_t *last_closep;
364 regnode_ssc *start_class;
367 /* The below is perhaps overboard, but this allows us to save a test at the
368 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
369 * and 'a' differ by a single bit; the same with the upper and lower case of
370 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
371 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
372 * then inverts it to form a mask, with just a single 0, in the bit position
373 * where the upper- and lowercase differ. XXX There are about 40 other
374 * instances in the Perl core where this micro-optimization could be used.
375 * Should decide if maintenance cost is worse, before changing those
377 * Returns a boolean as to whether or not 'v' is either a lowercase or
378 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
379 * compile-time constant, the generated code is better than some optimizing
380 * compilers figure out, amounting to a mask and test. The results are
381 * meaningless if 'c' is not one of [A-Za-z] */
382 #define isARG2_lower_or_UPPER_ARG1(c, v) \
383 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
386 * Forward declarations for pregcomp()'s friends.
389 static const scan_data_t zero_scan_data =
390 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
392 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
393 #define SF_BEFORE_SEOL 0x0001
394 #define SF_BEFORE_MEOL 0x0002
395 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
396 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
398 #define SF_FIX_SHIFT_EOL (+2)
399 #define SF_FL_SHIFT_EOL (+4)
401 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
402 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
404 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
405 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
406 #define SF_IS_INF 0x0040
407 #define SF_HAS_PAR 0x0080
408 #define SF_IN_PAR 0x0100
409 #define SF_HAS_EVAL 0x0200
410 #define SCF_DO_SUBSTR 0x0400
411 #define SCF_DO_STCLASS_AND 0x0800
412 #define SCF_DO_STCLASS_OR 0x1000
413 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
414 #define SCF_WHILEM_VISITED_POS 0x2000
416 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
417 #define SCF_SEEN_ACCEPT 0x8000
418 #define SCF_TRIE_DOING_RESTUDY 0x10000
420 #define UTF cBOOL(RExC_utf8)
422 /* The enums for all these are ordered so things work out correctly */
423 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
424 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
425 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
426 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
427 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
428 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
429 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
431 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
433 #define OOB_NAMEDCLASS -1
435 /* There is no code point that is out-of-bounds, so this is problematic. But
436 * its only current use is to initialize a variable that is always set before
438 #define OOB_UNICODE 0xDEADBEEF
440 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
441 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
444 /* length of regex to show in messages that don't mark a position within */
445 #define RegexLengthToShowInErrorMessages 127
448 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
449 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
450 * op/pragma/warn/regcomp.
452 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
453 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
455 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
457 #define REPORT_LOCATION_ARGS(offset) \
458 UTF8fARG(UTF, offset, RExC_precomp), \
459 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
462 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
463 * arg. Show regex, up to a maximum length. If it's too long, chop and add
466 #define _FAIL(code) STMT_START { \
467 const char *ellipses = ""; \
468 IV len = RExC_end - RExC_precomp; \
471 SAVEFREESV(RExC_rx_sv); \
472 if (len > RegexLengthToShowInErrorMessages) { \
473 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
474 len = RegexLengthToShowInErrorMessages - 10; \
480 #define FAIL(msg) _FAIL( \
481 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
482 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
484 #define FAIL2(msg,arg) _FAIL( \
485 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
486 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
489 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
491 #define Simple_vFAIL(m) STMT_START { \
492 const IV offset = RExC_parse - RExC_precomp; \
493 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
494 m, REPORT_LOCATION_ARGS(offset)); \
498 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
500 #define vFAIL(m) STMT_START { \
502 SAVEFREESV(RExC_rx_sv); \
507 * Like Simple_vFAIL(), but accepts two arguments.
509 #define Simple_vFAIL2(m,a1) STMT_START { \
510 const IV offset = RExC_parse - RExC_precomp; \
511 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
512 REPORT_LOCATION_ARGS(offset)); \
516 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
518 #define vFAIL2(m,a1) STMT_START { \
520 SAVEFREESV(RExC_rx_sv); \
521 Simple_vFAIL2(m, a1); \
526 * Like Simple_vFAIL(), but accepts three arguments.
528 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
529 const IV offset = RExC_parse - RExC_precomp; \
530 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
531 REPORT_LOCATION_ARGS(offset)); \
535 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
537 #define vFAIL3(m,a1,a2) STMT_START { \
539 SAVEFREESV(RExC_rx_sv); \
540 Simple_vFAIL3(m, a1, a2); \
544 * Like Simple_vFAIL(), but accepts four arguments.
546 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
547 const IV offset = RExC_parse - RExC_precomp; \
548 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
549 REPORT_LOCATION_ARGS(offset)); \
552 #define vFAIL4(m,a1,a2,a3) STMT_START { \
554 SAVEFREESV(RExC_rx_sv); \
555 Simple_vFAIL4(m, a1, a2, a3); \
558 /* A specialized version of vFAIL2 that works with UTF8f */
559 #define vFAIL2utf8f(m, a1) STMT_START { \
560 const IV offset = RExC_parse - RExC_precomp; \
562 SAVEFREESV(RExC_rx_sv); \
563 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
564 REPORT_LOCATION_ARGS(offset)); \
568 /* m is not necessarily a "literal string", in this macro */
569 #define reg_warn_non_literal_string(loc, m) STMT_START { \
570 const IV offset = loc - RExC_precomp; \
571 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
572 m, REPORT_LOCATION_ARGS(offset)); \
575 #define ckWARNreg(loc,m) STMT_START { \
576 const IV offset = loc - RExC_precomp; \
577 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
578 REPORT_LOCATION_ARGS(offset)); \
581 #define vWARN_dep(loc, m) STMT_START { \
582 const IV offset = loc - RExC_precomp; \
583 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
584 REPORT_LOCATION_ARGS(offset)); \
587 #define ckWARNdep(loc,m) STMT_START { \
588 const IV offset = loc - RExC_precomp; \
589 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
591 REPORT_LOCATION_ARGS(offset)); \
594 #define ckWARNregdep(loc,m) STMT_START { \
595 const IV offset = loc - RExC_precomp; \
596 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
598 REPORT_LOCATION_ARGS(offset)); \
601 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
602 const IV offset = loc - RExC_precomp; \
603 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
605 a1, REPORT_LOCATION_ARGS(offset)); \
608 #define ckWARN2reg(loc, m, a1) STMT_START { \
609 const IV offset = loc - RExC_precomp; \
610 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
611 a1, REPORT_LOCATION_ARGS(offset)); \
614 #define vWARN3(loc, m, a1, a2) STMT_START { \
615 const IV offset = loc - RExC_precomp; \
616 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
617 a1, a2, REPORT_LOCATION_ARGS(offset)); \
620 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
621 const IV offset = loc - RExC_precomp; \
622 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
623 a1, a2, REPORT_LOCATION_ARGS(offset)); \
626 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
627 const IV offset = loc - RExC_precomp; \
628 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
629 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
632 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
633 const IV offset = loc - RExC_precomp; \
634 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
635 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
638 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
639 const IV offset = loc - RExC_precomp; \
640 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
641 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
645 /* Allow for side effects in s */
646 #define REGC(c,s) STMT_START { \
647 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
650 /* Macros for recording node offsets. 20001227 mjd@plover.com
651 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
652 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
653 * Element 0 holds the number n.
654 * Position is 1 indexed.
656 #ifndef RE_TRACK_PATTERN_OFFSETS
657 #define Set_Node_Offset_To_R(node,byte)
658 #define Set_Node_Offset(node,byte)
659 #define Set_Cur_Node_Offset
660 #define Set_Node_Length_To_R(node,len)
661 #define Set_Node_Length(node,len)
662 #define Set_Node_Cur_Length(node,start)
663 #define Node_Offset(n)
664 #define Node_Length(n)
665 #define Set_Node_Offset_Length(node,offset,len)
666 #define ProgLen(ri) ri->u.proglen
667 #define SetProgLen(ri,x) ri->u.proglen = x
669 #define ProgLen(ri) ri->u.offsets[0]
670 #define SetProgLen(ri,x) ri->u.offsets[0] = x
671 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
673 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
674 __LINE__, (int)(node), (int)(byte))); \
676 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
678 RExC_offsets[2*(node)-1] = (byte); \
683 #define Set_Node_Offset(node,byte) \
684 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
685 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
687 #define Set_Node_Length_To_R(node,len) STMT_START { \
689 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
690 __LINE__, (int)(node), (int)(len))); \
692 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
694 RExC_offsets[2*(node)] = (len); \
699 #define Set_Node_Length(node,len) \
700 Set_Node_Length_To_R((node)-RExC_emit_start, len)
701 #define Set_Node_Cur_Length(node, start) \
702 Set_Node_Length(node, RExC_parse - start)
704 /* Get offsets and lengths */
705 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
706 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
708 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
709 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
710 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
714 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
715 #define EXPERIMENTAL_INPLACESCAN
716 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
718 #define DEBUG_RExC_seen() \
719 DEBUG_OPTIMISE_MORE_r({ \
720 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
722 if (RExC_seen & REG_SEEN_ZERO_LEN) \
723 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN "); \
725 if (RExC_seen & REG_SEEN_LOOKBEHIND) \
726 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND "); \
728 if (RExC_seen & REG_SEEN_GPOS) \
729 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS "); \
731 if (RExC_seen & REG_SEEN_CANY) \
732 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY "); \
734 if (RExC_seen & REG_SEEN_RECURSE) \
735 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE "); \
737 if (RExC_seen & REG_TOP_LEVEL_BRANCHES) \
738 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES "); \
740 if (RExC_seen & REG_SEEN_VERBARG) \
741 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG "); \
743 if (RExC_seen & REG_SEEN_CUTGROUP) \
744 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP "); \
746 if (RExC_seen & REG_SEEN_RUN_ON_COMMENT) \
747 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT "); \
749 if (RExC_seen & REG_SEEN_EXACTF_SHARP_S) \
750 PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S "); \
752 if (RExC_seen & REG_SEEN_GOSTART) \
753 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART "); \
755 PerlIO_printf(Perl_debug_log,"\n"); \
758 #define DEBUG_STUDYDATA(str,data,depth) \
759 DEBUG_OPTIMISE_MORE_r(if(data){ \
760 PerlIO_printf(Perl_debug_log, \
761 "%*s" str "Pos:%"IVdf"/%"IVdf \
762 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
763 (int)(depth)*2, "", \
764 (IV)((data)->pos_min), \
765 (IV)((data)->pos_delta), \
766 (UV)((data)->flags), \
767 (IV)((data)->whilem_c), \
768 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
769 is_inf ? "INF " : "" \
771 if ((data)->last_found) \
772 PerlIO_printf(Perl_debug_log, \
773 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
774 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
775 SvPVX_const((data)->last_found), \
776 (IV)((data)->last_end), \
777 (IV)((data)->last_start_min), \
778 (IV)((data)->last_start_max), \
779 ((data)->longest && \
780 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
781 SvPVX_const((data)->longest_fixed), \
782 (IV)((data)->offset_fixed), \
783 ((data)->longest && \
784 (data)->longest==&((data)->longest_float)) ? "*" : "", \
785 SvPVX_const((data)->longest_float), \
786 (IV)((data)->offset_float_min), \
787 (IV)((data)->offset_float_max) \
789 PerlIO_printf(Perl_debug_log,"\n"); \
792 /* Mark that we cannot extend a found fixed substring at this point.
793 Update the longest found anchored substring and the longest found
794 floating substrings if needed. */
797 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
798 SSize_t *minlenp, int is_inf)
800 const STRLEN l = CHR_SVLEN(data->last_found);
801 const STRLEN old_l = CHR_SVLEN(*data->longest);
802 GET_RE_DEBUG_FLAGS_DECL;
804 PERL_ARGS_ASSERT_SCAN_COMMIT;
806 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
807 SvSetMagicSV(*data->longest, data->last_found);
808 if (*data->longest == data->longest_fixed) {
809 data->offset_fixed = l ? data->last_start_min : data->pos_min;
810 if (data->flags & SF_BEFORE_EOL)
812 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
814 data->flags &= ~SF_FIX_BEFORE_EOL;
815 data->minlen_fixed=minlenp;
816 data->lookbehind_fixed=0;
818 else { /* *data->longest == data->longest_float */
819 data->offset_float_min = l ? data->last_start_min : data->pos_min;
820 data->offset_float_max = (l
821 ? data->last_start_max
822 : (data->pos_delta == SSize_t_MAX
824 : data->pos_min + data->pos_delta));
826 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
827 data->offset_float_max = SSize_t_MAX;
828 if (data->flags & SF_BEFORE_EOL)
830 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
832 data->flags &= ~SF_FL_BEFORE_EOL;
833 data->minlen_float=minlenp;
834 data->lookbehind_float=0;
837 SvCUR_set(data->last_found, 0);
839 SV * const sv = data->last_found;
840 if (SvUTF8(sv) && SvMAGICAL(sv)) {
841 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
847 data->flags &= ~SF_BEFORE_EOL;
848 DEBUG_STUDYDATA("commit: ",data,0);
851 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
852 * list that describes which code points it matches */
855 S_ssc_anything(pTHX_ regnode_ssc *ssc)
857 /* Set the SSC 'ssc' to match an empty string or any code point */
859 PERL_ARGS_ASSERT_SSC_ANYTHING;
861 assert(OP(ssc) == ANYOF_SYNTHETIC);
863 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
864 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
865 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
869 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
871 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
872 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
873 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
874 * in any way, so there's no point in using it */
879 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
881 assert(OP(ssc) == ANYOF_SYNTHETIC);
883 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
887 /* See if the list consists solely of the range 0 - Infinity */
888 invlist_iterinit(ssc->invlist);
889 ret = invlist_iternext(ssc->invlist, &start, &end)
893 invlist_iterfinish(ssc->invlist);
899 /* If e.g., both \w and \W are set, matches everything */
900 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
902 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
903 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
913 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
915 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
916 * string, any code point, or any posix class under locale */
918 PERL_ARGS_ASSERT_SSC_INIT;
920 Zero(ssc, 1, regnode_ssc);
921 OP(ssc) = ANYOF_SYNTHETIC;
922 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
925 /* If any portion of the regex is to operate under locale rules,
926 * initialization includes it. The reason this isn't done for all regexes
927 * is that the optimizer was written under the assumption that locale was
928 * all-or-nothing. Given the complexity and lack of documentation in the
929 * optimizer, and that there are inadequate test cases for locale, many
930 * parts of it may not work properly, it is safest to avoid locale unless
932 if (RExC_contains_locale) {
933 ANYOF_POSIXL_SETALL(ssc);
934 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
935 if (RExC_contains_i) {
936 ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
940 ANYOF_POSIXL_ZERO(ssc);
945 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
946 const regnode_ssc *ssc)
948 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
949 * to the list of code points matched, and locale posix classes; hence does
950 * not check its flags) */
955 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
957 assert(OP(ssc) == ANYOF_SYNTHETIC);
959 invlist_iterinit(ssc->invlist);
960 ret = invlist_iternext(ssc->invlist, &start, &end)
964 invlist_iterfinish(ssc->invlist);
970 if (RExC_contains_locale) {
971 if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
972 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
973 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
977 if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
986 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
987 const regnode_charclass_posixl* const node)
989 /* Returns a mortal inversion list defining which code points are matched
990 * by 'node', which is of type ANYOF. Handles complementing the result if
991 * appropriate. If some code points aren't knowable at this time, the
992 * returned list must, and will, contain every possible code point. */
994 SV* invlist = sv_2mortal(_new_invlist(0));
996 const U32 n = ARG(node);
998 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1000 /* Look at the data structure created by S_set_ANYOF_arg() */
1001 if (n != ANYOF_NONBITMAP_EMPTY) {
1002 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1003 AV * const av = MUTABLE_AV(SvRV(rv));
1004 SV **const ary = AvARRAY(av);
1005 assert(RExC_rxi->data->what[n] == 's');
1007 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1008 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1010 else if (ary[0] && ary[0] != &PL_sv_undef) {
1012 /* Here, no compile-time swash, and there are things that won't be
1013 * known until runtime -- we have to assume it could be anything */
1014 return _add_range_to_invlist(invlist, 0, UV_MAX);
1018 /* Here no compile-time swash, and no run-time only data. Use the
1019 * node's inversion list */
1020 invlist = sv_2mortal(invlist_clone(ary[2]));
1024 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1025 * inversion list for the others, but if there are code points that should
1026 * match only conditionally on the target string being UTF-8, those are
1027 * placed in the inversion list, and not the bitmap. Since there are
1028 * circumstances under which they could match, they are included in the
1029 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1030 * here, so that when we invert below, the end result actually does include
1031 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1032 * before we add the unconditionally matched code points */
1033 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1034 _invlist_intersection_complement_2nd(invlist,
1039 /* Add in the points from the bit map */
1040 for (i = 0; i < 256; i++) {
1041 if (ANYOF_BITMAP_TEST(node, i)) {
1042 invlist = add_cp_to_invlist(invlist, i);
1046 /* If this can match all upper Latin1 code points, have to add them
1048 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
1049 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1052 /* Similarly for these */
1053 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1054 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1057 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1058 _invlist_invert(invlist);
1064 /* These two functions currently do the exact same thing */
1065 #define ssc_init_zero ssc_init
1067 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1068 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1071 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1073 /* Take the flags 'and_with' and accumulate them anded into the flags for
1074 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
1075 * The flags 'and_with' should not come from another SSC (otherwise the
1076 * EMPTY_STRING flag won't work) */
1078 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
1080 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1082 /* Use just the SSC-related flags from 'and_with' */
1083 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_LOCALE_FLAGS);
1084 ANYOF_FLAGS(ssc) |= ssc_only_flags;
1087 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1088 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1089 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1092 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1093 const regnode_ssc *and_with)
1095 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1096 * another SSC or a regular ANYOF class. Can create false positives. */
1101 PERL_ARGS_ASSERT_SSC_AND;
1103 assert(OP(ssc) == ANYOF_SYNTHETIC);
1105 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1106 * the code point inversion list and just the relevant flags */
1107 if (OP(and_with) == ANYOF_SYNTHETIC) {
1108 anded_cp_list = and_with->invlist;
1109 anded_flags = ANYOF_FLAGS(and_with);
1112 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1113 (regnode_charclass_posixl*) and_with);
1114 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
1117 ANYOF_FLAGS(ssc) &= anded_flags;
1119 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1120 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1121 * 'and_with' may be inverted. When not inverted, we have the situation of
1123 * (C1 | P1) & (C2 | P2)
1124 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1125 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1126 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1127 * <= ((C1 & C2) | P1 | P2)
1128 * Alternatively, the last few steps could be:
1129 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1130 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1131 * <= (C1 | C2 | (P1 & P2))
1132 * We favor the second approach if either P1 or P2 is non-empty. This is
1133 * because these components are a barrier to doing optimizations, as what
1134 * they match cannot be known until the moment of matching as they are
1135 * dependent on the current locale, 'AND"ing them likely will reduce or
1137 * But we can do better if we know that C1,P1 are in their initial state (a
1138 * frequent occurrence), each matching everything:
1139 * (<everything>) & (C2 | P2) = C2 | P2
1140 * Similarly, if C2,P2 are in their initial state (again a frequent
1141 * occurrence), the result is a no-op
1142 * (C1 | P1) & (<everything>) = C1 | P1
1145 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1146 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1147 * <= (C1 & ~C2) | (P1 & ~P2)
1150 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1151 && OP(and_with) != ANYOF_SYNTHETIC)
1155 ssc_intersection(ssc,
1157 FALSE /* Has already been inverted */
1160 /* If either P1 or P2 is empty, the intersection will be also; can skip
1162 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1163 ANYOF_POSIXL_ZERO(ssc);
1165 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1167 /* Note that the Posix class component P from 'and_with' actually
1169 * P = Pa | Pb | ... | Pn
1170 * where each component is one posix class, such as in [\w\s].
1172 * ~P = ~(Pa | Pb | ... | Pn)
1173 * = ~Pa & ~Pb & ... & ~Pn
1174 * <= ~Pa | ~Pb | ... | ~Pn
1175 * The last is something we can easily calculate, but unfortunately
1176 * is likely to have many false positives. We could do better
1177 * in some (but certainly not all) instances if two classes in
1178 * P have known relationships. For example
1179 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1181 * :lower: & :print: = :lower:
1182 * And similarly for classes that must be disjoint. For example,
1183 * since \s and \w can have no elements in common based on rules in
1184 * the POSIX standard,
1185 * \w & ^\S = nothing
1186 * Unfortunately, some vendor locales do not meet the Posix
1187 * standard, in particular almost everything by Microsoft.
1188 * The loop below just changes e.g., \w into \W and vice versa */
1190 regnode_charclass_posixl temp;
1191 int add = 1; /* To calculate the index of the complement */
1193 ANYOF_POSIXL_ZERO(&temp);
1194 for (i = 0; i < ANYOF_MAX; i++) {
1196 || ! ANYOF_POSIXL_TEST(and_with, i)
1197 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1199 if (ANYOF_POSIXL_TEST(and_with, i)) {
1200 ANYOF_POSIXL_SET(&temp, i + add);
1202 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1204 ANYOF_POSIXL_AND(&temp, ssc);
1206 } /* else ssc already has no posixes */
1207 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1208 in its initial state */
1209 else if (OP(and_with) != ANYOF_SYNTHETIC
1210 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1212 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1213 * copy it over 'ssc' */
1214 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1215 if (OP(and_with) == ANYOF_SYNTHETIC) {
1216 StructCopy(and_with, ssc, regnode_ssc);
1219 ssc->invlist = anded_cp_list;
1220 ANYOF_POSIXL_ZERO(ssc);
1221 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1222 ANYOF_POSIXL_OR(and_with, ssc);
1226 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1227 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1229 /* One or the other of P1, P2 is non-empty. */
1230 ANYOF_POSIXL_AND(and_with, ssc);
1231 ssc_union(ssc, anded_cp_list, FALSE);
1233 else { /* P1 = P2 = empty */
1234 ssc_intersection(ssc, anded_cp_list, FALSE);
1240 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1241 const regnode_ssc *or_with)
1243 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1244 * another SSC or a regular ANYOF class. Can create false positives if
1245 * 'or_with' is to be inverted. */
1250 PERL_ARGS_ASSERT_SSC_OR;
1252 assert(OP(ssc) == ANYOF_SYNTHETIC);
1254 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1255 * the code point inversion list and just the relevant flags */
1256 if (OP(or_with) == ANYOF_SYNTHETIC) {
1257 ored_cp_list = or_with->invlist;
1258 ored_flags = ANYOF_FLAGS(or_with);
1261 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1262 (regnode_charclass_posixl*) or_with);
1263 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
1266 ANYOF_FLAGS(ssc) |= ored_flags;
1268 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1269 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1270 * 'or_with' may be inverted. When not inverted, we have the simple
1271 * situation of computing:
1272 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1273 * If P1|P2 yields a situation with both a class and its complement are
1274 * set, like having both \w and \W, this matches all code points, and we
1275 * can delete these from the P component of the ssc going forward. XXX We
1276 * might be able to delete all the P components, but I (khw) am not certain
1277 * about this, and it is better to be safe.
1280 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1281 * <= (C1 | P1) | ~C2
1282 * <= (C1 | ~C2) | P1
1283 * (which results in actually simpler code than the non-inverted case)
1286 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1287 && OP(or_with) != ANYOF_SYNTHETIC)
1289 /* We ignore P2, leaving P1 going forward */
1291 else { /* Not inverted */
1292 ANYOF_POSIXL_OR(or_with, ssc);
1293 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1295 for (i = 0; i < ANYOF_MAX; i += 2) {
1296 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1298 ssc_match_all_cp(ssc);
1299 ANYOF_POSIXL_CLEAR(ssc, i);
1300 ANYOF_POSIXL_CLEAR(ssc, i+1);
1301 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1302 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1311 FALSE /* Already has been inverted */
1315 PERL_STATIC_INLINE void
1316 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1318 PERL_ARGS_ASSERT_SSC_UNION;
1320 assert(OP(ssc) == ANYOF_SYNTHETIC);
1322 _invlist_union_maybe_complement_2nd(ssc->invlist,
1328 PERL_STATIC_INLINE void
1329 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1331 const bool invert2nd)
1333 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1335 assert(OP(ssc) == ANYOF_SYNTHETIC);
1337 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1343 PERL_STATIC_INLINE void
1344 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1346 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1348 assert(OP(ssc) == ANYOF_SYNTHETIC);
1350 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1353 PERL_STATIC_INLINE void
1354 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1356 /* AND just the single code point 'cp' into the SSC 'ssc' */
1358 SV* cp_list = _new_invlist(2);
1360 PERL_ARGS_ASSERT_SSC_CP_AND;
1362 assert(OP(ssc) == ANYOF_SYNTHETIC);
1364 cp_list = add_cp_to_invlist(cp_list, cp);
1365 ssc_intersection(ssc, cp_list,
1366 FALSE /* Not inverted */
1368 SvREFCNT_dec_NN(cp_list);
1371 PERL_STATIC_INLINE void
1372 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1374 /* Set the SSC 'ssc' to not match any locale things */
1376 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1378 assert(OP(ssc) == ANYOF_SYNTHETIC);
1380 ANYOF_POSIXL_ZERO(ssc);
1381 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1385 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1387 /* The inversion list in the SSC is marked mortal; now we need a more
1388 * permanent copy, which is stored the same way that is done in a regular
1389 * ANYOF node, with the first 256 code points in a bit map */
1391 SV* invlist = invlist_clone(ssc->invlist);
1393 PERL_ARGS_ASSERT_SSC_FINALIZE;
1395 assert(OP(ssc) == ANYOF_SYNTHETIC);
1397 /* The code in this file assumes that all but these flags aren't relevant
1398 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1399 * time we reach here */
1400 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
1402 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1404 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1406 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1409 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1410 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1411 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1412 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1417 dump_trie(trie,widecharmap,revcharmap)
1418 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1419 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1421 These routines dump out a trie in a somewhat readable format.
1422 The _interim_ variants are used for debugging the interim
1423 tables that are used to generate the final compressed
1424 representation which is what dump_trie expects.
1426 Part of the reason for their existence is to provide a form
1427 of documentation as to how the different representations function.
1432 Dumps the final compressed table form of the trie to Perl_debug_log.
1433 Used for debugging make_trie().
1437 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1438 AV *revcharmap, U32 depth)
1441 SV *sv=sv_newmortal();
1442 int colwidth= widecharmap ? 6 : 4;
1444 GET_RE_DEBUG_FLAGS_DECL;
1446 PERL_ARGS_ASSERT_DUMP_TRIE;
1448 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1449 (int)depth * 2 + 2,"",
1450 "Match","Base","Ofs" );
1452 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1453 SV ** const tmp = av_fetch( revcharmap, state, 0);
1455 PerlIO_printf( Perl_debug_log, "%*s",
1457 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1458 PL_colors[0], PL_colors[1],
1459 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1460 PERL_PV_ESCAPE_FIRSTCHAR
1465 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1466 (int)depth * 2 + 2,"");
1468 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1469 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1470 PerlIO_printf( Perl_debug_log, "\n");
1472 for( state = 1 ; state < trie->statecount ; state++ ) {
1473 const U32 base = trie->states[ state ].trans.base;
1475 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1477 if ( trie->states[ state ].wordnum ) {
1478 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1480 PerlIO_printf( Perl_debug_log, "%6s", "" );
1483 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1488 while( ( base + ofs < trie->uniquecharcount ) ||
1489 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1490 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1493 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1495 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1496 if ( ( base + ofs >= trie->uniquecharcount ) &&
1497 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1498 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1500 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1502 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1504 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1508 PerlIO_printf( Perl_debug_log, "]");
1511 PerlIO_printf( Perl_debug_log, "\n" );
1513 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1514 for (word=1; word <= trie->wordcount; word++) {
1515 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1516 (int)word, (int)(trie->wordinfo[word].prev),
1517 (int)(trie->wordinfo[word].len));
1519 PerlIO_printf(Perl_debug_log, "\n" );
1522 Dumps a fully constructed but uncompressed trie in list form.
1523 List tries normally only are used for construction when the number of
1524 possible chars (trie->uniquecharcount) is very high.
1525 Used for debugging make_trie().
1528 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1529 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1533 SV *sv=sv_newmortal();
1534 int colwidth= widecharmap ? 6 : 4;
1535 GET_RE_DEBUG_FLAGS_DECL;
1537 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1539 /* print out the table precompression. */
1540 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1541 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1542 "------:-----+-----------------\n" );
1544 for( state=1 ; state < next_alloc ; state ++ ) {
1547 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1548 (int)depth * 2 + 2,"", (UV)state );
1549 if ( ! trie->states[ state ].wordnum ) {
1550 PerlIO_printf( Perl_debug_log, "%5s| ","");
1552 PerlIO_printf( Perl_debug_log, "W%4x| ",
1553 trie->states[ state ].wordnum
1556 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1557 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1559 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1561 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1562 PL_colors[0], PL_colors[1],
1563 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1564 PERL_PV_ESCAPE_FIRSTCHAR
1566 TRIE_LIST_ITEM(state,charid).forid,
1567 (UV)TRIE_LIST_ITEM(state,charid).newstate
1570 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1571 (int)((depth * 2) + 14), "");
1574 PerlIO_printf( Perl_debug_log, "\n");
1579 Dumps a fully constructed but uncompressed trie in table form.
1580 This is the normal DFA style state transition table, with a few
1581 twists to facilitate compression later.
1582 Used for debugging make_trie().
1585 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1586 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1591 SV *sv=sv_newmortal();
1592 int colwidth= widecharmap ? 6 : 4;
1593 GET_RE_DEBUG_FLAGS_DECL;
1595 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1598 print out the table precompression so that we can do a visual check
1599 that they are identical.
1602 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1604 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1605 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1607 PerlIO_printf( Perl_debug_log, "%*s",
1609 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1610 PL_colors[0], PL_colors[1],
1611 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1612 PERL_PV_ESCAPE_FIRSTCHAR
1618 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1620 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1621 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1624 PerlIO_printf( Perl_debug_log, "\n" );
1626 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1628 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1629 (int)depth * 2 + 2,"",
1630 (UV)TRIE_NODENUM( state ) );
1632 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1633 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1635 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1637 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1639 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1640 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1642 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1643 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1651 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1652 startbranch: the first branch in the whole branch sequence
1653 first : start branch of sequence of branch-exact nodes.
1654 May be the same as startbranch
1655 last : Thing following the last branch.
1656 May be the same as tail.
1657 tail : item following the branch sequence
1658 count : words in the sequence
1659 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1660 depth : indent depth
1662 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1664 A trie is an N'ary tree where the branches are determined by digital
1665 decomposition of the key. IE, at the root node you look up the 1st character and
1666 follow that branch repeat until you find the end of the branches. Nodes can be
1667 marked as "accepting" meaning they represent a complete word. Eg:
1671 would convert into the following structure. Numbers represent states, letters
1672 following numbers represent valid transitions on the letter from that state, if
1673 the number is in square brackets it represents an accepting state, otherwise it
1674 will be in parenthesis.
1676 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1680 (1) +-i->(6)-+-s->[7]
1682 +-s->(3)-+-h->(4)-+-e->[5]
1684 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1686 This shows that when matching against the string 'hers' we will begin at state 1
1687 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1688 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1689 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1690 single traverse. We store a mapping from accepting to state to which word was
1691 matched, and then when we have multiple possibilities we try to complete the
1692 rest of the regex in the order in which they occured in the alternation.
1694 The only prior NFA like behaviour that would be changed by the TRIE support is
1695 the silent ignoring of duplicate alternations which are of the form:
1697 / (DUPE|DUPE) X? (?{ ... }) Y /x
1699 Thus EVAL blocks following a trie may be called a different number of times with
1700 and without the optimisation. With the optimisations dupes will be silently
1701 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1702 the following demonstrates:
1704 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1706 which prints out 'word' three times, but
1708 'words'=~/(word|word|word)(?{ print $1 })S/
1710 which doesnt print it out at all. This is due to other optimisations kicking in.
1712 Example of what happens on a structural level:
1714 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1716 1: CURLYM[1] {1,32767}(18)
1727 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1728 and should turn into:
1730 1: CURLYM[1] {1,32767}(18)
1732 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1740 Cases where tail != last would be like /(?foo|bar)baz/:
1750 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1751 and would end up looking like:
1754 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1761 d = uvchr_to_utf8_flags(d, uv, 0);
1763 is the recommended Unicode-aware way of saying
1768 #define TRIE_STORE_REVCHAR(val) \
1771 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1772 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1773 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1774 SvCUR_set(zlopp, kapow - flrbbbbb); \
1777 av_push(revcharmap, zlopp); \
1779 char ooooff = (char)val; \
1780 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1784 /* This gets the next character from the input, folding it if not already
1786 #define TRIE_READ_CHAR STMT_START { \
1789 /* if it is UTF then it is either already folded, or does not need \
1791 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1793 else if (folder == PL_fold_latin1) { \
1794 /* This folder implies Unicode rules, which in the range expressible \
1795 * by not UTF is the lower case, with the two exceptions, one of \
1796 * which should have been taken care of before calling this */ \
1797 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1798 uvc = toLOWER_L1(*uc); \
1799 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1802 /* raw data, will be folded later if needed */ \
1810 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1811 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1812 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1813 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1815 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1816 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1817 TRIE_LIST_CUR( state )++; \
1820 #define TRIE_LIST_NEW(state) STMT_START { \
1821 Newxz( trie->states[ state ].trans.list, \
1822 4, reg_trie_trans_le ); \
1823 TRIE_LIST_CUR( state ) = 1; \
1824 TRIE_LIST_LEN( state ) = 4; \
1827 #define TRIE_HANDLE_WORD(state) STMT_START { \
1828 U16 dupe= trie->states[ state ].wordnum; \
1829 regnode * const noper_next = regnext( noper ); \
1832 /* store the word for dumping */ \
1834 if (OP(noper) != NOTHING) \
1835 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1837 tmp = newSVpvn_utf8( "", 0, UTF ); \
1838 av_push( trie_words, tmp ); \
1842 trie->wordinfo[curword].prev = 0; \
1843 trie->wordinfo[curword].len = wordlen; \
1844 trie->wordinfo[curword].accept = state; \
1846 if ( noper_next < tail ) { \
1848 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1849 trie->jump[curword] = (U16)(noper_next - convert); \
1851 jumper = noper_next; \
1853 nextbranch= regnext(cur); \
1857 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1858 /* chain, so that when the bits of chain are later */\
1859 /* linked together, the dups appear in the chain */\
1860 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1861 trie->wordinfo[dupe].prev = curword; \
1863 /* we haven't inserted this word yet. */ \
1864 trie->states[ state ].wordnum = curword; \
1869 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1870 ( ( base + charid >= ucharcount \
1871 && base + charid < ubound \
1872 && state == trie->trans[ base - ucharcount + charid ].check \
1873 && trie->trans[ base - ucharcount + charid ].next ) \
1874 ? trie->trans[ base - ucharcount + charid ].next \
1875 : ( state==1 ? special : 0 ) \
1879 #define MADE_JUMP_TRIE 2
1880 #define MADE_EXACT_TRIE 4
1883 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1886 /* first pass, loop through and scan words */
1887 reg_trie_data *trie;
1888 HV *widecharmap = NULL;
1889 AV *revcharmap = newAV();
1895 regnode *jumper = NULL;
1896 regnode *nextbranch = NULL;
1897 regnode *convert = NULL;
1898 U32 *prev_states; /* temp array mapping each state to previous one */
1899 /* we just use folder as a flag in utf8 */
1900 const U8 * folder = NULL;
1903 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1904 AV *trie_words = NULL;
1905 /* along with revcharmap, this only used during construction but both are
1906 * useful during debugging so we store them in the struct when debugging.
1909 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1910 STRLEN trie_charcount=0;
1912 SV *re_trie_maxbuff;
1913 GET_RE_DEBUG_FLAGS_DECL;
1915 PERL_ARGS_ASSERT_MAKE_TRIE;
1917 PERL_UNUSED_ARG(depth);
1924 case EXACTFU: folder = PL_fold_latin1; break;
1925 case EXACTF: folder = PL_fold; break;
1926 case EXACTFL: folder = PL_fold_locale; break;
1927 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1930 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1932 trie->startstate = 1;
1933 trie->wordcount = word_count;
1934 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1935 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1937 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1938 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1939 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1942 trie_words = newAV();
1945 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1946 if (!SvIOK(re_trie_maxbuff)) {
1947 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1949 DEBUG_TRIE_COMPILE_r({
1950 PerlIO_printf( Perl_debug_log,
1951 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1952 (int)depth * 2 + 2, "",
1953 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1954 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1958 /* Find the node we are going to overwrite */
1959 if ( first == startbranch && OP( last ) != BRANCH ) {
1960 /* whole branch chain */
1963 /* branch sub-chain */
1964 convert = NEXTOPER( first );
1967 /* -- First loop and Setup --
1969 We first traverse the branches and scan each word to determine if it
1970 contains widechars, and how many unique chars there are, this is
1971 important as we have to build a table with at least as many columns as we
1974 We use an array of integers to represent the character codes 0..255
1975 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1976 native representation of the character value as the key and IV's for the
1979 *TODO* If we keep track of how many times each character is used we can
1980 remap the columns so that the table compression later on is more
1981 efficient in terms of memory by ensuring the most common value is in the
1982 middle and the least common are on the outside. IMO this would be better
1983 than a most to least common mapping as theres a decent chance the most
1984 common letter will share a node with the least common, meaning the node
1985 will not be compressible. With a middle is most common approach the worst
1986 case is when we have the least common nodes twice.
1990 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1991 regnode *noper = NEXTOPER( cur );
1992 const U8 *uc = (U8*)STRING( noper );
1993 const U8 *e = uc + STR_LEN( noper );
1995 U32 wordlen = 0; /* required init */
1996 STRLEN minbytes = 0;
1997 STRLEN maxbytes = 0;
1998 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
2000 if (OP(noper) == NOTHING) {
2001 regnode *noper_next= regnext(noper);
2002 if (noper_next != tail && OP(noper_next) == flags) {
2004 uc= (U8*)STRING(noper);
2005 e= uc + STR_LEN(noper);
2006 trie->minlen= STR_LEN(noper);
2013 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2014 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2015 regardless of encoding */
2016 if (OP( noper ) == EXACTFU_SS) {
2017 /* false positives are ok, so just set this */
2018 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2021 for ( ; uc < e ; uc += len ) {
2022 TRIE_CHARCOUNT(trie)++;
2025 /* Acummulate to the current values, the range in the number of
2026 * bytes that this character could match. The max is presumed to
2027 * be the same as the folded input (which TRIE_READ_CHAR returns),
2028 * except that when this is not in UTF-8, it could be matched
2029 * against a string which is UTF-8, and the variant characters
2030 * could be 2 bytes instead of the 1 here. Likewise, for the
2031 * minimum number of bytes when not folded. When folding, the min
2032 * is assumed to be 1 byte could fold to match the single character
2033 * here, or in the case of a multi-char fold, 1 byte can fold to
2034 * the whole sequence. 'foldlen' is used to denote whether we are
2035 * in such a sequence, skipping the min setting if so. XXX TODO
2036 * Use the exact list of what folds to each character, from
2037 * PL_utf8_foldclosures */
2039 maxbytes += UTF8SKIP(uc);
2041 /* A non-UTF-8 string could be 1 byte to match our 2 */
2042 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2048 foldlen -= UTF8SKIP(uc);
2051 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2057 maxbytes += (UNI_IS_INVARIANT(*uc))
2068 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2075 U8 folded= folder[ (U8) uvc ];
2076 if ( !trie->charmap[ folded ] ) {
2077 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2078 TRIE_STORE_REVCHAR( folded );
2081 if ( !trie->charmap[ uvc ] ) {
2082 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2083 TRIE_STORE_REVCHAR( uvc );
2086 /* store the codepoint in the bitmap, and its folded
2088 TRIE_BITMAP_SET(trie, uvc);
2090 /* store the folded codepoint */
2091 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2094 /* store first byte of utf8 representation of
2095 variant codepoints */
2096 if (! UVCHR_IS_INVARIANT(uvc)) {
2097 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2100 set_bit = 0; /* We've done our bit :-) */
2105 widecharmap = newHV();
2107 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2110 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2112 if ( !SvTRUE( *svpp ) ) {
2113 sv_setiv( *svpp, ++trie->uniquecharcount );
2114 TRIE_STORE_REVCHAR(uvc);
2118 if( cur == first ) {
2119 trie->minlen = minbytes;
2120 trie->maxlen = maxbytes;
2121 } else if (minbytes < trie->minlen) {
2122 trie->minlen = minbytes;
2123 } else if (maxbytes > trie->maxlen) {
2124 trie->maxlen = maxbytes;
2126 } /* end first pass */
2127 DEBUG_TRIE_COMPILE_r(
2128 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2129 (int)depth * 2 + 2,"",
2130 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2131 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2132 (int)trie->minlen, (int)trie->maxlen )
2136 We now know what we are dealing with in terms of unique chars and
2137 string sizes so we can calculate how much memory a naive
2138 representation using a flat table will take. If it's over a reasonable
2139 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2140 conservative but potentially much slower representation using an array
2143 At the end we convert both representations into the same compressed
2144 form that will be used in regexec.c for matching with. The latter
2145 is a form that cannot be used to construct with but has memory
2146 properties similar to the list form and access properties similar
2147 to the table form making it both suitable for fast searches and
2148 small enough that its feasable to store for the duration of a program.
2150 See the comment in the code where the compressed table is produced
2151 inplace from the flat tabe representation for an explanation of how
2152 the compression works.
2157 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2160 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
2162 Second Pass -- Array Of Lists Representation
2164 Each state will be represented by a list of charid:state records
2165 (reg_trie_trans_le) the first such element holds the CUR and LEN
2166 points of the allocated array. (See defines above).
2168 We build the initial structure using the lists, and then convert
2169 it into the compressed table form which allows faster lookups
2170 (but cant be modified once converted).
2173 STRLEN transcount = 1;
2175 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2176 "%*sCompiling trie using list compiler\n",
2177 (int)depth * 2 + 2, ""));
2179 trie->states = (reg_trie_state *)
2180 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2181 sizeof(reg_trie_state) );
2185 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2187 regnode *noper = NEXTOPER( cur );
2188 U8 *uc = (U8*)STRING( noper );
2189 const U8 *e = uc + STR_LEN( noper );
2190 U32 state = 1; /* required init */
2191 U16 charid = 0; /* sanity init */
2192 U32 wordlen = 0; /* required init */
2194 if (OP(noper) == NOTHING) {
2195 regnode *noper_next= regnext(noper);
2196 if (noper_next != tail && OP(noper_next) == flags) {
2198 uc= (U8*)STRING(noper);
2199 e= uc + STR_LEN(noper);
2203 if (OP(noper) != NOTHING) {
2204 for ( ; uc < e ; uc += len ) {
2209 charid = trie->charmap[ uvc ];
2211 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2215 charid=(U16)SvIV( *svpp );
2218 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2225 if ( !trie->states[ state ].trans.list ) {
2226 TRIE_LIST_NEW( state );
2228 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2229 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2230 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2235 newstate = next_alloc++;
2236 prev_states[newstate] = state;
2237 TRIE_LIST_PUSH( state, charid, newstate );
2242 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2246 TRIE_HANDLE_WORD(state);
2248 } /* end second pass */
2250 /* next alloc is the NEXT state to be allocated */
2251 trie->statecount = next_alloc;
2252 trie->states = (reg_trie_state *)
2253 PerlMemShared_realloc( trie->states,
2255 * sizeof(reg_trie_state) );
2257 /* and now dump it out before we compress it */
2258 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2259 revcharmap, next_alloc,
2263 trie->trans = (reg_trie_trans *)
2264 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2271 for( state=1 ; state < next_alloc ; state ++ ) {
2275 DEBUG_TRIE_COMPILE_MORE_r(
2276 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2280 if (trie->states[state].trans.list) {
2281 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2285 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2286 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2287 if ( forid < minid ) {
2289 } else if ( forid > maxid ) {
2293 if ( transcount < tp + maxid - minid + 1) {
2295 trie->trans = (reg_trie_trans *)
2296 PerlMemShared_realloc( trie->trans,
2298 * sizeof(reg_trie_trans) );
2299 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2301 base = trie->uniquecharcount + tp - minid;
2302 if ( maxid == minid ) {
2304 for ( ; zp < tp ; zp++ ) {
2305 if ( ! trie->trans[ zp ].next ) {
2306 base = trie->uniquecharcount + zp - minid;
2307 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2308 trie->trans[ zp ].check = state;
2314 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2315 trie->trans[ tp ].check = state;
2320 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2321 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2322 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2323 trie->trans[ tid ].check = state;
2325 tp += ( maxid - minid + 1 );
2327 Safefree(trie->states[ state ].trans.list);
2330 DEBUG_TRIE_COMPILE_MORE_r(
2331 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2334 trie->states[ state ].trans.base=base;
2336 trie->lasttrans = tp + 1;
2340 Second Pass -- Flat Table Representation.
2342 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2343 each. We know that we will need Charcount+1 trans at most to store
2344 the data (one row per char at worst case) So we preallocate both
2345 structures assuming worst case.
2347 We then construct the trie using only the .next slots of the entry
2350 We use the .check field of the first entry of the node temporarily
2351 to make compression both faster and easier by keeping track of how
2352 many non zero fields are in the node.
2354 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2357 There are two terms at use here: state as a TRIE_NODEIDX() which is
2358 a number representing the first entry of the node, and state as a
2359 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2360 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2361 if there are 2 entrys per node. eg:
2369 The table is internally in the right hand, idx form. However as we
2370 also have to deal with the states array which is indexed by nodenum
2371 we have to use TRIE_NODENUM() to convert.
2374 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2375 "%*sCompiling trie using table compiler\n",
2376 (int)depth * 2 + 2, ""));
2378 trie->trans = (reg_trie_trans *)
2379 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2380 * trie->uniquecharcount + 1,
2381 sizeof(reg_trie_trans) );
2382 trie->states = (reg_trie_state *)
2383 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2384 sizeof(reg_trie_state) );
2385 next_alloc = trie->uniquecharcount + 1;
2388 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2390 regnode *noper = NEXTOPER( cur );
2391 const U8 *uc = (U8*)STRING( noper );
2392 const U8 *e = uc + STR_LEN( noper );
2394 U32 state = 1; /* required init */
2396 U16 charid = 0; /* sanity init */
2397 U32 accept_state = 0; /* sanity init */
2399 U32 wordlen = 0; /* required init */
2401 if (OP(noper) == NOTHING) {
2402 regnode *noper_next= regnext(noper);
2403 if (noper_next != tail && OP(noper_next) == flags) {
2405 uc= (U8*)STRING(noper);
2406 e= uc + STR_LEN(noper);
2410 if ( OP(noper) != NOTHING ) {
2411 for ( ; uc < e ; uc += len ) {
2416 charid = trie->charmap[ uvc ];
2418 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2419 charid = svpp ? (U16)SvIV(*svpp) : 0;
2423 if ( !trie->trans[ state + charid ].next ) {
2424 trie->trans[ state + charid ].next = next_alloc;
2425 trie->trans[ state ].check++;
2426 prev_states[TRIE_NODENUM(next_alloc)]
2427 = TRIE_NODENUM(state);
2428 next_alloc += trie->uniquecharcount;
2430 state = trie->trans[ state + charid ].next;
2432 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2434 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2437 accept_state = TRIE_NODENUM( state );
2438 TRIE_HANDLE_WORD(accept_state);
2440 } /* end second pass */
2442 /* and now dump it out before we compress it */
2443 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2445 next_alloc, depth+1));
2449 * Inplace compress the table.*
2451 For sparse data sets the table constructed by the trie algorithm will
2452 be mostly 0/FAIL transitions or to put it another way mostly empty.
2453 (Note that leaf nodes will not contain any transitions.)
2455 This algorithm compresses the tables by eliminating most such
2456 transitions, at the cost of a modest bit of extra work during lookup:
2458 - Each states[] entry contains a .base field which indicates the
2459 index in the state[] array wheres its transition data is stored.
2461 - If .base is 0 there are no valid transitions from that node.
2463 - If .base is nonzero then charid is added to it to find an entry in
2466 -If trans[states[state].base+charid].check!=state then the
2467 transition is taken to be a 0/Fail transition. Thus if there are fail
2468 transitions at the front of the node then the .base offset will point
2469 somewhere inside the previous nodes data (or maybe even into a node
2470 even earlier), but the .check field determines if the transition is
2474 The following process inplace converts the table to the compressed
2475 table: We first do not compress the root node 1,and mark all its
2476 .check pointers as 1 and set its .base pointer as 1 as well. This
2477 allows us to do a DFA construction from the compressed table later,
2478 and ensures that any .base pointers we calculate later are greater
2481 - We set 'pos' to indicate the first entry of the second node.
2483 - We then iterate over the columns of the node, finding the first and
2484 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2485 and set the .check pointers accordingly, and advance pos
2486 appropriately and repreat for the next node. Note that when we copy
2487 the next pointers we have to convert them from the original
2488 NODEIDX form to NODENUM form as the former is not valid post
2491 - If a node has no transitions used we mark its base as 0 and do not
2492 advance the pos pointer.
2494 - If a node only has one transition we use a second pointer into the
2495 structure to fill in allocated fail transitions from other states.
2496 This pointer is independent of the main pointer and scans forward
2497 looking for null transitions that are allocated to a state. When it
2498 finds one it writes the single transition into the "hole". If the
2499 pointer doesnt find one the single transition is appended as normal.
2501 - Once compressed we can Renew/realloc the structures to release the
2504 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2505 specifically Fig 3.47 and the associated pseudocode.
2509 const U32 laststate = TRIE_NODENUM( next_alloc );
2512 trie->statecount = laststate;
2514 for ( state = 1 ; state < laststate ; state++ ) {
2516 const U32 stateidx = TRIE_NODEIDX( state );
2517 const U32 o_used = trie->trans[ stateidx ].check;
2518 U32 used = trie->trans[ stateidx ].check;
2519 trie->trans[ stateidx ].check = 0;
2521 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2522 if ( flag || trie->trans[ stateidx + charid ].next ) {
2523 if ( trie->trans[ stateidx + charid ].next ) {
2525 for ( ; zp < pos ; zp++ ) {
2526 if ( ! trie->trans[ zp ].next ) {
2530 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2531 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2532 trie->trans[ zp ].check = state;
2533 if ( ++zp > pos ) pos = zp;
2540 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2542 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2543 trie->trans[ pos ].check = state;
2548 trie->lasttrans = pos + 1;
2549 trie->states = (reg_trie_state *)
2550 PerlMemShared_realloc( trie->states, laststate
2551 * sizeof(reg_trie_state) );
2552 DEBUG_TRIE_COMPILE_MORE_r(
2553 PerlIO_printf( Perl_debug_log,
2554 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2555 (int)depth * 2 + 2,"",
2556 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2559 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2562 } /* end table compress */
2564 DEBUG_TRIE_COMPILE_MORE_r(
2565 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2566 (int)depth * 2 + 2, "",
2567 (UV)trie->statecount,
2568 (UV)trie->lasttrans)
2570 /* resize the trans array to remove unused space */
2571 trie->trans = (reg_trie_trans *)
2572 PerlMemShared_realloc( trie->trans, trie->lasttrans
2573 * sizeof(reg_trie_trans) );
2575 { /* Modify the program and insert the new TRIE node */
2576 U8 nodetype =(U8)(flags & 0xFF);
2580 regnode *optimize = NULL;
2581 #ifdef RE_TRACK_PATTERN_OFFSETS
2584 U32 mjd_nodelen = 0;
2585 #endif /* RE_TRACK_PATTERN_OFFSETS */
2586 #endif /* DEBUGGING */
2588 This means we convert either the first branch or the first Exact,
2589 depending on whether the thing following (in 'last') is a branch
2590 or not and whther first is the startbranch (ie is it a sub part of
2591 the alternation or is it the whole thing.)
2592 Assuming its a sub part we convert the EXACT otherwise we convert
2593 the whole branch sequence, including the first.
2595 /* Find the node we are going to overwrite */
2596 if ( first != startbranch || OP( last ) == BRANCH ) {
2597 /* branch sub-chain */
2598 NEXT_OFF( first ) = (U16)(last - first);
2599 #ifdef RE_TRACK_PATTERN_OFFSETS
2601 mjd_offset= Node_Offset((convert));
2602 mjd_nodelen= Node_Length((convert));
2605 /* whole branch chain */
2607 #ifdef RE_TRACK_PATTERN_OFFSETS
2610 const regnode *nop = NEXTOPER( convert );
2611 mjd_offset= Node_Offset((nop));
2612 mjd_nodelen= Node_Length((nop));
2616 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2617 (int)depth * 2 + 2, "",
2618 (UV)mjd_offset, (UV)mjd_nodelen)
2621 /* But first we check to see if there is a common prefix we can
2622 split out as an EXACT and put in front of the TRIE node. */
2623 trie->startstate= 1;
2624 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2626 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2630 const U32 base = trie->states[ state ].trans.base;
2632 if ( trie->states[state].wordnum )
2635 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2636 if ( ( base + ofs >= trie->uniquecharcount ) &&
2637 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2638 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2640 if ( ++count > 1 ) {
2641 SV **tmp = av_fetch( revcharmap, ofs, 0);
2642 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2643 if ( state == 1 ) break;
2645 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2647 PerlIO_printf(Perl_debug_log,
2648 "%*sNew Start State=%"UVuf" Class: [",
2649 (int)depth * 2 + 2, "",
2652 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2653 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2655 TRIE_BITMAP_SET(trie,*ch);
2657 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2659 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2663 TRIE_BITMAP_SET(trie,*ch);
2665 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2666 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2672 SV **tmp = av_fetch( revcharmap, idx, 0);
2674 char *ch = SvPV( *tmp, len );
2676 SV *sv=sv_newmortal();
2677 PerlIO_printf( Perl_debug_log,
2678 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2679 (int)depth * 2 + 2, "",
2681 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2682 PL_colors[0], PL_colors[1],
2683 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2684 PERL_PV_ESCAPE_FIRSTCHAR
2689 OP( convert ) = nodetype;
2690 str=STRING(convert);
2693 STR_LEN(convert) += len;
2699 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2704 trie->prefixlen = (state-1);
2706 regnode *n = convert+NODE_SZ_STR(convert);
2707 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2708 trie->startstate = state;
2709 trie->minlen -= (state - 1);
2710 trie->maxlen -= (state - 1);
2712 /* At least the UNICOS C compiler choked on this
2713 * being argument to DEBUG_r(), so let's just have
2716 #ifdef PERL_EXT_RE_BUILD
2722 regnode *fix = convert;
2723 U32 word = trie->wordcount;
2725 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2726 while( ++fix < n ) {
2727 Set_Node_Offset_Length(fix, 0, 0);
2730 SV ** const tmp = av_fetch( trie_words, word, 0 );
2732 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2733 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2735 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2743 NEXT_OFF(convert) = (U16)(tail - convert);
2744 DEBUG_r(optimize= n);
2750 if ( trie->maxlen ) {
2751 NEXT_OFF( convert ) = (U16)(tail - convert);
2752 ARG_SET( convert, data_slot );
2753 /* Store the offset to the first unabsorbed branch in
2754 jump[0], which is otherwise unused by the jump logic.
2755 We use this when dumping a trie and during optimisation. */
2757 trie->jump[0] = (U16)(nextbranch - convert);
2759 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2760 * and there is a bitmap
2761 * and the first "jump target" node we found leaves enough room
2762 * then convert the TRIE node into a TRIEC node, with the bitmap
2763 * embedded inline in the opcode - this is hypothetically faster.
2765 if ( !trie->states[trie->startstate].wordnum
2767 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2769 OP( convert ) = TRIEC;
2770 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2771 PerlMemShared_free(trie->bitmap);
2774 OP( convert ) = TRIE;
2776 /* store the type in the flags */
2777 convert->flags = nodetype;
2781 + regarglen[ OP( convert ) ];
2783 /* XXX We really should free up the resource in trie now,
2784 as we won't use them - (which resources?) dmq */
2786 /* needed for dumping*/
2787 DEBUG_r(if (optimize) {
2788 regnode *opt = convert;
2790 while ( ++opt < optimize) {
2791 Set_Node_Offset_Length(opt,0,0);
2794 Try to clean up some of the debris left after the
2797 while( optimize < jumper ) {
2798 mjd_nodelen += Node_Length((optimize));
2799 OP( optimize ) = OPTIMIZED;
2800 Set_Node_Offset_Length(optimize,0,0);
2803 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2805 } /* end node insert */
2807 /* Finish populating the prev field of the wordinfo array. Walk back
2808 * from each accept state until we find another accept state, and if
2809 * so, point the first word's .prev field at the second word. If the
2810 * second already has a .prev field set, stop now. This will be the
2811 * case either if we've already processed that word's accept state,
2812 * or that state had multiple words, and the overspill words were
2813 * already linked up earlier.
2820 for (word=1; word <= trie->wordcount; word++) {
2822 if (trie->wordinfo[word].prev)
2824 state = trie->wordinfo[word].accept;
2826 state = prev_states[state];
2829 prev = trie->states[state].wordnum;
2833 trie->wordinfo[word].prev = prev;
2835 Safefree(prev_states);
2839 /* and now dump out the compressed format */
2840 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2842 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2844 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2845 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2847 SvREFCNT_dec_NN(revcharmap);
2851 : trie->startstate>1
2857 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2859 /* The Trie is constructed and compressed now so we can build a fail array if
2862 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2864 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2868 We find the fail state for each state in the trie, this state is the longest
2869 proper suffix of the current state's 'word' that is also a proper prefix of
2870 another word in our trie. State 1 represents the word '' and is thus the
2871 default fail state. This allows the DFA not to have to restart after its
2872 tried and failed a word at a given point, it simply continues as though it
2873 had been matching the other word in the first place.
2875 'abcdgu'=~/abcdefg|cdgu/
2876 When we get to 'd' we are still matching the first word, we would encounter
2877 'g' which would fail, which would bring us to the state representing 'd' in
2878 the second word where we would try 'g' and succeed, proceeding to match
2881 /* add a fail transition */
2882 const U32 trie_offset = ARG(source);
2883 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2885 const U32 ucharcount = trie->uniquecharcount;
2886 const U32 numstates = trie->statecount;
2887 const U32 ubound = trie->lasttrans + ucharcount;
2891 U32 base = trie->states[ 1 ].trans.base;
2894 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
2895 GET_RE_DEBUG_FLAGS_DECL;
2897 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2899 PERL_UNUSED_ARG(depth);
2903 ARG_SET( stclass, data_slot );
2904 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2905 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2906 aho->trie=trie_offset;
2907 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2908 Copy( trie->states, aho->states, numstates, reg_trie_state );
2909 Newxz( q, numstates, U32);
2910 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2913 /* initialize fail[0..1] to be 1 so that we always have
2914 a valid final fail state */
2915 fail[ 0 ] = fail[ 1 ] = 1;
2917 for ( charid = 0; charid < ucharcount ; charid++ ) {
2918 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2920 q[ q_write ] = newstate;
2921 /* set to point at the root */
2922 fail[ q[ q_write++ ] ]=1;
2925 while ( q_read < q_write) {
2926 const U32 cur = q[ q_read++ % numstates ];
2927 base = trie->states[ cur ].trans.base;
2929 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2930 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2932 U32 fail_state = cur;
2935 fail_state = fail[ fail_state ];
2936 fail_base = aho->states[ fail_state ].trans.base;
2937 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2939 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2940 fail[ ch_state ] = fail_state;
2941 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2943 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2945 q[ q_write++ % numstates] = ch_state;
2949 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2950 when we fail in state 1, this allows us to use the
2951 charclass scan to find a valid start char. This is based on the principle
2952 that theres a good chance the string being searched contains lots of stuff
2953 that cant be a start char.
2955 fail[ 0 ] = fail[ 1 ] = 0;
2956 DEBUG_TRIE_COMPILE_r({
2957 PerlIO_printf(Perl_debug_log,
2958 "%*sStclass Failtable (%"UVuf" states): 0",
2959 (int)(depth * 2), "", (UV)numstates
2961 for( q_read=1; q_read<numstates; q_read++ ) {
2962 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2964 PerlIO_printf(Perl_debug_log, "\n");
2967 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2971 #define DEBUG_PEEP(str,scan,depth) \
2972 DEBUG_OPTIMISE_r({if (scan){ \
2973 SV * const mysv=sv_newmortal(); \
2974 regnode *Next = regnext(scan); \
2975 regprop(RExC_rx, mysv, scan); \
2976 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2977 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2978 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2982 /* The below joins as many adjacent EXACTish nodes as possible into a single
2983 * one. The regop may be changed if the node(s) contain certain sequences that
2984 * require special handling. The joining is only done if:
2985 * 1) there is room in the current conglomerated node to entirely contain the
2987 * 2) they are the exact same node type
2989 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2990 * these get optimized out
2992 * If a node is to match under /i (folded), the number of characters it matches
2993 * can be different than its character length if it contains a multi-character
2994 * fold. *min_subtract is set to the total delta of the input nodes.
2996 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2997 * and contains LATIN SMALL LETTER SHARP S
2999 * This is as good a place as any to discuss the design of handling these
3000 * multi-character fold sequences. It's been wrong in Perl for a very long
3001 * time. There are three code points in Unicode whose multi-character folds
3002 * were long ago discovered to mess things up. The previous designs for
3003 * dealing with these involved assigning a special node for them. This
3004 * approach doesn't work, as evidenced by this example:
3005 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3006 * Both these fold to "sss", but if the pattern is parsed to create a node that
3007 * would match just the \xDF, it won't be able to handle the case where a
3008 * successful match would have to cross the node's boundary. The new approach
3009 * that hopefully generally solves the problem generates an EXACTFU_SS node
3012 * It turns out that there are problems with all multi-character folds, and not
3013 * just these three. Now the code is general, for all such cases. The
3014 * approach taken is:
3015 * 1) This routine examines each EXACTFish node that could contain multi-
3016 * character fold sequences. It returns in *min_subtract how much to
3017 * subtract from the the actual length of the string to get a real minimum
3018 * match length; it is 0 if there are no multi-char folds. This delta is
3019 * used by the caller to adjust the min length of the match, and the delta
3020 * between min and max, so that the optimizer doesn't reject these
3021 * possibilities based on size constraints.
3022 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3023 * is used for an EXACTFU node that contains at least one "ss" sequence in
3024 * it. For non-UTF-8 patterns and strings, this is the only case where
3025 * there is a possible fold length change. That means that a regular
3026 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3027 * with length changes, and so can be processed faster. regexec.c takes
3028 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3029 * pre-folded by regcomp.c. This saves effort in regex matching.
3030 * However, the pre-folding isn't done for non-UTF8 patterns because the
3031 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
3032 * down by forcing the pattern into UTF8 unless necessary. Also what
3033 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
3034 * possibilities for the non-UTF8 patterns are quite simple, except for
3035 * the sharp s. All the ones that don't involve a UTF-8 target string are
3036 * members of a fold-pair, and arrays are set up for all of them so that
3037 * the other member of the pair can be found quickly. Code elsewhere in
3038 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3039 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3040 * described in the next item.
3041 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
3042 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3043 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
3044 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
3045 * character in the pattern corresponds to at most a single character in
3046 * the target string. (And I do mean character, and not byte here, unlike
3047 * other parts of the documentation that have never been updated to
3048 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
3049 * two character string 'ss'; in EXACTFA nodes it can match
3050 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
3051 * instances where it is violated. I'm reluctant to try to change the
3052 * assumption, as the code involved is impenetrable to me (khw), so
3053 * instead the code here punts. This routine examines (when the pattern
3054 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3055 * boolean indicating whether or not the node contains a sharp s. When it
3056 * is true, the caller sets a flag that later causes the optimizer in this
3057 * file to not set values for the floating and fixed string lengths, and
3058 * thus avoids the optimizer code in regexec.c that makes the invalid
3059 * assumption. Thus, there is no optimization based on string lengths for
3060 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3061 * (The reason the assumption is wrong only in these two cases is that all
3062 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3063 * other folds to their expanded versions. We can't prefold sharp s to
3064 * 'ss' in EXACTF nodes because we don't know at compile time if it
3065 * actually matches 'ss' or not. It will match iff the target string is
3066 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3067 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
3068 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3069 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3070 * require the pattern to be forced into UTF-8, the overhead of which we
3073 * Similarly, the code that generates tries doesn't currently handle
3074 * not-already-folded multi-char folds, and it looks like a pain to change
3075 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3076 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3077 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3078 * using /iaa matching will be doing so almost entirely with ASCII
3079 * strings, so this should rarely be encountered in practice */
3081 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
3082 if (PL_regkind[OP(scan)] == EXACT) \
3083 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
3086 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
3087 /* Merge several consecutive EXACTish nodes into one. */
3088 regnode *n = regnext(scan);
3090 regnode *next = scan + NODE_SZ_STR(scan);
3094 regnode *stop = scan;
3095 GET_RE_DEBUG_FLAGS_DECL;
3097 PERL_UNUSED_ARG(depth);
3100 PERL_ARGS_ASSERT_JOIN_EXACT;
3101 #ifndef EXPERIMENTAL_INPLACESCAN
3102 PERL_UNUSED_ARG(flags);
3103 PERL_UNUSED_ARG(val);
3105 DEBUG_PEEP("join",scan,depth);
3107 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3108 * EXACT ones that are mergeable to the current one. */
3110 && (PL_regkind[OP(n)] == NOTHING
3111 || (stringok && OP(n) == OP(scan)))
3113 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3116 if (OP(n) == TAIL || n > next)
3118 if (PL_regkind[OP(n)] == NOTHING) {
3119 DEBUG_PEEP("skip:",n,depth);
3120 NEXT_OFF(scan) += NEXT_OFF(n);
3121 next = n + NODE_STEP_REGNODE;
3128 else if (stringok) {
3129 const unsigned int oldl = STR_LEN(scan);
3130 regnode * const nnext = regnext(n);
3132 /* XXX I (khw) kind of doubt that this works on platforms where
3133 * U8_MAX is above 255 because of lots of other assumptions */
3134 /* Don't join if the sum can't fit into a single node */
3135 if (oldl + STR_LEN(n) > U8_MAX)
3138 DEBUG_PEEP("merg",n,depth);
3141 NEXT_OFF(scan) += NEXT_OFF(n);
3142 STR_LEN(scan) += STR_LEN(n);
3143 next = n + NODE_SZ_STR(n);
3144 /* Now we can overwrite *n : */
3145 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3153 #ifdef EXPERIMENTAL_INPLACESCAN
3154 if (flags && !NEXT_OFF(n)) {
3155 DEBUG_PEEP("atch", val, depth);
3156 if (reg_off_by_arg[OP(n)]) {
3157 ARG_SET(n, val - n);
3160 NEXT_OFF(n) = val - n;
3168 *has_exactf_sharp_s = FALSE;
3170 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3171 * can now analyze for sequences of problematic code points. (Prior to
3172 * this final joining, sequences could have been split over boundaries, and
3173 * hence missed). The sequences only happen in folding, hence for any
3174 * non-EXACT EXACTish node */
3175 if (OP(scan) != EXACT) {
3176 const U8 * const s0 = (U8*) STRING(scan);
3178 const U8 * const s_end = s0 + STR_LEN(scan);
3180 /* One pass is made over the node's string looking for all the
3181 * possibilities. to avoid some tests in the loop, there are two main
3182 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3186 /* Examine the string for a multi-character fold sequence. UTF-8
3187 * patterns have all characters pre-folded by the time this code is
3189 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3190 length sequence we are looking for is 2 */
3193 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3194 if (! len) { /* Not a multi-char fold: get next char */
3199 /* Nodes with 'ss' require special handling, except for EXACTFL
3200 * and EXACTFA-ish for which there is no multi-char fold to
3202 if (len == 2 && *s == 's' && *(s+1) == 's'
3203 && OP(scan) != EXACTFL
3204 && OP(scan) != EXACTFA
3205 && OP(scan) != EXACTFA_NO_TRIE)
3208 OP(scan) = EXACTFU_SS;
3211 else { /* Here is a generic multi-char fold. */
3212 const U8* multi_end = s + len;
3214 /* Count how many characters in it. In the case of /l and
3215 * /aa, no folds which contain ASCII code points are
3216 * allowed, so check for those, and skip if found. (In
3217 * EXACTFL, no folds are allowed to any Latin1 code point,
3218 * not just ASCII. But there aren't any of these
3219 * currently, nor ever likely, so don't take the time to
3220 * test for them. The code that generates the
3221 * is_MULTI_foo() macros croaks should one actually get put
3222 * into Unicode .) */
3223 if (OP(scan) != EXACTFL
3224 && OP(scan) != EXACTFA
3225 && OP(scan) != EXACTFA_NO_TRIE)
3227 count = utf8_length(s, multi_end);
3231 while (s < multi_end) {
3234 goto next_iteration;
3244 /* The delta is how long the sequence is minus 1 (1 is how long
3245 * the character that folds to the sequence is) */
3246 *min_subtract += count - 1;
3250 else if (OP(scan) == EXACTFA) {
3252 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3253 * fold to the ASCII range (and there are no existing ones in the
3254 * upper latin1 range). But, as outlined in the comments preceding
3255 * this function, we need to flag any occurrences of the sharp s.
3256 * This character forbids trie formation (because of added
3259 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3260 OP(scan) = EXACTFA_NO_TRIE;
3261 *has_exactf_sharp_s = TRUE;
3268 else if (OP(scan) != EXACTFL) {
3270 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
3271 * multi-char folds that are all Latin1. (This code knows that
3272 * there are no current multi-char folds possible with EXACTFL,
3273 * relying on fold_grind.t to catch any errors if the very unlikely
3274 * event happens that some get added in future Unicode versions.)
3275 * As explained in the comments preceding this function, we look
3276 * also for the sharp s in EXACTF nodes; it can be in the final
3277 * position. Otherwise we can stop looking 1 byte earlier because
3278 * have to find at least two characters for a multi-fold */
3279 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3282 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3283 if (! len) { /* Not a multi-char fold. */
3284 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3286 *has_exactf_sharp_s = TRUE;
3293 && isARG2_lower_or_UPPER_ARG1('s', *s)
3294 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3297 /* EXACTF nodes need to know that the minimum length
3298 * changed so that a sharp s in the string can match this
3299 * ss in the pattern, but they remain EXACTF nodes, as they
3300 * won't match this unless the target string is is UTF-8,
3301 * which we don't know until runtime */
3302 if (OP(scan) != EXACTF) {
3303 OP(scan) = EXACTFU_SS;
3307 *min_subtract += len - 1;
3314 /* Allow dumping but overwriting the collection of skipped
3315 * ops and/or strings with fake optimized ops */
3316 n = scan + NODE_SZ_STR(scan);
3324 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3328 /* REx optimizer. Converts nodes into quicker variants "in place".
3329 Finds fixed substrings. */
3331 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3332 to the position after last scanned or to NULL. */
3334 #define INIT_AND_WITHP \
3335 assert(!and_withp); \
3336 Newx(and_withp,1, regnode_ssc); \
3337 SAVEFREEPV(and_withp)
3339 /* this is a chain of data about sub patterns we are processing that
3340 need to be handled separately/specially in study_chunk. Its so
3341 we can simulate recursion without losing state. */
3343 typedef struct scan_frame {
3344 regnode *last; /* last node to process in this frame */
3345 regnode *next; /* next node to process when last is reached */
3346 struct scan_frame *prev; /*previous frame*/
3347 U32 prev_recursed_depth;
3348 I32 stop; /* what stopparen do we use */
3352 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3355 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3356 SSize_t *minlenp, SSize_t *deltap,
3361 regnode_ssc *and_withp,
3362 U32 flags, U32 depth)
3363 /* scanp: Start here (read-write). */
3364 /* deltap: Write maxlen-minlen here. */
3365 /* last: Stop before this one. */
3366 /* data: string data about the pattern */
3367 /* stopparen: treat close N as END */
3368 /* recursed: which subroutines have we recursed into */
3369 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3372 /* There must be at least this number of characters to match */
3375 regnode *scan = *scanp, *next;
3377 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3378 int is_inf_internal = 0; /* The studied chunk is infinite */
3379 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3380 scan_data_t data_fake;
3381 SV *re_trie_maxbuff = NULL;
3382 regnode *first_non_open = scan;
3383 SSize_t stopmin = SSize_t_MAX;
3384 scan_frame *frame = NULL;
3385 GET_RE_DEBUG_FLAGS_DECL;
3387 PERL_ARGS_ASSERT_STUDY_CHUNK;
3390 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3393 while (first_non_open && OP(first_non_open) == OPEN)
3394 first_non_open=regnext(first_non_open);
3399 while ( scan && OP(scan) != END && scan < last ){
3400 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3401 node length to get a real minimum (because
3402 the folded version may be shorter) */
3403 bool has_exactf_sharp_s = FALSE;
3404 /* Peephole optimizer: */
3405 DEBUG_OPTIMISE_MORE_r(
3407 PerlIO_printf(Perl_debug_log,"%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3408 ((int) depth*2), "", (long)stopparen,
3409 (unsigned long)depth, (unsigned long)recursed_depth);
3410 if (recursed_depth) {
3413 for ( j = 0 ; j < recursed_depth ; j++ ) {
3414 PerlIO_printf(Perl_debug_log,"[");
3415 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3416 PerlIO_printf(Perl_debug_log,"%d",
3417 PAREN_TEST(RExC_study_chunk_recursed +
3418 (j * RExC_study_chunk_recursed_bytes), i)
3421 PerlIO_printf(Perl_debug_log,"]");
3424 PerlIO_printf(Perl_debug_log,"\n");
3427 DEBUG_STUDYDATA("Peep:", data, depth);
3428 DEBUG_PEEP("Peep", scan, depth);
3431 /* Its not clear to khw or hv why this is done here, and not in the
3432 * clauses that deal with EXACT nodes. khw's guess is that it's
3433 * because of a previous design */
3434 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3436 /* Follow the next-chain of the current node and optimize
3437 away all the NOTHINGs from it. */
3438 if (OP(scan) != CURLYX) {
3439 const int max = (reg_off_by_arg[OP(scan)]
3441 /* I32 may be smaller than U16 on CRAYs! */
3442 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3443 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3447 /* Skip NOTHING and LONGJMP. */
3448 while ((n = regnext(n))
3449 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3450 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3451 && off + noff < max)
3453 if (reg_off_by_arg[OP(scan)])
3456 NEXT_OFF(scan) = off;
3461 /* The principal pseudo-switch. Cannot be a switch, since we
3462 look into several different things. */
3463 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3464 || OP(scan) == IFTHEN) {
3465 next = regnext(scan);
3467 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3469 if (OP(next) == code || code == IFTHEN) {
3470 /* NOTE - There is similar code to this block below for
3471 * handling TRIE nodes on a re-study. If you change stuff here
3472 * check there too. */
3473 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3475 regnode * const startbranch=scan;
3477 if (flags & SCF_DO_SUBSTR)
3478 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3479 if (flags & SCF_DO_STCLASS)
3480 ssc_init_zero(pRExC_state, &accum);
3482 while (OP(scan) == code) {
3483 SSize_t deltanext, minnext, fake;
3485 regnode_ssc this_class;
3488 data_fake.flags = 0;
3490 data_fake.whilem_c = data->whilem_c;
3491 data_fake.last_closep = data->last_closep;
3494 data_fake.last_closep = &fake;
3496 data_fake.pos_delta = delta;
3497 next = regnext(scan);
3498 scan = NEXTOPER(scan);
3500 scan = NEXTOPER(scan);
3501 if (flags & SCF_DO_STCLASS) {
3502 ssc_init(pRExC_state, &this_class);
3503 data_fake.start_class = &this_class;
3504 f = SCF_DO_STCLASS_AND;
3506 if (flags & SCF_WHILEM_VISITED_POS)
3507 f |= SCF_WHILEM_VISITED_POS;
3509 /* we suppose the run is continuous, last=next...*/
3510 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3512 stopparen, recursed_depth, NULL, f,depth+1);
3515 if (deltanext == SSize_t_MAX) {
3516 is_inf = is_inf_internal = 1;
3518 } else if (max1 < minnext + deltanext)
3519 max1 = minnext + deltanext;
3521 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3523 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3524 if ( stopmin > minnext)
3525 stopmin = min + min1;
3526 flags &= ~SCF_DO_SUBSTR;
3528 data->flags |= SCF_SEEN_ACCEPT;
3531 if (data_fake.flags & SF_HAS_EVAL)
3532 data->flags |= SF_HAS_EVAL;
3533 data->whilem_c = data_fake.whilem_c;
3535 if (flags & SCF_DO_STCLASS)
3536 ssc_or(pRExC_state, &accum, &this_class);
3538 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3540 if (flags & SCF_DO_SUBSTR) {
3541 data->pos_min += min1;
3542 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3543 data->pos_delta = SSize_t_MAX;
3545 data->pos_delta += max1 - min1;
3546 if (max1 != min1 || is_inf)
3547 data->longest = &(data->longest_float);
3550 if (delta == SSize_t_MAX
3551 || SSize_t_MAX - delta - (max1 - min1) < 0)
3552 delta = SSize_t_MAX;
3554 delta += max1 - min1;
3555 if (flags & SCF_DO_STCLASS_OR) {
3556 ssc_or(pRExC_state, data->start_class, &accum);
3558 ssc_and(pRExC_state, data->start_class, and_withp);
3559 flags &= ~SCF_DO_STCLASS;
3562 else if (flags & SCF_DO_STCLASS_AND) {
3564 ssc_and(pRExC_state, data->start_class, &accum);
3565 flags &= ~SCF_DO_STCLASS;
3568 /* Switch to OR mode: cache the old value of
3569 * data->start_class */
3571 StructCopy(data->start_class, and_withp, regnode_ssc);
3572 flags &= ~SCF_DO_STCLASS_AND;
3573 StructCopy(&accum, data->start_class, regnode_ssc);
3574 flags |= SCF_DO_STCLASS_OR;
3578 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3581 Assuming this was/is a branch we are dealing with: 'scan'
3582 now points at the item that follows the branch sequence,
3583 whatever it is. We now start at the beginning of the
3584 sequence and look for subsequences of
3590 which would be constructed from a pattern like
3593 If we can find such a subsequence we need to turn the first
3594 element into a trie and then add the subsequent branch exact
3595 strings to the trie.
3599 1. patterns where the whole set of branches can be
3602 2. patterns where only a subset can be converted.
3604 In case 1 we can replace the whole set with a single regop
3605 for the trie. In case 2 we need to keep the start and end
3608 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3609 becomes BRANCH TRIE; BRANCH X;
3611 There is an additional case, that being where there is a
3612 common prefix, which gets split out into an EXACT like node
3613 preceding the TRIE node.
3615 If x(1..n)==tail then we can do a simple trie, if not we make
3616 a "jump" trie, such that when we match the appropriate word
3617 we "jump" to the appropriate tail node. Essentially we turn
3618 a nested if into a case structure of sorts.
3623 if (!re_trie_maxbuff) {
3624 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3625 if (!SvIOK(re_trie_maxbuff))
3626 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3628 if ( SvIV(re_trie_maxbuff)>=0 ) {
3630 regnode *first = (regnode *)NULL;
3631 regnode *last = (regnode *)NULL;
3632 regnode *tail = scan;
3637 SV * const mysv = sv_newmortal(); /* for dumping */
3639 /* var tail is used because there may be a TAIL
3640 regop in the way. Ie, the exacts will point to the
3641 thing following the TAIL, but the last branch will
3642 point at the TAIL. So we advance tail. If we
3643 have nested (?:) we may have to move through several
3647 while ( OP( tail ) == TAIL ) {
3648 /* this is the TAIL generated by (?:) */
3649 tail = regnext( tail );
3653 DEBUG_TRIE_COMPILE_r({
3654 regprop(RExC_rx, mysv, tail );
3655 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3656 (int)depth * 2 + 2, "",
3657 "Looking for TRIE'able sequences. Tail node is: ",
3658 SvPV_nolen_const( mysv )
3664 Step through the branches
3665 cur represents each branch,
3666 noper is the first thing to be matched as part
3668 noper_next is the regnext() of that node.
3670 We normally handle a case like this
3671 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3672 support building with NOJUMPTRIE, which restricts
3673 the trie logic to structures like /FOO|BAR/.
3675 If noper is a trieable nodetype then the branch is
3676 a possible optimization target. If we are building
3677 under NOJUMPTRIE then we require that noper_next is
3678 the same as scan (our current position in the regex
3681 Once we have two or more consecutive such branches
3682 we can create a trie of the EXACT's contents and
3683 stitch it in place into the program.
3685 If the sequence represents all of the branches in
3686 the alternation we replace the entire thing with a
3689 Otherwise when it is a subsequence we need to
3690 stitch it in place and replace only the relevant
3691 branches. This means the first branch has to remain
3692 as it is used by the alternation logic, and its
3693 next pointer, and needs to be repointed at the item
3694 on the branch chain following the last branch we
3695 have optimized away.
3697 This could be either a BRANCH, in which case the
3698 subsequence is internal, or it could be the item
3699 following the branch sequence in which case the
3700 subsequence is at the end (which does not
3701 necessarily mean the first node is the start of the
3704 TRIE_TYPE(X) is a define which maps the optype to a
3708 ----------------+-----------
3712 EXACTFU_SS | EXACTFU
3717 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3718 ( EXACT == (X) ) ? EXACT : \
3719 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3720 ( EXACTFA == (X) ) ? EXACTFA : \
3723 /* dont use tail as the end marker for this traverse */
3724 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3725 regnode * const noper = NEXTOPER( cur );
3726 U8 noper_type = OP( noper );
3727 U8 noper_trietype = TRIE_TYPE( noper_type );
3728 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3729 regnode * const noper_next = regnext( noper );
3730 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3731 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3734 DEBUG_TRIE_COMPILE_r({
3735 regprop(RExC_rx, mysv, cur);
3736 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3737 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3739 regprop(RExC_rx, mysv, noper);
3740 PerlIO_printf( Perl_debug_log, " -> %s",
3741 SvPV_nolen_const(mysv));
3744 regprop(RExC_rx, mysv, noper_next );
3745 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3746 SvPV_nolen_const(mysv));
3748 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3749 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3750 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3754 /* Is noper a trieable nodetype that can be merged
3755 * with the current trie (if there is one)? */
3759 ( noper_trietype == NOTHING)
3760 || ( trietype == NOTHING )
3761 || ( trietype == noper_trietype )
3764 && noper_next == tail
3768 /* Handle mergable triable node Either we are
3769 * the first node in a new trieable sequence,
3770 * in which case we do some bookkeeping,
3771 * otherwise we update the end pointer. */
3774 if ( noper_trietype == NOTHING ) {
3775 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3776 regnode * const noper_next = regnext( noper );
3777 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3778 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3781 if ( noper_next_trietype ) {
3782 trietype = noper_next_trietype;
3783 } else if (noper_next_type) {
3784 /* a NOTHING regop is 1 regop wide.
3785 * We need at least two for a trie
3786 * so we can't merge this in */
3790 trietype = noper_trietype;
3793 if ( trietype == NOTHING )
3794 trietype = noper_trietype;
3799 } /* end handle mergable triable node */
3801 /* handle unmergable node -
3802 * noper may either be a triable node which can
3803 * not be tried together with the current trie,
3804 * or a non triable node */
3806 /* If last is set and trietype is not
3807 * NOTHING then we have found at least two
3808 * triable branch sequences in a row of a
3809 * similar trietype so we can turn them
3810 * into a trie. If/when we allow NOTHING to
3811 * start a trie sequence this condition
3812 * will be required, and it isn't expensive
3813 * so we leave it in for now. */
3814 if ( trietype && trietype != NOTHING )
3815 make_trie( pRExC_state,
3816 startbranch, first, cur, tail, count,
3817 trietype, depth+1 );
3818 last = NULL; /* note: we clear/update
3819 first, trietype etc below,
3820 so we dont do it here */
3824 && noper_next == tail
3827 /* noper is triable, so we can start a new
3831 trietype = noper_trietype;
3833 /* if we already saw a first but the
3834 * current node is not triable then we have
3835 * to reset the first information. */
3840 } /* end handle unmergable node */
3841 } /* loop over branches */
3842 DEBUG_TRIE_COMPILE_r({
3843 regprop(RExC_rx, mysv, cur);
3844 PerlIO_printf( Perl_debug_log,
3845 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3846 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3849 if ( last && trietype ) {
3850 if ( trietype != NOTHING ) {
3851 /* the last branch of the sequence was part of
3852 * a trie, so we have to construct it here
3853 * outside of the loop */
3854 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3855 #ifdef TRIE_STUDY_OPT
3856 if ( ((made == MADE_EXACT_TRIE &&
3857 startbranch == first)
3858 || ( first_non_open == first )) &&
3860 flags |= SCF_TRIE_RESTUDY;
3861 if ( startbranch == first
3864 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3869 /* at this point we know whatever we have is a
3870 * NOTHING sequence/branch AND if 'startbranch'
3871 * is 'first' then we can turn the whole thing
3874 if ( startbranch == first ) {
3876 /* the entire thing is a NOTHING sequence,
3877 * something like this: (?:|) So we can
3878 * turn it into a plain NOTHING op. */
3879 DEBUG_TRIE_COMPILE_r({
3880 regprop(RExC_rx, mysv, cur);
3881 PerlIO_printf( Perl_debug_log,
3882 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3883 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3886 OP(startbranch)= NOTHING;
3887 NEXT_OFF(startbranch)= tail - startbranch;
3888 for ( opt= startbranch + 1; opt < tail ; opt++ )
3892 } /* end if ( last) */
3893 } /* TRIE_MAXBUF is non zero */
3898 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3899 scan = NEXTOPER(NEXTOPER(scan));
3900 } else /* single branch is optimized. */
3901 scan = NEXTOPER(scan);
3903 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3904 scan_frame *newframe = NULL;
3908 U32 my_recursed_depth= recursed_depth;
3910 if (OP(scan) != SUSPEND) {
3911 /* set the pointer */
3912 if (OP(scan) == GOSUB) {
3914 RExC_recurse[ARG2L(scan)] = scan;
3915 start = RExC_open_parens[paren-1];
3916 end = RExC_close_parens[paren-1];
3919 start = RExC_rxi->program + 1;
3924 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
3926 if (!recursed_depth) {
3927 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
3929 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
3930 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
3931 RExC_study_chunk_recursed_bytes, U8);
3933 /* we havent recursed into this paren yet, so recurse into it */
3934 DEBUG_STUDYDATA("set:", data,depth);
3935 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
3936 my_recursed_depth= recursed_depth + 1;
3937 Newx(newframe,1,scan_frame);
3939 DEBUG_STUDYDATA("inf:", data,depth);
3940 /* some form of infinite recursion, assume infinite length */
3941 if (flags & SCF_DO_SUBSTR) {
3942 SCAN_COMMIT(pRExC_state,data,minlenp);
3943 data->longest = &(data->longest_float);
3945 is_inf = is_inf_internal = 1;
3946 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3947 ssc_anything(data->start_class);
3948 flags &= ~SCF_DO_STCLASS;
3951 Newx(newframe,1,scan_frame);
3954 end = regnext(scan);
3959 SAVEFREEPV(newframe);
3960 newframe->next = regnext(scan);
3961 newframe->last = last;
3962 newframe->stop = stopparen;
3963 newframe->prev = frame;
3964 newframe->prev_recursed_depth = recursed_depth;
3966 DEBUG_STUDYDATA("frame-new:",data,depth);
3967 DEBUG_PEEP("fnew", scan, depth);
3974 recursed_depth= my_recursed_depth;
3979 else if (OP(scan) == EXACT) {
3980 SSize_t l = STR_LEN(scan);
3983 const U8 * const s = (U8*)STRING(scan);
3984 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3985 l = utf8_length(s, s + l);
3987 uc = *((U8*)STRING(scan));
3990 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3991 /* The code below prefers earlier match for fixed
3992 offset, later match for variable offset. */
3993 if (data->last_end == -1) { /* Update the start info. */
3994 data->last_start_min = data->pos_min;
3995 data->last_start_max = is_inf
3996 ? SSize_t_MAX : data->pos_min + data->pos_delta;
3998 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4000 SvUTF8_on(data->last_found);
4002 SV * const sv = data->last_found;
4003 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4004 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4005 if (mg && mg->mg_len >= 0)
4006 mg->mg_len += utf8_length((U8*)STRING(scan),
4007 (U8*)STRING(scan)+STR_LEN(scan));
4009 data->last_end = data->pos_min + l;
4010 data->pos_min += l; /* As in the first entry. */
4011 data->flags &= ~SF_BEFORE_EOL;
4014 /* ANDing the code point leaves at most it, and not in locale, and
4015 * can't match null string */
4016 if (flags & SCF_DO_STCLASS_AND) {
4017 ssc_cp_and(data->start_class, uc);
4018 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4019 ssc_clear_locale(data->start_class);
4021 else if (flags & SCF_DO_STCLASS_OR) {
4022 ssc_add_cp(data->start_class, uc);
4023 ssc_and(pRExC_state, data->start_class, and_withp);
4025 flags &= ~SCF_DO_STCLASS;
4027 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4028 SSize_t l = STR_LEN(scan);
4029 UV uc = *((U8*)STRING(scan));
4030 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4031 separate code points */
4033 /* Search for fixed substrings supports EXACT only. */
4034 if (flags & SCF_DO_SUBSTR) {
4036 SCAN_COMMIT(pRExC_state, data, minlenp);
4039 const U8 * const s = (U8 *)STRING(scan);
4040 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4041 l = utf8_length(s, s + l);
4043 if (has_exactf_sharp_s) {
4044 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
4046 min += l - min_subtract;
4048 delta += min_subtract;
4049 if (flags & SCF_DO_SUBSTR) {
4050 data->pos_min += l - min_subtract;
4051 if (data->pos_min < 0) {
4054 data->pos_delta += min_subtract;
4056 data->longest = &(data->longest_float);
4059 if (OP(scan) == EXACTFL) {
4060 if (flags & SCF_DO_STCLASS_AND) {
4061 ssc_flags_and(data->start_class,
4062 ANYOF_LOCALE|ANYOF_LOC_FOLD);
4064 else if (flags & SCF_DO_STCLASS_OR) {
4065 ANYOF_FLAGS(data->start_class)
4066 |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
4069 /* We don't know what the folds are; it could be anything. XXX
4070 * Actually, we only support UTF-8 encoding for code points
4071 * above Latin1, so we could know what those folds are. */
4072 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4076 else { /* Non-locale EXACTFish */
4077 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4078 if (flags & SCF_DO_STCLASS_AND) {
4079 ssc_clear_locale(data->start_class);
4081 if (uc < 256) { /* We know what the Latin1 folds are ... */
4082 if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
4083 know if anything folds
4085 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4086 PL_fold_latin1[uc]);
4087 if (OP(scan) != EXACTFA) { /* The folds below aren't
4089 if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4091 = add_cp_to_invlist(EXACTF_invlist,
4092 LATIN_SMALL_LETTER_SHARP_S);
4094 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4096 = add_cp_to_invlist(EXACTF_invlist, 's');
4098 = add_cp_to_invlist(EXACTF_invlist, 'S');
4102 /* We also know if there are above-Latin1 code points
4103 * that fold to this (none legal for ASCII and /iaa) */
4104 if ((! isASCII(uc) || OP(scan) != EXACTFA)
4105 && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4107 /* XXX We could know exactly what does fold to this
4108 * if the reverse folds are loaded, as currently in
4110 _invlist_union(EXACTF_invlist,
4116 else { /* Non-locale, above Latin1. XXX We don't currently
4117 know what participates in folds with this, so have
4118 to assume anything could */
4120 /* XXX We could know exactly what does fold to this if the
4121 * reverse folds are loaded, as currently in S_regclass().
4122 * But we do know that under /iaa nothing in the ASCII
4123 * range can participate */
4124 if (OP(scan) == EXACTFA) {
4125 _invlist_union_complement_2nd(EXACTF_invlist,
4126 PL_Posix_ptrs[_CC_ASCII],
4130 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4135 if (flags & SCF_DO_STCLASS_AND) {
4136 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4137 ANYOF_POSIXL_ZERO(data->start_class);
4138 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4140 else if (flags & SCF_DO_STCLASS_OR) {
4141 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4142 ssc_and(pRExC_state, data->start_class, and_withp);
4144 flags &= ~SCF_DO_STCLASS;
4145 SvREFCNT_dec(EXACTF_invlist);
4147 else if (REGNODE_VARIES(OP(scan))) {
4148 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4149 I32 fl = 0, f = flags;
4150 regnode * const oscan = scan;
4151 regnode_ssc this_class;
4152 regnode_ssc *oclass = NULL;
4153 I32 next_is_eval = 0;
4155 switch (PL_regkind[OP(scan)]) {
4156 case WHILEM: /* End of (?:...)* . */
4157 scan = NEXTOPER(scan);
4160 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4161 next = NEXTOPER(scan);
4162 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4164 maxcount = REG_INFTY;
4165 next = regnext(scan);
4166 scan = NEXTOPER(scan);
4170 if (flags & SCF_DO_SUBSTR)
4175 if (flags & SCF_DO_STCLASS) {
4177 maxcount = REG_INFTY;
4178 next = regnext(scan);
4179 scan = NEXTOPER(scan);
4182 is_inf = is_inf_internal = 1;
4183 scan = regnext(scan);
4184 if (flags & SCF_DO_SUBSTR) {
4185 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
4186 data->longest = &(data->longest_float);
4188 goto optimize_curly_tail;
4190 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4191 && (scan->flags == stopparen))
4196 mincount = ARG1(scan);
4197 maxcount = ARG2(scan);
4199 next = regnext(scan);
4200 if (OP(scan) == CURLYX) {
4201 I32 lp = (data ? *(data->last_closep) : 0);
4202 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4204 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4205 next_is_eval = (OP(scan) == EVAL);
4207 if (flags & SCF_DO_SUBSTR) {
4208 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
4209 pos_before = data->pos_min;
4213 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4215 data->flags |= SF_IS_INF;
4217 if (flags & SCF_DO_STCLASS) {
4218 ssc_init(pRExC_state, &this_class);
4219 oclass = data->start_class;
4220 data->start_class = &this_class;
4221 f |= SCF_DO_STCLASS_AND;
4222 f &= ~SCF_DO_STCLASS_OR;
4224 /* Exclude from super-linear cache processing any {n,m}
4225 regops for which the combination of input pos and regex
4226 pos is not enough information to determine if a match
4229 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4230 regex pos at the \s*, the prospects for a match depend not
4231 only on the input position but also on how many (bar\s*)
4232 repeats into the {4,8} we are. */
4233 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4234 f &= ~SCF_WHILEM_VISITED_POS;
4236 /* This will finish on WHILEM, setting scan, or on NULL: */
4237 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4238 last, data, stopparen, recursed_depth, NULL,
4240 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
4242 if (flags & SCF_DO_STCLASS)
4243 data->start_class = oclass;
4244 if (mincount == 0 || minnext == 0) {
4245 if (flags & SCF_DO_STCLASS_OR) {
4246 ssc_or(pRExC_state, data->start_class, &this_class);
4248 else if (flags & SCF_DO_STCLASS_AND) {
4249 /* Switch to OR mode: cache the old value of
4250 * data->start_class */
4252 StructCopy(data->start_class, and_withp, regnode_ssc);
4253 flags &= ~SCF_DO_STCLASS_AND;
4254 StructCopy(&this_class, data->start_class, regnode_ssc);
4255 flags |= SCF_DO_STCLASS_OR;
4256 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4258 } else { /* Non-zero len */
4259 if (flags & SCF_DO_STCLASS_OR) {
4260 ssc_or(pRExC_state, data->start_class, &this_class);
4261 ssc_and(pRExC_state, data->start_class, and_withp);
4263 else if (flags & SCF_DO_STCLASS_AND)
4264 ssc_and(pRExC_state, data->start_class, &this_class);
4265 flags &= ~SCF_DO_STCLASS;
4267 if (!scan) /* It was not CURLYX, but CURLY. */
4269 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4270 /* ? quantifier ok, except for (?{ ... }) */
4271 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4272 && (minnext == 0) && (deltanext == 0)
4273 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4274 && maxcount <= REG_INFTY/3) /* Complement check for big count */
4276 /* Fatal warnings may leak the regexp without this: */
4277 SAVEFREESV(RExC_rx_sv);
4278 ckWARNreg(RExC_parse,
4279 "Quantifier unexpected on zero-length expression");
4280 (void)ReREFCNT_inc(RExC_rx_sv);
4283 min += minnext * mincount;
4284 is_inf_internal |= deltanext == SSize_t_MAX
4285 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4286 is_inf |= is_inf_internal;
4288 delta = SSize_t_MAX;
4290 delta += (minnext + deltanext) * maxcount - minnext * mincount;
4292 /* Try powerful optimization CURLYX => CURLYN. */
4293 if ( OP(oscan) == CURLYX && data
4294 && data->flags & SF_IN_PAR
4295 && !(data->flags & SF_HAS_EVAL)
4296 && !deltanext && minnext == 1 ) {
4297 /* Try to optimize to CURLYN. */
4298 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4299 regnode * const nxt1 = nxt;
4306 if (!REGNODE_SIMPLE(OP(nxt))
4307 && !(PL_regkind[OP(nxt)] == EXACT
4308 && STR_LEN(nxt) == 1))
4314 if (OP(nxt) != CLOSE)
4316 if (RExC_open_parens) {
4317 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4318 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4320 /* Now we know that nxt2 is the only contents: */
4321 oscan->flags = (U8)ARG(nxt);
4323 OP(nxt1) = NOTHING; /* was OPEN. */
4326 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4327 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4328 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4329 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4330 OP(nxt + 1) = OPTIMIZED; /* was count. */
4331 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4336 /* Try optimization CURLYX => CURLYM. */
4337 if ( OP(oscan) == CURLYX && data
4338 && !(data->flags & SF_HAS_PAR)
4339 && !(data->flags & SF_HAS_EVAL)
4340 && !deltanext /* atom is fixed width */
4341 && minnext != 0 /* CURLYM can't handle zero width */
4342 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4344 /* XXXX How to optimize if data == 0? */
4345 /* Optimize to a simpler form. */
4346 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4350 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4351 && (OP(nxt2) != WHILEM))
4353 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4354 /* Need to optimize away parenths. */
4355 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4356 /* Set the parenth number. */
4357 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4359 oscan->flags = (U8)ARG(nxt);
4360 if (RExC_open_parens) {
4361 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4362 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4364 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4365 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4368 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4369 OP(nxt + 1) = OPTIMIZED; /* was count. */
4370 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4371 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4374 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4375 regnode *nnxt = regnext(nxt1);
4377 if (reg_off_by_arg[OP(nxt1)])
4378 ARG_SET(nxt1, nxt2 - nxt1);
4379 else if (nxt2 - nxt1 < U16_MAX)
4380 NEXT_OFF(nxt1) = nxt2 - nxt1;
4382 OP(nxt) = NOTHING; /* Cannot beautify */
4387 /* Optimize again: */
4388 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4389 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4394 else if ((OP(oscan) == CURLYX)
4395 && (flags & SCF_WHILEM_VISITED_POS)
4396 /* See the comment on a similar expression above.
4397 However, this time it's not a subexpression
4398 we care about, but the expression itself. */
4399 && (maxcount == REG_INFTY)
4400 && data && ++data->whilem_c < 16) {
4401 /* This stays as CURLYX, we can put the count/of pair. */
4402 /* Find WHILEM (as in regexec.c) */
4403 regnode *nxt = oscan + NEXT_OFF(oscan);
4405 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4407 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4408 | (RExC_whilem_seen << 4)); /* On WHILEM */
4410 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4412 if (flags & SCF_DO_SUBSTR) {
4413 SV *last_str = NULL;
4414 int counted = mincount != 0;
4416 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4417 SSize_t b = pos_before >= data->last_start_min
4418 ? pos_before : data->last_start_min;
4420 const char * const s = SvPV_const(data->last_found, l);
4421 SSize_t old = b - data->last_start_min;
4424 old = utf8_hop((U8*)s, old) - (U8*)s;
4426 /* Get the added string: */
4427 last_str = newSVpvn_utf8(s + old, l, UTF);
4428 if (deltanext == 0 && pos_before == b) {
4429 /* What was added is a constant string */
4431 SvGROW(last_str, (mincount * l) + 1);
4432 repeatcpy(SvPVX(last_str) + l,
4433 SvPVX_const(last_str), l, mincount - 1);
4434 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4435 /* Add additional parts. */
4436 SvCUR_set(data->last_found,
4437 SvCUR(data->last_found) - l);
4438 sv_catsv(data->last_found, last_str);
4440 SV * sv = data->last_found;
4442 SvUTF8(sv) && SvMAGICAL(sv) ?
4443 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4444 if (mg && mg->mg_len >= 0)
4445 mg->mg_len += CHR_SVLEN(last_str) - l;
4447 data->last_end += l * (mincount - 1);
4450 /* start offset must point into the last copy */
4451 data->last_start_min += minnext * (mincount - 1);
4452 data->last_start_max += is_inf ? SSize_t_MAX
4453 : (maxcount - 1) * (minnext + data->pos_delta);
4456 /* It is counted once already... */
4457 data->pos_min += minnext * (mincount - counted);
4459 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4460 " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4461 " maxcount=%"UVdf" mincount=%"UVdf"\n",
4462 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4464 if (deltanext != SSize_t_MAX)
4465 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4466 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4467 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4469 if (deltanext == SSize_t_MAX ||
4470 -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4471 data->pos_delta = SSize_t_MAX;
4473 data->pos_delta += - counted * deltanext +
4474 (minnext + deltanext) * maxcount - minnext * mincount;
4475 if (mincount != maxcount) {
4476 /* Cannot extend fixed substrings found inside
4478 SCAN_COMMIT(pRExC_state,data,minlenp);
4479 if (mincount && last_str) {
4480 SV * const sv = data->last_found;
4481 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4482 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4486 sv_setsv(sv, last_str);
4487 data->last_end = data->pos_min;
4488 data->last_start_min =
4489 data->pos_min - CHR_SVLEN(last_str);
4490 data->last_start_max = is_inf
4492 : data->pos_min + data->pos_delta
4493 - CHR_SVLEN(last_str);
4495 data->longest = &(data->longest_float);
4497 SvREFCNT_dec(last_str);
4499 if (data && (fl & SF_HAS_EVAL))
4500 data->flags |= SF_HAS_EVAL;
4501 optimize_curly_tail:
4502 if (OP(oscan) != CURLYX) {
4503 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4505 NEXT_OFF(oscan) += NEXT_OFF(next);
4511 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4516 if (flags & SCF_DO_SUBSTR) {
4517 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4518 data->longest = &(data->longest_float);
4520 is_inf = is_inf_internal = 1;
4521 if (flags & SCF_DO_STCLASS_OR) {
4522 if (OP(scan) == CLUMP) {
4523 /* Actually is any start char, but very few code points
4524 * aren't start characters */
4525 ssc_match_all_cp(data->start_class);
4528 ssc_anything(data->start_class);
4531 flags &= ~SCF_DO_STCLASS;
4535 else if (OP(scan) == LNBREAK) {
4536 if (flags & SCF_DO_STCLASS) {
4537 if (flags & SCF_DO_STCLASS_AND) {
4538 ssc_intersection(data->start_class,
4539 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4540 ssc_clear_locale(data->start_class);
4541 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4543 else if (flags & SCF_DO_STCLASS_OR) {
4544 ssc_union(data->start_class,
4545 PL_XPosix_ptrs[_CC_VERTSPACE],
4547 ssc_and(pRExC_state, data->start_class, and_withp);
4549 flags &= ~SCF_DO_STCLASS;
4552 delta++; /* Because of the 2 char string cr-lf */
4553 if (flags & SCF_DO_SUBSTR) {
4554 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4556 data->pos_delta += 1;
4557 data->longest = &(data->longest_float);
4560 else if (REGNODE_SIMPLE(OP(scan))) {
4562 if (flags & SCF_DO_SUBSTR) {
4563 SCAN_COMMIT(pRExC_state,data,minlenp);
4567 if (flags & SCF_DO_STCLASS) {
4569 SV* my_invlist = sv_2mortal(_new_invlist(0));
4573 if (flags & SCF_DO_STCLASS_AND) {
4574 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4577 /* Some of the logic below assumes that switching
4578 locale on will only add false positives. */
4583 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4587 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4588 ssc_match_all_cp(data->start_class);
4593 SV* REG_ANY_invlist = _new_invlist(2);
4594 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4596 if (flags & SCF_DO_STCLASS_OR) {
4597 ssc_union(data->start_class,
4599 TRUE /* TRUE => invert, hence all but \n
4603 else if (flags & SCF_DO_STCLASS_AND) {
4604 ssc_intersection(data->start_class,
4606 TRUE /* TRUE => invert */
4608 ssc_clear_locale(data->start_class);
4610 SvREFCNT_dec_NN(REG_ANY_invlist);
4614 case ANYOF_WARN_SUPER:
4616 if (flags & SCF_DO_STCLASS_AND)
4617 ssc_and(pRExC_state, data->start_class,
4618 (regnode_ssc*) scan);
4620 ssc_or(pRExC_state, data->start_class,
4621 (regnode_ssc*)scan);
4629 classnum = FLAGS(scan);
4630 namedclass = classnum_to_namedclass(classnum) + invert;
4631 if (flags & SCF_DO_STCLASS_AND) {
4632 bool was_there = cBOOL(
4633 ANYOF_POSIXL_TEST(data->start_class,
4635 ANYOF_POSIXL_ZERO(data->start_class);
4636 if (was_there) { /* Do an AND */
4637 ANYOF_POSIXL_SET(data->start_class, namedclass);
4639 /* No individual code points can now match */
4640 data->start_class->invlist
4641 = sv_2mortal(_new_invlist(0));
4644 int complement = namedclass + ((invert) ? -1 : 1);
4646 assert(flags & SCF_DO_STCLASS_OR);
4648 /* If the complement of this class was already there,
4649 * the result is that they match all code points,
4650 * (\d + \D == everything). Remove the classes from
4651 * future consideration. Locale is not relevant in
4653 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4654 ssc_match_all_cp(data->start_class);
4655 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4656 ANYOF_POSIXL_CLEAR(data->start_class, complement);
4657 if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
4659 ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
4662 else { /* The usual case; just add this class to the
4664 ANYOF_POSIXL_SET(data->start_class, namedclass);
4665 ANYOF_FLAGS(data->start_class)
4666 |= ANYOF_LOCALE|ANYOF_POSIXL;
4671 case NPOSIXA: /* For these, we always know the exact set of
4676 classnum = FLAGS(scan);
4677 my_invlist = PL_Posix_ptrs[classnum];
4686 classnum = FLAGS(scan);
4688 /* If we know all the code points that match the class, use
4689 * that; otherwise use the Latin1 code points, plus we have
4690 * to assume that it could match anything above Latin1 */
4691 if (PL_XPosix_ptrs[classnum]) {
4692 my_invlist = invlist_clone(PL_XPosix_ptrs[classnum]);
4695 _invlist_union(PL_L1Posix_ptrs[classnum],
4696 PL_AboveLatin1, &my_invlist);
4699 /* NPOSIXD matches all upper Latin1 code points unless the
4700 * target string being matched is UTF-8, which is
4701 * unknowable until match time */
4702 if (PL_regkind[OP(scan)] == NPOSIXD) {
4703 _invlist_union_complement_2nd(my_invlist,
4704 PL_Posix_ptrs[_CC_ASCII], &my_invlist);
4709 if (flags & SCF_DO_STCLASS_AND) {
4710 ssc_intersection(data->start_class, my_invlist, invert);
4711 ssc_clear_locale(data->start_class);
4714 assert(flags & SCF_DO_STCLASS_OR);
4715 ssc_union(data->start_class, my_invlist, invert);
4718 if (flags & SCF_DO_STCLASS_OR)
4719 ssc_and(pRExC_state, data->start_class, and_withp);
4720 flags &= ~SCF_DO_STCLASS;
4723 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4724 data->flags |= (OP(scan) == MEOL
4727 SCAN_COMMIT(pRExC_state, data, minlenp);
4730 else if ( PL_regkind[OP(scan)] == BRANCHJ
4731 /* Lookbehind, or need to calculate parens/evals/stclass: */
4732 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4733 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4734 if ( OP(scan) == UNLESSM &&
4736 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4737 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4740 regnode *upto= regnext(scan);
4742 SV * const mysv_val=sv_newmortal();
4743 DEBUG_STUDYDATA("OPFAIL",data,depth);
4745 /*DEBUG_PARSE_MSG("opfail");*/
4746 regprop(RExC_rx, mysv_val, upto);
4747 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4748 SvPV_nolen_const(mysv_val),
4749 (IV)REG_NODE_NUM(upto),
4754 NEXT_OFF(scan) = upto - scan;
4755 for (opt= scan + 1; opt < upto ; opt++)
4756 OP(opt) = OPTIMIZED;
4760 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4761 || OP(scan) == UNLESSM )
4763 /* Negative Lookahead/lookbehind
4764 In this case we can't do fixed string optimisation.
4767 SSize_t deltanext, minnext, fake = 0;
4772 data_fake.flags = 0;
4774 data_fake.whilem_c = data->whilem_c;
4775 data_fake.last_closep = data->last_closep;
4778 data_fake.last_closep = &fake;
4779 data_fake.pos_delta = delta;
4780 if ( flags & SCF_DO_STCLASS && !scan->flags
4781 && OP(scan) == IFMATCH ) { /* Lookahead */
4782 ssc_init(pRExC_state, &intrnl);
4783 data_fake.start_class = &intrnl;
4784 f |= SCF_DO_STCLASS_AND;
4786 if (flags & SCF_WHILEM_VISITED_POS)
4787 f |= SCF_WHILEM_VISITED_POS;
4788 next = regnext(scan);
4789 nscan = NEXTOPER(NEXTOPER(scan));
4790 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4791 last, &data_fake, stopparen, recursed_depth, NULL, f, depth+1);
4794 FAIL("Variable length lookbehind not implemented");
4796 else if (minnext > (I32)U8_MAX) {
4797 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4799 scan->flags = (U8)minnext;
4802 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4804 if (data_fake.flags & SF_HAS_EVAL)
4805 data->flags |= SF_HAS_EVAL;
4806 data->whilem_c = data_fake.whilem_c;
4808 if (f & SCF_DO_STCLASS_AND) {
4809 if (flags & SCF_DO_STCLASS_OR) {
4810 /* OR before, AND after: ideally we would recurse with
4811 * data_fake to get the AND applied by study of the
4812 * remainder of the pattern, and then derecurse;
4813 * *** HACK *** for now just treat as "no information".
4814 * See [perl #56690].
4816 ssc_init(pRExC_state, data->start_class);
4818 /* AND before and after: combine and continue */
4819 ssc_and(pRExC_state, data->start_class, &intrnl);
4823 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4825 /* Positive Lookahead/lookbehind
4826 In this case we can do fixed string optimisation,
4827 but we must be careful about it. Note in the case of
4828 lookbehind the positions will be offset by the minimum
4829 length of the pattern, something we won't know about
4830 until after the recurse.
4832 SSize_t deltanext, fake = 0;
4836 /* We use SAVEFREEPV so that when the full compile
4837 is finished perl will clean up the allocated
4838 minlens when it's all done. This way we don't
4839 have to worry about freeing them when we know
4840 they wont be used, which would be a pain.
4843 Newx( minnextp, 1, SSize_t );
4844 SAVEFREEPV(minnextp);
4847 StructCopy(data, &data_fake, scan_data_t);
4848 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4851 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4852 data_fake.last_found=newSVsv(data->last_found);
4856 data_fake.last_closep = &fake;
4857 data_fake.flags = 0;
4858 data_fake.pos_delta = delta;
4860 data_fake.flags |= SF_IS_INF;
4861 if ( flags & SCF_DO_STCLASS && !scan->flags
4862 && OP(scan) == IFMATCH ) { /* Lookahead */
4863 ssc_init(pRExC_state, &intrnl);
4864 data_fake.start_class = &intrnl;
4865 f |= SCF_DO_STCLASS_AND;
4867 if (flags & SCF_WHILEM_VISITED_POS)
4868 f |= SCF_WHILEM_VISITED_POS;
4869 next = regnext(scan);
4870 nscan = NEXTOPER(NEXTOPER(scan));
4872 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4873 last, &data_fake, stopparen, recursed_depth, NULL, f,depth+1);
4876 FAIL("Variable length lookbehind not implemented");
4878 else if (*minnextp > (I32)U8_MAX) {
4879 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4881 scan->flags = (U8)*minnextp;
4886 if (f & SCF_DO_STCLASS_AND) {
4887 ssc_and(pRExC_state, data->start_class, &intrnl);
4890 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4892 if (data_fake.flags & SF_HAS_EVAL)
4893 data->flags |= SF_HAS_EVAL;
4894 data->whilem_c = data_fake.whilem_c;
4895 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4896 if (RExC_rx->minlen<*minnextp)
4897 RExC_rx->minlen=*minnextp;
4898 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4899 SvREFCNT_dec_NN(data_fake.last_found);
4901 if ( data_fake.minlen_fixed != minlenp )
4903 data->offset_fixed= data_fake.offset_fixed;
4904 data->minlen_fixed= data_fake.minlen_fixed;
4905 data->lookbehind_fixed+= scan->flags;
4907 if ( data_fake.minlen_float != minlenp )
4909 data->minlen_float= data_fake.minlen_float;
4910 data->offset_float_min=data_fake.offset_float_min;
4911 data->offset_float_max=data_fake.offset_float_max;
4912 data->lookbehind_float+= scan->flags;
4919 else if (OP(scan) == OPEN) {
4920 if (stopparen != (I32)ARG(scan))
4923 else if (OP(scan) == CLOSE) {
4924 if (stopparen == (I32)ARG(scan)) {
4927 if ((I32)ARG(scan) == is_par) {
4928 next = regnext(scan);
4930 if ( next && (OP(next) != WHILEM) && next < last)
4931 is_par = 0; /* Disable optimization */
4934 *(data->last_closep) = ARG(scan);
4936 else if (OP(scan) == EVAL) {
4938 data->flags |= SF_HAS_EVAL;
4940 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4941 if (flags & SCF_DO_SUBSTR) {
4942 SCAN_COMMIT(pRExC_state,data,minlenp);
4943 flags &= ~SCF_DO_SUBSTR;
4945 if (data && OP(scan)==ACCEPT) {
4946 data->flags |= SCF_SEEN_ACCEPT;
4951 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4953 if (flags & SCF_DO_SUBSTR) {
4954 SCAN_COMMIT(pRExC_state,data,minlenp);
4955 data->longest = &(data->longest_float);
4957 is_inf = is_inf_internal = 1;
4958 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4959 ssc_anything(data->start_class);
4960 flags &= ~SCF_DO_STCLASS;
4962 else if (OP(scan) == GPOS) {
4963 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4964 !(delta || is_inf || (data && data->pos_delta)))
4966 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4967 RExC_rx->extflags |= RXf_ANCH_GPOS;
4968 if (RExC_rx->gofs < (STRLEN)min)
4969 RExC_rx->gofs = min;
4971 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4975 #ifdef TRIE_STUDY_OPT
4976 #ifdef FULL_TRIE_STUDY
4977 else if (PL_regkind[OP(scan)] == TRIE) {
4978 /* NOTE - There is similar code to this block above for handling
4979 BRANCH nodes on the initial study. If you change stuff here
4981 regnode *trie_node= scan;
4982 regnode *tail= regnext(scan);
4983 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4984 SSize_t max1 = 0, min1 = SSize_t_MAX;
4987 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4988 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4989 if (flags & SCF_DO_STCLASS)
4990 ssc_init_zero(pRExC_state, &accum);
4996 const regnode *nextbranch= NULL;
4999 for ( word=1 ; word <= trie->wordcount ; word++)
5001 SSize_t deltanext=0, minnext=0, f = 0, fake;
5002 regnode_ssc this_class;
5004 data_fake.flags = 0;
5006 data_fake.whilem_c = data->whilem_c;
5007 data_fake.last_closep = data->last_closep;
5010 data_fake.last_closep = &fake;
5011 data_fake.pos_delta = delta;
5012 if (flags & SCF_DO_STCLASS) {
5013 ssc_init(pRExC_state, &this_class);
5014 data_fake.start_class = &this_class;
5015 f = SCF_DO_STCLASS_AND;
5017 if (flags & SCF_WHILEM_VISITED_POS)
5018 f |= SCF_WHILEM_VISITED_POS;
5020 if (trie->jump[word]) {
5022 nextbranch = trie_node + trie->jump[0];
5023 scan= trie_node + trie->jump[word];
5024 /* We go from the jump point to the branch that follows
5025 it. Note this means we need the vestigal unused branches
5026 even though they arent otherwise used.
5028 minnext = study_chunk(pRExC_state, &scan, minlenp,
5029 &deltanext, (regnode *)nextbranch, &data_fake,
5030 stopparen, recursed_depth, NULL, f,depth+1);
5032 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5033 nextbranch= regnext((regnode*)nextbranch);
5035 if (min1 > (SSize_t)(minnext + trie->minlen))
5036 min1 = minnext + trie->minlen;
5037 if (deltanext == SSize_t_MAX) {
5038 is_inf = is_inf_internal = 1;
5040 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5041 max1 = minnext + deltanext + trie->maxlen;
5043 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5045 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5046 if ( stopmin > min + min1)
5047 stopmin = min + min1;
5048 flags &= ~SCF_DO_SUBSTR;
5050 data->flags |= SCF_SEEN_ACCEPT;
5053 if (data_fake.flags & SF_HAS_EVAL)
5054 data->flags |= SF_HAS_EVAL;
5055 data->whilem_c = data_fake.whilem_c;
5057 if (flags & SCF_DO_STCLASS)
5058 ssc_or(pRExC_state, &accum, &this_class);
5061 if (flags & SCF_DO_SUBSTR) {
5062 data->pos_min += min1;
5063 data->pos_delta += max1 - min1;
5064 if (max1 != min1 || is_inf)
5065 data->longest = &(data->longest_float);
5068 delta += max1 - min1;
5069 if (flags & SCF_DO_STCLASS_OR) {
5070 ssc_or(pRExC_state, data->start_class, &accum);
5072 ssc_and(pRExC_state, data->start_class, and_withp);
5073 flags &= ~SCF_DO_STCLASS;
5076 else if (flags & SCF_DO_STCLASS_AND) {
5078 ssc_and(pRExC_state, data->start_class, &accum);
5079 flags &= ~SCF_DO_STCLASS;
5082 /* Switch to OR mode: cache the old value of
5083 * data->start_class */
5085 StructCopy(data->start_class, and_withp, regnode_ssc);
5086 flags &= ~SCF_DO_STCLASS_AND;
5087 StructCopy(&accum, data->start_class, regnode_ssc);
5088 flags |= SCF_DO_STCLASS_OR;
5095 else if (PL_regkind[OP(scan)] == TRIE) {
5096 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5099 min += trie->minlen;
5100 delta += (trie->maxlen - trie->minlen);
5101 flags &= ~SCF_DO_STCLASS; /* xxx */
5102 if (flags & SCF_DO_SUBSTR) {
5103 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
5104 data->pos_min += trie->minlen;
5105 data->pos_delta += (trie->maxlen - trie->minlen);
5106 if (trie->maxlen != trie->minlen)
5107 data->longest = &(data->longest_float);
5109 if (trie->jump) /* no more substrings -- for now /grr*/
5110 flags &= ~SCF_DO_SUBSTR;
5112 #endif /* old or new */
5113 #endif /* TRIE_STUDY_OPT */
5115 /* Else: zero-length, ignore. */
5116 scan = regnext(scan);
5118 /* If we are exiting a recursion we can unset its recursed bit
5119 * and allow ourselves to enter it again - no danger of an
5120 * infinite loop there.
5121 if (stopparen > -1 && recursed) {
5122 DEBUG_STUDYDATA("unset:", data,depth);
5123 PAREN_UNSET( recursed, stopparen);
5127 DEBUG_STUDYDATA("frame-end:",data,depth);
5128 DEBUG_PEEP("fend", scan, depth);
5129 /* restore previous context */
5132 stopparen = frame->stop;
5133 recursed_depth = frame->prev_recursed_depth;
5136 frame = frame->prev;
5137 goto fake_study_recurse;
5142 DEBUG_STUDYDATA("pre-fin:",data,depth);
5145 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5146 if (flags & SCF_DO_SUBSTR && is_inf)
5147 data->pos_delta = SSize_t_MAX - data->pos_min;
5148 if (is_par > (I32)U8_MAX)
5150 if (is_par && pars==1 && data) {
5151 data->flags |= SF_IN_PAR;
5152 data->flags &= ~SF_HAS_PAR;
5154 else if (pars && data) {
5155 data->flags |= SF_HAS_PAR;
5156 data->flags &= ~SF_IN_PAR;
5158 if (flags & SCF_DO_STCLASS_OR)
5159 ssc_and(pRExC_state, data->start_class, and_withp);
5160 if (flags & SCF_TRIE_RESTUDY)
5161 data->flags |= SCF_TRIE_RESTUDY;
5163 DEBUG_STUDYDATA("post-fin:",data,depth);
5165 return min < stopmin ? min : stopmin;
5169 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5171 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5173 PERL_ARGS_ASSERT_ADD_DATA;
5175 Renewc(RExC_rxi->data,
5176 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5177 char, struct reg_data);
5179 Renew(RExC_rxi->data->what, count + n, U8);
5181 Newx(RExC_rxi->data->what, n, U8);
5182 RExC_rxi->data->count = count + n;
5183 Copy(s, RExC_rxi->data->what + count, n, U8);
5187 /*XXX: todo make this not included in a non debugging perl */
5188 #ifndef PERL_IN_XSUB_RE
5190 Perl_reginitcolors(pTHX)
5193 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5195 char *t = savepv(s);
5199 t = strchr(t, '\t');
5205 PL_colors[i] = t = (char *)"";
5210 PL_colors[i++] = (char *)"";
5217 #ifdef TRIE_STUDY_OPT
5218 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5221 (data.flags & SCF_TRIE_RESTUDY) \
5229 #define CHECK_RESTUDY_GOTO_butfirst
5233 * pregcomp - compile a regular expression into internal code
5235 * Decides which engine's compiler to call based on the hint currently in
5239 #ifndef PERL_IN_XSUB_RE
5241 /* return the currently in-scope regex engine (or the default if none) */
5243 regexp_engine const *
5244 Perl_current_re_engine(pTHX)
5248 if (IN_PERL_COMPILETIME) {
5249 HV * const table = GvHV(PL_hintgv);
5252 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5253 return &PL_core_reg_engine;
5254 ptr = hv_fetchs(table, "regcomp", FALSE);
5255 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5256 return &PL_core_reg_engine;
5257 return INT2PTR(regexp_engine*,SvIV(*ptr));
5261 if (!PL_curcop->cop_hints_hash)
5262 return &PL_core_reg_engine;
5263 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5264 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5265 return &PL_core_reg_engine;
5266 return INT2PTR(regexp_engine*,SvIV(ptr));
5272 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5275 regexp_engine const *eng = current_re_engine();
5276 GET_RE_DEBUG_FLAGS_DECL;
5278 PERL_ARGS_ASSERT_PREGCOMP;
5280 /* Dispatch a request to compile a regexp to correct regexp engine. */
5282 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5285 return CALLREGCOMP_ENG(eng, pattern, flags);
5289 /* public(ish) entry point for the perl core's own regex compiling code.
5290 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5291 * pattern rather than a list of OPs, and uses the internal engine rather
5292 * than the current one */
5295 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5297 SV *pat = pattern; /* defeat constness! */
5298 PERL_ARGS_ASSERT_RE_COMPILE;
5299 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5300 #ifdef PERL_IN_XSUB_RE
5303 &PL_core_reg_engine,
5305 NULL, NULL, rx_flags, 0);
5309 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5310 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5311 * point to the realloced string and length.
5313 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5317 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5318 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5320 U8 *const src = (U8*)*pat_p;
5323 STRLEN s = 0, d = 0;
5325 GET_RE_DEBUG_FLAGS_DECL;
5327 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5328 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5330 Newx(dst, *plen_p * 2 + 1, U8);
5332 while (s < *plen_p) {
5333 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5336 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5337 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
5339 if (n < num_code_blocks) {
5340 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5341 pRExC_state->code_blocks[n].start = d;
5342 assert(dst[d] == '(');
5345 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5346 pRExC_state->code_blocks[n].end = d;
5347 assert(dst[d] == ')');
5357 *pat_p = (char*) dst;
5359 RExC_orig_utf8 = RExC_utf8 = 1;
5364 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5365 * while recording any code block indices, and handling overloading,
5366 * nested qr// objects etc. If pat is null, it will allocate a new
5367 * string, or just return the first arg, if there's only one.
5369 * Returns the malloced/updated pat.
5370 * patternp and pat_count is the array of SVs to be concatted;
5371 * oplist is the optional list of ops that generated the SVs;
5372 * recompile_p is a pointer to a boolean that will be set if
5373 * the regex will need to be recompiled.
5374 * delim, if non-null is an SV that will be inserted between each element
5378 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5379 SV *pat, SV ** const patternp, int pat_count,
5380 OP *oplist, bool *recompile_p, SV *delim)
5384 bool use_delim = FALSE;
5385 bool alloced = FALSE;
5387 /* if we know we have at least two args, create an empty string,
5388 * then concatenate args to that. For no args, return an empty string */
5389 if (!pat && pat_count != 1) {
5390 pat = newSVpvn("", 0);
5395 for (svp = patternp; svp < patternp + pat_count; svp++) {
5398 STRLEN orig_patlen = 0;
5400 SV *msv = use_delim ? delim : *svp;
5401 if (!msv) msv = &PL_sv_undef;
5403 /* if we've got a delimiter, we go round the loop twice for each
5404 * svp slot (except the last), using the delimiter the second
5413 if (SvTYPE(msv) == SVt_PVAV) {
5414 /* we've encountered an interpolated array within
5415 * the pattern, e.g. /...@a..../. Expand the list of elements,
5416 * then recursively append elements.
5417 * The code in this block is based on S_pushav() */
5419 AV *const av = (AV*)msv;
5420 const SSize_t maxarg = AvFILL(av) + 1;
5424 assert(oplist->op_type == OP_PADAV
5425 || oplist->op_type == OP_RV2AV);
5426 oplist = oplist->op_sibling;;
5429 if (SvRMAGICAL(av)) {
5432 Newx(array, maxarg, SV*);
5434 for (i=0; i < maxarg; i++) {
5435 SV ** const svp = av_fetch(av, i, FALSE);
5436 array[i] = svp ? *svp : &PL_sv_undef;
5440 array = AvARRAY(av);
5442 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5443 array, maxarg, NULL, recompile_p,
5445 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5451 /* we make the assumption here that each op in the list of
5452 * op_siblings maps to one SV pushed onto the stack,
5453 * except for code blocks, with have both an OP_NULL and
5455 * This allows us to match up the list of SVs against the
5456 * list of OPs to find the next code block.
5458 * Note that PUSHMARK PADSV PADSV ..
5460 * PADRANGE PADSV PADSV ..
5461 * so the alignment still works. */
5464 if (oplist->op_type == OP_NULL
5465 && (oplist->op_flags & OPf_SPECIAL))
5467 assert(n < pRExC_state->num_code_blocks);
5468 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5469 pRExC_state->code_blocks[n].block = oplist;
5470 pRExC_state->code_blocks[n].src_regex = NULL;
5473 oplist = oplist->op_sibling; /* skip CONST */
5476 oplist = oplist->op_sibling;;
5479 /* apply magic and QR overloading to arg */
5482 if (SvROK(msv) && SvAMAGIC(msv)) {
5483 SV *sv = AMG_CALLunary(msv, regexp_amg);
5487 if (SvTYPE(sv) != SVt_REGEXP)
5488 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5493 /* try concatenation overload ... */
5494 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5495 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5498 /* overloading involved: all bets are off over literal
5499 * code. Pretend we haven't seen it */
5500 pRExC_state->num_code_blocks -= n;
5504 /* ... or failing that, try "" overload */
5505 while (SvAMAGIC(msv)
5506 && (sv = AMG_CALLunary(msv, string_amg))
5510 && SvRV(msv) == SvRV(sv))
5515 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5519 /* this is a partially unrolled
5520 * sv_catsv_nomg(pat, msv);
5521 * that allows us to adjust code block indices if
5524 char *dst = SvPV_force_nomg(pat, dlen);
5526 if (SvUTF8(msv) && !SvUTF8(pat)) {
5527 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5528 sv_setpvn(pat, dst, dlen);
5531 sv_catsv_nomg(pat, msv);
5538 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5541 /* extract any code blocks within any embedded qr//'s */
5542 if (rx && SvTYPE(rx) == SVt_REGEXP
5543 && RX_ENGINE((REGEXP*)rx)->op_comp)
5546 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5547 if (ri->num_code_blocks) {
5549 /* the presence of an embedded qr// with code means
5550 * we should always recompile: the text of the
5551 * qr// may not have changed, but it may be a
5552 * different closure than last time */
5554 Renew(pRExC_state->code_blocks,
5555 pRExC_state->num_code_blocks + ri->num_code_blocks,
5556 struct reg_code_block);
5557 pRExC_state->num_code_blocks += ri->num_code_blocks;
5559 for (i=0; i < ri->num_code_blocks; i++) {
5560 struct reg_code_block *src, *dst;
5561 STRLEN offset = orig_patlen
5562 + ReANY((REGEXP *)rx)->pre_prefix;
5563 assert(n < pRExC_state->num_code_blocks);
5564 src = &ri->code_blocks[i];
5565 dst = &pRExC_state->code_blocks[n];
5566 dst->start = src->start + offset;
5567 dst->end = src->end + offset;
5568 dst->block = src->block;
5569 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5578 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5587 /* see if there are any run-time code blocks in the pattern.
5588 * False positives are allowed */
5591 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5592 char *pat, STRLEN plen)
5597 for (s = 0; s < plen; s++) {
5598 if (n < pRExC_state->num_code_blocks
5599 && s == pRExC_state->code_blocks[n].start)
5601 s = pRExC_state->code_blocks[n].end;
5605 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5607 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5609 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5616 /* Handle run-time code blocks. We will already have compiled any direct
5617 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5618 * copy of it, but with any literal code blocks blanked out and
5619 * appropriate chars escaped; then feed it into
5621 * eval "qr'modified_pattern'"
5625 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5629 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5631 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5632 * and merge them with any code blocks of the original regexp.
5634 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5635 * instead, just save the qr and return FALSE; this tells our caller that
5636 * the original pattern needs upgrading to utf8.
5640 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5641 char *pat, STRLEN plen)
5645 GET_RE_DEBUG_FLAGS_DECL;
5647 if (pRExC_state->runtime_code_qr) {
5648 /* this is the second time we've been called; this should
5649 * only happen if the main pattern got upgraded to utf8
5650 * during compilation; re-use the qr we compiled first time
5651 * round (which should be utf8 too)
5653 qr = pRExC_state->runtime_code_qr;
5654 pRExC_state->runtime_code_qr = NULL;
5655 assert(RExC_utf8 && SvUTF8(qr));
5661 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5665 /* determine how many extra chars we need for ' and \ escaping */
5666 for (s = 0; s < plen; s++) {
5667 if (pat[s] == '\'' || pat[s] == '\\')
5671 Newx(newpat, newlen, char);
5673 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5675 for (s = 0; s < plen; s++) {
5676 if (n < pRExC_state->num_code_blocks
5677 && s == pRExC_state->code_blocks[n].start)
5679 /* blank out literal code block */
5680 assert(pat[s] == '(');
5681 while (s <= pRExC_state->code_blocks[n].end) {
5689 if (pat[s] == '\'' || pat[s] == '\\')
5694 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5698 PerlIO_printf(Perl_debug_log,
5699 "%sre-parsing pattern for runtime code:%s %s\n",
5700 PL_colors[4],PL_colors[5],newpat);
5703 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5709 PUSHSTACKi(PERLSI_REQUIRE);
5710 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5711 * parsing qr''; normally only q'' does this. It also alters
5713 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5714 SvREFCNT_dec_NN(sv);
5719 SV * const errsv = ERRSV;
5720 if (SvTRUE_NN(errsv))
5722 Safefree(pRExC_state->code_blocks);
5723 /* use croak_sv ? */
5724 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5727 assert(SvROK(qr_ref));
5729 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5730 /* the leaving below frees the tmp qr_ref.
5731 * Give qr a life of its own */
5739 if (!RExC_utf8 && SvUTF8(qr)) {
5740 /* first time through; the pattern got upgraded; save the
5741 * qr for the next time through */
5742 assert(!pRExC_state->runtime_code_qr);
5743 pRExC_state->runtime_code_qr = qr;
5748 /* extract any code blocks within the returned qr// */
5751 /* merge the main (r1) and run-time (r2) code blocks into one */
5753 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5754 struct reg_code_block *new_block, *dst;
5755 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5758 if (!r2->num_code_blocks) /* we guessed wrong */
5760 SvREFCNT_dec_NN(qr);
5765 r1->num_code_blocks + r2->num_code_blocks,
5766 struct reg_code_block);
5769 while ( i1 < r1->num_code_blocks
5770 || i2 < r2->num_code_blocks)
5772 struct reg_code_block *src;
5775 if (i1 == r1->num_code_blocks) {
5776 src = &r2->code_blocks[i2++];
5779 else if (i2 == r2->num_code_blocks)
5780 src = &r1->code_blocks[i1++];
5781 else if ( r1->code_blocks[i1].start
5782 < r2->code_blocks[i2].start)
5784 src = &r1->code_blocks[i1++];
5785 assert(src->end < r2->code_blocks[i2].start);
5788 assert( r1->code_blocks[i1].start
5789 > r2->code_blocks[i2].start);
5790 src = &r2->code_blocks[i2++];
5792 assert(src->end < r1->code_blocks[i1].start);
5795 assert(pat[src->start] == '(');
5796 assert(pat[src->end] == ')');
5797 dst->start = src->start;
5798 dst->end = src->end;
5799 dst->block = src->block;
5800 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5804 r1->num_code_blocks += r2->num_code_blocks;
5805 Safefree(r1->code_blocks);
5806 r1->code_blocks = new_block;
5809 SvREFCNT_dec_NN(qr);
5815 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5816 SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5818 /* This is the common code for setting up the floating and fixed length
5819 * string data extracted from Perl_re_op_compile() below. Returns a boolean
5820 * as to whether succeeded or not */
5825 if (! (longest_length
5826 || (eol /* Can't have SEOL and MULTI */
5827 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5829 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5830 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5835 /* copy the information about the longest from the reg_scan_data
5836 over to the program. */
5837 if (SvUTF8(sv_longest)) {
5838 *rx_utf8 = sv_longest;
5841 *rx_substr = sv_longest;
5844 /* end_shift is how many chars that must be matched that
5845 follow this item. We calculate it ahead of time as once the
5846 lookbehind offset is added in we lose the ability to correctly
5848 ml = minlen ? *(minlen) : (SSize_t)longest_length;
5849 *rx_end_shift = ml - offset
5850 - longest_length + (SvTAIL(sv_longest) != 0)
5853 t = (eol/* Can't have SEOL and MULTI */
5854 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5855 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5861 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5862 * regular expression into internal code.
5863 * The pattern may be passed either as:
5864 * a list of SVs (patternp plus pat_count)
5865 * a list of OPs (expr)
5866 * If both are passed, the SV list is used, but the OP list indicates
5867 * which SVs are actually pre-compiled code blocks
5869 * The SVs in the list have magic and qr overloading applied to them (and
5870 * the list may be modified in-place with replacement SVs in the latter
5873 * If the pattern hasn't changed from old_re, then old_re will be
5876 * eng is the current engine. If that engine has an op_comp method, then
5877 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5878 * do the initial concatenation of arguments and pass on to the external
5881 * If is_bare_re is not null, set it to a boolean indicating whether the
5882 * arg list reduced (after overloading) to a single bare regex which has
5883 * been returned (i.e. /$qr/).
5885 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5887 * pm_flags contains the PMf_* flags, typically based on those from the
5888 * pm_flags field of the related PMOP. Currently we're only interested in
5889 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5891 * We can't allocate space until we know how big the compiled form will be,
5892 * but we can't compile it (and thus know how big it is) until we've got a
5893 * place to put the code. So we cheat: we compile it twice, once with code
5894 * generation turned off and size counting turned on, and once "for real".
5895 * This also means that we don't allocate space until we are sure that the
5896 * thing really will compile successfully, and we never have to move the
5897 * code and thus invalidate pointers into it. (Note that it has to be in
5898 * one piece because free() must be able to free it all.) [NB: not true in perl]
5900 * Beware that the optimization-preparation code in here knows about some
5901 * of the structure of the compiled regexp. [I'll say.]
5905 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5906 OP *expr, const regexp_engine* eng, REGEXP *old_re,
5907 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5912 regexp_internal *ri;
5920 SV *code_blocksv = NULL;
5921 SV** new_patternp = patternp;
5923 /* these are all flags - maybe they should be turned
5924 * into a single int with different bit masks */
5925 I32 sawlookahead = 0;
5930 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5932 bool runtime_code = 0;
5934 RExC_state_t RExC_state;
5935 RExC_state_t * const pRExC_state = &RExC_state;
5936 #ifdef TRIE_STUDY_OPT
5938 RExC_state_t copyRExC_state;
5940 GET_RE_DEBUG_FLAGS_DECL;
5942 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5944 DEBUG_r(if (!PL_colorset) reginitcolors());
5946 #ifndef PERL_IN_XSUB_RE
5947 /* Initialize these here instead of as-needed, as is quick and avoids
5948 * having to test them each time otherwise */
5949 if (! PL_AboveLatin1) {
5950 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5951 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5952 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
5954 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5955 PL_L1Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5956 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5958 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5959 = _new_invlist_C_array(L1PosixAlnum_invlist);
5960 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5961 = _new_invlist_C_array(PosixAlnum_invlist);
5963 PL_L1Posix_ptrs[_CC_ALPHA]
5964 = _new_invlist_C_array(L1PosixAlpha_invlist);
5965 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5967 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5968 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5970 /* Cased is the same as Alpha in the ASCII range */
5971 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5972 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5974 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5975 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5977 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5978 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5980 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5981 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5983 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5984 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5986 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5987 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5989 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5990 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5992 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5993 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5994 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5995 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5997 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5998 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
6000 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
6002 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
6003 PL_L1Posix_ptrs[_CC_WORDCHAR]
6004 = _new_invlist_C_array(L1PosixWord_invlist);
6006 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
6007 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
6009 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
6013 pRExC_state->code_blocks = NULL;
6014 pRExC_state->num_code_blocks = 0;
6017 *is_bare_re = FALSE;
6019 if (expr && (expr->op_type == OP_LIST ||
6020 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6021 /* allocate code_blocks if needed */
6025 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6026 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6027 ncode++; /* count of DO blocks */
6029 pRExC_state->num_code_blocks = ncode;
6030 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6035 /* compile-time pattern with just OP_CONSTs and DO blocks */
6040 /* find how many CONSTs there are */
6043 if (expr->op_type == OP_CONST)
6046 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6047 if (o->op_type == OP_CONST)
6051 /* fake up an SV array */
6053 assert(!new_patternp);
6054 Newx(new_patternp, n, SV*);
6055 SAVEFREEPV(new_patternp);
6059 if (expr->op_type == OP_CONST)
6060 new_patternp[n] = cSVOPx_sv(expr);
6062 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6063 if (o->op_type == OP_CONST)
6064 new_patternp[n++] = cSVOPo_sv;
6069 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6070 "Assembling pattern from %d elements%s\n", pat_count,
6071 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6073 /* set expr to the first arg op */
6075 if (pRExC_state->num_code_blocks
6076 && expr->op_type != OP_CONST)
6078 expr = cLISTOPx(expr)->op_first;
6079 assert( expr->op_type == OP_PUSHMARK
6080 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6081 || expr->op_type == OP_PADRANGE);
6082 expr = expr->op_sibling;
6085 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6086 expr, &recompile, NULL);
6088 /* handle bare (possibly after overloading) regex: foo =~ $re */
6093 if (SvTYPE(re) == SVt_REGEXP) {
6097 Safefree(pRExC_state->code_blocks);
6098 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6099 "Precompiled pattern%s\n",
6100 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6106 exp = SvPV_nomg(pat, plen);
6108 if (!eng->op_comp) {
6109 if ((SvUTF8(pat) && IN_BYTES)
6110 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6112 /* make a temporary copy; either to convert to bytes,
6113 * or to avoid repeating get-magic / overloaded stringify */
6114 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6115 (IN_BYTES ? 0 : SvUTF8(pat)));
6117 Safefree(pRExC_state->code_blocks);
6118 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6121 /* ignore the utf8ness if the pattern is 0 length */
6122 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6123 RExC_uni_semantics = 0;
6124 RExC_contains_locale = 0;
6125 RExC_contains_i = 0;
6126 pRExC_state->runtime_code_qr = NULL;
6129 SV *dsv= sv_newmortal();
6130 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6131 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6132 PL_colors[4],PL_colors[5],s);
6136 /* we jump here if we upgrade the pattern to utf8 and have to
6139 if ((pm_flags & PMf_USE_RE_EVAL)
6140 /* this second condition covers the non-regex literal case,
6141 * i.e. $foo =~ '(?{})'. */
6142 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6144 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6146 /* return old regex if pattern hasn't changed */
6147 /* XXX: note in the below we have to check the flags as well as the pattern.
6149 * Things get a touch tricky as we have to compare the utf8 flag independently
6150 * from the compile flags.
6155 && !!RX_UTF8(old_re) == !!RExC_utf8
6156 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6157 && RX_PRECOMP(old_re)
6158 && RX_PRELEN(old_re) == plen
6159 && memEQ(RX_PRECOMP(old_re), exp, plen)
6160 && !runtime_code /* with runtime code, always recompile */ )
6162 Safefree(pRExC_state->code_blocks);
6166 rx_flags = orig_rx_flags;
6168 if (rx_flags & PMf_FOLD) {
6169 RExC_contains_i = 1;
6171 if (initial_charset == REGEX_LOCALE_CHARSET) {
6172 RExC_contains_locale = 1;
6174 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6176 /* Set to use unicode semantics if the pattern is in utf8 and has the
6177 * 'depends' charset specified, as it means unicode when utf8 */
6178 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6182 RExC_flags = rx_flags;
6183 RExC_pm_flags = pm_flags;
6186 if (TAINTING_get && TAINT_get)
6187 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6189 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6190 /* whoops, we have a non-utf8 pattern, whilst run-time code
6191 * got compiled as utf8. Try again with a utf8 pattern */
6192 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6193 pRExC_state->num_code_blocks);
6194 goto redo_first_pass;
6197 assert(!pRExC_state->runtime_code_qr);
6202 RExC_in_lookbehind = 0;
6203 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6205 RExC_override_recoding = 0;
6206 RExC_in_multi_char_class = 0;
6208 /* First pass: determine size, legality. */
6211 RExC_end = exp + plen;
6216 RExC_emit = (regnode *) &RExC_emit_dummy;
6217 RExC_whilem_seen = 0;
6218 RExC_open_parens = NULL;
6219 RExC_close_parens = NULL;
6221 RExC_paren_names = NULL;
6223 RExC_paren_name_list = NULL;
6225 RExC_recurse = NULL;
6226 RExC_study_chunk_recursed = NULL;
6227 RExC_study_chunk_recursed_bytes= 0;
6228 RExC_recurse_count = 0;
6229 pRExC_state->code_index = 0;
6231 #if 0 /* REGC() is (currently) a NOP at the first pass.
6232 * Clever compilers notice this and complain. --jhi */
6233 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6236 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6238 RExC_lastparse=NULL;
6240 /* reg may croak on us, not giving us a chance to free
6241 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6242 need it to survive as long as the regexp (qr/(?{})/).
6243 We must check that code_blocksv is not already set, because we may
6244 have jumped back to restart the sizing pass. */
6245 if (pRExC_state->code_blocks && !code_blocksv) {
6246 code_blocksv = newSV_type(SVt_PV);
6247 SAVEFREESV(code_blocksv);
6248 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6249 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6251 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6252 /* It's possible to write a regexp in ascii that represents Unicode
6253 codepoints outside of the byte range, such as via \x{100}. If we
6254 detect such a sequence we have to convert the entire pattern to utf8
6255 and then recompile, as our sizing calculation will have been based
6256 on 1 byte == 1 character, but we will need to use utf8 to encode
6257 at least some part of the pattern, and therefore must convert the whole
6260 if (flags & RESTART_UTF8) {
6261 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6262 pRExC_state->num_code_blocks);
6263 goto redo_first_pass;
6265 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6268 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6271 PerlIO_printf(Perl_debug_log,
6272 "Required size %"IVdf" nodes\n"
6273 "Starting second pass (creation)\n",
6276 RExC_lastparse=NULL;
6279 /* The first pass could have found things that force Unicode semantics */
6280 if ((RExC_utf8 || RExC_uni_semantics)
6281 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6283 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6286 /* Small enough for pointer-storage convention?
6287 If extralen==0, this means that we will not need long jumps. */
6288 if (RExC_size >= 0x10000L && RExC_extralen)
6289 RExC_size += RExC_extralen;
6292 if (RExC_whilem_seen > 15)
6293 RExC_whilem_seen = 15;
6295 /* Allocate space and zero-initialize. Note, the two step process
6296 of zeroing when in debug mode, thus anything assigned has to
6297 happen after that */
6298 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6300 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6301 char, regexp_internal);
6302 if ( r == NULL || ri == NULL )
6303 FAIL("Regexp out of space");
6305 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6306 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
6308 /* bulk initialize base fields with 0. */
6309 Zero(ri, sizeof(regexp_internal), char);
6312 /* non-zero initialization begins here */
6315 r->extflags = rx_flags;
6316 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6318 if (pm_flags & PMf_IS_QR) {
6319 ri->code_blocks = pRExC_state->code_blocks;
6320 ri->num_code_blocks = pRExC_state->num_code_blocks;
6325 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6326 if (pRExC_state->code_blocks[n].src_regex)
6327 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6328 SAVEFREEPV(pRExC_state->code_blocks);
6332 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6333 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
6335 /* The caret is output if there are any defaults: if not all the STD
6336 * flags are set, or if no character set specifier is needed */
6338 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6340 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
6341 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6342 >> RXf_PMf_STD_PMMOD_SHIFT);
6343 const char *fptr = STD_PAT_MODS; /*"msix"*/
6345 /* Allocate for the worst case, which is all the std flags are turned
6346 * on. If more precision is desired, we could do a population count of
6347 * the flags set. This could be done with a small lookup table, or by
6348 * shifting, masking and adding, or even, when available, assembly
6349 * language for a machine-language population count.
6350 * We never output a minus, as all those are defaults, so are
6351 * covered by the caret */
6352 const STRLEN wraplen = plen + has_p + has_runon
6353 + has_default /* If needs a caret */
6355 /* If needs a character set specifier */
6356 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6357 + (sizeof(STD_PAT_MODS) - 1)
6358 + (sizeof("(?:)") - 1);
6360 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6361 r->xpv_len_u.xpvlenu_pv = p;
6363 SvFLAGS(rx) |= SVf_UTF8;
6366 /* If a default, cover it using the caret */
6368 *p++= DEFAULT_PAT_MOD;
6372 const char* const name = get_regex_charset_name(r->extflags, &len);
6373 Copy(name, p, len, char);
6377 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6380 while((ch = *fptr++)) {
6388 Copy(RExC_precomp, p, plen, char);
6389 assert ((RX_WRAPPED(rx) - p) < 16);
6390 r->pre_prefix = p - RX_WRAPPED(rx);
6396 SvCUR_set(rx, p - RX_WRAPPED(rx));
6400 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6402 /* setup various meta data about recursion, this all requires
6403 * RExC_npar to be correctly set, and a bit later on we clear it */
6404 if (RExC_seen & REG_SEEN_RECURSE) {
6405 Newxz(RExC_open_parens, RExC_npar,regnode *);
6406 SAVEFREEPV(RExC_open_parens);
6407 Newxz(RExC_close_parens,RExC_npar,regnode *);
6408 SAVEFREEPV(RExC_close_parens);
6410 if (RExC_seen & (REG_SEEN_RECURSE | REG_SEEN_GOSTART)) {
6411 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6412 * So its 1 if there are no parens. */
6413 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6414 ((RExC_npar & 0x07) != 0);
6415 Newx(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6416 SAVEFREEPV(RExC_study_chunk_recursed);
6419 /* Useful during FAIL. */
6420 #ifdef RE_TRACK_PATTERN_OFFSETS
6421 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6422 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6423 "%s %"UVuf" bytes for offset annotations.\n",
6424 ri->u.offsets ? "Got" : "Couldn't get",
6425 (UV)((2*RExC_size+1) * sizeof(U32))));
6427 SetProgLen(ri,RExC_size);
6432 /* Second pass: emit code. */
6433 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6434 RExC_pm_flags = pm_flags;
6436 RExC_end = exp + plen;
6439 RExC_emit_start = ri->program;
6440 RExC_emit = ri->program;
6441 RExC_emit_bound = ri->program + RExC_size + 1;
6442 pRExC_state->code_index = 0;
6444 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6445 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6447 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6449 /* XXXX To minimize changes to RE engine we always allocate
6450 3-units-long substrs field. */
6451 Newx(r->substrs, 1, struct reg_substr_data);
6452 if (RExC_recurse_count) {
6453 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6454 SAVEFREEPV(RExC_recurse);
6458 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6459 Zero(r->substrs, 1, struct reg_substr_data);
6460 if (RExC_study_chunk_recursed)
6461 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6463 #ifdef TRIE_STUDY_OPT
6465 StructCopy(&zero_scan_data, &data, scan_data_t);
6466 copyRExC_state = RExC_state;
6469 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6471 RExC_state = copyRExC_state;
6472 if (seen & REG_TOP_LEVEL_BRANCHES)
6473 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6475 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6476 StructCopy(&zero_scan_data, &data, scan_data_t);
6479 StructCopy(&zero_scan_data, &data, scan_data_t);
6482 /* Dig out information for optimizations. */
6483 r->extflags = RExC_flags; /* was pm_op */
6484 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6487 SvUTF8_on(rx); /* Unicode in it? */
6488 ri->regstclass = NULL;
6489 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6490 r->intflags |= PREGf_NAUGHTY;
6491 scan = ri->program + 1; /* First BRANCH. */
6493 /* testing for BRANCH here tells us whether there is "must appear"
6494 data in the pattern. If there is then we can use it for optimisations */
6495 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6497 STRLEN longest_float_length, longest_fixed_length;
6498 regnode_ssc ch_class; /* pointed to by data */
6500 SSize_t last_close = 0; /* pointed to by data */
6501 regnode *first= scan;
6502 regnode *first_next= regnext(first);
6504 * Skip introductions and multiplicators >= 1
6505 * so that we can extract the 'meat' of the pattern that must
6506 * match in the large if() sequence following.
6507 * NOTE that EXACT is NOT covered here, as it is normally
6508 * picked up by the optimiser separately.
6510 * This is unfortunate as the optimiser isnt handling lookahead
6511 * properly currently.
6514 while ((OP(first) == OPEN && (sawopen = 1)) ||
6515 /* An OR of *one* alternative - should not happen now. */
6516 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6517 /* for now we can't handle lookbehind IFMATCH*/
6518 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6519 (OP(first) == PLUS) ||
6520 (OP(first) == MINMOD) ||
6521 /* An {n,m} with n>0 */
6522 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6523 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6526 * the only op that could be a regnode is PLUS, all the rest
6527 * will be regnode_1 or regnode_2.
6529 * (yves doesn't think this is true)
6531 if (OP(first) == PLUS)
6534 if (OP(first) == MINMOD)
6536 first += regarglen[OP(first)];
6538 first = NEXTOPER(first);
6539 first_next= regnext(first);
6542 /* Starting-point info. */
6544 DEBUG_PEEP("first:",first,0);
6545 /* Ignore EXACT as we deal with it later. */
6546 if (PL_regkind[OP(first)] == EXACT) {
6547 if (OP(first) == EXACT)
6548 NOOP; /* Empty, get anchored substr later. */
6550 ri->regstclass = first;
6553 else if (PL_regkind[OP(first)] == TRIE &&
6554 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6557 /* this can happen only on restudy */
6558 if ( OP(first) == TRIE ) {
6559 struct regnode_1 *trieop = (struct regnode_1 *)
6560 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6561 StructCopy(first,trieop,struct regnode_1);
6562 trie_op=(regnode *)trieop;
6564 struct regnode_charclass *trieop = (struct regnode_charclass *)
6565 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6566 StructCopy(first,trieop,struct regnode_charclass);
6567 trie_op=(regnode *)trieop;
6570 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6571 ri->regstclass = trie_op;
6574 else if (REGNODE_SIMPLE(OP(first)))
6575 ri->regstclass = first;
6576 else if (PL_regkind[OP(first)] == BOUND ||
6577 PL_regkind[OP(first)] == NBOUND)
6578 ri->regstclass = first;
6579 else if (PL_regkind[OP(first)] == BOL) {
6580 r->extflags |= (OP(first) == MBOL
6582 : (OP(first) == SBOL
6585 first = NEXTOPER(first);
6588 else if (OP(first) == GPOS) {
6589 r->extflags |= RXf_ANCH_GPOS;
6590 first = NEXTOPER(first);
6593 else if ((!sawopen || !RExC_sawback) &&
6594 (OP(first) == STAR &&
6595 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6596 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6598 /* turn .* into ^.* with an implied $*=1 */
6600 (OP(NEXTOPER(first)) == REG_ANY)
6603 r->extflags |= type;
6604 r->intflags |= PREGf_IMPLICIT;
6605 first = NEXTOPER(first);
6608 if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6609 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6610 /* x+ must match at the 1st pos of run of x's */
6611 r->intflags |= PREGf_SKIP;
6613 /* Scan is after the zeroth branch, first is atomic matcher. */
6614 #ifdef TRIE_STUDY_OPT
6617 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6618 (IV)(first - scan + 1))
6622 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6623 (IV)(first - scan + 1))
6629 * If there's something expensive in the r.e., find the
6630 * longest literal string that must appear and make it the
6631 * regmust. Resolve ties in favor of later strings, since
6632 * the regstart check works with the beginning of the r.e.
6633 * and avoiding duplication strengthens checking. Not a
6634 * strong reason, but sufficient in the absence of others.
6635 * [Now we resolve ties in favor of the earlier string if
6636 * it happens that c_offset_min has been invalidated, since the
6637 * earlier string may buy us something the later one won't.]
6640 data.longest_fixed = newSVpvs("");
6641 data.longest_float = newSVpvs("");
6642 data.last_found = newSVpvs("");
6643 data.longest = &(data.longest_fixed);
6644 ENTER_with_name("study_chunk");
6645 SAVEFREESV(data.longest_fixed);
6646 SAVEFREESV(data.longest_float);
6647 SAVEFREESV(data.last_found);
6649 if (!ri->regstclass) {
6650 ssc_init(pRExC_state, &ch_class);
6651 data.start_class = &ch_class;
6652 stclass_flag = SCF_DO_STCLASS_AND;
6653 } else /* XXXX Check for BOUND? */
6655 data.last_closep = &last_close;
6658 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6660 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6661 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6665 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6668 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6669 && data.last_start_min == 0 && data.last_end > 0
6670 && !RExC_seen_zerolen
6671 && !(RExC_seen & REG_SEEN_VERBARG)
6672 && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6673 r->extflags |= RXf_CHECK_ALL;
6674 scan_commit(pRExC_state, &data,&minlen,0);
6676 longest_float_length = CHR_SVLEN(data.longest_float);
6678 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6679 && data.offset_fixed == data.offset_float_min
6680 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6681 && S_setup_longest (aTHX_ pRExC_state,
6685 &(r->float_end_shift),
6686 data.lookbehind_float,
6687 data.offset_float_min,
6689 longest_float_length,
6690 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6691 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6693 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6694 r->float_max_offset = data.offset_float_max;
6695 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6696 r->float_max_offset -= data.lookbehind_float;
6697 SvREFCNT_inc_simple_void_NN(data.longest_float);
6700 r->float_substr = r->float_utf8 = NULL;
6701 longest_float_length = 0;
6704 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6706 if (S_setup_longest (aTHX_ pRExC_state,
6708 &(r->anchored_utf8),
6709 &(r->anchored_substr),
6710 &(r->anchored_end_shift),
6711 data.lookbehind_fixed,
6714 longest_fixed_length,
6715 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6716 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6718 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6719 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6722 r->anchored_substr = r->anchored_utf8 = NULL;
6723 longest_fixed_length = 0;
6725 LEAVE_with_name("study_chunk");
6728 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6729 ri->regstclass = NULL;
6731 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6733 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6734 && !ssc_is_anything(data.start_class))
6736 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6738 ssc_finalize(pRExC_state, data.start_class);
6740 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6741 StructCopy(data.start_class,
6742 (regnode_ssc*)RExC_rxi->data->data[n],
6744 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6745 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6746 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6747 regprop(r, sv, (regnode*)data.start_class);
6748 PerlIO_printf(Perl_debug_log,
6749 "synthetic stclass \"%s\".\n",
6750 SvPVX_const(sv));});
6751 data.start_class = NULL;
6754 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6755 if (longest_fixed_length > longest_float_length) {
6756 r->check_end_shift = r->anchored_end_shift;
6757 r->check_substr = r->anchored_substr;
6758 r->check_utf8 = r->anchored_utf8;
6759 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6760 if (r->extflags & RXf_ANCH_SINGLE)
6761 r->extflags |= RXf_NOSCAN;
6764 r->check_end_shift = r->float_end_shift;
6765 r->check_substr = r->float_substr;
6766 r->check_utf8 = r->float_utf8;
6767 r->check_offset_min = r->float_min_offset;
6768 r->check_offset_max = r->float_max_offset;
6770 if ((r->check_substr || r->check_utf8) ) {
6771 r->extflags |= RXf_USE_INTUIT;
6772 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6773 r->extflags |= RXf_INTUIT_TAIL;
6775 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6776 if ( (STRLEN)minlen < longest_float_length )
6777 minlen= longest_float_length;
6778 if ( (STRLEN)minlen < longest_fixed_length )
6779 minlen= longest_fixed_length;
6783 /* Several toplevels. Best we can is to set minlen. */
6785 regnode_ssc ch_class;
6786 SSize_t last_close = 0;
6788 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6790 scan = ri->program + 1;
6791 ssc_init(pRExC_state, &ch_class);
6792 data.start_class = &ch_class;
6793 data.last_closep = &last_close;
6796 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6798 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6799 |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6802 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6804 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6805 = r->float_substr = r->float_utf8 = NULL;
6807 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6808 && ! ssc_is_anything(data.start_class))
6810 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6812 ssc_finalize(pRExC_state, data.start_class);
6814 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6815 StructCopy(data.start_class,
6816 (regnode_ssc*)RExC_rxi->data->data[n],
6818 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6819 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6820 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6821 regprop(r, sv, (regnode*)data.start_class);
6822 PerlIO_printf(Perl_debug_log,
6823 "synthetic stclass \"%s\".\n",
6824 SvPVX_const(sv));});
6825 data.start_class = NULL;
6829 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6830 the "real" pattern. */
6832 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6833 (IV)minlen, (IV)r->minlen);
6835 r->minlenret = minlen;
6836 if (r->minlen < minlen)
6839 if (RExC_seen & REG_SEEN_GPOS)
6840 r->extflags |= RXf_GPOS_SEEN;
6841 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6842 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6843 if (pRExC_state->num_code_blocks)
6844 r->extflags |= RXf_EVAL_SEEN;
6845 if (RExC_seen & REG_SEEN_CANY)
6846 r->extflags |= RXf_CANY_SEEN;
6847 if (RExC_seen & REG_SEEN_VERBARG)
6849 r->intflags |= PREGf_VERBARG_SEEN;
6850 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6852 if (RExC_seen & REG_SEEN_CUTGROUP)
6853 r->intflags |= PREGf_CUTGROUP_SEEN;
6854 if (pm_flags & PMf_USE_RE_EVAL)
6855 r->intflags |= PREGf_USE_RE_EVAL;
6856 if (RExC_paren_names)
6857 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6859 RXp_PAREN_NAMES(r) = NULL;
6862 regnode *first = ri->program + 1;
6864 regnode *next = NEXTOPER(first);
6867 if (PL_regkind[fop] == NOTHING && nop == END)
6868 r->extflags |= RXf_NULL;
6869 else if (PL_regkind[fop] == BOL && nop == END)
6870 r->extflags |= RXf_START_ONLY;
6871 else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6872 r->extflags |= RXf_WHITE;
6873 else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6874 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6878 if (RExC_paren_names) {
6879 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
6880 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6883 ri->name_list_idx = 0;
6885 if (RExC_recurse_count) {
6886 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6887 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6888 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6891 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6892 /* assume we don't need to swap parens around before we match */
6896 PerlIO_printf(Perl_debug_log,"Final program:\n");
6899 #ifdef RE_TRACK_PATTERN_OFFSETS
6900 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6901 const STRLEN len = ri->u.offsets[0];
6903 GET_RE_DEBUG_FLAGS_DECL;
6904 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6905 for (i = 1; i <= len; i++) {
6906 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6907 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6908 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6910 PerlIO_printf(Perl_debug_log, "\n");
6915 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6916 * by setting the regexp SV to readonly-only instead. If the
6917 * pattern's been recompiled, the USEDness should remain. */
6918 if (old_re && SvREADONLY(old_re))
6926 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6929 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6931 PERL_UNUSED_ARG(value);
6933 if (flags & RXapif_FETCH) {
6934 return reg_named_buff_fetch(rx, key, flags);
6935 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6936 Perl_croak_no_modify();
6938 } else if (flags & RXapif_EXISTS) {
6939 return reg_named_buff_exists(rx, key, flags)
6942 } else if (flags & RXapif_REGNAMES) {
6943 return reg_named_buff_all(rx, flags);
6944 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6945 return reg_named_buff_scalar(rx, flags);
6947 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6953 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6956 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6957 PERL_UNUSED_ARG(lastkey);
6959 if (flags & RXapif_FIRSTKEY)
6960 return reg_named_buff_firstkey(rx, flags);
6961 else if (flags & RXapif_NEXTKEY)
6962 return reg_named_buff_nextkey(rx, flags);
6964 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6970 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6973 AV *retarray = NULL;
6975 struct regexp *const rx = ReANY(r);
6977 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6979 if (flags & RXapif_ALL)
6982 if (rx && RXp_PAREN_NAMES(rx)) {
6983 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6986 SV* sv_dat=HeVAL(he_str);
6987 I32 *nums=(I32*)SvPVX(sv_dat);
6988 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6989 if ((I32)(rx->nparens) >= nums[i]
6990 && rx->offs[nums[i]].start != -1
6991 && rx->offs[nums[i]].end != -1)
6994 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6999 ret = newSVsv(&PL_sv_undef);
7002 av_push(retarray, ret);
7005 return newRV_noinc(MUTABLE_SV(retarray));
7012 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7015 struct regexp *const rx = ReANY(r);
7017 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7019 if (rx && RXp_PAREN_NAMES(rx)) {
7020 if (flags & RXapif_ALL) {
7021 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7023 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7025 SvREFCNT_dec_NN(sv);
7037 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7039 struct regexp *const rx = ReANY(r);
7041 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7043 if ( rx && RXp_PAREN_NAMES(rx) ) {
7044 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7046 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7053 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7055 struct regexp *const rx = ReANY(r);
7056 GET_RE_DEBUG_FLAGS_DECL;
7058 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7060 if (rx && RXp_PAREN_NAMES(rx)) {
7061 HV *hv = RXp_PAREN_NAMES(rx);
7063 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7066 SV* sv_dat = HeVAL(temphe);
7067 I32 *nums = (I32*)SvPVX(sv_dat);
7068 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7069 if ((I32)(rx->lastparen) >= nums[i] &&
7070 rx->offs[nums[i]].start != -1 &&
7071 rx->offs[nums[i]].end != -1)
7077 if (parno || flags & RXapif_ALL) {
7078 return newSVhek(HeKEY_hek(temphe));
7086 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7091 struct regexp *const rx = ReANY(r);
7093 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7095 if (rx && RXp_PAREN_NAMES(rx)) {
7096 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7097 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7098 } else if (flags & RXapif_ONE) {
7099 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7100 av = MUTABLE_AV(SvRV(ret));
7101 length = av_len(av);
7102 SvREFCNT_dec_NN(ret);
7103 return newSViv(length + 1);
7105 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
7109 return &PL_sv_undef;
7113 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7115 struct regexp *const rx = ReANY(r);
7118 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7120 if (rx && RXp_PAREN_NAMES(rx)) {
7121 HV *hv= RXp_PAREN_NAMES(rx);
7123 (void)hv_iterinit(hv);
7124 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7127 SV* sv_dat = HeVAL(temphe);
7128 I32 *nums = (I32*)SvPVX(sv_dat);
7129 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7130 if ((I32)(rx->lastparen) >= nums[i] &&
7131 rx->offs[nums[i]].start != -1 &&
7132 rx->offs[nums[i]].end != -1)
7138 if (parno || flags & RXapif_ALL) {
7139 av_push(av, newSVhek(HeKEY_hek(temphe)));
7144 return newRV_noinc(MUTABLE_SV(av));
7148 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7151 struct regexp *const rx = ReANY(r);
7157 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7159 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7160 || n == RX_BUFF_IDX_CARET_FULLMATCH
7161 || n == RX_BUFF_IDX_CARET_POSTMATCH
7164 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7166 /* on something like
7169 * the KEEPCOPY is set on the PMOP rather than the regex */
7170 if (PL_curpm && r == PM_GETRE(PL_curpm))
7171 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7180 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7181 /* no need to distinguish between them any more */
7182 n = RX_BUFF_IDX_FULLMATCH;
7184 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7185 && rx->offs[0].start != -1)
7187 /* $`, ${^PREMATCH} */
7188 i = rx->offs[0].start;
7192 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7193 && rx->offs[0].end != -1)
7195 /* $', ${^POSTMATCH} */
7196 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7197 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7200 if ( 0 <= n && n <= (I32)rx->nparens &&
7201 (s1 = rx->offs[n].start) != -1 &&
7202 (t1 = rx->offs[n].end) != -1)
7204 /* $&, ${^MATCH}, $1 ... */
7206 s = rx->subbeg + s1 - rx->suboffset;
7211 assert(s >= rx->subbeg);
7212 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7214 #if NO_TAINT_SUPPORT
7215 sv_setpvn(sv, s, i);
7217 const int oldtainted = TAINT_get;
7219 sv_setpvn(sv, s, i);
7220 TAINT_set(oldtainted);
7222 if ( (rx->extflags & RXf_CANY_SEEN)
7223 ? (RXp_MATCH_UTF8(rx)
7224 && (!i || is_utf8_string((U8*)s, i)))
7225 : (RXp_MATCH_UTF8(rx)) )
7232 if (RXp_MATCH_TAINTED(rx)) {
7233 if (SvTYPE(sv) >= SVt_PVMG) {
7234 MAGIC* const mg = SvMAGIC(sv);
7237 SvMAGIC_set(sv, mg->mg_moremagic);
7239 if ((mgt = SvMAGIC(sv))) {
7240 mg->mg_moremagic = mgt;
7241 SvMAGIC_set(sv, mg);
7252 sv_setsv(sv,&PL_sv_undef);
7258 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7259 SV const * const value)
7261 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7263 PERL_UNUSED_ARG(rx);
7264 PERL_UNUSED_ARG(paren);
7265 PERL_UNUSED_ARG(value);
7268 Perl_croak_no_modify();
7272 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7275 struct regexp *const rx = ReANY(r);
7279 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7281 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7282 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7283 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7286 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7288 /* on something like
7291 * the KEEPCOPY is set on the PMOP rather than the regex */
7292 if (PL_curpm && r == PM_GETRE(PL_curpm))
7293 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7299 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7301 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7302 case RX_BUFF_IDX_PREMATCH: /* $` */
7303 if (rx->offs[0].start != -1) {
7304 i = rx->offs[0].start;
7313 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7314 case RX_BUFF_IDX_POSTMATCH: /* $' */
7315 if (rx->offs[0].end != -1) {
7316 i = rx->sublen - rx->offs[0].end;
7318 s1 = rx->offs[0].end;
7325 default: /* $& / ${^MATCH}, $1, $2, ... */
7326 if (paren <= (I32)rx->nparens &&
7327 (s1 = rx->offs[paren].start) != -1 &&
7328 (t1 = rx->offs[paren].end) != -1)
7334 if (ckWARN(WARN_UNINITIALIZED))
7335 report_uninit((const SV *)sv);
7340 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7341 const char * const s = rx->subbeg - rx->suboffset + s1;
7346 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7353 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7355 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7356 PERL_UNUSED_ARG(rx);
7360 return newSVpvs("Regexp");
7363 /* Scans the name of a named buffer from the pattern.
7364 * If flags is REG_RSN_RETURN_NULL returns null.
7365 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7366 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7367 * to the parsed name as looked up in the RExC_paren_names hash.
7368 * If there is an error throws a vFAIL().. type exception.
7371 #define REG_RSN_RETURN_NULL 0
7372 #define REG_RSN_RETURN_NAME 1
7373 #define REG_RSN_RETURN_DATA 2
7376 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7378 char *name_start = RExC_parse;
7380 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7382 assert (RExC_parse <= RExC_end);
7383 if (RExC_parse == RExC_end) NOOP;
7384 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7385 /* skip IDFIRST by using do...while */
7388 RExC_parse += UTF8SKIP(RExC_parse);
7389 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7393 } while (isWORDCHAR(*RExC_parse));
7395 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7396 vFAIL("Group name must start with a non-digit word character");
7400 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7401 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7402 if ( flags == REG_RSN_RETURN_NAME)
7404 else if (flags==REG_RSN_RETURN_DATA) {
7407 if ( ! sv_name ) /* should not happen*/
7408 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7409 if (RExC_paren_names)
7410 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7412 sv_dat = HeVAL(he_str);
7414 vFAIL("Reference to nonexistent named group");
7418 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7419 (unsigned long) flags);
7421 assert(0); /* NOT REACHED */
7426 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7427 int rem=(int)(RExC_end - RExC_parse); \
7436 if (RExC_lastparse!=RExC_parse) \
7437 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7440 iscut ? "..." : "<" \
7443 PerlIO_printf(Perl_debug_log,"%16s",""); \
7446 num = RExC_size + 1; \
7448 num=REG_NODE_NUM(RExC_emit); \
7449 if (RExC_lastnum!=num) \
7450 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7452 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7453 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7454 (int)((depth*2)), "", \
7458 RExC_lastparse=RExC_parse; \
7463 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7464 DEBUG_PARSE_MSG((funcname)); \
7465 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7467 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7468 DEBUG_PARSE_MSG((funcname)); \
7469 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7472 /* This section of code defines the inversion list object and its methods. The
7473 * interfaces are highly subject to change, so as much as possible is static to
7474 * this file. An inversion list is here implemented as a malloc'd C UV array
7475 * as an SVt_INVLIST scalar.
7477 * An inversion list for Unicode is an array of code points, sorted by ordinal
7478 * number. The zeroth element is the first code point in the list. The 1th
7479 * element is the first element beyond that not in the list. In other words,
7480 * the first range is
7481 * invlist[0]..(invlist[1]-1)
7482 * The other ranges follow. Thus every element whose index is divisible by two
7483 * marks the beginning of a range that is in the list, and every element not
7484 * divisible by two marks the beginning of a range not in the list. A single
7485 * element inversion list that contains the single code point N generally
7486 * consists of two elements
7489 * (The exception is when N is the highest representable value on the
7490 * machine, in which case the list containing just it would be a single
7491 * element, itself. By extension, if the last range in the list extends to
7492 * infinity, then the first element of that range will be in the inversion list
7493 * at a position that is divisible by two, and is the final element in the
7495 * Taking the complement (inverting) an inversion list is quite simple, if the
7496 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7497 * This implementation reserves an element at the beginning of each inversion
7498 * list to always contain 0; there is an additional flag in the header which
7499 * indicates if the list begins at the 0, or is offset to begin at the next
7502 * More about inversion lists can be found in "Unicode Demystified"
7503 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7504 * More will be coming when functionality is added later.
7506 * The inversion list data structure is currently implemented as an SV pointing
7507 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7508 * array of UV whose memory management is automatically handled by the existing
7509 * facilities for SV's.
7511 * Some of the methods should always be private to the implementation, and some
7512 * should eventually be made public */
7514 /* The header definitions are in F<inline_invlist.c> */
7516 PERL_STATIC_INLINE UV*
7517 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7519 /* Returns a pointer to the first element in the inversion list's array.
7520 * This is called upon initialization of an inversion list. Where the
7521 * array begins depends on whether the list has the code point U+0000 in it
7522 * or not. The other parameter tells it whether the code that follows this
7523 * call is about to put a 0 in the inversion list or not. The first
7524 * element is either the element reserved for 0, if TRUE, or the element
7525 * after it, if FALSE */
7527 bool* offset = get_invlist_offset_addr(invlist);
7528 UV* zero_addr = (UV *) SvPVX(invlist);
7530 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7533 assert(! _invlist_len(invlist));
7537 /* 1^1 = 0; 1^0 = 1 */
7538 *offset = 1 ^ will_have_0;
7539 return zero_addr + *offset;
7542 PERL_STATIC_INLINE UV*
7543 S_invlist_array(pTHX_ SV* const invlist)
7545 /* Returns the pointer to the inversion list's array. Every time the
7546 * length changes, this needs to be called in case malloc or realloc moved
7549 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7551 /* Must not be empty. If these fail, you probably didn't check for <len>
7552 * being non-zero before trying to get the array */
7553 assert(_invlist_len(invlist));
7555 /* The very first element always contains zero, The array begins either
7556 * there, or if the inversion list is offset, at the element after it.
7557 * The offset header field determines which; it contains 0 or 1 to indicate
7558 * how much additionally to add */
7559 assert(0 == *(SvPVX(invlist)));
7560 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7563 PERL_STATIC_INLINE void
7564 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7566 /* Sets the current number of elements stored in the inversion list.
7567 * Updates SvCUR correspondingly */
7569 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7571 assert(SvTYPE(invlist) == SVt_INVLIST);
7576 : TO_INTERNAL_SIZE(len + offset));
7577 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7580 PERL_STATIC_INLINE IV*
7581 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7583 /* Return the address of the IV that is reserved to hold the cached index
7586 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7588 assert(SvTYPE(invlist) == SVt_INVLIST);
7590 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7593 PERL_STATIC_INLINE IV
7594 S_invlist_previous_index(pTHX_ SV* const invlist)
7596 /* Returns cached index of previous search */
7598 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7600 return *get_invlist_previous_index_addr(invlist);
7603 PERL_STATIC_INLINE void
7604 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7606 /* Caches <index> for later retrieval */
7608 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7610 assert(index == 0 || index < (int) _invlist_len(invlist));
7612 *get_invlist_previous_index_addr(invlist) = index;
7615 PERL_STATIC_INLINE UV
7616 S_invlist_max(pTHX_ SV* const invlist)
7618 /* Returns the maximum number of elements storable in the inversion list's
7619 * array, without having to realloc() */
7621 PERL_ARGS_ASSERT_INVLIST_MAX;
7623 assert(SvTYPE(invlist) == SVt_INVLIST);
7625 /* Assumes worst case, in which the 0 element is not counted in the
7626 * inversion list, so subtracts 1 for that */
7627 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7628 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7629 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7632 #ifndef PERL_IN_XSUB_RE
7634 Perl__new_invlist(pTHX_ IV initial_size)
7637 /* Return a pointer to a newly constructed inversion list, with enough
7638 * space to store 'initial_size' elements. If that number is negative, a
7639 * system default is used instead */
7643 if (initial_size < 0) {
7647 /* Allocate the initial space */
7648 new_list = newSV_type(SVt_INVLIST);
7650 /* First 1 is in case the zero element isn't in the list; second 1 is for
7652 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7653 invlist_set_len(new_list, 0, 0);
7655 /* Force iterinit() to be used to get iteration to work */
7656 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7658 *get_invlist_previous_index_addr(new_list) = 0;
7665 S__new_invlist_C_array(pTHX_ const UV* const list)
7667 /* Return a pointer to a newly constructed inversion list, initialized to
7668 * point to <list>, which has to be in the exact correct inversion list
7669 * form, including internal fields. Thus this is a dangerous routine that
7670 * should not be used in the wrong hands. The passed in 'list' contains
7671 * several header fields at the beginning that are not part of the
7672 * inversion list body proper */
7674 const STRLEN length = (STRLEN) list[0];
7675 const UV version_id = list[1];
7676 const bool offset = cBOOL(list[2]);
7677 #define HEADER_LENGTH 3
7678 /* If any of the above changes in any way, you must change HEADER_LENGTH
7679 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7680 * perl -E 'say int(rand 2**31-1)'
7682 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7683 data structure type, so that one being
7684 passed in can be validated to be an
7685 inversion list of the correct vintage.
7688 SV* invlist = newSV_type(SVt_INVLIST);
7690 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7692 if (version_id != INVLIST_VERSION_ID) {
7693 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7696 /* The generated array passed in includes header elements that aren't part
7697 * of the list proper, so start it just after them */
7698 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7700 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7701 shouldn't touch it */
7703 *(get_invlist_offset_addr(invlist)) = offset;
7705 /* The 'length' passed to us is the physical number of elements in the
7706 * inversion list. But if there is an offset the logical number is one
7708 invlist_set_len(invlist, length - offset, offset);
7710 invlist_set_previous_index(invlist, 0);
7712 /* Initialize the iteration pointer. */
7713 invlist_iterfinish(invlist);
7719 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7721 /* Grow the maximum size of an inversion list */
7723 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7725 assert(SvTYPE(invlist) == SVt_INVLIST);
7727 /* Add one to account for the zero element at the beginning which may not
7728 * be counted by the calling parameters */
7729 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7732 PERL_STATIC_INLINE void
7733 S_invlist_trim(pTHX_ SV* const invlist)
7735 PERL_ARGS_ASSERT_INVLIST_TRIM;
7737 assert(SvTYPE(invlist) == SVt_INVLIST);
7739 /* Change the length of the inversion list to how many entries it currently
7741 SvPV_shrink_to_cur((SV *) invlist);
7745 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7747 /* Subject to change or removal. Append the range from 'start' to 'end' at
7748 * the end of the inversion list. The range must be above any existing
7752 UV max = invlist_max(invlist);
7753 UV len = _invlist_len(invlist);
7756 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7758 if (len == 0) { /* Empty lists must be initialized */
7759 offset = start != 0;
7760 array = _invlist_array_init(invlist, ! offset);
7763 /* Here, the existing list is non-empty. The current max entry in the
7764 * list is generally the first value not in the set, except when the
7765 * set extends to the end of permissible values, in which case it is
7766 * the first entry in that final set, and so this call is an attempt to
7767 * append out-of-order */
7769 UV final_element = len - 1;
7770 array = invlist_array(invlist);
7771 if (array[final_element] > start
7772 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7774 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7775 array[final_element], start,
7776 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7779 /* Here, it is a legal append. If the new range begins with the first
7780 * value not in the set, it is extending the set, so the new first
7781 * value not in the set is one greater than the newly extended range.
7783 offset = *get_invlist_offset_addr(invlist);
7784 if (array[final_element] == start) {
7785 if (end != UV_MAX) {
7786 array[final_element] = end + 1;
7789 /* But if the end is the maximum representable on the machine,
7790 * just let the range that this would extend to have no end */
7791 invlist_set_len(invlist, len - 1, offset);
7797 /* Here the new range doesn't extend any existing set. Add it */
7799 len += 2; /* Includes an element each for the start and end of range */
7801 /* If wll overflow the existing space, extend, which may cause the array to
7804 invlist_extend(invlist, len);
7806 /* Have to set len here to avoid assert failure in invlist_array() */
7807 invlist_set_len(invlist, len, offset);
7809 array = invlist_array(invlist);
7812 invlist_set_len(invlist, len, offset);
7815 /* The next item on the list starts the range, the one after that is
7816 * one past the new range. */
7817 array[len - 2] = start;
7818 if (end != UV_MAX) {
7819 array[len - 1] = end + 1;
7822 /* But if the end is the maximum representable on the machine, just let
7823 * the range have no end */
7824 invlist_set_len(invlist, len - 1, offset);
7828 #ifndef PERL_IN_XSUB_RE
7831 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7833 /* Searches the inversion list for the entry that contains the input code
7834 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7835 * return value is the index into the list's array of the range that
7840 IV high = _invlist_len(invlist);
7841 const IV highest_element = high - 1;
7844 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7846 /* If list is empty, return failure. */
7851 /* (We can't get the array unless we know the list is non-empty) */
7852 array = invlist_array(invlist);
7854 mid = invlist_previous_index(invlist);
7855 assert(mid >=0 && mid <= highest_element);
7857 /* <mid> contains the cache of the result of the previous call to this
7858 * function (0 the first time). See if this call is for the same result,
7859 * or if it is for mid-1. This is under the theory that calls to this
7860 * function will often be for related code points that are near each other.
7861 * And benchmarks show that caching gives better results. We also test
7862 * here if the code point is within the bounds of the list. These tests
7863 * replace others that would have had to be made anyway to make sure that
7864 * the array bounds were not exceeded, and these give us extra information
7865 * at the same time */
7866 if (cp >= array[mid]) {
7867 if (cp >= array[highest_element]) {
7868 return highest_element;
7871 /* Here, array[mid] <= cp < array[highest_element]. This means that
7872 * the final element is not the answer, so can exclude it; it also
7873 * means that <mid> is not the final element, so can refer to 'mid + 1'
7875 if (cp < array[mid + 1]) {
7881 else { /* cp < aray[mid] */
7882 if (cp < array[0]) { /* Fail if outside the array */
7886 if (cp >= array[mid - 1]) {
7891 /* Binary search. What we are looking for is <i> such that
7892 * array[i] <= cp < array[i+1]
7893 * The loop below converges on the i+1. Note that there may not be an
7894 * (i+1)th element in the array, and things work nonetheless */
7895 while (low < high) {
7896 mid = (low + high) / 2;
7897 assert(mid <= highest_element);
7898 if (array[mid] <= cp) { /* cp >= array[mid] */
7901 /* We could do this extra test to exit the loop early.
7902 if (cp < array[low]) {
7907 else { /* cp < array[mid] */
7914 invlist_set_previous_index(invlist, high);
7919 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7921 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7922 * but is used when the swash has an inversion list. This makes this much
7923 * faster, as it uses a binary search instead of a linear one. This is
7924 * intimately tied to that function, and perhaps should be in utf8.c,
7925 * except it is intimately tied to inversion lists as well. It assumes
7926 * that <swatch> is all 0's on input */
7929 const IV len = _invlist_len(invlist);
7933 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7935 if (len == 0) { /* Empty inversion list */
7939 array = invlist_array(invlist);
7941 /* Find which element it is */
7942 i = _invlist_search(invlist, start);
7944 /* We populate from <start> to <end> */
7945 while (current < end) {
7948 /* The inversion list gives the results for every possible code point
7949 * after the first one in the list. Only those ranges whose index is
7950 * even are ones that the inversion list matches. For the odd ones,
7951 * and if the initial code point is not in the list, we have to skip
7952 * forward to the next element */
7953 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7955 if (i >= len) { /* Finished if beyond the end of the array */
7959 if (current >= end) { /* Finished if beyond the end of what we
7961 if (LIKELY(end < UV_MAX)) {
7965 /* We get here when the upper bound is the maximum
7966 * representable on the machine, and we are looking for just
7967 * that code point. Have to special case it */
7969 goto join_end_of_list;
7972 assert(current >= start);
7974 /* The current range ends one below the next one, except don't go past
7977 upper = (i < len && array[i] < end) ? array[i] : end;
7979 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7980 * for each code point in it */
7981 for (; current < upper; current++) {
7982 const STRLEN offset = (STRLEN)(current - start);
7983 swatch[offset >> 3] |= 1 << (offset & 7);
7988 /* Quit if at the end of the list */
7991 /* But first, have to deal with the highest possible code point on
7992 * the platform. The previous code assumes that <end> is one
7993 * beyond where we want to populate, but that is impossible at the
7994 * platform's infinity, so have to handle it specially */
7995 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7997 const STRLEN offset = (STRLEN)(end - start);
7998 swatch[offset >> 3] |= 1 << (offset & 7);
8003 /* Advance to the next range, which will be for code points not in the
8012 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
8014 /* Take the union of two inversion lists and point <output> to it. *output
8015 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8016 * the reference count to that list will be decremented if not already a
8017 * temporary (mortal); otherwise *output will be made correspondingly
8018 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8019 * second list is returned. If <complement_b> is TRUE, the union is taken
8020 * of the complement (inversion) of <b> instead of b itself.
8022 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8023 * Richard Gillam, published by Addison-Wesley, and explained at some
8024 * length there. The preface says to incorporate its examples into your
8025 * code at your own risk.
8027 * The algorithm is like a merge sort.
8029 * XXX A potential performance improvement is to keep track as we go along
8030 * if only one of the inputs contributes to the result, meaning the other
8031 * is a subset of that one. In that case, we can skip the final copy and
8032 * return the larger of the input lists, but then outside code might need
8033 * to keep track of whether to free the input list or not */
8035 const UV* array_a; /* a's array */
8037 UV len_a; /* length of a's array */
8040 SV* u; /* the resulting union */
8044 UV i_a = 0; /* current index into a's array */
8048 /* running count, as explained in the algorithm source book; items are
8049 * stopped accumulating and are output when the count changes to/from 0.
8050 * The count is incremented when we start a range that's in the set, and
8051 * decremented when we start a range that's not in the set. So its range
8052 * is 0 to 2. Only when the count is zero is something not in the set.
8056 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8059 /* If either one is empty, the union is the other one */
8060 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8061 bool make_temp = FALSE; /* Should we mortalize the result? */
8065 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8071 *output = invlist_clone(b);
8073 _invlist_invert(*output);
8075 } /* else *output already = b; */
8078 sv_2mortal(*output);
8082 else if ((len_b = _invlist_len(b)) == 0) {
8083 bool make_temp = FALSE;
8085 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8090 /* The complement of an empty list is a list that has everything in it,
8091 * so the union with <a> includes everything too */
8094 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8098 *output = _new_invlist(1);
8099 _append_range_to_invlist(*output, 0, UV_MAX);
8101 else if (*output != a) {
8102 *output = invlist_clone(a);
8104 /* else *output already = a; */
8107 sv_2mortal(*output);
8112 /* Here both lists exist and are non-empty */
8113 array_a = invlist_array(a);
8114 array_b = invlist_array(b);
8116 /* If are to take the union of 'a' with the complement of b, set it
8117 * up so are looking at b's complement. */
8120 /* To complement, we invert: if the first element is 0, remove it. To
8121 * do this, we just pretend the array starts one later */
8122 if (array_b[0] == 0) {
8128 /* But if the first element is not zero, we pretend the list starts
8129 * at the 0 that is always stored immediately before the array. */
8135 /* Size the union for the worst case: that the sets are completely
8137 u = _new_invlist(len_a + len_b);
8139 /* Will contain U+0000 if either component does */
8140 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8141 || (len_b > 0 && array_b[0] == 0));
8143 /* Go through each list item by item, stopping when exhausted one of
8145 while (i_a < len_a && i_b < len_b) {
8146 UV cp; /* The element to potentially add to the union's array */
8147 bool cp_in_set; /* is it in the the input list's set or not */
8149 /* We need to take one or the other of the two inputs for the union.
8150 * Since we are merging two sorted lists, we take the smaller of the
8151 * next items. In case of a tie, we take the one that is in its set
8152 * first. If we took one not in the set first, it would decrement the
8153 * count, possibly to 0 which would cause it to be output as ending the
8154 * range, and the next time through we would take the same number, and
8155 * output it again as beginning the next range. By doing it the
8156 * opposite way, there is no possibility that the count will be
8157 * momentarily decremented to 0, and thus the two adjoining ranges will
8158 * be seamlessly merged. (In a tie and both are in the set or both not
8159 * in the set, it doesn't matter which we take first.) */
8160 if (array_a[i_a] < array_b[i_b]
8161 || (array_a[i_a] == array_b[i_b]
8162 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8164 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8168 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8169 cp = array_b[i_b++];
8172 /* Here, have chosen which of the two inputs to look at. Only output
8173 * if the running count changes to/from 0, which marks the
8174 * beginning/end of a range in that's in the set */
8177 array_u[i_u++] = cp;
8184 array_u[i_u++] = cp;
8189 /* Here, we are finished going through at least one of the lists, which
8190 * means there is something remaining in at most one. We check if the list
8191 * that hasn't been exhausted is positioned such that we are in the middle
8192 * of a range in its set or not. (i_a and i_b point to the element beyond
8193 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8194 * is potentially more to output.
8195 * There are four cases:
8196 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8197 * in the union is entirely from the non-exhausted set.
8198 * 2) Both were in their sets, count is 2. Nothing further should
8199 * be output, as everything that remains will be in the exhausted
8200 * list's set, hence in the union; decrementing to 1 but not 0 insures
8202 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8203 * Nothing further should be output because the union includes
8204 * everything from the exhausted set. Not decrementing ensures that.
8205 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8206 * decrementing to 0 insures that we look at the remainder of the
8207 * non-exhausted set */
8208 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8209 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8214 /* The final length is what we've output so far, plus what else is about to
8215 * be output. (If 'count' is non-zero, then the input list we exhausted
8216 * has everything remaining up to the machine's limit in its set, and hence
8217 * in the union, so there will be no further output. */
8220 /* At most one of the subexpressions will be non-zero */
8221 len_u += (len_a - i_a) + (len_b - i_b);
8224 /* Set result to final length, which can change the pointer to array_u, so
8226 if (len_u != _invlist_len(u)) {
8227 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8229 array_u = invlist_array(u);
8232 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8233 * the other) ended with everything above it not in its set. That means
8234 * that the remaining part of the union is precisely the same as the
8235 * non-exhausted list, so can just copy it unchanged. (If both list were
8236 * exhausted at the same time, then the operations below will be both 0.)
8239 IV copy_count; /* At most one will have a non-zero copy count */
8240 if ((copy_count = len_a - i_a) > 0) {
8241 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8243 else if ((copy_count = len_b - i_b) > 0) {
8244 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8248 /* We may be removing a reference to one of the inputs. If so, the output
8249 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8250 * count decremented) */
8251 if (a == *output || b == *output) {
8252 assert(! invlist_is_iterating(*output));
8253 if ((SvTEMP(*output))) {
8257 SvREFCNT_dec_NN(*output);
8267 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
8269 /* Take the intersection of two inversion lists and point <i> to it. *i
8270 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8271 * the reference count to that list will be decremented if not already a
8272 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8273 * The first list, <a>, may be NULL, in which case an empty list is
8274 * returned. If <complement_b> is TRUE, the result will be the
8275 * intersection of <a> and the complement (or inversion) of <b> instead of
8278 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8279 * Richard Gillam, published by Addison-Wesley, and explained at some
8280 * length there. The preface says to incorporate its examples into your
8281 * code at your own risk. In fact, it had bugs
8283 * The algorithm is like a merge sort, and is essentially the same as the
8287 const UV* array_a; /* a's array */
8289 UV len_a; /* length of a's array */
8292 SV* r; /* the resulting intersection */
8296 UV i_a = 0; /* current index into a's array */
8300 /* running count, as explained in the algorithm source book; items are
8301 * stopped accumulating and are output when the count changes to/from 2.
8302 * The count is incremented when we start a range that's in the set, and
8303 * decremented when we start a range that's not in the set. So its range
8304 * is 0 to 2. Only when the count is 2 is something in the intersection.
8308 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8311 /* Special case if either one is empty */
8312 len_a = (a == NULL) ? 0 : _invlist_len(a);
8313 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8314 bool make_temp = FALSE;
8316 if (len_a != 0 && complement_b) {
8318 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8319 * be empty. Here, also we are using 'b's complement, which hence
8320 * must be every possible code point. Thus the intersection is
8324 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8329 *i = invlist_clone(a);
8331 /* else *i is already 'a' */
8339 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8340 * intersection must be empty */
8342 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8347 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8351 *i = _new_invlist(0);
8359 /* Here both lists exist and are non-empty */
8360 array_a = invlist_array(a);
8361 array_b = invlist_array(b);
8363 /* If are to take the intersection of 'a' with the complement of b, set it
8364 * up so are looking at b's complement. */
8367 /* To complement, we invert: if the first element is 0, remove it. To
8368 * do this, we just pretend the array starts one later */
8369 if (array_b[0] == 0) {
8375 /* But if the first element is not zero, we pretend the list starts
8376 * at the 0 that is always stored immediately before the array. */
8382 /* Size the intersection for the worst case: that the intersection ends up
8383 * fragmenting everything to be completely disjoint */
8384 r= _new_invlist(len_a + len_b);
8386 /* Will contain U+0000 iff both components do */
8387 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8388 && len_b > 0 && array_b[0] == 0);
8390 /* Go through each list item by item, stopping when exhausted one of
8392 while (i_a < len_a && i_b < len_b) {
8393 UV cp; /* The element to potentially add to the intersection's
8395 bool cp_in_set; /* Is it in the input list's set or not */
8397 /* We need to take one or the other of the two inputs for the
8398 * intersection. Since we are merging two sorted lists, we take the
8399 * smaller of the next items. In case of a tie, we take the one that
8400 * is not in its set first (a difference from the union algorithm). If
8401 * we took one in the set first, it would increment the count, possibly
8402 * to 2 which would cause it to be output as starting a range in the
8403 * intersection, and the next time through we would take that same
8404 * number, and output it again as ending the set. By doing it the
8405 * opposite of this, there is no possibility that the count will be
8406 * momentarily incremented to 2. (In a tie and both are in the set or
8407 * both not in the set, it doesn't matter which we take first.) */
8408 if (array_a[i_a] < array_b[i_b]
8409 || (array_a[i_a] == array_b[i_b]
8410 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8412 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8416 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8420 /* Here, have chosen which of the two inputs to look at. Only output
8421 * if the running count changes to/from 2, which marks the
8422 * beginning/end of a range that's in the intersection */
8426 array_r[i_r++] = cp;
8431 array_r[i_r++] = cp;
8437 /* Here, we are finished going through at least one of the lists, which
8438 * means there is something remaining in at most one. We check if the list
8439 * that has been exhausted is positioned such that we are in the middle
8440 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8441 * the ones we care about.) There are four cases:
8442 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8443 * nothing left in the intersection.
8444 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8445 * above 2. What should be output is exactly that which is in the
8446 * non-exhausted set, as everything it has is also in the intersection
8447 * set, and everything it doesn't have can't be in the intersection
8448 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8449 * gets incremented to 2. Like the previous case, the intersection is
8450 * everything that remains in the non-exhausted set.
8451 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8452 * remains 1. And the intersection has nothing more. */
8453 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8454 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8459 /* The final length is what we've output so far plus what else is in the
8460 * intersection. At most one of the subexpressions below will be non-zero */
8463 len_r += (len_a - i_a) + (len_b - i_b);
8466 /* Set result to final length, which can change the pointer to array_r, so
8468 if (len_r != _invlist_len(r)) {
8469 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8471 array_r = invlist_array(r);
8474 /* Finish outputting any remaining */
8475 if (count >= 2) { /* At most one will have a non-zero copy count */
8477 if ((copy_count = len_a - i_a) > 0) {
8478 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8480 else if ((copy_count = len_b - i_b) > 0) {
8481 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8485 /* We may be removing a reference to one of the inputs. If so, the output
8486 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8487 * count decremented) */
8488 if (a == *i || b == *i) {
8489 assert(! invlist_is_iterating(*i));
8494 SvREFCNT_dec_NN(*i);
8504 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8506 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8507 * set. A pointer to the inversion list is returned. This may actually be
8508 * a new list, in which case the passed in one has been destroyed. The
8509 * passed in inversion list can be NULL, in which case a new one is created
8510 * with just the one range in it */
8515 if (invlist == NULL) {
8516 invlist = _new_invlist(2);
8520 len = _invlist_len(invlist);
8523 /* If comes after the final entry actually in the list, can just append it
8526 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8527 && start >= invlist_array(invlist)[len - 1]))
8529 _append_range_to_invlist(invlist, start, end);
8533 /* Here, can't just append things, create and return a new inversion list
8534 * which is the union of this range and the existing inversion list */
8535 range_invlist = _new_invlist(2);
8536 _append_range_to_invlist(range_invlist, start, end);
8538 _invlist_union(invlist, range_invlist, &invlist);
8540 /* The temporary can be freed */
8541 SvREFCNT_dec_NN(range_invlist);
8548 PERL_STATIC_INLINE SV*
8549 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8550 return _add_range_to_invlist(invlist, cp, cp);
8553 #ifndef PERL_IN_XSUB_RE
8555 Perl__invlist_invert(pTHX_ SV* const invlist)
8557 /* Complement the input inversion list. This adds a 0 if the list didn't
8558 * have a zero; removes it otherwise. As described above, the data
8559 * structure is set up so that this is very efficient */
8561 PERL_ARGS_ASSERT__INVLIST_INVERT;
8563 assert(! invlist_is_iterating(invlist));
8565 /* The inverse of matching nothing is matching everything */
8566 if (_invlist_len(invlist) == 0) {
8567 _append_range_to_invlist(invlist, 0, UV_MAX);
8571 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8575 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8577 /* Complement the input inversion list (which must be a Unicode property,
8578 * all of which don't match above the Unicode maximum code point.) And
8579 * Perl has chosen to not have the inversion match above that either. This
8580 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8586 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8588 _invlist_invert(invlist);
8590 len = _invlist_len(invlist);
8592 if (len != 0) { /* If empty do nothing */
8593 array = invlist_array(invlist);
8594 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8595 /* Add 0x110000. First, grow if necessary */
8597 if (invlist_max(invlist) < len) {
8598 invlist_extend(invlist, len);
8599 array = invlist_array(invlist);
8601 invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8602 array[len - 1] = PERL_UNICODE_MAX + 1;
8604 else { /* Remove the 0x110000 */
8605 invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8613 PERL_STATIC_INLINE SV*
8614 S_invlist_clone(pTHX_ SV* const invlist)
8617 /* Return a new inversion list that is a copy of the input one, which is
8618 * unchanged. The new list will not be mortal even if the old one was. */
8620 /* Need to allocate extra space to accommodate Perl's addition of a
8621 * trailing NUL to SvPV's, since it thinks they are always strings */
8622 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8623 STRLEN physical_length = SvCUR(invlist);
8624 bool offset = *(get_invlist_offset_addr(invlist));
8626 PERL_ARGS_ASSERT_INVLIST_CLONE;
8628 *(get_invlist_offset_addr(new_invlist)) = offset;
8629 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8630 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8635 PERL_STATIC_INLINE STRLEN*
8636 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8638 /* Return the address of the UV that contains the current iteration
8641 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8643 assert(SvTYPE(invlist) == SVt_INVLIST);
8645 return &(((XINVLIST*) SvANY(invlist))->iterator);
8648 PERL_STATIC_INLINE void
8649 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8651 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8653 *get_invlist_iter_addr(invlist) = 0;
8656 PERL_STATIC_INLINE void
8657 S_invlist_iterfinish(pTHX_ SV* invlist)
8659 /* Terminate iterator for invlist. This is to catch development errors.
8660 * Any iteration that is interrupted before completed should call this
8661 * function. Functions that add code points anywhere else but to the end
8662 * of an inversion list assert that they are not in the middle of an
8663 * iteration. If they were, the addition would make the iteration
8664 * problematical: if the iteration hadn't reached the place where things
8665 * were being added, it would be ok */
8667 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8669 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8673 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8675 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8676 * This call sets in <*start> and <*end>, the next range in <invlist>.
8677 * Returns <TRUE> if successful and the next call will return the next
8678 * range; <FALSE> if was already at the end of the list. If the latter,
8679 * <*start> and <*end> are unchanged, and the next call to this function
8680 * will start over at the beginning of the list */
8682 STRLEN* pos = get_invlist_iter_addr(invlist);
8683 UV len = _invlist_len(invlist);
8686 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8689 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8693 array = invlist_array(invlist);
8695 *start = array[(*pos)++];
8701 *end = array[(*pos)++] - 1;
8707 PERL_STATIC_INLINE bool
8708 S_invlist_is_iterating(pTHX_ SV* const invlist)
8710 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8712 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8715 PERL_STATIC_INLINE UV
8716 S_invlist_highest(pTHX_ SV* const invlist)
8718 /* Returns the highest code point that matches an inversion list. This API
8719 * has an ambiguity, as it returns 0 under either the highest is actually
8720 * 0, or if the list is empty. If this distinction matters to you, check
8721 * for emptiness before calling this function */
8723 UV len = _invlist_len(invlist);
8726 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8732 array = invlist_array(invlist);
8734 /* The last element in the array in the inversion list always starts a
8735 * range that goes to infinity. That range may be for code points that are
8736 * matched in the inversion list, or it may be for ones that aren't
8737 * matched. In the latter case, the highest code point in the set is one
8738 * less than the beginning of this range; otherwise it is the final element
8739 * of this range: infinity */
8740 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8742 : array[len - 1] - 1;
8745 #ifndef PERL_IN_XSUB_RE
8747 Perl__invlist_contents(pTHX_ SV* const invlist)
8749 /* Get the contents of an inversion list into a string SV so that they can
8750 * be printed out. It uses the format traditionally done for debug tracing
8754 SV* output = newSVpvs("\n");
8756 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8758 assert(! invlist_is_iterating(invlist));
8760 invlist_iterinit(invlist);
8761 while (invlist_iternext(invlist, &start, &end)) {
8762 if (end == UV_MAX) {
8763 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8765 else if (end != start) {
8766 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8770 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8778 #ifndef PERL_IN_XSUB_RE
8780 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8782 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
8783 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
8784 * the string 'indent'. The output looks like this:
8785 [0] 0x000A .. 0x000D
8787 [4] 0x2028 .. 0x2029
8788 [6] 0x3104 .. INFINITY
8789 * This means that the first range of code points matched by the list are
8790 * 0xA through 0xD; the second range contains only the single code point
8791 * 0x85, etc. An inversion list is an array of UVs. Two array elements
8792 * are used to define each range (except if the final range extends to
8793 * infinity, only a single element is needed). The array index of the
8794 * first element for the corresponding range is given in brackets. */
8799 PERL_ARGS_ASSERT__INVLIST_DUMP;
8801 if (invlist_is_iterating(invlist)) {
8802 Perl_dump_indent(aTHX_ level, file,
8803 "%sCan't dump inversion list because is in middle of iterating\n",
8808 invlist_iterinit(invlist);
8809 while (invlist_iternext(invlist, &start, &end)) {
8810 if (end == UV_MAX) {
8811 Perl_dump_indent(aTHX_ level, file,
8812 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8813 indent, (UV)count, start);
8815 else if (end != start) {
8816 Perl_dump_indent(aTHX_ level, file,
8817 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8818 indent, (UV)count, start, end);
8821 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8822 indent, (UV)count, start);
8829 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8831 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8833 /* Return a boolean as to if the two passed in inversion lists are
8834 * identical. The final argument, if TRUE, says to take the complement of
8835 * the second inversion list before doing the comparison */
8837 const UV* array_a = invlist_array(a);
8838 const UV* array_b = invlist_array(b);
8839 UV len_a = _invlist_len(a);
8840 UV len_b = _invlist_len(b);
8842 UV i = 0; /* current index into the arrays */
8843 bool retval = TRUE; /* Assume are identical until proven otherwise */
8845 PERL_ARGS_ASSERT__INVLISTEQ;
8847 /* If are to compare 'a' with the complement of b, set it
8848 * up so are looking at b's complement. */
8851 /* The complement of nothing is everything, so <a> would have to have
8852 * just one element, starting at zero (ending at infinity) */
8854 return (len_a == 1 && array_a[0] == 0);
8856 else if (array_b[0] == 0) {
8858 /* Otherwise, to complement, we invert. Here, the first element is
8859 * 0, just remove it. To do this, we just pretend the array starts
8867 /* But if the first element is not zero, we pretend the list starts
8868 * at the 0 that is always stored immediately before the array. */
8874 /* Make sure that the lengths are the same, as well as the final element
8875 * before looping through the remainder. (Thus we test the length, final,
8876 * and first elements right off the bat) */
8877 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8880 else for (i = 0; i < len_a - 1; i++) {
8881 if (array_a[i] != array_b[i]) {
8891 #undef HEADER_LENGTH
8892 #undef TO_INTERNAL_SIZE
8893 #undef FROM_INTERNAL_SIZE
8894 #undef INVLIST_VERSION_ID
8896 /* End of inversion list object */
8899 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
8901 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8902 * constructs, and updates RExC_flags with them. On input, RExC_parse
8903 * should point to the first flag; it is updated on output to point to the
8904 * final ')' or ':'. There needs to be at least one flag, or this will
8907 /* for (?g), (?gc), and (?o) warnings; warning
8908 about (?c) will warn about (?g) -- japhy */
8910 #define WASTED_O 0x01
8911 #define WASTED_G 0x02
8912 #define WASTED_C 0x04
8913 #define WASTED_GC (WASTED_G|WASTED_C)
8914 I32 wastedflags = 0x00;
8915 U32 posflags = 0, negflags = 0;
8916 U32 *flagsp = &posflags;
8917 char has_charset_modifier = '\0';
8919 bool has_use_defaults = FALSE;
8920 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8922 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8924 /* '^' as an initial flag sets certain defaults */
8925 if (UCHARAT(RExC_parse) == '^') {
8927 has_use_defaults = TRUE;
8928 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8929 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8930 ? REGEX_UNICODE_CHARSET
8931 : REGEX_DEPENDS_CHARSET);
8934 cs = get_regex_charset(RExC_flags);
8935 if (cs == REGEX_DEPENDS_CHARSET
8936 && (RExC_utf8 || RExC_uni_semantics))
8938 cs = REGEX_UNICODE_CHARSET;
8941 while (*RExC_parse) {
8942 /* && strchr("iogcmsx", *RExC_parse) */
8943 /* (?g), (?gc) and (?o) are useless here
8944 and must be globally applied -- japhy */
8945 switch (*RExC_parse) {
8947 /* Code for the imsx flags */
8948 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8950 case LOCALE_PAT_MOD:
8951 if (has_charset_modifier) {
8952 goto excess_modifier;
8954 else if (flagsp == &negflags) {
8957 cs = REGEX_LOCALE_CHARSET;
8958 has_charset_modifier = LOCALE_PAT_MOD;
8959 RExC_contains_locale = 1;
8961 case UNICODE_PAT_MOD:
8962 if (has_charset_modifier) {
8963 goto excess_modifier;
8965 else if (flagsp == &negflags) {
8968 cs = REGEX_UNICODE_CHARSET;
8969 has_charset_modifier = UNICODE_PAT_MOD;
8971 case ASCII_RESTRICT_PAT_MOD:
8972 if (flagsp == &negflags) {
8975 if (has_charset_modifier) {
8976 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8977 goto excess_modifier;
8979 /* Doubled modifier implies more restricted */
8980 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8983 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8985 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8987 case DEPENDS_PAT_MOD:
8988 if (has_use_defaults) {
8989 goto fail_modifiers;
8991 else if (flagsp == &negflags) {
8994 else if (has_charset_modifier) {
8995 goto excess_modifier;
8998 /* The dual charset means unicode semantics if the
8999 * pattern (or target, not known until runtime) are
9000 * utf8, or something in the pattern indicates unicode
9002 cs = (RExC_utf8 || RExC_uni_semantics)
9003 ? REGEX_UNICODE_CHARSET
9004 : REGEX_DEPENDS_CHARSET;
9005 has_charset_modifier = DEPENDS_PAT_MOD;
9009 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9010 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9012 else if (has_charset_modifier == *(RExC_parse - 1)) {
9013 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9016 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9021 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9023 case ONCE_PAT_MOD: /* 'o' */
9024 case GLOBAL_PAT_MOD: /* 'g' */
9025 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9026 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9027 if (! (wastedflags & wflagbit) ) {
9028 wastedflags |= wflagbit;
9029 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9032 "Useless (%s%c) - %suse /%c modifier",
9033 flagsp == &negflags ? "?-" : "?",
9035 flagsp == &negflags ? "don't " : "",
9042 case CONTINUE_PAT_MOD: /* 'c' */
9043 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9044 if (! (wastedflags & WASTED_C) ) {
9045 wastedflags |= WASTED_GC;
9046 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9049 "Useless (%sc) - %suse /gc modifier",
9050 flagsp == &negflags ? "?-" : "?",
9051 flagsp == &negflags ? "don't " : ""
9056 case KEEPCOPY_PAT_MOD: /* 'p' */
9057 if (flagsp == &negflags) {
9059 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9061 *flagsp |= RXf_PMf_KEEPCOPY;
9065 /* A flag is a default iff it is following a minus, so
9066 * if there is a minus, it means will be trying to
9067 * re-specify a default which is an error */
9068 if (has_use_defaults || flagsp == &negflags) {
9069 goto fail_modifiers;
9072 wastedflags = 0; /* reset so (?g-c) warns twice */
9076 RExC_flags |= posflags;
9077 RExC_flags &= ~negflags;
9078 set_regex_charset(&RExC_flags, cs);
9079 if (RExC_flags & RXf_PMf_FOLD) {
9080 RExC_contains_i = 1;
9086 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9087 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9088 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9089 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9098 - reg - regular expression, i.e. main body or parenthesized thing
9100 * Caller must absorb opening parenthesis.
9102 * Combining parenthesis handling with the base level of regular expression
9103 * is a trifle forced, but the need to tie the tails of the branches to what
9104 * follows makes it hard to avoid.
9106 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9108 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9110 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9113 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9114 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9115 needs to be restarted.
9116 Otherwise would only return NULL if regbranch() returns NULL, which
9119 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9120 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9121 * 2 is like 1, but indicates that nextchar() has been called to advance
9122 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9123 * this flag alerts us to the need to check for that */
9126 regnode *ret; /* Will be the head of the group. */
9129 regnode *ender = NULL;
9132 U32 oregflags = RExC_flags;
9133 bool have_branch = 0;
9135 I32 freeze_paren = 0;
9136 I32 after_freeze = 0;
9138 char * parse_start = RExC_parse; /* MJD */
9139 char * const oregcomp_parse = RExC_parse;
9141 GET_RE_DEBUG_FLAGS_DECL;
9143 PERL_ARGS_ASSERT_REG;
9144 DEBUG_PARSE("reg ");
9146 *flagp = 0; /* Tentatively. */
9149 /* Make an OPEN node, if parenthesized. */
9152 /* Under /x, space and comments can be gobbled up between the '(' and
9153 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9154 * intervening space, as the sequence is a token, and a token should be
9156 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9158 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9159 char *start_verb = RExC_parse;
9160 STRLEN verb_len = 0;
9161 char *start_arg = NULL;
9162 unsigned char op = 0;
9164 int internal_argval = 0; /* internal_argval is only useful if !argok */
9166 if (has_intervening_patws && SIZE_ONLY) {
9167 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9169 while ( *RExC_parse && *RExC_parse != ')' ) {
9170 if ( *RExC_parse == ':' ) {
9171 start_arg = RExC_parse + 1;
9177 verb_len = RExC_parse - start_verb;
9180 while ( *RExC_parse && *RExC_parse != ')' )
9182 if ( *RExC_parse != ')' )
9183 vFAIL("Unterminated verb pattern argument");
9184 if ( RExC_parse == start_arg )
9187 if ( *RExC_parse != ')' )
9188 vFAIL("Unterminated verb pattern");
9191 switch ( *start_verb ) {
9192 case 'A': /* (*ACCEPT) */
9193 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9195 internal_argval = RExC_nestroot;
9198 case 'C': /* (*COMMIT) */
9199 if ( memEQs(start_verb,verb_len,"COMMIT") )
9202 case 'F': /* (*FAIL) */
9203 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9208 case ':': /* (*:NAME) */
9209 case 'M': /* (*MARK:NAME) */
9210 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9215 case 'P': /* (*PRUNE) */
9216 if ( memEQs(start_verb,verb_len,"PRUNE") )
9219 case 'S': /* (*SKIP) */
9220 if ( memEQs(start_verb,verb_len,"SKIP") )
9223 case 'T': /* (*THEN) */
9224 /* [19:06] <TimToady> :: is then */
9225 if ( memEQs(start_verb,verb_len,"THEN") ) {
9227 RExC_seen |= REG_SEEN_CUTGROUP;
9232 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9234 "Unknown verb pattern '%"UTF8f"'",
9235 UTF8fARG(UTF, verb_len, start_verb));
9238 if ( start_arg && internal_argval ) {
9239 vFAIL3("Verb pattern '%.*s' may not have an argument",
9240 verb_len, start_verb);
9241 } else if ( argok < 0 && !start_arg ) {
9242 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9243 verb_len, start_verb);
9245 ret = reganode(pRExC_state, op, internal_argval);
9246 if ( ! internal_argval && ! SIZE_ONLY ) {
9248 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
9249 ARG(ret) = add_data( pRExC_state, STR_WITH_LEN("S"));
9250 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9257 if (!internal_argval)
9258 RExC_seen |= REG_SEEN_VERBARG;
9259 } else if ( start_arg ) {
9260 vFAIL3("Verb pattern '%.*s' may not have an argument",
9261 verb_len, start_verb);
9263 ret = reg_node(pRExC_state, op);
9265 nextchar(pRExC_state);
9268 else if (*RExC_parse == '?') { /* (?...) */
9269 bool is_logical = 0;
9270 const char * const seqstart = RExC_parse;
9271 if (has_intervening_patws && SIZE_ONLY) {
9272 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9276 paren = *RExC_parse++;
9277 ret = NULL; /* For look-ahead/behind. */
9280 case 'P': /* (?P...) variants for those used to PCRE/Python */
9281 paren = *RExC_parse++;
9282 if ( paren == '<') /* (?P<...>) named capture */
9284 else if (paren == '>') { /* (?P>name) named recursion */
9285 goto named_recursion;
9287 else if (paren == '=') { /* (?P=...) named backref */
9288 /* this pretty much dupes the code for \k<NAME> in regatom(), if
9289 you change this make sure you change that */
9290 char* name_start = RExC_parse;
9292 SV *sv_dat = reg_scan_name(pRExC_state,
9293 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9294 if (RExC_parse == name_start || *RExC_parse != ')')
9295 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9296 vFAIL2("Sequence %.3s... not terminated",parse_start);
9299 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9300 RExC_rxi->data->data[num]=(void*)sv_dat;
9301 SvREFCNT_inc_simple_void(sv_dat);
9304 ret = reganode(pRExC_state,
9307 : (ASCII_FOLD_RESTRICTED)
9309 : (AT_LEAST_UNI_SEMANTICS)
9317 Set_Node_Offset(ret, parse_start+1);
9318 Set_Node_Cur_Length(ret, parse_start);
9320 nextchar(pRExC_state);
9324 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9325 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9327 case '<': /* (?<...) */
9328 if (*RExC_parse == '!')
9330 else if (*RExC_parse != '=')
9336 case '\'': /* (?'...') */
9337 name_start= RExC_parse;
9338 svname = reg_scan_name(pRExC_state,
9339 SIZE_ONLY /* reverse test from the others */
9340 ? REG_RSN_RETURN_NAME
9341 : REG_RSN_RETURN_NULL);
9342 if (RExC_parse == name_start || *RExC_parse != paren)
9343 vFAIL2("Sequence (?%c... not terminated",
9344 paren=='>' ? '<' : paren);
9348 if (!svname) /* shouldn't happen */
9350 "panic: reg_scan_name returned NULL");
9351 if (!RExC_paren_names) {
9352 RExC_paren_names= newHV();
9353 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9355 RExC_paren_name_list= newAV();
9356 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9359 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9361 sv_dat = HeVAL(he_str);
9363 /* croak baby croak */
9365 "panic: paren_name hash element allocation failed");
9366 } else if ( SvPOK(sv_dat) ) {
9367 /* (?|...) can mean we have dupes so scan to check
9368 its already been stored. Maybe a flag indicating
9369 we are inside such a construct would be useful,
9370 but the arrays are likely to be quite small, so
9371 for now we punt -- dmq */
9372 IV count = SvIV(sv_dat);
9373 I32 *pv = (I32*)SvPVX(sv_dat);
9375 for ( i = 0 ; i < count ; i++ ) {
9376 if ( pv[i] == RExC_npar ) {
9382 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
9383 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9384 pv[count] = RExC_npar;
9385 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9388 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9389 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
9391 SvIV_set(sv_dat, 1);
9394 /* Yes this does cause a memory leak in debugging Perls */
9395 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
9396 SvREFCNT_dec_NN(svname);
9399 /*sv_dump(sv_dat);*/
9401 nextchar(pRExC_state);
9403 goto capturing_parens;
9405 RExC_seen |= REG_SEEN_LOOKBEHIND;
9406 RExC_in_lookbehind++;
9408 case '=': /* (?=...) */
9409 RExC_seen_zerolen++;
9411 case '!': /* (?!...) */
9412 RExC_seen_zerolen++;
9413 if (*RExC_parse == ')') {
9414 ret=reg_node(pRExC_state, OPFAIL);
9415 nextchar(pRExC_state);
9419 case '|': /* (?|...) */
9420 /* branch reset, behave like a (?:...) except that
9421 buffers in alternations share the same numbers */
9423 after_freeze = freeze_paren = RExC_npar;
9425 case ':': /* (?:...) */
9426 case '>': /* (?>...) */
9428 case '$': /* (?$...) */
9429 case '@': /* (?@...) */
9430 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9432 case '#': /* (?#...) */
9433 /* XXX As soon as we disallow separating the '?' and '*' (by
9434 * spaces or (?#...) comment), it is believed that this case
9435 * will be unreachable and can be removed. See
9437 while (*RExC_parse && *RExC_parse != ')')
9439 if (*RExC_parse != ')')
9440 FAIL("Sequence (?#... not terminated");
9441 nextchar(pRExC_state);
9444 case '0' : /* (?0) */
9445 case 'R' : /* (?R) */
9446 if (*RExC_parse != ')')
9447 FAIL("Sequence (?R) not terminated");
9448 ret = reg_node(pRExC_state, GOSTART);
9449 RExC_seen |= REG_SEEN_GOSTART;
9450 *flagp |= POSTPONED;
9451 nextchar(pRExC_state);
9454 { /* named and numeric backreferences */
9456 case '&': /* (?&NAME) */
9457 parse_start = RExC_parse - 1;
9460 SV *sv_dat = reg_scan_name(pRExC_state,
9461 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9462 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9464 if (RExC_parse == RExC_end || *RExC_parse != ')')
9465 vFAIL("Sequence (?&... not terminated");
9466 goto gen_recurse_regop;
9467 assert(0); /* NOT REACHED */
9469 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9471 vFAIL("Illegal pattern");
9473 goto parse_recursion;
9475 case '-': /* (?-1) */
9476 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9477 RExC_parse--; /* rewind to let it be handled later */
9481 case '1': case '2': case '3': case '4': /* (?1) */
9482 case '5': case '6': case '7': case '8': case '9':
9485 num = atoi(RExC_parse);
9486 parse_start = RExC_parse - 1; /* MJD */
9487 if (*RExC_parse == '-')
9489 while (isDIGIT(*RExC_parse))
9491 if (*RExC_parse!=')')
9492 vFAIL("Expecting close bracket");
9495 if ( paren == '-' ) {
9497 Diagram of capture buffer numbering.
9498 Top line is the normal capture buffer numbers
9499 Bottom line is the negative indexing as from
9503 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9507 num = RExC_npar + num;
9510 vFAIL("Reference to nonexistent group");
9512 } else if ( paren == '+' ) {
9513 num = RExC_npar + num - 1;
9516 ret = reganode(pRExC_state, GOSUB, num);
9518 if (num > (I32)RExC_rx->nparens) {
9520 vFAIL("Reference to nonexistent group");
9522 ARG2L_SET( ret, RExC_recurse_count++);
9524 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9525 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9529 RExC_seen |= REG_SEEN_RECURSE;
9530 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9531 Set_Node_Offset(ret, parse_start); /* MJD */
9533 *flagp |= POSTPONED;
9534 nextchar(pRExC_state);
9536 } /* named and numeric backreferences */
9537 assert(0); /* NOT REACHED */
9539 case '?': /* (??...) */
9541 if (*RExC_parse != '{') {
9543 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9545 "Sequence (%"UTF8f"...) not recognized",
9546 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9549 *flagp |= POSTPONED;
9550 paren = *RExC_parse++;
9552 case '{': /* (?{...}) */
9555 struct reg_code_block *cb;
9557 RExC_seen_zerolen++;
9559 if ( !pRExC_state->num_code_blocks
9560 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9561 || pRExC_state->code_blocks[pRExC_state->code_index].start
9562 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9565 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9566 FAIL("panic: Sequence (?{...}): no code block found\n");
9567 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9569 /* this is a pre-compiled code block (?{...}) */
9570 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9571 RExC_parse = RExC_start + cb->end;
9574 if (cb->src_regex) {
9575 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9576 RExC_rxi->data->data[n] =
9577 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9578 RExC_rxi->data->data[n+1] = (void*)o;
9581 n = add_data(pRExC_state,
9582 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9583 RExC_rxi->data->data[n] = (void*)o;
9586 pRExC_state->code_index++;
9587 nextchar(pRExC_state);
9591 ret = reg_node(pRExC_state, LOGICAL);
9592 eval = reganode(pRExC_state, EVAL, n);
9595 /* for later propagation into (??{}) return value */
9596 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9598 REGTAIL(pRExC_state, ret, eval);
9599 /* deal with the length of this later - MJD */
9602 ret = reganode(pRExC_state, EVAL, n);
9603 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9604 Set_Node_Offset(ret, parse_start);
9607 case '(': /* (?(?{...})...) and (?(?=...)...) */
9610 if (RExC_parse[0] == '?') { /* (?(?...)) */
9611 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9612 || RExC_parse[1] == '<'
9613 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9617 ret = reg_node(pRExC_state, LOGICAL);
9621 tail = reg(pRExC_state, 1, &flag, depth+1);
9622 if (flag & RESTART_UTF8) {
9623 *flagp = RESTART_UTF8;
9626 REGTAIL(pRExC_state, ret, tail);
9630 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9631 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9633 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9634 char *name_start= RExC_parse++;
9636 SV *sv_dat=reg_scan_name(pRExC_state,
9637 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9638 if (RExC_parse == name_start || *RExC_parse != ch)
9639 vFAIL2("Sequence (?(%c... not terminated",
9640 (ch == '>' ? '<' : ch));
9643 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9644 RExC_rxi->data->data[num]=(void*)sv_dat;
9645 SvREFCNT_inc_simple_void(sv_dat);
9647 ret = reganode(pRExC_state,NGROUPP,num);
9648 goto insert_if_check_paren;
9650 else if (RExC_parse[0] == 'D' &&
9651 RExC_parse[1] == 'E' &&
9652 RExC_parse[2] == 'F' &&
9653 RExC_parse[3] == 'I' &&
9654 RExC_parse[4] == 'N' &&
9655 RExC_parse[5] == 'E')
9657 ret = reganode(pRExC_state,DEFINEP,0);
9660 goto insert_if_check_paren;
9662 else if (RExC_parse[0] == 'R') {
9665 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9666 parno = atoi(RExC_parse++);
9667 while (isDIGIT(*RExC_parse))
9669 } else if (RExC_parse[0] == '&') {
9672 sv_dat = reg_scan_name(pRExC_state,
9673 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9674 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9676 ret = reganode(pRExC_state,INSUBP,parno);
9677 goto insert_if_check_paren;
9679 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9683 parno = atoi(RExC_parse++);
9685 while (isDIGIT(*RExC_parse))
9687 ret = reganode(pRExC_state, GROUPP, parno);
9689 insert_if_check_paren:
9690 if (*(tmp = nextchar(pRExC_state)) != ')') {
9691 /* nextchar also skips comments, so undo its work
9692 * and skip over the the next character.
9695 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9696 vFAIL("Switch condition not recognized");
9699 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9700 br = regbranch(pRExC_state, &flags, 1,depth+1);
9702 if (flags & RESTART_UTF8) {
9703 *flagp = RESTART_UTF8;
9706 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9709 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9710 c = *nextchar(pRExC_state);
9715 vFAIL("(?(DEFINE)....) does not allow branches");
9716 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9717 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9718 if (flags & RESTART_UTF8) {
9719 *flagp = RESTART_UTF8;
9722 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9725 REGTAIL(pRExC_state, ret, lastbr);
9728 c = *nextchar(pRExC_state);
9733 vFAIL("Switch (?(condition)... contains too many branches");
9734 ender = reg_node(pRExC_state, TAIL);
9735 REGTAIL(pRExC_state, br, ender);
9737 REGTAIL(pRExC_state, lastbr, ender);
9738 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9741 REGTAIL(pRExC_state, ret, ender);
9742 RExC_size++; /* XXX WHY do we need this?!!
9743 For large programs it seems to be required
9744 but I can't figure out why. -- dmq*/
9748 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9749 vFAIL("Unknown switch condition (?(...))");
9752 case '[': /* (?[ ... ]) */
9753 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9756 RExC_parse--; /* for vFAIL to print correctly */
9757 vFAIL("Sequence (? incomplete");
9759 default: /* e.g., (?i) */
9762 parse_lparen_question_flags(pRExC_state);
9763 if (UCHARAT(RExC_parse) != ':') {
9764 nextchar(pRExC_state);
9769 nextchar(pRExC_state);
9779 ret = reganode(pRExC_state, OPEN, parno);
9782 RExC_nestroot = parno;
9783 if (RExC_seen & REG_SEEN_RECURSE
9784 && !RExC_open_parens[parno-1])
9786 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9787 "Setting open paren #%"IVdf" to %d\n",
9788 (IV)parno, REG_NODE_NUM(ret)));
9789 RExC_open_parens[parno-1]= ret;
9792 Set_Node_Length(ret, 1); /* MJD */
9793 Set_Node_Offset(ret, RExC_parse); /* MJD */
9801 /* Pick up the branches, linking them together. */
9802 parse_start = RExC_parse; /* MJD */
9803 br = regbranch(pRExC_state, &flags, 1,depth+1);
9805 /* branch_len = (paren != 0); */
9808 if (flags & RESTART_UTF8) {
9809 *flagp = RESTART_UTF8;
9812 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9814 if (*RExC_parse == '|') {
9815 if (!SIZE_ONLY && RExC_extralen) {
9816 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9819 reginsert(pRExC_state, BRANCH, br, depth+1);
9820 Set_Node_Length(br, paren != 0);
9821 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9825 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9827 else if (paren == ':') {
9828 *flagp |= flags&SIMPLE;
9830 if (is_open) { /* Starts with OPEN. */
9831 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9833 else if (paren != '?') /* Not Conditional */
9835 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9837 while (*RExC_parse == '|') {
9838 if (!SIZE_ONLY && RExC_extralen) {
9839 ender = reganode(pRExC_state, LONGJMP,0);
9840 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9843 RExC_extralen += 2; /* Account for LONGJMP. */
9844 nextchar(pRExC_state);
9846 if (RExC_npar > after_freeze)
9847 after_freeze = RExC_npar;
9848 RExC_npar = freeze_paren;
9850 br = regbranch(pRExC_state, &flags, 0, depth+1);
9853 if (flags & RESTART_UTF8) {
9854 *flagp = RESTART_UTF8;
9857 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9859 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9861 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9864 if (have_branch || paren != ':') {
9865 /* Make a closing node, and hook it on the end. */
9868 ender = reg_node(pRExC_state, TAIL);
9871 ender = reganode(pRExC_state, CLOSE, parno);
9872 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9873 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9874 "Setting close paren #%"IVdf" to %d\n",
9875 (IV)parno, REG_NODE_NUM(ender)));
9876 RExC_close_parens[parno-1]= ender;
9877 if (RExC_nestroot == parno)
9880 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9881 Set_Node_Length(ender,1); /* MJD */
9887 *flagp &= ~HASWIDTH;
9890 ender = reg_node(pRExC_state, SUCCEED);
9893 ender = reg_node(pRExC_state, END);
9895 assert(!RExC_opend); /* there can only be one! */
9900 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9901 SV * const mysv_val1=sv_newmortal();
9902 SV * const mysv_val2=sv_newmortal();
9903 DEBUG_PARSE_MSG("lsbr");
9904 regprop(RExC_rx, mysv_val1, lastbr);
9905 regprop(RExC_rx, mysv_val2, ender);
9906 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9907 SvPV_nolen_const(mysv_val1),
9908 (IV)REG_NODE_NUM(lastbr),
9909 SvPV_nolen_const(mysv_val2),
9910 (IV)REG_NODE_NUM(ender),
9911 (IV)(ender - lastbr)
9914 REGTAIL(pRExC_state, lastbr, ender);
9916 if (have_branch && !SIZE_ONLY) {
9919 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9921 /* Hook the tails of the branches to the closing node. */
9922 for (br = ret; br; br = regnext(br)) {
9923 const U8 op = PL_regkind[OP(br)];
9925 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9926 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9929 else if (op == BRANCHJ) {
9930 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9931 /* for now we always disable this optimisation * /
9932 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9938 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9939 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9940 SV * const mysv_val1=sv_newmortal();
9941 SV * const mysv_val2=sv_newmortal();
9942 DEBUG_PARSE_MSG("NADA");
9943 regprop(RExC_rx, mysv_val1, ret);
9944 regprop(RExC_rx, mysv_val2, ender);
9945 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9946 SvPV_nolen_const(mysv_val1),
9947 (IV)REG_NODE_NUM(ret),
9948 SvPV_nolen_const(mysv_val2),
9949 (IV)REG_NODE_NUM(ender),
9954 if (OP(ender) == TAIL) {
9959 for ( opt= br + 1; opt < ender ; opt++ )
9961 NEXT_OFF(br)= ender - br;
9969 static const char parens[] = "=!<,>";
9971 if (paren && (p = strchr(parens, paren))) {
9972 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9973 int flag = (p - parens) > 1;
9976 node = SUSPEND, flag = 0;
9977 reginsert(pRExC_state, node,ret, depth+1);
9978 Set_Node_Cur_Length(ret, parse_start);
9979 Set_Node_Offset(ret, parse_start + 1);
9981 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9985 /* Check for proper termination. */
9987 /* restore original flags, but keep (?p) */
9988 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9989 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9990 RExC_parse = oregcomp_parse;
9991 vFAIL("Unmatched (");
9994 else if (!paren && RExC_parse < RExC_end) {
9995 if (*RExC_parse == ')') {
9997 vFAIL("Unmatched )");
10000 FAIL("Junk on end of regexp"); /* "Can't happen". */
10001 assert(0); /* NOTREACHED */
10004 if (RExC_in_lookbehind) {
10005 RExC_in_lookbehind--;
10007 if (after_freeze > RExC_npar)
10008 RExC_npar = after_freeze;
10013 - regbranch - one alternative of an | operator
10015 * Implements the concatenation operator.
10017 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10021 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10025 regnode *chain = NULL;
10027 I32 flags = 0, c = 0;
10028 GET_RE_DEBUG_FLAGS_DECL;
10030 PERL_ARGS_ASSERT_REGBRANCH;
10032 DEBUG_PARSE("brnc");
10037 if (!SIZE_ONLY && RExC_extralen)
10038 ret = reganode(pRExC_state, BRANCHJ,0);
10040 ret = reg_node(pRExC_state, BRANCH);
10041 Set_Node_Length(ret, 1);
10045 if (!first && SIZE_ONLY)
10046 RExC_extralen += 1; /* BRANCHJ */
10048 *flagp = WORST; /* Tentatively. */
10051 nextchar(pRExC_state);
10052 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10053 flags &= ~TRYAGAIN;
10054 latest = regpiece(pRExC_state, &flags,depth+1);
10055 if (latest == NULL) {
10056 if (flags & TRYAGAIN)
10058 if (flags & RESTART_UTF8) {
10059 *flagp = RESTART_UTF8;
10062 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10064 else if (ret == NULL)
10066 *flagp |= flags&(HASWIDTH|POSTPONED);
10067 if (chain == NULL) /* First piece. */
10068 *flagp |= flags&SPSTART;
10071 REGTAIL(pRExC_state, chain, latest);
10076 if (chain == NULL) { /* Loop ran zero times. */
10077 chain = reg_node(pRExC_state, NOTHING);
10082 *flagp |= flags&SIMPLE;
10089 - regpiece - something followed by possible [*+?]
10091 * Note that the branching code sequences used for ? and the general cases
10092 * of * and + are somewhat optimized: they use the same NOTHING node as
10093 * both the endmarker for their branch list and the body of the last branch.
10094 * It might seem that this node could be dispensed with entirely, but the
10095 * endmarker role is not redundant.
10097 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10099 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10103 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10110 const char * const origparse = RExC_parse;
10112 I32 max = REG_INFTY;
10113 #ifdef RE_TRACK_PATTERN_OFFSETS
10116 const char *maxpos = NULL;
10118 /* Save the original in case we change the emitted regop to a FAIL. */
10119 regnode * const orig_emit = RExC_emit;
10121 GET_RE_DEBUG_FLAGS_DECL;
10123 PERL_ARGS_ASSERT_REGPIECE;
10125 DEBUG_PARSE("piec");
10127 ret = regatom(pRExC_state, &flags,depth+1);
10129 if (flags & (TRYAGAIN|RESTART_UTF8))
10130 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10132 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10138 if (op == '{' && regcurly(RExC_parse, FALSE)) {
10140 #ifdef RE_TRACK_PATTERN_OFFSETS
10141 parse_start = RExC_parse; /* MJD */
10143 next = RExC_parse + 1;
10144 while (isDIGIT(*next) || *next == ',') {
10145 if (*next == ',') {
10153 if (*next == '}') { /* got one */
10157 min = atoi(RExC_parse);
10158 if (*maxpos == ',')
10161 maxpos = RExC_parse;
10162 max = atoi(maxpos);
10163 if (!max && *maxpos != '0')
10164 max = REG_INFTY; /* meaning "infinity" */
10165 else if (max >= REG_INFTY)
10166 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10168 nextchar(pRExC_state);
10169 if (max < min) { /* If can't match, warn and optimize to fail
10172 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10174 /* We can't back off the size because we have to reserve
10175 * enough space for all the things we are about to throw
10176 * away, but we can shrink it by the ammount we are about
10177 * to re-use here */
10178 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10181 RExC_emit = orig_emit;
10183 ret = reg_node(pRExC_state, OPFAIL);
10186 else if (min == max
10187 && RExC_parse < RExC_end
10188 && (*RExC_parse == '?' || *RExC_parse == '+'))
10191 ckWARN2reg(RExC_parse + 1,
10192 "Useless use of greediness modifier '%c'",
10195 /* Absorb the modifier, so later code doesn't see nor use
10197 nextchar(pRExC_state);
10201 if ((flags&SIMPLE)) {
10202 RExC_naughty += 2 + RExC_naughty / 2;
10203 reginsert(pRExC_state, CURLY, ret, depth+1);
10204 Set_Node_Offset(ret, parse_start+1); /* MJD */
10205 Set_Node_Cur_Length(ret, parse_start);
10208 regnode * const w = reg_node(pRExC_state, WHILEM);
10211 REGTAIL(pRExC_state, ret, w);
10212 if (!SIZE_ONLY && RExC_extralen) {
10213 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10214 reginsert(pRExC_state, NOTHING,ret, depth+1);
10215 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10217 reginsert(pRExC_state, CURLYX,ret, depth+1);
10219 Set_Node_Offset(ret, parse_start+1);
10220 Set_Node_Length(ret,
10221 op == '{' ? (RExC_parse - parse_start) : 1);
10223 if (!SIZE_ONLY && RExC_extralen)
10224 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10225 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10227 RExC_whilem_seen++, RExC_extralen += 3;
10228 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10235 *flagp |= HASWIDTH;
10237 ARG1_SET(ret, (U16)min);
10238 ARG2_SET(ret, (U16)max);
10245 if (!ISMULT1(op)) {
10250 #if 0 /* Now runtime fix should be reliable. */
10252 /* if this is reinstated, don't forget to put this back into perldiag:
10254 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10256 (F) The part of the regexp subject to either the * or + quantifier
10257 could match an empty string. The {#} shows in the regular
10258 expression about where the problem was discovered.
10262 if (!(flags&HASWIDTH) && op != '?')
10263 vFAIL("Regexp *+ operand could be empty");
10266 #ifdef RE_TRACK_PATTERN_OFFSETS
10267 parse_start = RExC_parse;
10269 nextchar(pRExC_state);
10271 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10273 if (op == '*' && (flags&SIMPLE)) {
10274 reginsert(pRExC_state, STAR, ret, depth+1);
10278 else if (op == '*') {
10282 else if (op == '+' && (flags&SIMPLE)) {
10283 reginsert(pRExC_state, PLUS, ret, depth+1);
10287 else if (op == '+') {
10291 else if (op == '?') {
10296 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10297 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10298 ckWARN2reg(RExC_parse,
10299 "%"UTF8f" matches null string many times",
10300 UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
10302 (void)ReREFCNT_inc(RExC_rx_sv);
10305 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10306 nextchar(pRExC_state);
10307 reginsert(pRExC_state, MINMOD, ret, depth+1);
10308 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10311 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10313 nextchar(pRExC_state);
10314 ender = reg_node(pRExC_state, SUCCEED);
10315 REGTAIL(pRExC_state, ret, ender);
10316 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10318 ender = reg_node(pRExC_state, TAIL);
10319 REGTAIL(pRExC_state, ret, ender);
10322 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10324 vFAIL("Nested quantifiers");
10331 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10332 const bool strict /* Apply stricter parsing rules? */
10336 /* This is expected to be called by a parser routine that has recognized '\N'
10337 and needs to handle the rest. RExC_parse is expected to point at the first
10338 char following the N at the time of the call. On successful return,
10339 RExC_parse has been updated to point to just after the sequence identified
10340 by this routine, and <*flagp> has been updated.
10342 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10345 \N may begin either a named sequence, or if outside a character class, mean
10346 to match a non-newline. For non single-quoted regexes, the tokenizer has
10347 attempted to decide which, and in the case of a named sequence, converted it
10348 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10349 where c1... are the characters in the sequence. For single-quoted regexes,
10350 the tokenizer passes the \N sequence through unchanged; this code will not
10351 attempt to determine this nor expand those, instead raising a syntax error.
10352 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10353 or there is no '}', it signals that this \N occurrence means to match a
10356 Only the \N{U+...} form should occur in a character class, for the same
10357 reason that '.' inside a character class means to just match a period: it
10358 just doesn't make sense.
10360 The function raises an error (via vFAIL), and doesn't return for various
10361 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
10362 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10363 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10364 only possible if node_p is non-NULL.
10367 If <valuep> is non-null, it means the caller can accept an input sequence
10368 consisting of a just a single code point; <*valuep> is set to that value
10369 if the input is such.
10371 If <node_p> is non-null it signifies that the caller can accept any other
10372 legal sequence (i.e., one that isn't just a single code point). <*node_p>
10374 1) \N means not-a-NL: points to a newly created REG_ANY node;
10375 2) \N{}: points to a new NOTHING node;
10376 3) otherwise: points to a new EXACT node containing the resolved
10378 Note that FALSE is returned for single code point sequences if <valuep> is
10382 char * endbrace; /* '}' following the name */
10384 char *endchar; /* Points to '.' or '}' ending cur char in the input
10386 bool has_multiple_chars; /* true if the input stream contains a sequence of
10387 more than one character */
10389 GET_RE_DEBUG_FLAGS_DECL;
10391 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10393 GET_RE_DEBUG_FLAGS;
10395 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10397 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10398 * modifier. The other meaning does not, so use a temporary until we find
10399 * out which we are being called with */
10400 p = (RExC_flags & RXf_PMf_EXTENDED)
10401 ? regwhite( pRExC_state, RExC_parse )
10404 /* Disambiguate between \N meaning a named character versus \N meaning
10405 * [^\n]. The former is assumed when it can't be the latter. */
10406 if (*p != '{' || regcurly(p, FALSE)) {
10409 /* no bare \N allowed in a charclass */
10410 if (in_char_class) {
10411 vFAIL("\\N in a character class must be a named character: \\N{...}");
10415 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10417 nextchar(pRExC_state);
10418 *node_p = reg_node(pRExC_state, REG_ANY);
10419 *flagp |= HASWIDTH|SIMPLE;
10421 Set_Node_Length(*node_p, 1); /* MJD */
10425 /* Here, we have decided it should be a named character or sequence */
10427 /* The test above made sure that the next real character is a '{', but
10428 * under the /x modifier, it could be separated by space (or a comment and
10429 * \n) and this is not allowed (for consistency with \x{...} and the
10430 * tokenizer handling of \N{NAME}). */
10431 if (*RExC_parse != '{') {
10432 vFAIL("Missing braces on \\N{}");
10435 RExC_parse++; /* Skip past the '{' */
10437 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10438 || ! (endbrace == RExC_parse /* nothing between the {} */
10439 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
10440 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
10442 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10443 vFAIL("\\N{NAME} must be resolved by the lexer");
10446 if (endbrace == RExC_parse) { /* empty: \N{} */
10449 *node_p = reg_node(pRExC_state,NOTHING);
10451 else if (in_char_class) {
10452 if (SIZE_ONLY && in_char_class) {
10454 RExC_parse++; /* Position after the "}" */
10455 vFAIL("Zero length \\N{}");
10458 ckWARNreg(RExC_parse,
10459 "Ignoring zero length \\N{} in character class");
10467 nextchar(pRExC_state);
10471 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10472 RExC_parse += 2; /* Skip past the 'U+' */
10474 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10476 /* Code points are separated by dots. If none, there is only one code
10477 * point, and is terminated by the brace */
10478 has_multiple_chars = (endchar < endbrace);
10480 if (valuep && (! has_multiple_chars || in_char_class)) {
10481 /* We only pay attention to the first char of
10482 multichar strings being returned in char classes. I kinda wonder
10483 if this makes sense as it does change the behaviour
10484 from earlier versions, OTOH that behaviour was broken
10485 as well. XXX Solution is to recharacterize as
10486 [rest-of-class]|multi1|multi2... */
10488 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10489 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10490 | PERL_SCAN_DISALLOW_PREFIX
10491 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10493 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10495 /* The tokenizer should have guaranteed validity, but it's possible to
10496 * bypass it by using single quoting, so check */
10497 if (length_of_hex == 0
10498 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10500 RExC_parse += length_of_hex; /* Includes all the valid */
10501 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10502 ? UTF8SKIP(RExC_parse)
10504 /* Guard against malformed utf8 */
10505 if (RExC_parse >= endchar) {
10506 RExC_parse = endchar;
10508 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10511 if (in_char_class && has_multiple_chars) {
10513 RExC_parse = endbrace;
10514 vFAIL("\\N{} in character class restricted to one character");
10517 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10521 RExC_parse = endbrace + 1;
10523 else if (! node_p || ! has_multiple_chars) {
10525 /* Here, the input is legal, but not according to the caller's
10526 * options. We fail without advancing the parse, so that the
10527 * caller can try again */
10533 /* What is done here is to convert this to a sub-pattern of the form
10534 * (?:\x{char1}\x{char2}...)
10535 * and then call reg recursively. That way, it retains its atomicness,
10536 * while not having to worry about special handling that some code
10537 * points may have. toke.c has converted the original Unicode values
10538 * to native, so that we can just pass on the hex values unchanged. We
10539 * do have to set a flag to keep recoding from happening in the
10542 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10544 char *orig_end = RExC_end;
10547 while (RExC_parse < endbrace) {
10549 /* Convert to notation the rest of the code understands */
10550 sv_catpv(substitute_parse, "\\x{");
10551 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10552 sv_catpv(substitute_parse, "}");
10554 /* Point to the beginning of the next character in the sequence. */
10555 RExC_parse = endchar + 1;
10556 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10558 sv_catpv(substitute_parse, ")");
10560 RExC_parse = SvPV(substitute_parse, len);
10562 /* Don't allow empty number */
10564 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10566 RExC_end = RExC_parse + len;
10568 /* The values are Unicode, and therefore not subject to recoding */
10569 RExC_override_recoding = 1;
10571 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10572 if (flags & RESTART_UTF8) {
10573 *flagp = RESTART_UTF8;
10576 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10579 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10581 RExC_parse = endbrace;
10582 RExC_end = orig_end;
10583 RExC_override_recoding = 0;
10585 nextchar(pRExC_state);
10595 * It returns the code point in utf8 for the value in *encp.
10596 * value: a code value in the source encoding
10597 * encp: a pointer to an Encode object
10599 * If the result from Encode is not a single character,
10600 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10603 S_reg_recode(pTHX_ const char value, SV **encp)
10606 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10607 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10608 const STRLEN newlen = SvCUR(sv);
10609 UV uv = UNICODE_REPLACEMENT;
10611 PERL_ARGS_ASSERT_REG_RECODE;
10615 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10618 if (!newlen || numlen != newlen) {
10619 uv = UNICODE_REPLACEMENT;
10625 PERL_STATIC_INLINE U8
10626 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10630 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10636 op = get_regex_charset(RExC_flags);
10637 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10638 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10639 been, so there is no hole */
10642 return op + EXACTF;
10645 PERL_STATIC_INLINE void
10646 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10648 /* This knows the details about sizing an EXACTish node, setting flags for
10649 * it (by setting <*flagp>, and potentially populating it with a single
10652 * If <len> (the length in bytes) is non-zero, this function assumes that
10653 * the node has already been populated, and just does the sizing. In this
10654 * case <code_point> should be the final code point that has already been
10655 * placed into the node. This value will be ignored except that under some
10656 * circumstances <*flagp> is set based on it.
10658 * If <len> is zero, the function assumes that the node is to contain only
10659 * the single character given by <code_point> and calculates what <len>
10660 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10661 * additionally will populate the node's STRING with <code_point>, if <len>
10662 * is 0. In both cases <*flagp> is appropriately set
10664 * It knows that under FOLD, the Latin Sharp S and UTF characters above
10665 * 255, must be folded (the former only when the rules indicate it can
10668 bool len_passed_in = cBOOL(len != 0);
10669 U8 character[UTF8_MAXBYTES_CASE+1];
10671 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10673 if (! len_passed_in) {
10675 if (FOLD && (! LOC || code_point > 255)) {
10676 _to_uni_fold_flags(code_point,
10679 FOLD_FLAGS_FULL | ((LOC)
10680 ? FOLD_FLAGS_LOCALE
10681 : (ASCII_FOLD_RESTRICTED)
10682 ? FOLD_FLAGS_NOMIX_ASCII
10686 uvchr_to_utf8( character, code_point);
10687 len = UTF8SKIP(character);
10691 || code_point != LATIN_SMALL_LETTER_SHARP_S
10692 || ASCII_FOLD_RESTRICTED
10693 || ! AT_LEAST_UNI_SEMANTICS)
10695 *character = (U8) code_point;
10700 *(character + 1) = 's';
10706 RExC_size += STR_SZ(len);
10709 RExC_emit += STR_SZ(len);
10710 STR_LEN(node) = len;
10711 if (! len_passed_in) {
10712 Copy((char *) character, STRING(node), len, char);
10716 *flagp |= HASWIDTH;
10718 /* A single character node is SIMPLE, except for the special-cased SHARP S
10720 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10721 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10722 || ! FOLD || ! DEPENDS_SEMANTICS))
10729 /* return atoi(p), unless it's too big to sensibly be a backref,
10730 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
10733 S_backref_value(char *p)
10737 for (;isDIGIT(*q); q++); /* calculate length of num */
10738 if (q - p == 0 || q - p > 9)
10745 - regatom - the lowest level
10747 Try to identify anything special at the start of the pattern. If there
10748 is, then handle it as required. This may involve generating a single regop,
10749 such as for an assertion; or it may involve recursing, such as to
10750 handle a () structure.
10752 If the string doesn't start with something special then we gobble up
10753 as much literal text as we can.
10755 Once we have been able to handle whatever type of thing started the
10756 sequence, we return.
10758 Note: we have to be careful with escapes, as they can be both literal
10759 and special, and in the case of \10 and friends, context determines which.
10761 A summary of the code structure is:
10763 switch (first_byte) {
10764 cases for each special:
10765 handle this special;
10768 switch (2nd byte) {
10769 cases for each unambiguous special:
10770 handle this special;
10772 cases for each ambigous special/literal:
10774 if (special) handle here
10776 default: // unambiguously literal:
10779 default: // is a literal char
10782 create EXACTish node for literal;
10783 while (more input and node isn't full) {
10784 switch (input_byte) {
10785 cases for each special;
10786 make sure parse pointer is set so that the next call to
10787 regatom will see this special first
10788 goto loopdone; // EXACTish node terminated by prev. char
10790 append char to EXACTISH node;
10792 get next input byte;
10796 return the generated node;
10798 Specifically there are two separate switches for handling
10799 escape sequences, with the one for handling literal escapes requiring
10800 a dummy entry for all of the special escapes that are actually handled
10803 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10805 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10807 Otherwise does not return NULL.
10811 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10814 regnode *ret = NULL;
10816 char *parse_start = RExC_parse;
10820 GET_RE_DEBUG_FLAGS_DECL;
10822 *flagp = WORST; /* Tentatively. */
10824 DEBUG_PARSE("atom");
10826 PERL_ARGS_ASSERT_REGATOM;
10829 switch ((U8)*RExC_parse) {
10831 RExC_seen_zerolen++;
10832 nextchar(pRExC_state);
10833 if (RExC_flags & RXf_PMf_MULTILINE)
10834 ret = reg_node(pRExC_state, MBOL);
10835 else if (RExC_flags & RXf_PMf_SINGLELINE)
10836 ret = reg_node(pRExC_state, SBOL);
10838 ret = reg_node(pRExC_state, BOL);
10839 Set_Node_Length(ret, 1); /* MJD */
10842 nextchar(pRExC_state);
10844 RExC_seen_zerolen++;
10845 if (RExC_flags & RXf_PMf_MULTILINE)
10846 ret = reg_node(pRExC_state, MEOL);
10847 else if (RExC_flags & RXf_PMf_SINGLELINE)
10848 ret = reg_node(pRExC_state, SEOL);
10850 ret = reg_node(pRExC_state, EOL);
10851 Set_Node_Length(ret, 1); /* MJD */
10854 nextchar(pRExC_state);
10855 if (RExC_flags & RXf_PMf_SINGLELINE)
10856 ret = reg_node(pRExC_state, SANY);
10858 ret = reg_node(pRExC_state, REG_ANY);
10859 *flagp |= HASWIDTH|SIMPLE;
10861 Set_Node_Length(ret, 1); /* MJD */
10865 char * const oregcomp_parse = ++RExC_parse;
10866 ret = regclass(pRExC_state, flagp,depth+1,
10867 FALSE, /* means parse the whole char class */
10868 TRUE, /* allow multi-char folds */
10869 FALSE, /* don't silence non-portable warnings. */
10871 if (*RExC_parse != ']') {
10872 RExC_parse = oregcomp_parse;
10873 vFAIL("Unmatched [");
10876 if (*flagp & RESTART_UTF8)
10878 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10881 nextchar(pRExC_state);
10882 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10886 nextchar(pRExC_state);
10887 ret = reg(pRExC_state, 2, &flags,depth+1);
10889 if (flags & TRYAGAIN) {
10890 if (RExC_parse == RExC_end) {
10891 /* Make parent create an empty node if needed. */
10892 *flagp |= TRYAGAIN;
10897 if (flags & RESTART_UTF8) {
10898 *flagp = RESTART_UTF8;
10901 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10903 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10907 if (flags & TRYAGAIN) {
10908 *flagp |= TRYAGAIN;
10911 vFAIL("Internal urp");
10912 /* Supposed to be caught earlier. */
10915 if (!regcurly(RExC_parse, FALSE)) {
10924 vFAIL("Quantifier follows nothing");
10929 This switch handles escape sequences that resolve to some kind
10930 of special regop and not to literal text. Escape sequnces that
10931 resolve to literal text are handled below in the switch marked
10934 Every entry in this switch *must* have a corresponding entry
10935 in the literal escape switch. However, the opposite is not
10936 required, as the default for this switch is to jump to the
10937 literal text handling code.
10939 switch ((U8)*++RExC_parse) {
10941 /* Special Escapes */
10943 RExC_seen_zerolen++;
10944 ret = reg_node(pRExC_state, SBOL);
10946 goto finish_meta_pat;
10948 ret = reg_node(pRExC_state, GPOS);
10949 RExC_seen |= REG_SEEN_GPOS;
10951 goto finish_meta_pat;
10953 RExC_seen_zerolen++;
10954 ret = reg_node(pRExC_state, KEEPS);
10956 /* XXX:dmq : disabling in-place substitution seems to
10957 * be necessary here to avoid cases of memory corruption, as
10958 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10960 RExC_seen |= REG_SEEN_LOOKBEHIND;
10961 goto finish_meta_pat;
10963 ret = reg_node(pRExC_state, SEOL);
10965 RExC_seen_zerolen++; /* Do not optimize RE away */
10966 goto finish_meta_pat;
10968 ret = reg_node(pRExC_state, EOS);
10970 RExC_seen_zerolen++; /* Do not optimize RE away */
10971 goto finish_meta_pat;
10973 ret = reg_node(pRExC_state, CANY);
10974 RExC_seen |= REG_SEEN_CANY;
10975 *flagp |= HASWIDTH|SIMPLE;
10976 goto finish_meta_pat;
10978 ret = reg_node(pRExC_state, CLUMP);
10979 *flagp |= HASWIDTH;
10980 goto finish_meta_pat;
10986 arg = ANYOF_WORDCHAR;
10990 RExC_seen_zerolen++;
10991 RExC_seen |= REG_SEEN_LOOKBEHIND;
10992 op = BOUND + get_regex_charset(RExC_flags);
10993 if (op > BOUNDA) { /* /aa is same as /a */
10996 ret = reg_node(pRExC_state, op);
10997 FLAGS(ret) = get_regex_charset(RExC_flags);
10999 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11000 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
11002 goto finish_meta_pat;
11004 RExC_seen_zerolen++;
11005 RExC_seen |= REG_SEEN_LOOKBEHIND;
11006 op = NBOUND + get_regex_charset(RExC_flags);
11007 if (op > NBOUNDA) { /* /aa is same as /a */
11010 ret = reg_node(pRExC_state, op);
11011 FLAGS(ret) = get_regex_charset(RExC_flags);
11013 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11014 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
11016 goto finish_meta_pat;
11026 ret = reg_node(pRExC_state, LNBREAK);
11027 *flagp |= HASWIDTH|SIMPLE;
11028 goto finish_meta_pat;
11036 goto join_posix_op_known;
11042 arg = ANYOF_VERTWS;
11044 goto join_posix_op_known;
11054 op = POSIXD + get_regex_charset(RExC_flags);
11055 if (op > POSIXA) { /* /aa is same as /a */
11059 join_posix_op_known:
11062 op += NPOSIXD - POSIXD;
11065 ret = reg_node(pRExC_state, op);
11067 FLAGS(ret) = namedclass_to_classnum(arg);
11070 *flagp |= HASWIDTH|SIMPLE;
11074 nextchar(pRExC_state);
11075 Set_Node_Length(ret, 2); /* MJD */
11081 char* parse_start = RExC_parse - 2;
11086 ret = regclass(pRExC_state, flagp,depth+1,
11087 TRUE, /* means just parse this element */
11088 FALSE, /* don't allow multi-char folds */
11089 FALSE, /* don't silence non-portable warnings.
11090 It would be a bug if these returned
11093 /* regclass() can only return RESTART_UTF8 if multi-char folds
11096 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11101 Set_Node_Offset(ret, parse_start + 2);
11102 Set_Node_Cur_Length(ret, parse_start);
11103 nextchar(pRExC_state);
11107 /* Handle \N and \N{NAME} with multiple code points here and not
11108 * below because it can be multicharacter. join_exact() will join
11109 * them up later on. Also this makes sure that things like
11110 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11111 * The options to the grok function call causes it to fail if the
11112 * sequence is just a single code point. We then go treat it as
11113 * just another character in the current EXACT node, and hence it
11114 * gets uniform treatment with all the other characters. The
11115 * special treatment for quantifiers is not needed for such single
11116 * character sequences */
11118 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11119 FALSE /* not strict */ )) {
11120 if (*flagp & RESTART_UTF8)
11126 case 'k': /* Handle \k<NAME> and \k'NAME' */
11129 char ch= RExC_parse[1];
11130 if (ch != '<' && ch != '\'' && ch != '{') {
11132 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11133 vFAIL2("Sequence %.2s... not terminated",parse_start);
11135 /* this pretty much dupes the code for (?P=...) in reg(), if
11136 you change this make sure you change that */
11137 char* name_start = (RExC_parse += 2);
11139 SV *sv_dat = reg_scan_name(pRExC_state,
11140 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11141 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11142 if (RExC_parse == name_start || *RExC_parse != ch)
11143 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11144 vFAIL2("Sequence %.3s... not terminated",parse_start);
11147 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11148 RExC_rxi->data->data[num]=(void*)sv_dat;
11149 SvREFCNT_inc_simple_void(sv_dat);
11153 ret = reganode(pRExC_state,
11156 : (ASCII_FOLD_RESTRICTED)
11158 : (AT_LEAST_UNI_SEMANTICS)
11164 *flagp |= HASWIDTH;
11166 /* override incorrect value set in reganode MJD */
11167 Set_Node_Offset(ret, parse_start+1);
11168 Set_Node_Cur_Length(ret, parse_start);
11169 nextchar(pRExC_state);
11175 case '1': case '2': case '3': case '4':
11176 case '5': case '6': case '7': case '8': case '9':
11181 if (*RExC_parse == 'g') {
11185 if (*RExC_parse == '{') {
11189 if (*RExC_parse == '-') {
11193 if (hasbrace && !isDIGIT(*RExC_parse)) {
11194 if (isrel) RExC_parse--;
11196 goto parse_named_seq;
11199 num = S_backref_value(RExC_parse);
11201 vFAIL("Reference to invalid group 0");
11202 else if (num == I32_MAX) {
11203 if (isDIGIT(*RExC_parse))
11204 vFAIL("Reference to nonexistent group");
11206 vFAIL("Unterminated \\g... pattern");
11210 num = RExC_npar - num;
11212 vFAIL("Reference to nonexistent or unclosed group");
11216 num = S_backref_value(RExC_parse);
11217 /* bare \NNN might be backref or octal */
11218 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11219 && *RExC_parse != '8' && *RExC_parse != '9'))
11220 /* Probably a character specified in octal, e.g. \35 */
11224 /* at this point RExC_parse definitely points to a backref
11227 #ifdef RE_TRACK_PATTERN_OFFSETS
11228 char * const parse_start = RExC_parse - 1; /* MJD */
11230 while (isDIGIT(*RExC_parse))
11233 if (*RExC_parse != '}')
11234 vFAIL("Unterminated \\g{...} pattern");
11238 if (num > (I32)RExC_rx->nparens)
11239 vFAIL("Reference to nonexistent group");
11242 ret = reganode(pRExC_state,
11245 : (ASCII_FOLD_RESTRICTED)
11247 : (AT_LEAST_UNI_SEMANTICS)
11253 *flagp |= HASWIDTH;
11255 /* override incorrect value set in reganode MJD */
11256 Set_Node_Offset(ret, parse_start+1);
11257 Set_Node_Cur_Length(ret, parse_start);
11259 nextchar(pRExC_state);
11264 if (RExC_parse >= RExC_end)
11265 FAIL("Trailing \\");
11268 /* Do not generate "unrecognized" warnings here, we fall
11269 back into the quick-grab loop below */
11276 if (RExC_flags & RXf_PMf_EXTENDED) {
11277 if ( reg_skipcomment( pRExC_state ) )
11284 parse_start = RExC_parse - 1;
11293 #define MAX_NODE_STRING_SIZE 127
11294 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11296 U8 upper_parse = MAX_NODE_STRING_SIZE;
11298 U8 node_type = compute_EXACTish(pRExC_state);
11299 bool next_is_quantifier;
11300 char * oldp = NULL;
11302 /* We can convert EXACTF nodes to EXACTFU if they contain only
11303 * characters that match identically regardless of the target
11304 * string's UTF8ness. The reason to do this is that EXACTF is not
11305 * trie-able, EXACTFU is. (We don't need to figure this out until
11307 bool maybe_exactfu = node_type == EXACTF && PASS2;
11309 /* If a folding node contains only code points that don't
11310 * participate in folds, it can be changed into an EXACT node,
11311 * which allows the optimizer more things to look for */
11314 ret = reg_node(pRExC_state, node_type);
11316 /* In pass1, folded, we use a temporary buffer instead of the
11317 * actual node, as the node doesn't exist yet */
11318 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11324 /* We do the EXACTFish to EXACT node only if folding, and not if in
11325 * locale, as whether a character folds or not isn't known until
11326 * runtime. (And we don't need to figure this out until pass 2) */
11327 maybe_exact = FOLD && ! LOC && PASS2;
11329 /* XXX The node can hold up to 255 bytes, yet this only goes to
11330 * 127. I (khw) do not know why. Keeping it somewhat less than
11331 * 255 allows us to not have to worry about overflow due to
11332 * converting to utf8 and fold expansion, but that value is
11333 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11334 * split up by this limit into a single one using the real max of
11335 * 255. Even at 127, this breaks under rare circumstances. If
11336 * folding, we do not want to split a node at a character that is a
11337 * non-final in a multi-char fold, as an input string could just
11338 * happen to want to match across the node boundary. The join
11339 * would solve that problem if the join actually happens. But a
11340 * series of more than two nodes in a row each of 127 would cause
11341 * the first join to succeed to get to 254, but then there wouldn't
11342 * be room for the next one, which could at be one of those split
11343 * multi-char folds. I don't know of any fool-proof solution. One
11344 * could back off to end with only a code point that isn't such a
11345 * non-final, but it is possible for there not to be any in the
11347 for (p = RExC_parse - 1;
11348 len < upper_parse && p < RExC_end;
11353 if (RExC_flags & RXf_PMf_EXTENDED)
11354 p = regwhite( pRExC_state, p );
11365 /* Literal Escapes Switch
11367 This switch is meant to handle escape sequences that
11368 resolve to a literal character.
11370 Every escape sequence that represents something
11371 else, like an assertion or a char class, is handled
11372 in the switch marked 'Special Escapes' above in this
11373 routine, but also has an entry here as anything that
11374 isn't explicitly mentioned here will be treated as
11375 an unescaped equivalent literal.
11378 switch ((U8)*++p) {
11379 /* These are all the special escapes. */
11380 case 'A': /* Start assertion */
11381 case 'b': case 'B': /* Word-boundary assertion*/
11382 case 'C': /* Single char !DANGEROUS! */
11383 case 'd': case 'D': /* digit class */
11384 case 'g': case 'G': /* generic-backref, pos assertion */
11385 case 'h': case 'H': /* HORIZWS */
11386 case 'k': case 'K': /* named backref, keep marker */
11387 case 'p': case 'P': /* Unicode property */
11388 case 'R': /* LNBREAK */
11389 case 's': case 'S': /* space class */
11390 case 'v': case 'V': /* VERTWS */
11391 case 'w': case 'W': /* word class */
11392 case 'X': /* eXtended Unicode "combining character sequence" */
11393 case 'z': case 'Z': /* End of line/string assertion */
11397 /* Anything after here is an escape that resolves to a
11398 literal. (Except digits, which may or may not)
11404 case 'N': /* Handle a single-code point named character. */
11405 /* The options cause it to fail if a multiple code
11406 * point sequence. Handle those in the switch() above
11408 RExC_parse = p + 1;
11409 if (! grok_bslash_N(pRExC_state, NULL, &ender,
11410 flagp, depth, FALSE,
11411 FALSE /* not strict */ ))
11413 if (*flagp & RESTART_UTF8)
11414 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11415 RExC_parse = p = oldp;
11419 if (ender > 0xff) {
11436 ender = ASCII_TO_NATIVE('\033');
11446 const char* error_msg;
11448 bool valid = grok_bslash_o(&p,
11451 TRUE, /* out warnings */
11452 FALSE, /* not strict */
11453 TRUE, /* Output warnings
11458 RExC_parse = p; /* going to die anyway; point
11459 to exact spot of failure */
11463 if (PL_encoding && ender < 0x100) {
11464 goto recode_encoding;
11466 if (ender > 0xff) {
11473 UV result = UV_MAX; /* initialize to erroneous
11475 const char* error_msg;
11477 bool valid = grok_bslash_x(&p,
11480 TRUE, /* out warnings */
11481 FALSE, /* not strict */
11482 TRUE, /* Output warnings
11487 RExC_parse = p; /* going to die anyway; point
11488 to exact spot of failure */
11493 if (PL_encoding && ender < 0x100) {
11494 goto recode_encoding;
11496 if (ender > 0xff) {
11503 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11505 case '8': case '9': /* must be a backreference */
11508 case '1': case '2': case '3':case '4':
11509 case '5': case '6': case '7':
11510 /* When we parse backslash escapes there is ambiguity
11511 * between backreferences and octal escapes. Any escape
11512 * from \1 - \9 is a backreference, any multi-digit
11513 * escape which does not start with 0 and which when
11514 * evaluated as decimal could refer to an already
11515 * parsed capture buffer is a backslash. Anything else
11518 * Note this implies that \118 could be interpreted as
11519 * 118 OR as "\11" . "8" depending on whether there
11520 * were 118 capture buffers defined already in the
11522 if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
11523 { /* Not to be treated as an octal constant, go
11530 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11532 ender = grok_oct(p, &numlen, &flags, NULL);
11533 if (ender > 0xff) {
11537 if (SIZE_ONLY /* like \08, \178 */
11540 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11542 reg_warn_non_literal_string(
11544 form_short_octal_warning(p, numlen));
11547 if (PL_encoding && ender < 0x100)
11548 goto recode_encoding;
11551 if (! RExC_override_recoding) {
11552 SV* enc = PL_encoding;
11553 ender = reg_recode((const char)(U8)ender, &enc);
11554 if (!enc && SIZE_ONLY)
11555 ckWARNreg(p, "Invalid escape in the specified encoding");
11561 FAIL("Trailing \\");
11564 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11565 /* Include any { following the alpha to emphasize
11566 * that it could be part of an escape at some point
11568 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11569 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11571 goto normal_default;
11572 } /* End of switch on '\' */
11574 default: /* A literal character */
11577 && RExC_flags & RXf_PMf_EXTENDED
11578 && ckWARN_d(WARN_DEPRECATED)
11579 && is_PATWS_non_low(p, UTF))
11581 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11582 "Escape literal pattern white space under /x");
11586 if (UTF8_IS_START(*p) && UTF) {
11588 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11589 &numlen, UTF8_ALLOW_DEFAULT);
11595 } /* End of switch on the literal */
11597 /* Here, have looked at the literal character and <ender>
11598 * contains its ordinal, <p> points to the character after it
11601 if ( RExC_flags & RXf_PMf_EXTENDED)
11602 p = regwhite( pRExC_state, p );
11604 /* If the next thing is a quantifier, it applies to this
11605 * character only, which means that this character has to be in
11606 * its own node and can't just be appended to the string in an
11607 * existing node, so if there are already other characters in
11608 * the node, close the node with just them, and set up to do
11609 * this character again next time through, when it will be the
11610 * only thing in its new node */
11611 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11619 const STRLEN unilen = reguni(pRExC_state, ender, s);
11625 /* The loop increments <len> each time, as all but this
11626 * path (and one other) through it add a single byte to
11627 * the EXACTish node. But this one has changed len to
11628 * be the correct final value, so subtract one to
11629 * cancel out the increment that follows */
11633 REGC((char)ender, s++);
11636 else /* FOLD */ if (! ( UTF
11637 /* See comments for join_exact() as to why we fold this
11638 * non-UTF at compile time */
11639 || (node_type == EXACTFU
11640 && ender == LATIN_SMALL_LETTER_SHARP_S)))
11642 if (IS_IN_SOME_FOLD_L1(ender)) {
11643 maybe_exact = FALSE;
11645 /* See if the character's fold differs between /d and
11646 * /u. This includes the multi-char fold SHARP S to
11649 && (PL_fold[ender] != PL_fold_latin1[ender]
11650 || ender == LATIN_SMALL_LETTER_SHARP_S
11652 && isARG2_lower_or_UPPER_ARG1('s', ender)
11653 && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11655 maybe_exactfu = FALSE;
11658 *(s++) = (char) ender;
11662 /* Prime the casefolded buffer. Locale rules, which apply
11663 * only to code points < 256, aren't known until execution,
11664 * so for them, just output the original character using
11665 * utf8. If we start to fold non-UTF patterns, be sure to
11666 * update join_exact() */
11667 if (LOC && ender < 256) {
11668 if (UVCHR_IS_INVARIANT(ender)) {
11672 *s = UTF8_TWO_BYTE_HI(ender);
11673 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11678 UV folded = _to_uni_fold_flags(
11683 | ((LOC) ? FOLD_FLAGS_LOCALE
11684 : (ASCII_FOLD_RESTRICTED)
11685 ? FOLD_FLAGS_NOMIX_ASCII
11689 /* If this node only contains non-folding code points
11690 * so far, see if this new one is also non-folding */
11692 if (folded != ender) {
11693 maybe_exact = FALSE;
11696 /* Here the fold is the original; we have
11697 * to check further to see if anything
11699 if (! PL_utf8_foldable) {
11700 SV* swash = swash_init("utf8",
11702 &PL_sv_undef, 1, 0);
11704 _get_swash_invlist(swash);
11705 SvREFCNT_dec_NN(swash);
11707 if (_invlist_contains_cp(PL_utf8_foldable,
11710 maybe_exact = FALSE;
11718 /* The loop increments <len> each time, as all but this
11719 * path (and one other) through it add a single byte to the
11720 * EXACTish node. But this one has changed len to be the
11721 * correct final value, so subtract one to cancel out the
11722 * increment that follows */
11723 len += foldlen - 1;
11726 if (next_is_quantifier) {
11728 /* Here, the next input is a quantifier, and to get here,
11729 * the current character is the only one in the node.
11730 * Also, here <len> doesn't include the final byte for this
11736 } /* End of loop through literal characters */
11738 /* Here we have either exhausted the input or ran out of room in
11739 * the node. (If we encountered a character that can't be in the
11740 * node, transfer is made directly to <loopdone>, and so we
11741 * wouldn't have fallen off the end of the loop.) In the latter
11742 * case, we artificially have to split the node into two, because
11743 * we just don't have enough space to hold everything. This
11744 * creates a problem if the final character participates in a
11745 * multi-character fold in the non-final position, as a match that
11746 * should have occurred won't, due to the way nodes are matched,
11747 * and our artificial boundary. So back off until we find a non-
11748 * problematic character -- one that isn't at the beginning or
11749 * middle of such a fold. (Either it doesn't participate in any
11750 * folds, or appears only in the final position of all the folds it
11751 * does participate in.) A better solution with far fewer false
11752 * positives, and that would fill the nodes more completely, would
11753 * be to actually have available all the multi-character folds to
11754 * test against, and to back-off only far enough to be sure that
11755 * this node isn't ending with a partial one. <upper_parse> is set
11756 * further below (if we need to reparse the node) to include just
11757 * up through that final non-problematic character that this code
11758 * identifies, so when it is set to less than the full node, we can
11759 * skip the rest of this */
11760 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11762 const STRLEN full_len = len;
11764 assert(len >= MAX_NODE_STRING_SIZE);
11766 /* Here, <s> points to the final byte of the final character.
11767 * Look backwards through the string until find a non-
11768 * problematic character */
11772 /* These two have no multi-char folds to non-UTF characters
11774 if (ASCII_FOLD_RESTRICTED || LOC) {
11778 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11782 if (! PL_NonL1NonFinalFold) {
11783 PL_NonL1NonFinalFold = _new_invlist_C_array(
11784 NonL1_Perl_Non_Final_Folds_invlist);
11787 /* Point to the first byte of the final character */
11788 s = (char *) utf8_hop((U8 *) s, -1);
11790 while (s >= s0) { /* Search backwards until find
11791 non-problematic char */
11792 if (UTF8_IS_INVARIANT(*s)) {
11794 /* There are no ascii characters that participate
11795 * in multi-char folds under /aa. In EBCDIC, the
11796 * non-ascii invariants are all control characters,
11797 * so don't ever participate in any folds. */
11798 if (ASCII_FOLD_RESTRICTED
11799 || ! IS_NON_FINAL_FOLD(*s))
11804 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11806 /* No Latin1 characters participate in multi-char
11807 * folds under /l */
11809 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11815 else if (! _invlist_contains_cp(
11816 PL_NonL1NonFinalFold,
11817 valid_utf8_to_uvchr((U8 *) s, NULL)))
11822 /* Here, the current character is problematic in that
11823 * it does occur in the non-final position of some
11824 * fold, so try the character before it, but have to
11825 * special case the very first byte in the string, so
11826 * we don't read outside the string */
11827 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11828 } /* End of loop backwards through the string */
11830 /* If there were only problematic characters in the string,
11831 * <s> will point to before s0, in which case the length
11832 * should be 0, otherwise include the length of the
11833 * non-problematic character just found */
11834 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11837 /* Here, have found the final character, if any, that is
11838 * non-problematic as far as ending the node without splitting
11839 * it across a potential multi-char fold. <len> contains the
11840 * number of bytes in the node up-to and including that
11841 * character, or is 0 if there is no such character, meaning
11842 * the whole node contains only problematic characters. In
11843 * this case, give up and just take the node as-is. We can't
11848 /* If the node ends in an 's' we make sure it stays EXACTF,
11849 * as if it turns into an EXACTFU, it could later get
11850 * joined with another 's' that would then wrongly match
11852 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11854 maybe_exactfu = FALSE;
11858 /* Here, the node does contain some characters that aren't
11859 * problematic. If one such is the final character in the
11860 * node, we are done */
11861 if (len == full_len) {
11864 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11866 /* If the final character is problematic, but the
11867 * penultimate is not, back-off that last character to
11868 * later start a new node with it */
11873 /* Here, the final non-problematic character is earlier
11874 * in the input than the penultimate character. What we do
11875 * is reparse from the beginning, going up only as far as
11876 * this final ok one, thus guaranteeing that the node ends
11877 * in an acceptable character. The reason we reparse is
11878 * that we know how far in the character is, but we don't
11879 * know how to correlate its position with the input parse.
11880 * An alternate implementation would be to build that
11881 * correlation as we go along during the original parse,
11882 * but that would entail extra work for every node, whereas
11883 * this code gets executed only when the string is too
11884 * large for the node, and the final two characters are
11885 * problematic, an infrequent occurrence. Yet another
11886 * possible strategy would be to save the tail of the
11887 * string, and the next time regatom is called, initialize
11888 * with that. The problem with this is that unless you
11889 * back off one more character, you won't be guaranteed
11890 * regatom will get called again, unless regbranch,
11891 * regpiece ... are also changed. If you do back off that
11892 * extra character, so that there is input guaranteed to
11893 * force calling regatom, you can't handle the case where
11894 * just the first character in the node is acceptable. I
11895 * (khw) decided to try this method which doesn't have that
11896 * pitfall; if performance issues are found, we can do a
11897 * combination of the current approach plus that one */
11903 } /* End of verifying node ends with an appropriate char */
11905 loopdone: /* Jumped to when encounters something that shouldn't be in
11908 /* I (khw) don't know if you can get here with zero length, but the
11909 * old code handled this situation by creating a zero-length EXACT
11910 * node. Might as well be NOTHING instead */
11916 /* If 'maybe_exact' is still set here, means there are no
11917 * code points in the node that participate in folds;
11918 * similarly for 'maybe_exactfu' and code points that match
11919 * differently depending on UTF8ness of the target string
11924 else if (maybe_exactfu) {
11928 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11931 RExC_parse = p - 1;
11932 Set_Node_Cur_Length(ret, parse_start);
11933 nextchar(pRExC_state);
11935 /* len is STRLEN which is unsigned, need to copy to signed */
11938 vFAIL("Internal disaster");
11941 } /* End of label 'defchar:' */
11943 } /* End of giant switch on input character */
11949 S_regwhite( RExC_state_t *pRExC_state, char *p )
11951 const char *e = RExC_end;
11953 PERL_ARGS_ASSERT_REGWHITE;
11958 else if (*p == '#') {
11961 if (*p++ == '\n') {
11967 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11976 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11978 /* Returns the next non-pattern-white space, non-comment character (the
11979 * latter only if 'recognize_comment is true) in the string p, which is
11980 * ended by RExC_end. If there is no line break ending a comment,
11981 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11982 const char *e = RExC_end;
11984 PERL_ARGS_ASSERT_REGPATWS;
11988 if ((len = is_PATWS_safe(p, e, UTF))) {
11991 else if (recognize_comment && *p == '#') {
11995 if (is_LNBREAK_safe(p, e, UTF)) {
12001 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12010 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12012 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12013 * sets up the bitmap and any flags, removing those code points from the
12014 * inversion list, setting it to NULL should it become completely empty */
12016 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12017 assert(PL_regkind[OP(node)] == ANYOF);
12019 ANYOF_BITMAP_ZERO(node);
12020 if (*invlist_ptr) {
12022 /* This gets set if we actually need to modify things */
12023 bool change_invlist = FALSE;
12027 /* Start looking through *invlist_ptr */
12028 invlist_iterinit(*invlist_ptr);
12029 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12033 if (end == UV_MAX && start <= 256) {
12034 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12037 /* Quit if are above what we should change */
12042 change_invlist = TRUE;
12044 /* Set all the bits in the range, up to the max that we are doing */
12045 high = (end < 255) ? end : 255;
12046 for (i = start; i <= (int) high; i++) {
12047 if (! ANYOF_BITMAP_TEST(node, i)) {
12048 ANYOF_BITMAP_SET(node, i);
12052 invlist_iterfinish(*invlist_ptr);
12054 /* Done with loop; remove any code points that are in the bitmap from
12055 * *invlist_ptr; similarly for code points above latin1 if we have a flag
12056 * to match all of them anyways */
12057 if (change_invlist) {
12058 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12060 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12061 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12064 /* If have completely emptied it, remove it completely */
12065 if (_invlist_len(*invlist_ptr) == 0) {
12066 SvREFCNT_dec_NN(*invlist_ptr);
12067 *invlist_ptr = NULL;
12072 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12073 Character classes ([:foo:]) can also be negated ([:^foo:]).
12074 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12075 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12076 but trigger failures because they are currently unimplemented. */
12078 #define POSIXCC_DONE(c) ((c) == ':')
12079 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12080 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12082 PERL_STATIC_INLINE I32
12083 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12086 I32 namedclass = OOB_NAMEDCLASS;
12088 PERL_ARGS_ASSERT_REGPPOSIXCC;
12090 if (value == '[' && RExC_parse + 1 < RExC_end &&
12091 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12092 POSIXCC(UCHARAT(RExC_parse)))
12094 const char c = UCHARAT(RExC_parse);
12095 char* const s = RExC_parse++;
12097 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12099 if (RExC_parse == RExC_end) {
12102 /* Try to give a better location for the error (than the end of
12103 * the string) by looking for the matching ']' */
12105 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12108 vFAIL2("Unmatched '%c' in POSIX class", c);
12110 /* Grandfather lone [:, [=, [. */
12114 const char* const t = RExC_parse++; /* skip over the c */
12117 if (UCHARAT(RExC_parse) == ']') {
12118 const char *posixcc = s + 1;
12119 RExC_parse++; /* skip over the ending ] */
12122 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12123 const I32 skip = t - posixcc;
12125 /* Initially switch on the length of the name. */
12128 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12129 this is the Perl \w
12131 namedclass = ANYOF_WORDCHAR;
12134 /* Names all of length 5. */
12135 /* alnum alpha ascii blank cntrl digit graph lower
12136 print punct space upper */
12137 /* Offset 4 gives the best switch position. */
12138 switch (posixcc[4]) {
12140 if (memEQ(posixcc, "alph", 4)) /* alpha */
12141 namedclass = ANYOF_ALPHA;
12144 if (memEQ(posixcc, "spac", 4)) /* space */
12145 namedclass = ANYOF_PSXSPC;
12148 if (memEQ(posixcc, "grap", 4)) /* graph */
12149 namedclass = ANYOF_GRAPH;
12152 if (memEQ(posixcc, "asci", 4)) /* ascii */
12153 namedclass = ANYOF_ASCII;
12156 if (memEQ(posixcc, "blan", 4)) /* blank */
12157 namedclass = ANYOF_BLANK;
12160 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12161 namedclass = ANYOF_CNTRL;
12164 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12165 namedclass = ANYOF_ALPHANUMERIC;
12168 if (memEQ(posixcc, "lowe", 4)) /* lower */
12169 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12170 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12171 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12174 if (memEQ(posixcc, "digi", 4)) /* digit */
12175 namedclass = ANYOF_DIGIT;
12176 else if (memEQ(posixcc, "prin", 4)) /* print */
12177 namedclass = ANYOF_PRINT;
12178 else if (memEQ(posixcc, "punc", 4)) /* punct */
12179 namedclass = ANYOF_PUNCT;
12184 if (memEQ(posixcc, "xdigit", 6))
12185 namedclass = ANYOF_XDIGIT;
12189 if (namedclass == OOB_NAMEDCLASS)
12191 "POSIX class [:%"UTF8f":] unknown",
12192 UTF8fARG(UTF, t - s - 1, s + 1));
12194 /* The #defines are structured so each complement is +1 to
12195 * the normal one */
12199 assert (posixcc[skip] == ':');
12200 assert (posixcc[skip+1] == ']');
12201 } else if (!SIZE_ONLY) {
12202 /* [[=foo=]] and [[.foo.]] are still future. */
12204 /* adjust RExC_parse so the warning shows after
12205 the class closes */
12206 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12208 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12211 /* Maternal grandfather:
12212 * "[:" ending in ":" but not in ":]" */
12214 vFAIL("Unmatched '[' in POSIX class");
12217 /* Grandfather lone [:, [=, [. */
12227 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12229 /* This applies some heuristics at the current parse position (which should
12230 * be at a '[') to see if what follows might be intended to be a [:posix:]
12231 * class. It returns true if it really is a posix class, of course, but it
12232 * also can return true if it thinks that what was intended was a posix
12233 * class that didn't quite make it.
12235 * It will return true for
12237 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12238 * ')' indicating the end of the (?[
12239 * [:any garbage including %^&$ punctuation:]
12241 * This is designed to be called only from S_handle_regex_sets; it could be
12242 * easily adapted to be called from the spot at the beginning of regclass()
12243 * that checks to see in a normal bracketed class if the surrounding []
12244 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12245 * change long-standing behavior, so I (khw) didn't do that */
12246 char* p = RExC_parse + 1;
12247 char first_char = *p;
12249 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12251 assert(*(p - 1) == '[');
12253 if (! POSIXCC(first_char)) {
12258 while (p < RExC_end && isWORDCHAR(*p)) p++;
12260 if (p >= RExC_end) {
12264 if (p - RExC_parse > 2 /* Got at least 1 word character */
12265 && (*p == first_char
12266 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12271 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12274 && p - RExC_parse > 2 /* [:] evaluates to colon;
12275 [::] is a bad posix class. */
12276 && first_char == *(p - 1));
12280 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
12281 char * const oregcomp_parse)
12283 /* Handle the (?[...]) construct to do set operations */
12286 UV start, end; /* End points of code point ranges */
12288 char *save_end, *save_parse;
12293 const bool save_fold = FOLD;
12295 GET_RE_DEBUG_FLAGS_DECL;
12297 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12300 vFAIL("(?[...]) not valid in locale");
12302 RExC_uni_semantics = 1;
12304 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12305 * (such as EXACT). Thus we can skip most everything if just sizing. We
12306 * call regclass to handle '[]' so as to not have to reinvent its parsing
12307 * rules here (throwing away the size it computes each time). And, we exit
12308 * upon an unescaped ']' that isn't one ending a regclass. To do both
12309 * these things, we need to realize that something preceded by a backslash
12310 * is escaped, so we have to keep track of backslashes */
12312 UV depth = 0; /* how many nested (?[...]) constructs */
12314 Perl_ck_warner_d(aTHX_
12315 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12316 "The regex_sets feature is experimental" REPORT_LOCATION,
12317 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12318 UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
12320 while (RExC_parse < RExC_end) {
12321 SV* current = NULL;
12322 RExC_parse = regpatws(pRExC_state, RExC_parse,
12323 TRUE); /* means recognize comments */
12324 switch (*RExC_parse) {
12326 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12331 /* Skip the next byte (which could cause us to end up in
12332 * the middle of a UTF-8 character, but since none of those
12333 * are confusable with anything we currently handle in this
12334 * switch (invariants all), it's safe. We'll just hit the
12335 * default: case next time and keep on incrementing until
12336 * we find one of the invariants we do handle. */
12341 /* If this looks like it is a [:posix:] class, leave the
12342 * parse pointer at the '[' to fool regclass() into
12343 * thinking it is part of a '[[:posix:]]'. That function
12344 * will use strict checking to force a syntax error if it
12345 * doesn't work out to a legitimate class */
12346 bool is_posix_class
12347 = could_it_be_a_POSIX_class(pRExC_state);
12348 if (! is_posix_class) {
12352 /* regclass() can only return RESTART_UTF8 if multi-char
12353 folds are allowed. */
12354 if (!regclass(pRExC_state, flagp,depth+1,
12355 is_posix_class, /* parse the whole char
12356 class only if not a
12358 FALSE, /* don't allow multi-char folds */
12359 TRUE, /* silence non-portable warnings. */
12361 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12364 /* function call leaves parse pointing to the ']', except
12365 * if we faked it */
12366 if (is_posix_class) {
12370 SvREFCNT_dec(current); /* In case it returned something */
12375 if (depth--) break;
12377 if (RExC_parse < RExC_end
12378 && *RExC_parse == ')')
12380 node = reganode(pRExC_state, ANYOF, 0);
12381 RExC_size += ANYOF_SKIP;
12382 nextchar(pRExC_state);
12383 Set_Node_Length(node,
12384 RExC_parse - oregcomp_parse + 1); /* MJD */
12393 FAIL("Syntax error in (?[...])");
12396 /* Pass 2 only after this. Everything in this construct is a
12397 * metacharacter. Operands begin with either a '\' (for an escape
12398 * sequence), or a '[' for a bracketed character class. Any other
12399 * character should be an operator, or parenthesis for grouping. Both
12400 * types of operands are handled by calling regclass() to parse them. It
12401 * is called with a parameter to indicate to return the computed inversion
12402 * list. The parsing here is implemented via a stack. Each entry on the
12403 * stack is a single character representing one of the operators, or the
12404 * '('; or else a pointer to an operand inversion list. */
12406 #define IS_OPERAND(a) (! SvIOK(a))
12408 /* The stack starts empty. It is a syntax error if the first thing parsed
12409 * is a binary operator; everything else is pushed on the stack. When an
12410 * operand is parsed, the top of the stack is examined. If it is a binary
12411 * operator, the item before it should be an operand, and both are replaced
12412 * by the result of doing that operation on the new operand and the one on
12413 * the stack. Thus a sequence of binary operands is reduced to a single
12414 * one before the next one is parsed.
12416 * A unary operator may immediately follow a binary in the input, for
12419 * When an operand is parsed and the top of the stack is a unary operator,
12420 * the operation is performed, and then the stack is rechecked to see if
12421 * this new operand is part of a binary operation; if so, it is handled as
12424 * A '(' is simply pushed on the stack; it is valid only if the stack is
12425 * empty, or the top element of the stack is an operator or another '('
12426 * (for which the parenthesized expression will become an operand). By the
12427 * time the corresponding ')' is parsed everything in between should have
12428 * been parsed and evaluated to a single operand (or else is a syntax
12429 * error), and is handled as a regular operand */
12431 sv_2mortal((SV *)(stack = newAV()));
12433 while (RExC_parse < RExC_end) {
12434 I32 top_index = av_tindex(stack);
12436 SV* current = NULL;
12438 /* Skip white space */
12439 RExC_parse = regpatws(pRExC_state, RExC_parse,
12440 TRUE); /* means recognize comments */
12441 if (RExC_parse >= RExC_end) {
12442 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12444 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12451 if (av_tindex(stack) >= 0 /* This makes sure that we can
12452 safely subtract 1 from
12453 RExC_parse in the next clause.
12454 If we have something on the
12455 stack, we have parsed something
12457 && UCHARAT(RExC_parse - 1) == '('
12458 && RExC_parse < RExC_end)
12460 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12461 * This happens when we have some thing like
12463 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12465 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12467 * Here we would be handling the interpolated
12468 * '$thai_or_lao'. We handle this by a recursive call to
12469 * ourselves which returns the inversion list the
12470 * interpolated expression evaluates to. We use the flags
12471 * from the interpolated pattern. */
12472 U32 save_flags = RExC_flags;
12473 const char * const save_parse = ++RExC_parse;
12475 parse_lparen_question_flags(pRExC_state);
12477 if (RExC_parse == save_parse /* Makes sure there was at
12478 least one flag (or this
12479 embedding wasn't compiled)
12481 || RExC_parse >= RExC_end - 4
12482 || UCHARAT(RExC_parse) != ':'
12483 || UCHARAT(++RExC_parse) != '('
12484 || UCHARAT(++RExC_parse) != '?'
12485 || UCHARAT(++RExC_parse) != '[')
12488 /* In combination with the above, this moves the
12489 * pointer to the point just after the first erroneous
12490 * character (or if there are no flags, to where they
12491 * should have been) */
12492 if (RExC_parse >= RExC_end - 4) {
12493 RExC_parse = RExC_end;
12495 else if (RExC_parse != save_parse) {
12496 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12498 vFAIL("Expecting '(?flags:(?[...'");
12501 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
12502 depth+1, oregcomp_parse);
12504 /* Here, 'current' contains the embedded expression's
12505 * inversion list, and RExC_parse points to the trailing
12506 * ']'; the next character should be the ')' which will be
12507 * paired with the '(' that has been put on the stack, so
12508 * the whole embedded expression reduces to '(operand)' */
12511 RExC_flags = save_flags;
12512 goto handle_operand;
12517 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12518 vFAIL("Unexpected character");
12521 /* regclass() can only return RESTART_UTF8 if multi-char
12522 folds are allowed. */
12523 if (!regclass(pRExC_state, flagp,depth+1,
12524 TRUE, /* means parse just the next thing */
12525 FALSE, /* don't allow multi-char folds */
12526 FALSE, /* don't silence non-portable warnings. */
12528 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12530 /* regclass() will return with parsing just the \ sequence,
12531 * leaving the parse pointer at the next thing to parse */
12533 goto handle_operand;
12535 case '[': /* Is a bracketed character class */
12537 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12539 if (! is_posix_class) {
12543 /* regclass() can only return RESTART_UTF8 if multi-char
12544 folds are allowed. */
12545 if(!regclass(pRExC_state, flagp,depth+1,
12546 is_posix_class, /* parse the whole char class
12547 only if not a posix class */
12548 FALSE, /* don't allow multi-char folds */
12549 FALSE, /* don't silence non-portable warnings. */
12551 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12553 /* function call leaves parse pointing to the ']', except if we
12555 if (is_posix_class) {
12559 goto handle_operand;
12568 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12569 || ! IS_OPERAND(*top_ptr))
12572 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12574 av_push(stack, newSVuv(curchar));
12578 av_push(stack, newSVuv(curchar));
12582 if (top_index >= 0) {
12583 top_ptr = av_fetch(stack, top_index, FALSE);
12585 if (IS_OPERAND(*top_ptr)) {
12587 vFAIL("Unexpected '(' with no preceding operator");
12590 av_push(stack, newSVuv(curchar));
12597 || ! (current = av_pop(stack))
12598 || ! IS_OPERAND(current)
12599 || ! (lparen = av_pop(stack))
12600 || IS_OPERAND(lparen)
12601 || SvUV(lparen) != '(')
12603 SvREFCNT_dec(current);
12605 vFAIL("Unexpected ')'");
12608 SvREFCNT_dec_NN(lparen);
12615 /* Here, we have an operand to process, in 'current' */
12617 if (top_index < 0) { /* Just push if stack is empty */
12618 av_push(stack, current);
12621 SV* top = av_pop(stack);
12623 char current_operator;
12625 if (IS_OPERAND(top)) {
12626 SvREFCNT_dec_NN(top);
12627 SvREFCNT_dec_NN(current);
12628 vFAIL("Operand with no preceding operator");
12630 current_operator = (char) SvUV(top);
12631 switch (current_operator) {
12632 case '(': /* Push the '(' back on followed by the new
12634 av_push(stack, top);
12635 av_push(stack, current);
12636 SvREFCNT_inc(top); /* Counters the '_dec' done
12637 just after the 'break', so
12638 it doesn't get wrongly freed
12643 _invlist_invert(current);
12645 /* Unlike binary operators, the top of the stack,
12646 * now that this unary one has been popped off, may
12647 * legally be an operator, and we now have operand
12650 SvREFCNT_dec_NN(top);
12651 goto handle_operand;
12654 prev = av_pop(stack);
12655 _invlist_intersection(prev,
12658 av_push(stack, current);
12663 prev = av_pop(stack);
12664 _invlist_union(prev, current, ¤t);
12665 av_push(stack, current);
12669 prev = av_pop(stack);;
12670 _invlist_subtract(prev, current, ¤t);
12671 av_push(stack, current);
12674 case '^': /* The union minus the intersection */
12680 prev = av_pop(stack);
12681 _invlist_union(prev, current, &u);
12682 _invlist_intersection(prev, current, &i);
12683 /* _invlist_subtract will overwrite current
12684 without freeing what it already contains */
12686 _invlist_subtract(u, i, ¤t);
12687 av_push(stack, current);
12688 SvREFCNT_dec_NN(i);
12689 SvREFCNT_dec_NN(u);
12690 SvREFCNT_dec_NN(element);
12695 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12697 SvREFCNT_dec_NN(top);
12698 SvREFCNT_dec(prev);
12702 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12705 if (av_tindex(stack) < 0 /* Was empty */
12706 || ((final = av_pop(stack)) == NULL)
12707 || ! IS_OPERAND(final)
12708 || av_tindex(stack) >= 0) /* More left on stack */
12710 vFAIL("Incomplete expression within '(?[ ])'");
12713 /* Here, 'final' is the resultant inversion list from evaluating the
12714 * expression. Return it if so requested */
12715 if (return_invlist) {
12716 *return_invlist = final;
12720 /* Otherwise generate a resultant node, based on 'final'. regclass() is
12721 * expecting a string of ranges and individual code points */
12722 invlist_iterinit(final);
12723 result_string = newSVpvs("");
12724 while (invlist_iternext(final, &start, &end)) {
12725 if (start == end) {
12726 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12729 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12734 save_parse = RExC_parse;
12735 RExC_parse = SvPV(result_string, len);
12736 save_end = RExC_end;
12737 RExC_end = RExC_parse + len;
12739 /* We turn off folding around the call, as the class we have constructed
12740 * already has all folding taken into consideration, and we don't want
12741 * regclass() to add to that */
12742 RExC_flags &= ~RXf_PMf_FOLD;
12743 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12745 node = regclass(pRExC_state, flagp,depth+1,
12746 FALSE, /* means parse the whole char class */
12747 FALSE, /* don't allow multi-char folds */
12748 TRUE, /* silence non-portable warnings. The above may very
12749 well have generated non-portable code points, but
12750 they're valid on this machine */
12753 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12756 RExC_flags |= RXf_PMf_FOLD;
12758 RExC_parse = save_parse + 1;
12759 RExC_end = save_end;
12760 SvREFCNT_dec_NN(final);
12761 SvREFCNT_dec_NN(result_string);
12763 nextchar(pRExC_state);
12764 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12769 /* The names of properties whose definitions are not known at compile time are
12770 * stored in this SV, after a constant heading. So if the length has been
12771 * changed since initialization, then there is a run-time definition. */
12772 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12775 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12776 const bool stop_at_1, /* Just parse the next thing, don't
12777 look for a full character class */
12778 bool allow_multi_folds,
12779 const bool silence_non_portable, /* Don't output warnings
12782 SV** ret_invlist) /* Return an inversion list, not a node */
12784 /* parse a bracketed class specification. Most of these will produce an
12785 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12786 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
12787 * under /i with multi-character folds: it will be rewritten following the
12788 * paradigm of this example, where the <multi-fold>s are characters which
12789 * fold to multiple character sequences:
12790 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12791 * gets effectively rewritten as:
12792 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12793 * reg() gets called (recursively) on the rewritten version, and this
12794 * function will return what it constructs. (Actually the <multi-fold>s
12795 * aren't physically removed from the [abcdefghi], it's just that they are
12796 * ignored in the recursion by means of a flag:
12797 * <RExC_in_multi_char_class>.)
12799 * ANYOF nodes contain a bit map for the first 256 characters, with the
12800 * corresponding bit set if that character is in the list. For characters
12801 * above 255, a range list or swash is used. There are extra bits for \w,
12802 * etc. in locale ANYOFs, as what these match is not determinable at
12805 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12806 * to be restarted. This can only happen if ret_invlist is non-NULL.
12810 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12812 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12815 IV namedclass = OOB_NAMEDCLASS;
12816 char *rangebegin = NULL;
12817 bool need_class = 0;
12819 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12820 than just initialized. */
12821 SV* properties = NULL; /* Code points that match \p{} \P{} */
12822 SV* posixes = NULL; /* Code points that match classes like [:word:],
12823 extended beyond the Latin1 range. These have to
12824 be kept separate from other code points for much
12825 of this function because their handling is
12826 different under /i, and for most classes under
12828 UV element_count = 0; /* Number of distinct elements in the class.
12829 Optimizations may be possible if this is tiny */
12830 AV * multi_char_matches = NULL; /* Code points that fold to more than one
12831 character; used under /i */
12833 char * stop_ptr = RExC_end; /* where to stop parsing */
12834 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12836 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12838 /* Unicode properties are stored in a swash; this holds the current one
12839 * being parsed. If this swash is the only above-latin1 component of the
12840 * character class, an optimization is to pass it directly on to the
12841 * execution engine. Otherwise, it is set to NULL to indicate that there
12842 * are other things in the class that have to be dealt with at execution
12844 SV* swash = NULL; /* Code points that match \p{} \P{} */
12846 /* Set if a component of this character class is user-defined; just passed
12847 * on to the engine */
12848 bool has_user_defined_property = FALSE;
12850 /* inversion list of code points this node matches only when the target
12851 * string is in UTF-8. (Because is under /d) */
12852 SV* depends_list = NULL;
12854 /* inversion list of code points this node matches. For much of the
12855 * function, it includes only those that match regardless of the utf8ness
12856 * of the target string */
12857 SV* cp_list = NULL;
12860 /* In a range, counts how many 0-2 of the ends of it came from literals,
12861 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
12862 UV literal_endpoint = 0;
12864 bool invert = FALSE; /* Is this class to be complemented */
12866 /* Is there any thing like \W or [:^digit:] that matches above the legal
12867 * Unicode range? */
12868 bool runtime_posix_matches_above_Unicode = FALSE;
12870 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12871 case we need to change the emitted regop to an EXACT. */
12872 const char * orig_parse = RExC_parse;
12873 const SSize_t orig_size = RExC_size;
12874 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
12875 GET_RE_DEBUG_FLAGS_DECL;
12877 PERL_ARGS_ASSERT_REGCLASS;
12879 PERL_UNUSED_ARG(depth);
12882 DEBUG_PARSE("clas");
12884 /* Assume we are going to generate an ANYOF node. */
12885 ret = reganode(pRExC_state, ANYOF, 0);
12888 RExC_size += ANYOF_SKIP;
12889 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12892 ANYOF_FLAGS(ret) = 0;
12894 RExC_emit += ANYOF_SKIP;
12896 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12898 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12899 initial_listsv_len = SvCUR(listsv);
12900 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
12904 RExC_parse = regpatws(pRExC_state, RExC_parse,
12905 FALSE /* means don't recognize comments */);
12908 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12911 allow_multi_folds = FALSE;
12914 RExC_parse = regpatws(pRExC_state, RExC_parse,
12915 FALSE /* means don't recognize comments */);
12919 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12920 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12921 const char *s = RExC_parse;
12922 const char c = *s++;
12924 while (isWORDCHAR(*s))
12926 if (*s && c == *s && s[1] == ']') {
12927 SAVEFREESV(RExC_rx_sv);
12929 "POSIX syntax [%c %c] belongs inside character classes",
12931 (void)ReREFCNT_inc(RExC_rx_sv);
12935 /* If the caller wants us to just parse a single element, accomplish this
12936 * by faking the loop ending condition */
12937 if (stop_at_1 && RExC_end > RExC_parse) {
12938 stop_ptr = RExC_parse + 1;
12941 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12942 if (UCHARAT(RExC_parse) == ']')
12943 goto charclassloop;
12947 if (RExC_parse >= stop_ptr) {
12952 RExC_parse = regpatws(pRExC_state, RExC_parse,
12953 FALSE /* means don't recognize comments */);
12956 if (UCHARAT(RExC_parse) == ']') {
12962 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12963 save_value = value;
12964 save_prevvalue = prevvalue;
12967 rangebegin = RExC_parse;
12971 value = utf8n_to_uvchr((U8*)RExC_parse,
12972 RExC_end - RExC_parse,
12973 &numlen, UTF8_ALLOW_DEFAULT);
12974 RExC_parse += numlen;
12977 value = UCHARAT(RExC_parse++);
12980 && RExC_parse < RExC_end
12981 && POSIXCC(UCHARAT(RExC_parse)))
12983 namedclass = regpposixcc(pRExC_state, value, strict);
12985 else if (value == '\\') {
12987 value = utf8n_to_uvchr((U8*)RExC_parse,
12988 RExC_end - RExC_parse,
12989 &numlen, UTF8_ALLOW_DEFAULT);
12990 RExC_parse += numlen;
12993 value = UCHARAT(RExC_parse++);
12995 /* Some compilers cannot handle switching on 64-bit integer
12996 * values, therefore value cannot be an UV. Yes, this will
12997 * be a problem later if we want switch on Unicode.
12998 * A similar issue a little bit later when switching on
12999 * namedclass. --jhi */
13001 /* If the \ is escaping white space when white space is being
13002 * skipped, it means that that white space is wanted literally, and
13003 * is already in 'value'. Otherwise, need to translate the escape
13004 * into what it signifies. */
13005 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13007 case 'w': namedclass = ANYOF_WORDCHAR; break;
13008 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13009 case 's': namedclass = ANYOF_SPACE; break;
13010 case 'S': namedclass = ANYOF_NSPACE; break;
13011 case 'd': namedclass = ANYOF_DIGIT; break;
13012 case 'D': namedclass = ANYOF_NDIGIT; break;
13013 case 'v': namedclass = ANYOF_VERTWS; break;
13014 case 'V': namedclass = ANYOF_NVERTWS; break;
13015 case 'h': namedclass = ANYOF_HORIZWS; break;
13016 case 'H': namedclass = ANYOF_NHORIZWS; break;
13017 case 'N': /* Handle \N{NAME} in class */
13019 /* We only pay attention to the first char of
13020 multichar strings being returned. I kinda wonder
13021 if this makes sense as it does change the behaviour
13022 from earlier versions, OTOH that behaviour was broken
13024 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13025 TRUE, /* => charclass */
13028 if (*flagp & RESTART_UTF8)
13029 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13039 /* We will handle any undefined properties ourselves */
13040 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13041 /* And we actually would prefer to get
13042 * the straight inversion list of the
13043 * swash, since we will be accessing it
13044 * anyway, to save a little time */
13045 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13047 if (RExC_parse >= RExC_end)
13048 vFAIL2("Empty \\%c{}", (U8)value);
13049 if (*RExC_parse == '{') {
13050 const U8 c = (U8)value;
13051 e = strchr(RExC_parse++, '}');
13053 vFAIL2("Missing right brace on \\%c{}", c);
13054 while (isSPACE(UCHARAT(RExC_parse)))
13056 if (e == RExC_parse)
13057 vFAIL2("Empty \\%c{}", c);
13058 n = e - RExC_parse;
13059 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13071 if (UCHARAT(RExC_parse) == '^') {
13074 /* toggle. (The rhs xor gets the single bit that
13075 * differs between P and p; the other xor inverts just
13077 value ^= 'P' ^ 'p';
13079 while (isSPACE(UCHARAT(RExC_parse))) {
13084 /* Try to get the definition of the property into
13085 * <invlist>. If /i is in effect, the effective property
13086 * will have its name be <__NAME_i>. The design is
13087 * discussed in commit
13088 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13089 formatted = Perl_form(aTHX_
13091 (FOLD) ? "__" : "",
13096 name = savepvn(formatted, strlen(formatted));
13098 /* Look up the property name, and get its swash and
13099 * inversion list, if the property is found */
13101 SvREFCNT_dec_NN(swash);
13103 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13106 NULL, /* No inversion list */
13109 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13111 SvREFCNT_dec_NN(swash);
13115 /* Here didn't find it. It could be a user-defined
13116 * property that will be available at run-time. If we
13117 * accept only compile-time properties, is an error;
13118 * otherwise add it to the list for run-time look up */
13120 RExC_parse = e + 1;
13122 "Property '%"UTF8f"' is unknown",
13123 UTF8fARG(UTF, n, name));
13125 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13126 (value == 'p' ? '+' : '!'),
13127 UTF8fARG(UTF, n, name));
13128 has_user_defined_property = TRUE;
13130 /* We don't know yet, so have to assume that the
13131 * property could match something in the Latin1 range,
13132 * hence something that isn't utf8. Note that this
13133 * would cause things in <depends_list> to match
13134 * inappropriately, except that any \p{}, including
13135 * this one forces Unicode semantics, which means there
13136 * is <no depends_list> */
13137 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13141 /* Here, did get the swash and its inversion list. If
13142 * the swash is from a user-defined property, then this
13143 * whole character class should be regarded as such */
13144 has_user_defined_property =
13146 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
13148 /* Invert if asking for the complement */
13149 if (value == 'P') {
13150 _invlist_union_complement_2nd(properties,
13154 /* The swash can't be used as-is, because we've
13155 * inverted things; delay removing it to here after
13156 * have copied its invlist above */
13157 SvREFCNT_dec_NN(swash);
13161 _invlist_union(properties, invlist, &properties);
13166 RExC_parse = e + 1;
13167 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13170 /* \p means they want Unicode semantics */
13171 RExC_uni_semantics = 1;
13174 case 'n': value = '\n'; break;
13175 case 'r': value = '\r'; break;
13176 case 't': value = '\t'; break;
13177 case 'f': value = '\f'; break;
13178 case 'b': value = '\b'; break;
13179 case 'e': value = ASCII_TO_NATIVE('\033');break;
13180 case 'a': value = '\a'; break;
13182 RExC_parse--; /* function expects to be pointed at the 'o' */
13184 const char* error_msg;
13185 bool valid = grok_bslash_o(&RExC_parse,
13188 SIZE_ONLY, /* warnings in pass
13191 silence_non_portable,
13197 if (PL_encoding && value < 0x100) {
13198 goto recode_encoding;
13202 RExC_parse--; /* function expects to be pointed at the 'x' */
13204 const char* error_msg;
13205 bool valid = grok_bslash_x(&RExC_parse,
13208 TRUE, /* Output warnings */
13210 silence_non_portable,
13216 if (PL_encoding && value < 0x100)
13217 goto recode_encoding;
13220 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
13222 case '0': case '1': case '2': case '3': case '4':
13223 case '5': case '6': case '7':
13225 /* Take 1-3 octal digits */
13226 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13227 numlen = (strict) ? 4 : 3;
13228 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13229 RExC_parse += numlen;
13232 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13233 vFAIL("Need exactly 3 octal digits");
13235 else if (! SIZE_ONLY /* like \08, \178 */
13237 && RExC_parse < RExC_end
13238 && isDIGIT(*RExC_parse)
13239 && ckWARN(WARN_REGEXP))
13241 SAVEFREESV(RExC_rx_sv);
13242 reg_warn_non_literal_string(
13244 form_short_octal_warning(RExC_parse, numlen));
13245 (void)ReREFCNT_inc(RExC_rx_sv);
13248 if (PL_encoding && value < 0x100)
13249 goto recode_encoding;
13253 if (! RExC_override_recoding) {
13254 SV* enc = PL_encoding;
13255 value = reg_recode((const char)(U8)value, &enc);
13258 vFAIL("Invalid escape in the specified encoding");
13260 else if (SIZE_ONLY) {
13261 ckWARNreg(RExC_parse,
13262 "Invalid escape in the specified encoding");
13268 /* Allow \_ to not give an error */
13269 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13271 vFAIL2("Unrecognized escape \\%c in character class",
13275 SAVEFREESV(RExC_rx_sv);
13276 ckWARN2reg(RExC_parse,
13277 "Unrecognized escape \\%c in character class passed through",
13279 (void)ReREFCNT_inc(RExC_rx_sv);
13283 } /* End of switch on char following backslash */
13284 } /* end of handling backslash escape sequences */
13287 literal_endpoint++;
13290 /* Here, we have the current token in 'value' */
13292 /* What matches in a locale is not known until runtime. This includes
13293 * what the Posix classes (like \w, [:space:]) match. Room must be
13294 * reserved (one time per outer bracketed class) to store such classes,
13295 * either if Perl is compiled so that locale nodes always should have
13296 * this space, or if there is such posix class info to be stored. The
13297 * space will contain a bit for each named class that is to be matched
13298 * against. This isn't needed for \p{} and pseudo-classes, as they are
13299 * not affected by locale, and hence are dealt with separately */
13302 && (ANYOF_LOCALE == ANYOF_POSIXL
13303 || (namedclass > OOB_NAMEDCLASS
13304 && namedclass < ANYOF_POSIXL_MAX)))
13308 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13311 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13313 ANYOF_POSIXL_ZERO(ret);
13314 ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13317 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13320 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13321 * literal, as is the character that began the false range, i.e.
13322 * the 'a' in the examples */
13325 const int w = (RExC_parse >= rangebegin)
13326 ? RExC_parse - rangebegin
13330 "False [] range \"%"UTF8f"\"",
13331 UTF8fARG(UTF, w, rangebegin));
13334 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13335 ckWARN2reg(RExC_parse,
13336 "False [] range \"%"UTF8f"\"",
13337 UTF8fARG(UTF, w, rangebegin));
13338 (void)ReREFCNT_inc(RExC_rx_sv);
13339 cp_list = add_cp_to_invlist(cp_list, '-');
13340 cp_list = add_cp_to_invlist(cp_list, prevvalue);
13344 range = 0; /* this was not a true range */
13345 element_count += 2; /* So counts for three values */
13348 classnum = namedclass_to_classnum(namedclass);
13350 if (LOC && namedclass < ANYOF_POSIXL_MAX
13351 #ifndef HAS_ISASCII
13352 && classnum != _CC_ASCII
13354 #ifndef HAS_ISBLANK
13355 && classnum != _CC_BLANK
13358 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13359 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13363 posixl_matches_all = TRUE;
13366 ANYOF_POSIXL_SET(ret, namedclass);
13368 /* XXX After have made all the posix classes known at compile time
13369 * we can move the LOC handling below to above */
13372 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
13373 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13375 /* Here, should be \h, \H, \v, or \V. Neither /d nor
13376 * /l make a difference in what these match. There
13377 * would be problems if these characters had folds
13378 * other than themselves, as cp_list is subject to
13380 if (classnum != _CC_VERTSPACE) {
13381 assert( namedclass == ANYOF_HORIZWS
13382 || namedclass == ANYOF_NHORIZWS);
13384 /* It turns out that \h is just a synonym for
13386 classnum = _CC_BLANK;
13389 _invlist_union_maybe_complement_2nd(
13391 PL_XPosix_ptrs[classnum],
13392 cBOOL(namedclass % 2), /* Complement if odd
13393 (NHORIZWS, NVERTWS)
13398 else if (classnum == _CC_ASCII) {
13401 ANYOF_POSIXL_SET(ret, namedclass);
13404 #endif /* Not isascii(); just use the hard-coded definition for it */
13406 _invlist_union_maybe_complement_2nd(
13408 PL_Posix_ptrs[_CC_ASCII],
13409 cBOOL(namedclass % 2), /* Complement if odd
13413 /* The code points 128-255 added above will be
13414 * subtracted out below under /d, so the flag needs to
13416 if (namedclass == ANYOF_NASCII && DEPENDS_SEMANTICS) {
13417 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13421 else { /* Garden variety class */
13423 /* The ascii range inversion list */
13424 SV* ascii_source = PL_Posix_ptrs[classnum];
13426 /* The full Latin1 range inversion list */
13427 SV* l1_source = PL_L1Posix_ptrs[classnum];
13429 /* This code is structured into two major clauses. The
13430 * first is for classes whose complete definitions may not
13431 * already be known. If not, the Latin1 definition
13432 * (guaranteed to already known) is used plus code is
13433 * generated to load the rest at run-time (only if needed).
13434 * If the complete definition is known, it drops down to
13435 * the second clause, where the complete definition is
13438 if (classnum < _FIRST_NON_SWASH_CC) {
13440 /* Here, the class has a swash, which may or not
13441 * already be loaded */
13443 /* The name of the property to use to match the full
13444 * eXtended Unicode range swash for this character
13446 const char *Xname = swash_property_names[classnum];
13448 /* If returning the inversion list, we can't defer
13449 * getting this until runtime */
13450 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
13451 PL_utf8_swash_ptrs[classnum] =
13452 _core_swash_init("utf8", Xname, &PL_sv_undef,
13455 NULL, /* No inversion list */
13456 NULL /* No flags */
13458 assert(PL_utf8_swash_ptrs[classnum]);
13460 if ( ! PL_utf8_swash_ptrs[classnum]) {
13461 if (namedclass % 2 == 0) { /* A non-complemented
13463 /* If not /a matching, there are code points we
13464 * don't know at compile time. Arrange for the
13465 * unknown matches to be loaded at run-time, if
13467 if (! AT_LEAST_ASCII_RESTRICTED) {
13468 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
13471 if (LOC) { /* Under locale, set run-time
13473 ANYOF_POSIXL_SET(ret, namedclass);
13476 /* Add the current class's code points to
13477 * the running total */
13478 _invlist_union(posixes,
13479 (AT_LEAST_ASCII_RESTRICTED)
13485 else { /* A complemented class */
13486 if (AT_LEAST_ASCII_RESTRICTED) {
13487 /* Under /a should match everything above
13488 * ASCII, plus the complement of the set's
13490 _invlist_union_complement_2nd(posixes,
13495 /* Arrange for the unknown matches to be
13496 * loaded at run-time, if needed */
13497 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
13499 runtime_posix_matches_above_Unicode = TRUE;
13501 ANYOF_POSIXL_SET(ret, namedclass);
13505 /* We want to match everything in
13506 * Latin1, except those things that
13507 * l1_source matches */
13508 SV* scratch_list = NULL;
13509 _invlist_subtract(PL_Latin1, l1_source,
13512 /* Add the list from this class to the
13515 posixes = scratch_list;
13518 _invlist_union(posixes,
13521 SvREFCNT_dec_NN(scratch_list);
13523 if (DEPENDS_SEMANTICS) {
13525 |= ANYOF_NON_UTF8_LATIN1_ALL;
13530 goto namedclass_done;
13533 /* Here, there is a swash loaded for the class. If no
13534 * inversion list for it yet, get it */
13535 if (! PL_XPosix_ptrs[classnum]) {
13536 PL_XPosix_ptrs[classnum]
13537 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
13541 /* Here there is an inversion list already loaded for the
13544 if (namedclass % 2 == 0) { /* A non-complemented class,
13545 like ANYOF_PUNCT */
13547 /* For non-locale, just add it to any existing list
13549 _invlist_union(posixes,
13550 (AT_LEAST_ASCII_RESTRICTED)
13552 : PL_XPosix_ptrs[classnum],
13555 else { /* Locale */
13556 SV* scratch_list = NULL;
13558 /* For above Latin1 code points, we use the full
13560 _invlist_intersection(PL_AboveLatin1,
13561 PL_XPosix_ptrs[classnum],
13563 /* And set the output to it, adding instead if
13564 * there already is an output. Checking if
13565 * 'posixes' is NULL first saves an extra clone.
13566 * Its reference count will be decremented at the
13567 * next union, etc, or if this is the only
13568 * instance, at the end of the routine */
13570 posixes = scratch_list;
13573 _invlist_union(posixes, scratch_list, &posixes);
13574 SvREFCNT_dec_NN(scratch_list);
13577 #ifndef HAS_ISBLANK
13578 if (namedclass != ANYOF_BLANK) {
13580 /* Set this class in the node for runtime
13582 ANYOF_POSIXL_SET(ret, namedclass);
13583 #ifndef HAS_ISBLANK
13586 /* No isblank(), use the hard-coded ASCII-range
13587 * blanks, adding them to the running total. */
13589 _invlist_union(posixes, ascii_source, &posixes);
13594 else { /* A complemented class, like ANYOF_NPUNCT */
13596 _invlist_union_complement_2nd(
13598 (AT_LEAST_ASCII_RESTRICTED)
13600 : PL_XPosix_ptrs[classnum],
13602 /* Under /d, everything in the upper half of the
13603 * Latin1 range matches this complement */
13604 if (DEPENDS_SEMANTICS) {
13605 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13608 else { /* Locale */
13609 SV* scratch_list = NULL;
13610 _invlist_subtract(PL_AboveLatin1,
13611 PL_XPosix_ptrs[classnum],
13614 posixes = scratch_list;
13617 _invlist_union(posixes, scratch_list, &posixes);
13618 SvREFCNT_dec_NN(scratch_list);
13620 #ifndef HAS_ISBLANK
13621 if (namedclass != ANYOF_NBLANK) {
13623 ANYOF_POSIXL_SET(ret, namedclass);
13624 #ifndef HAS_ISBLANK
13627 /* Get the list of all code points in Latin1
13628 * that are not ASCII blanks, and add them to
13629 * the running total */
13630 _invlist_subtract(PL_Latin1, ascii_source,
13632 _invlist_union(posixes, scratch_list, &posixes);
13633 SvREFCNT_dec_NN(scratch_list);
13640 continue; /* Go get next character */
13642 } /* end of namedclass \blah */
13644 /* Here, we have a single value. If 'range' is set, it is the ending
13645 * of a range--check its validity. Later, we will handle each
13646 * individual code point in the range. If 'range' isn't set, this
13647 * could be the beginning of a range, so check for that by looking
13648 * ahead to see if the next real character to be processed is the range
13649 * indicator--the minus sign */
13652 RExC_parse = regpatws(pRExC_state, RExC_parse,
13653 FALSE /* means don't recognize comments */);
13657 if (prevvalue > value) /* b-a */ {
13658 const int w = RExC_parse - rangebegin;
13660 "Invalid [] range \"%"UTF8f"\"",
13661 UTF8fARG(UTF, w, rangebegin));
13662 range = 0; /* not a valid range */
13666 prevvalue = value; /* save the beginning of the potential range */
13667 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13668 && *RExC_parse == '-')
13670 char* next_char_ptr = RExC_parse + 1;
13671 if (skip_white) { /* Get the next real char after the '-' */
13672 next_char_ptr = regpatws(pRExC_state,
13674 FALSE); /* means don't recognize
13678 /* If the '-' is at the end of the class (just before the ']',
13679 * it is a literal minus; otherwise it is a range */
13680 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13681 RExC_parse = next_char_ptr;
13683 /* a bad range like \w-, [:word:]- ? */
13684 if (namedclass > OOB_NAMEDCLASS) {
13685 if (strict || ckWARN(WARN_REGEXP)) {
13687 RExC_parse >= rangebegin ?
13688 RExC_parse - rangebegin : 0;
13690 vFAIL4("False [] range \"%*.*s\"",
13695 "False [] range \"%*.*s\"",
13700 cp_list = add_cp_to_invlist(cp_list, '-');
13704 range = 1; /* yeah, it's a range! */
13705 continue; /* but do it the next time */
13710 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13713 /* non-Latin1 code point implies unicode semantics. Must be set in
13714 * pass1 so is there for the whole of pass 2 */
13716 RExC_uni_semantics = 1;
13719 /* Ready to process either the single value, or the completed range.
13720 * For single-valued non-inverted ranges, we consider the possibility
13721 * of multi-char folds. (We made a conscious decision to not do this
13722 * for the other cases because it can often lead to non-intuitive
13723 * results. For example, you have the peculiar case that:
13724 * "s s" =~ /^[^\xDF]+$/i => Y
13725 * "ss" =~ /^[^\xDF]+$/i => N
13727 * See [perl #89750] */
13728 if (FOLD && allow_multi_folds && value == prevvalue) {
13729 if (value == LATIN_SMALL_LETTER_SHARP_S
13730 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13733 /* Here <value> is indeed a multi-char fold. Get what it is */
13735 U8 foldbuf[UTF8_MAXBYTES_CASE];
13738 UV folded = _to_uni_fold_flags(
13743 | ((LOC) ? FOLD_FLAGS_LOCALE
13744 : (ASCII_FOLD_RESTRICTED)
13745 ? FOLD_FLAGS_NOMIX_ASCII
13749 /* Here, <folded> should be the first character of the
13750 * multi-char fold of <value>, with <foldbuf> containing the
13751 * whole thing. But, if this fold is not allowed (because of
13752 * the flags), <fold> will be the same as <value>, and should
13753 * be processed like any other character, so skip the special
13755 if (folded != value) {
13757 /* Skip if we are recursed, currently parsing the class
13758 * again. Otherwise add this character to the list of
13759 * multi-char folds. */
13760 if (! RExC_in_multi_char_class) {
13761 AV** this_array_ptr;
13763 STRLEN cp_count = utf8_length(foldbuf,
13764 foldbuf + foldlen);
13765 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13767 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13770 if (! multi_char_matches) {
13771 multi_char_matches = newAV();
13774 /* <multi_char_matches> is actually an array of arrays.
13775 * There will be one or two top-level elements: [2],
13776 * and/or [3]. The [2] element is an array, each
13777 * element thereof is a character which folds to TWO
13778 * characters; [3] is for folds to THREE characters.
13779 * (Unicode guarantees a maximum of 3 characters in any
13780 * fold.) When we rewrite the character class below,
13781 * we will do so such that the longest folds are
13782 * written first, so that it prefers the longest
13783 * matching strings first. This is done even if it
13784 * turns out that any quantifier is non-greedy, out of
13785 * programmer laziness. Tom Christiansen has agreed
13786 * that this is ok. This makes the test for the
13787 * ligature 'ffi' come before the test for 'ff' */
13788 if (av_exists(multi_char_matches, cp_count)) {
13789 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13791 this_array = *this_array_ptr;
13794 this_array = newAV();
13795 av_store(multi_char_matches, cp_count,
13798 av_push(this_array, multi_fold);
13801 /* This element should not be processed further in this
13804 value = save_value;
13805 prevvalue = save_prevvalue;
13811 /* Deal with this element of the class */
13814 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13816 SV* this_range = _new_invlist(1);
13817 _append_range_to_invlist(this_range, prevvalue, value);
13819 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13820 * If this range was specified using something like 'i-j', we want
13821 * to include only the 'i' and the 'j', and not anything in
13822 * between, so exclude non-ASCII, non-alphabetics from it.
13823 * However, if the range was specified with something like
13824 * [\x89-\x91] or [\x89-j], all code points within it should be
13825 * included. literal_endpoint==2 means both ends of the range used
13826 * a literal character, not \x{foo} */
13827 if (literal_endpoint == 2
13828 && ((prevvalue >= 'a' && value <= 'z')
13829 || (prevvalue >= 'A' && value <= 'Z')))
13831 _invlist_intersection(this_range, PL_ASCII,
13833 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13836 _invlist_union(cp_list, this_range, &cp_list);
13837 literal_endpoint = 0;
13841 range = 0; /* this range (if it was one) is done now */
13842 } /* End of loop through all the text within the brackets */
13844 /* If anything in the class expands to more than one character, we have to
13845 * deal with them by building up a substitute parse string, and recursively
13846 * calling reg() on it, instead of proceeding */
13847 if (multi_char_matches) {
13848 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13851 char *save_end = RExC_end;
13852 char *save_parse = RExC_parse;
13853 bool first_time = TRUE; /* First multi-char occurrence doesn't get
13858 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13859 because too confusing */
13861 sv_catpv(substitute_parse, "(?:");
13865 /* Look at the longest folds first */
13866 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13868 if (av_exists(multi_char_matches, cp_count)) {
13869 AV** this_array_ptr;
13872 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13874 while ((this_sequence = av_pop(*this_array_ptr)) !=
13877 if (! first_time) {
13878 sv_catpv(substitute_parse, "|");
13880 first_time = FALSE;
13882 sv_catpv(substitute_parse, SvPVX(this_sequence));
13887 /* If the character class contains anything else besides these
13888 * multi-character folds, have to include it in recursive parsing */
13889 if (element_count) {
13890 sv_catpv(substitute_parse, "|[");
13891 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13892 sv_catpv(substitute_parse, "]");
13895 sv_catpv(substitute_parse, ")");
13898 /* This is a way to get the parse to skip forward a whole named
13899 * sequence instead of matching the 2nd character when it fails the
13901 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13905 RExC_parse = SvPV(substitute_parse, len);
13906 RExC_end = RExC_parse + len;
13907 RExC_in_multi_char_class = 1;
13908 RExC_emit = (regnode *)orig_emit;
13910 ret = reg(pRExC_state, 1, ®_flags, depth+1);
13912 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13914 RExC_parse = save_parse;
13915 RExC_end = save_end;
13916 RExC_in_multi_char_class = 0;
13917 SvREFCNT_dec_NN(multi_char_matches);
13921 /* If the character class contains only a single element, it may be
13922 * optimizable into another node type which is smaller and runs faster.
13923 * Check if this is the case for this class */
13924 if ((element_count == 1 && ! ret_invlist)
13925 || UNLIKELY(posixl_matches_all))
13930 if (UNLIKELY(posixl_matches_all)) {
13933 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
13934 \w or [:digit:] or \p{foo}
13937 /* All named classes are mapped into POSIXish nodes, with its FLAG
13938 * argument giving which class it is */
13939 switch ((I32)namedclass) {
13940 case ANYOF_UNIPROP:
13943 /* These don't depend on the charset modifiers. They always
13944 * match under /u rules */
13945 case ANYOF_NHORIZWS:
13946 case ANYOF_HORIZWS:
13947 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13950 case ANYOF_NVERTWS:
13955 /* The actual POSIXish node for all the rest depends on the
13956 * charset modifier. The ones in the first set depend only on
13957 * ASCII or, if available on this platform, locale */
13961 op = (LOC) ? POSIXL : POSIXA;
13972 /* under /a could be alpha */
13974 if (ASCII_RESTRICTED) {
13975 namedclass = ANYOF_ALPHA + (namedclass % 2);
13983 /* The rest have more possibilities depending on the charset.
13984 * We take advantage of the enum ordering of the charset
13985 * modifiers to get the exact node type, */
13987 op = POSIXD + get_regex_charset(RExC_flags);
13988 if (op > POSIXA) { /* /aa is same as /a */
13991 #ifndef HAS_ISBLANK
13993 && (namedclass == ANYOF_BLANK
13994 || namedclass == ANYOF_NBLANK))
14001 /* The odd numbered ones are the complements of the
14002 * next-lower even number one */
14003 if (namedclass % 2 == 1) {
14007 arg = namedclass_to_classnum(namedclass);
14011 else if (value == prevvalue) {
14013 /* Here, the class consists of just a single code point */
14016 if (! LOC && value == '\n') {
14017 op = REG_ANY; /* Optimize [^\n] */
14018 *flagp |= HASWIDTH|SIMPLE;
14022 else if (value < 256 || UTF) {
14024 /* Optimize a single value into an EXACTish node, but not if it
14025 * would require converting the pattern to UTF-8. */
14026 op = compute_EXACTish(pRExC_state);
14028 } /* Otherwise is a range */
14029 else if (! LOC) { /* locale could vary these */
14030 if (prevvalue == '0') {
14031 if (value == '9') {
14038 /* Here, we have changed <op> away from its initial value iff we found
14039 * an optimization */
14042 /* Throw away this ANYOF regnode, and emit the calculated one,
14043 * which should correspond to the beginning, not current, state of
14045 const char * cur_parse = RExC_parse;
14046 RExC_parse = (char *)orig_parse;
14050 /* To get locale nodes to not use the full ANYOF size would
14051 * require moving the code above that writes the portions
14052 * of it that aren't in other nodes to after this point.
14053 * e.g. ANYOF_POSIXL_SET */
14054 RExC_size = orig_size;
14058 RExC_emit = (regnode *)orig_emit;
14059 if (PL_regkind[op] == POSIXD) {
14061 op += NPOSIXD - POSIXD;
14066 ret = reg_node(pRExC_state, op);
14068 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14072 *flagp |= HASWIDTH|SIMPLE;
14074 else if (PL_regkind[op] == EXACT) {
14075 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14078 RExC_parse = (char *) cur_parse;
14080 SvREFCNT_dec(posixes);
14081 SvREFCNT_dec(cp_list);
14088 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14090 /* If folding, we calculate all characters that could fold to or from the
14091 * ones already on the list */
14092 if (FOLD && cp_list) {
14093 UV start, end; /* End points of code point ranges */
14095 SV* fold_intersection = NULL;
14097 /* If the highest code point is within Latin1, we can use the
14098 * compiled-in Alphas list, and not have to go out to disk. This
14099 * yields two false positives, the masculine and feminine ordinal
14100 * indicators, which are weeded out below using the
14101 * IS_IN_SOME_FOLD_L1() macro */
14102 if (invlist_highest(cp_list) < 256) {
14103 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
14104 &fold_intersection);
14108 /* Here, there are non-Latin1 code points, so we will have to go
14109 * fetch the list of all the characters that participate in folds
14111 if (! PL_utf8_foldable) {
14112 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14113 &PL_sv_undef, 1, 0);
14114 PL_utf8_foldable = _get_swash_invlist(swash);
14115 SvREFCNT_dec_NN(swash);
14118 /* This is a hash that for a particular fold gives all characters
14119 * that are involved in it */
14120 if (! PL_utf8_foldclosures) {
14122 /* If we were unable to find any folds, then we likely won't be
14123 * able to find the closures. So just create an empty list.
14124 * Folding will effectively be restricted to the non-Unicode
14125 * rules hard-coded into Perl. (This case happens legitimately
14126 * during compilation of Perl itself before the Unicode tables
14127 * are generated) */
14128 if (_invlist_len(PL_utf8_foldable) == 0) {
14129 PL_utf8_foldclosures = newHV();
14132 /* If the folds haven't been read in, call a fold function
14134 if (! PL_utf8_tofold) {
14135 U8 dummy[UTF8_MAXBYTES_CASE+1];
14137 /* This string is just a short named one above \xff */
14138 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14139 assert(PL_utf8_tofold); /* Verify that worked */
14141 PL_utf8_foldclosures =
14142 _swash_inversion_hash(PL_utf8_tofold);
14146 /* Only the characters in this class that participate in folds need
14147 * be checked. Get the intersection of this class and all the
14148 * possible characters that are foldable. This can quickly narrow
14149 * down a large class */
14150 _invlist_intersection(PL_utf8_foldable, cp_list,
14151 &fold_intersection);
14154 /* Now look at the foldable characters in this class individually */
14155 invlist_iterinit(fold_intersection);
14156 while (invlist_iternext(fold_intersection, &start, &end)) {
14159 /* Locale folding for Latin1 characters is deferred until runtime */
14160 if (LOC && start < 256) {
14164 /* Look at every character in the range */
14165 for (j = start; j <= end; j++) {
14167 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14173 /* We have the latin1 folding rules hard-coded here so that
14174 * an innocent-looking character class, like /[ks]/i won't
14175 * have to go out to disk to find the possible matches.
14176 * XXX It would be better to generate these via regen, in
14177 * case a new version of the Unicode standard adds new
14178 * mappings, though that is not really likely, and may be
14179 * caught by the default: case of the switch below. */
14181 if (IS_IN_SOME_FOLD_L1(j)) {
14183 /* ASCII is always matched; non-ASCII is matched only
14184 * under Unicode rules */
14185 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
14187 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
14191 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
14195 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14196 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14198 /* Certain Latin1 characters have matches outside
14199 * Latin1. To get here, <j> is one of those
14200 * characters. None of these matches is valid for
14201 * ASCII characters under /aa, which is why the 'if'
14202 * just above excludes those. These matches only
14203 * happen when the target string is utf8. The code
14204 * below adds the single fold closures for <j> to the
14205 * inversion list. */
14210 add_cp_to_invlist(cp_list, KELVIN_SIGN);
14214 cp_list = add_cp_to_invlist(cp_list,
14215 LATIN_SMALL_LETTER_LONG_S);
14218 cp_list = add_cp_to_invlist(cp_list,
14219 GREEK_CAPITAL_LETTER_MU);
14220 cp_list = add_cp_to_invlist(cp_list,
14221 GREEK_SMALL_LETTER_MU);
14223 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14224 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14226 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
14228 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14229 cp_list = add_cp_to_invlist(cp_list,
14230 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14232 case LATIN_SMALL_LETTER_SHARP_S:
14233 cp_list = add_cp_to_invlist(cp_list,
14234 LATIN_CAPITAL_LETTER_SHARP_S);
14236 case 'F': case 'f':
14237 case 'I': case 'i':
14238 case 'L': case 'l':
14239 case 'T': case 't':
14240 case 'A': case 'a':
14241 case 'H': case 'h':
14242 case 'J': case 'j':
14243 case 'N': case 'n':
14244 case 'W': case 'w':
14245 case 'Y': case 'y':
14246 /* These all are targets of multi-character
14247 * folds from code points that require UTF8 to
14248 * express, so they can't match unless the
14249 * target string is in UTF-8, so no action here
14250 * is necessary, as regexec.c properly handles
14251 * the general case for UTF-8 matching and
14252 * multi-char folds */
14255 /* Use deprecated warning to increase the
14256 * chances of this being output */
14257 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14264 /* Here is an above Latin1 character. We don't have the rules
14265 * hard-coded for it. First, get its fold. This is the simple
14266 * fold, as the multi-character folds have been handled earlier
14267 * and separated out */
14268 _to_uni_fold_flags(j, foldbuf, &foldlen,
14270 ? FOLD_FLAGS_LOCALE
14271 : (ASCII_FOLD_RESTRICTED)
14272 ? FOLD_FLAGS_NOMIX_ASCII
14275 /* Single character fold of above Latin1. Add everything in
14276 * its fold closure to the list that this node should match.
14277 * The fold closures data structure is a hash with the keys
14278 * being the UTF-8 of every character that is folded to, like
14279 * 'k', and the values each an array of all code points that
14280 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14281 * Multi-character folds are not included */
14282 if ((listp = hv_fetch(PL_utf8_foldclosures,
14283 (char *) foldbuf, foldlen, FALSE)))
14285 AV* list = (AV*) *listp;
14287 for (k = 0; k <= av_len(list); k++) {
14288 SV** c_p = av_fetch(list, k, FALSE);
14291 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14295 /* /aa doesn't allow folds between ASCII and non-; /l
14296 * doesn't allow them between above and below 256 */
14297 if ((ASCII_FOLD_RESTRICTED
14298 && (isASCII(c) != isASCII(j)))
14299 || (LOC && c < 256)) {
14303 /* Folds involving non-ascii Latin1 characters
14304 * under /d are added to a separate list */
14305 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14307 cp_list = add_cp_to_invlist(cp_list, c);
14310 depends_list = add_cp_to_invlist(depends_list, c);
14316 SvREFCNT_dec_NN(fold_intersection);
14319 /* And combine the result (if any) with any inversion list from posix
14320 * classes. The lists are kept separate up to now because we don't want to
14321 * fold the classes (folding of those is automatically handled by the swash
14322 * fetching code) */
14324 if (! DEPENDS_SEMANTICS) {
14326 _invlist_union(cp_list, posixes, &cp_list);
14327 SvREFCNT_dec_NN(posixes);
14334 /* Under /d, we put into a separate list the Latin1 things that
14335 * match only when the target string is utf8 */
14336 SV* nonascii_but_latin1_properties = NULL;
14337 _invlist_intersection(posixes, PL_UpperLatin1,
14338 &nonascii_but_latin1_properties);
14339 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14342 _invlist_union(cp_list, posixes, &cp_list);
14343 SvREFCNT_dec_NN(posixes);
14349 if (depends_list) {
14350 _invlist_union(depends_list, nonascii_but_latin1_properties,
14352 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14355 depends_list = nonascii_but_latin1_properties;
14360 /* And combine the result (if any) with any inversion list from properties.
14361 * The lists are kept separate up to now so that we can distinguish the two
14362 * in regards to matching above-Unicode. A run-time warning is generated
14363 * if a Unicode property is matched against a non-Unicode code point. But,
14364 * we allow user-defined properties to match anything, without any warning,
14365 * and we also suppress the warning if there is a portion of the character
14366 * class that isn't a Unicode property, and which matches above Unicode, \W
14367 * or [\x{110000}] for example.
14368 * (Note that in this case, unlike the Posix one above, there is no
14369 * <depends_list>, because having a Unicode property forces Unicode
14372 bool warn_super = ! has_user_defined_property;
14375 /* If it matters to the final outcome, see if a non-property
14376 * component of the class matches above Unicode. If so, the
14377 * warning gets suppressed. This is true even if just a single
14378 * such code point is specified, as though not strictly correct if
14379 * another such code point is matched against, the fact that they
14380 * are using above-Unicode code points indicates they should know
14381 * the issues involved */
14383 bool non_prop_matches_above_Unicode =
14384 runtime_posix_matches_above_Unicode
14385 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
14387 non_prop_matches_above_Unicode =
14388 ! non_prop_matches_above_Unicode;
14390 warn_super = ! non_prop_matches_above_Unicode;
14393 _invlist_union(properties, cp_list, &cp_list);
14394 SvREFCNT_dec_NN(properties);
14397 cp_list = properties;
14401 OP(ret) = ANYOF_WARN_SUPER;
14405 /* Here, we have calculated what code points should be in the character
14408 * Now we can see about various optimizations. Fold calculation (which we
14409 * did above) needs to take place before inversion. Otherwise /[^k]/i
14410 * would invert to include K, which under /i would match k, which it
14411 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14412 * folded until runtime */
14414 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14415 * at compile time. Besides not inverting folded locale now, we can't
14416 * invert if there are things such as \w, which aren't known until runtime
14419 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_POSIXL)))
14421 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14423 _invlist_invert(cp_list);
14425 /* Any swash can't be used as-is, because we've inverted things */
14427 SvREFCNT_dec_NN(swash);
14431 /* Clear the invert flag since have just done it here */
14436 *ret_invlist = cp_list;
14437 SvREFCNT_dec(swash);
14439 /* Discard the generated node */
14441 RExC_size = orig_size;
14444 RExC_emit = orig_emit;
14449 /* If we didn't do folding, it's because some information isn't available
14450 * until runtime; set the run-time fold flag for these. (We don't have to
14451 * worry about properties folding, as that is taken care of by the swash
14455 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14458 /* Some character classes are equivalent to other nodes. Such nodes take
14459 * up less room and generally fewer operations to execute than ANYOF nodes.
14460 * Above, we checked for and optimized into some such equivalents for
14461 * certain common classes that are easy to test. Getting to this point in
14462 * the code means that the class didn't get optimized there. Since this
14463 * code is only executed in Pass 2, it is too late to save space--it has
14464 * been allocated in Pass 1, and currently isn't given back. But turning
14465 * things into an EXACTish node can allow the optimizer to join it to any
14466 * adjacent such nodes. And if the class is equivalent to things like /./,
14467 * expensive run-time swashes can be avoided. Now that we have more
14468 * complete information, we can find things necessarily missed by the
14469 * earlier code. I (khw) am not sure how much to look for here. It would
14470 * be easy, but perhaps too slow, to check any candidates against all the
14471 * node types they could possibly match using _invlistEQ(). */
14476 && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
14477 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14480 U8 op = END; /* The optimzation node-type */
14481 const char * cur_parse= RExC_parse;
14483 invlist_iterinit(cp_list);
14484 if (! invlist_iternext(cp_list, &start, &end)) {
14486 /* Here, the list is empty. This happens, for example, when a
14487 * Unicode property is the only thing in the character class, and
14488 * it doesn't match anything. (perluniprops.pod notes such
14491 *flagp |= HASWIDTH|SIMPLE;
14493 else if (start == end) { /* The range is a single code point */
14494 if (! invlist_iternext(cp_list, &start, &end)
14496 /* Don't do this optimization if it would require changing
14497 * the pattern to UTF-8 */
14498 && (start < 256 || UTF))
14500 /* Here, the list contains a single code point. Can optimize
14501 * into an EXACT node */
14510 /* A locale node under folding with one code point can be
14511 * an EXACTFL, as its fold won't be calculated until
14517 /* Here, we are generally folding, but there is only one
14518 * code point to match. If we have to, we use an EXACT
14519 * node, but it would be better for joining with adjacent
14520 * nodes in the optimization pass if we used the same
14521 * EXACTFish node that any such are likely to be. We can
14522 * do this iff the code point doesn't participate in any
14523 * folds. For example, an EXACTF of a colon is the same as
14524 * an EXACT one, since nothing folds to or from a colon. */
14526 if (IS_IN_SOME_FOLD_L1(value)) {
14531 if (! PL_utf8_foldable) {
14532 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14533 &PL_sv_undef, 1, 0);
14534 PL_utf8_foldable = _get_swash_invlist(swash);
14535 SvREFCNT_dec_NN(swash);
14537 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14542 /* If we haven't found the node type, above, it means we
14543 * can use the prevailing one */
14545 op = compute_EXACTish(pRExC_state);
14550 else if (start == 0) {
14551 if (end == UV_MAX) {
14553 *flagp |= HASWIDTH|SIMPLE;
14556 else if (end == '\n' - 1
14557 && invlist_iternext(cp_list, &start, &end)
14558 && start == '\n' + 1 && end == UV_MAX)
14561 *flagp |= HASWIDTH|SIMPLE;
14565 invlist_iterfinish(cp_list);
14568 RExC_parse = (char *)orig_parse;
14569 RExC_emit = (regnode *)orig_emit;
14571 ret = reg_node(pRExC_state, op);
14573 RExC_parse = (char *)cur_parse;
14575 if (PL_regkind[op] == EXACT) {
14576 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14579 SvREFCNT_dec_NN(cp_list);
14584 /* Here, <cp_list> contains all the code points we can determine at
14585 * compile time that match under all conditions. Go through it, and
14586 * for things that belong in the bitmap, put them there, and delete from
14587 * <cp_list>. While we are at it, see if everything above 255 is in the
14588 * list, and if so, set a flag to speed up execution */
14590 populate_ANYOF_from_invlist(ret, &cp_list);
14593 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14596 /* Here, the bitmap has been populated with all the Latin1 code points that
14597 * always match. Can now add to the overall list those that match only
14598 * when the target string is UTF-8 (<depends_list>). */
14599 if (depends_list) {
14601 _invlist_union(cp_list, depends_list, &cp_list);
14602 SvREFCNT_dec_NN(depends_list);
14605 cp_list = depends_list;
14609 /* If there is a swash and more than one element, we can't use the swash in
14610 * the optimization below. */
14611 if (swash && element_count > 1) {
14612 SvREFCNT_dec_NN(swash);
14616 set_ANYOF_arg(pRExC_state, ret, cp_list,
14617 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14619 swash, has_user_defined_property);
14621 *flagp |= HASWIDTH|SIMPLE;
14625 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14628 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14629 regnode* const node,
14631 SV* const runtime_defns,
14633 const bool has_user_defined_property)
14635 /* Sets the arg field of an ANYOF-type node 'node', using information about
14636 * the node passed-in. If there is nothing outside the node's bitmap, the
14637 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
14638 * the count returned by add_data(), having allocated and stored an array,
14639 * av, that that count references, as follows:
14640 * av[0] stores the character class description in its textual form.
14641 * This is used later (regexec.c:Perl_regclass_swash()) to
14642 * initialize the appropriate swash, and is also useful for dumping
14643 * the regnode. This is set to &PL_sv_undef if the textual
14644 * description is not needed at run-time (as happens if the other
14645 * elements completely define the class)
14646 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14647 * computed from av[0]. But if no further computation need be done,
14648 * the swash is stored here now (and av[0] is &PL_sv_undef).
14649 * av[2] stores the cp_list inversion list for use in addition or instead
14650 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14651 * (Otherwise everything needed is already in av[0] and av[1])
14652 * av[3] is set if any component of the class is from a user-defined
14653 * property; used only if av[2] exists */
14657 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14659 if (! cp_list && ! runtime_defns) {
14660 ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14663 AV * const av = newAV();
14666 av_store(av, 0, (runtime_defns)
14667 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14669 av_store(av, 1, swash);
14670 SvREFCNT_dec_NN(cp_list);
14673 av_store(av, 1, &PL_sv_undef);
14675 av_store(av, 2, cp_list);
14676 av_store(av, 3, newSVuv(has_user_defined_property));
14680 rv = newRV_noinc(MUTABLE_SV(av));
14681 n = add_data(pRExC_state, STR_WITH_LEN("s"));
14682 RExC_rxi->data->data[n] = (void*)rv;
14688 /* reg_skipcomment()
14690 Absorbs an /x style # comments from the input stream.
14691 Returns true if there is more text remaining in the stream.
14692 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14693 terminates the pattern without including a newline.
14695 Note its the callers responsibility to ensure that we are
14696 actually in /x mode
14701 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14705 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14707 while (RExC_parse < RExC_end)
14708 if (*RExC_parse++ == '\n') {
14713 /* we ran off the end of the pattern without ending
14714 the comment, so we have to add an \n when wrapping */
14715 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14723 Advances the parse position, and optionally absorbs
14724 "whitespace" from the inputstream.
14726 Without /x "whitespace" means (?#...) style comments only,
14727 with /x this means (?#...) and # comments and whitespace proper.
14729 Returns the RExC_parse point from BEFORE the scan occurs.
14731 This is the /x friendly way of saying RExC_parse++.
14735 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14737 char* const retval = RExC_parse++;
14739 PERL_ARGS_ASSERT_NEXTCHAR;
14742 if (RExC_end - RExC_parse >= 3
14743 && *RExC_parse == '('
14744 && RExC_parse[1] == '?'
14745 && RExC_parse[2] == '#')
14747 while (*RExC_parse != ')') {
14748 if (RExC_parse == RExC_end)
14749 FAIL("Sequence (?#... not terminated");
14755 if (RExC_flags & RXf_PMf_EXTENDED) {
14756 if (isSPACE(*RExC_parse)) {
14760 else if (*RExC_parse == '#') {
14761 if ( reg_skipcomment( pRExC_state ) )
14770 - reg_node - emit a node
14772 STATIC regnode * /* Location. */
14773 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14777 regnode * const ret = RExC_emit;
14778 GET_RE_DEBUG_FLAGS_DECL;
14780 PERL_ARGS_ASSERT_REG_NODE;
14783 SIZE_ALIGN(RExC_size);
14787 if (RExC_emit >= RExC_emit_bound)
14788 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14789 op, RExC_emit, RExC_emit_bound);
14791 NODE_ALIGN_FILL(ret);
14793 FILL_ADVANCE_NODE(ptr, op);
14794 #ifdef RE_TRACK_PATTERN_OFFSETS
14795 if (RExC_offsets) { /* MJD */
14796 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14797 "reg_node", __LINE__,
14799 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14800 ? "Overwriting end of array!\n" : "OK",
14801 (UV)(RExC_emit - RExC_emit_start),
14802 (UV)(RExC_parse - RExC_start),
14803 (UV)RExC_offsets[0]));
14804 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14812 - reganode - emit a node with an argument
14814 STATIC regnode * /* Location. */
14815 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14819 regnode * const ret = RExC_emit;
14820 GET_RE_DEBUG_FLAGS_DECL;
14822 PERL_ARGS_ASSERT_REGANODE;
14825 SIZE_ALIGN(RExC_size);
14830 assert(2==regarglen[op]+1);
14832 Anything larger than this has to allocate the extra amount.
14833 If we changed this to be:
14835 RExC_size += (1 + regarglen[op]);
14837 then it wouldn't matter. Its not clear what side effect
14838 might come from that so its not done so far.
14843 if (RExC_emit >= RExC_emit_bound)
14844 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14845 op, RExC_emit, RExC_emit_bound);
14847 NODE_ALIGN_FILL(ret);
14849 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14850 #ifdef RE_TRACK_PATTERN_OFFSETS
14851 if (RExC_offsets) { /* MJD */
14852 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14856 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14857 "Overwriting end of array!\n" : "OK",
14858 (UV)(RExC_emit - RExC_emit_start),
14859 (UV)(RExC_parse - RExC_start),
14860 (UV)RExC_offsets[0]));
14861 Set_Cur_Node_Offset;
14869 - reguni - emit (if appropriate) a Unicode character
14872 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14876 PERL_ARGS_ASSERT_REGUNI;
14878 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14882 - reginsert - insert an operator in front of already-emitted operand
14884 * Means relocating the operand.
14887 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14893 const int offset = regarglen[(U8)op];
14894 const int size = NODE_STEP_REGNODE + offset;
14895 GET_RE_DEBUG_FLAGS_DECL;
14897 PERL_ARGS_ASSERT_REGINSERT;
14898 PERL_UNUSED_ARG(depth);
14899 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14900 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14909 if (RExC_open_parens) {
14911 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14912 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14913 if ( RExC_open_parens[paren] >= opnd ) {
14914 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14915 RExC_open_parens[paren] += size;
14917 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14919 if ( RExC_close_parens[paren] >= opnd ) {
14920 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14921 RExC_close_parens[paren] += size;
14923 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14928 while (src > opnd) {
14929 StructCopy(--src, --dst, regnode);
14930 #ifdef RE_TRACK_PATTERN_OFFSETS
14931 if (RExC_offsets) { /* MJD 20010112 */
14932 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14936 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14937 ? "Overwriting end of array!\n" : "OK",
14938 (UV)(src - RExC_emit_start),
14939 (UV)(dst - RExC_emit_start),
14940 (UV)RExC_offsets[0]));
14941 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14942 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14948 place = opnd; /* Op node, where operand used to be. */
14949 #ifdef RE_TRACK_PATTERN_OFFSETS
14950 if (RExC_offsets) { /* MJD */
14951 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14955 (UV)(place - RExC_emit_start) > RExC_offsets[0]
14956 ? "Overwriting end of array!\n" : "OK",
14957 (UV)(place - RExC_emit_start),
14958 (UV)(RExC_parse - RExC_start),
14959 (UV)RExC_offsets[0]));
14960 Set_Node_Offset(place, RExC_parse);
14961 Set_Node_Length(place, 1);
14964 src = NEXTOPER(place);
14965 FILL_ADVANCE_NODE(place, op);
14966 Zero(src, offset, regnode);
14970 - regtail - set the next-pointer at the end of a node chain of p to val.
14971 - SEE ALSO: regtail_study
14973 /* TODO: All three parms should be const */
14975 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14979 GET_RE_DEBUG_FLAGS_DECL;
14981 PERL_ARGS_ASSERT_REGTAIL;
14983 PERL_UNUSED_ARG(depth);
14989 /* Find last node. */
14992 regnode * const temp = regnext(scan);
14994 SV * const mysv=sv_newmortal();
14995 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14996 regprop(RExC_rx, mysv, scan);
14997 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14998 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14999 (temp == NULL ? "->" : ""),
15000 (temp == NULL ? PL_reg_name[OP(val)] : "")
15008 if (reg_off_by_arg[OP(scan)]) {
15009 ARG_SET(scan, val - scan);
15012 NEXT_OFF(scan) = val - scan;
15018 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15019 - Look for optimizable sequences at the same time.
15020 - currently only looks for EXACT chains.
15022 This is experimental code. The idea is to use this routine to perform
15023 in place optimizations on branches and groups as they are constructed,
15024 with the long term intention of removing optimization from study_chunk so
15025 that it is purely analytical.
15027 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15028 to control which is which.
15031 /* TODO: All four parms should be const */
15034 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
15039 #ifdef EXPERIMENTAL_INPLACESCAN
15042 GET_RE_DEBUG_FLAGS_DECL;
15044 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15050 /* Find last node. */
15054 regnode * const temp = regnext(scan);
15055 #ifdef EXPERIMENTAL_INPLACESCAN
15056 if (PL_regkind[OP(scan)] == EXACT) {
15057 bool has_exactf_sharp_s; /* Unexamined in this routine */
15058 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
15063 switch (OP(scan)) {
15066 case EXACTFA_NO_TRIE:
15071 if( exact == PSEUDO )
15073 else if ( exact != OP(scan) )
15082 SV * const mysv=sv_newmortal();
15083 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15084 regprop(RExC_rx, mysv, scan);
15085 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15086 SvPV_nolen_const(mysv),
15087 REG_NODE_NUM(scan),
15088 PL_reg_name[exact]);
15095 SV * const mysv_val=sv_newmortal();
15096 DEBUG_PARSE_MSG("");
15097 regprop(RExC_rx, mysv_val, val);
15098 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15099 SvPV_nolen_const(mysv_val),
15100 (IV)REG_NODE_NUM(val),
15104 if (reg_off_by_arg[OP(scan)]) {
15105 ARG_SET(scan, val - scan);
15108 NEXT_OFF(scan) = val - scan;
15116 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15121 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15126 for (bit=0; bit<32; bit++) {
15127 if (flags & (1<<bit)) {
15128 if (!set++ && lead)
15129 PerlIO_printf(Perl_debug_log, "%s",lead);
15130 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15135 PerlIO_printf(Perl_debug_log, "\n");
15137 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15142 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15148 for (bit=0; bit<32; bit++) {
15149 if (flags & (1<<bit)) {
15150 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15153 if (!set++ && lead)
15154 PerlIO_printf(Perl_debug_log, "%s",lead);
15155 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15158 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15159 if (!set++ && lead) {
15160 PerlIO_printf(Perl_debug_log, "%s",lead);
15163 case REGEX_UNICODE_CHARSET:
15164 PerlIO_printf(Perl_debug_log, "UNICODE");
15166 case REGEX_LOCALE_CHARSET:
15167 PerlIO_printf(Perl_debug_log, "LOCALE");
15169 case REGEX_ASCII_RESTRICTED_CHARSET:
15170 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15172 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15173 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15176 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15182 PerlIO_printf(Perl_debug_log, "\n");
15184 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15190 Perl_regdump(pTHX_ const regexp *r)
15194 SV * const sv = sv_newmortal();
15195 SV *dsv= sv_newmortal();
15196 RXi_GET_DECL(r,ri);
15197 GET_RE_DEBUG_FLAGS_DECL;
15199 PERL_ARGS_ASSERT_REGDUMP;
15201 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15203 /* Header fields of interest. */
15204 if (r->anchored_substr) {
15205 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15206 RE_SV_DUMPLEN(r->anchored_substr), 30);
15207 PerlIO_printf(Perl_debug_log,
15208 "anchored %s%s at %"IVdf" ",
15209 s, RE_SV_TAIL(r->anchored_substr),
15210 (IV)r->anchored_offset);
15211 } else if (r->anchored_utf8) {
15212 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15213 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15214 PerlIO_printf(Perl_debug_log,
15215 "anchored utf8 %s%s at %"IVdf" ",
15216 s, RE_SV_TAIL(r->anchored_utf8),
15217 (IV)r->anchored_offset);
15219 if (r->float_substr) {
15220 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15221 RE_SV_DUMPLEN(r->float_substr), 30);
15222 PerlIO_printf(Perl_debug_log,
15223 "floating %s%s at %"IVdf"..%"UVuf" ",
15224 s, RE_SV_TAIL(r->float_substr),
15225 (IV)r->float_min_offset, (UV)r->float_max_offset);
15226 } else if (r->float_utf8) {
15227 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15228 RE_SV_DUMPLEN(r->float_utf8), 30);
15229 PerlIO_printf(Perl_debug_log,
15230 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15231 s, RE_SV_TAIL(r->float_utf8),
15232 (IV)r->float_min_offset, (UV)r->float_max_offset);
15234 if (r->check_substr || r->check_utf8)
15235 PerlIO_printf(Perl_debug_log,
15237 (r->check_substr == r->float_substr
15238 && r->check_utf8 == r->float_utf8
15239 ? "(checking floating" : "(checking anchored"));
15240 if (r->extflags & RXf_NOSCAN)
15241 PerlIO_printf(Perl_debug_log, " noscan");
15242 if (r->extflags & RXf_CHECK_ALL)
15243 PerlIO_printf(Perl_debug_log, " isall");
15244 if (r->check_substr || r->check_utf8)
15245 PerlIO_printf(Perl_debug_log, ") ");
15247 if (ri->regstclass) {
15248 regprop(r, sv, ri->regstclass);
15249 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15251 if (r->extflags & RXf_ANCH) {
15252 PerlIO_printf(Perl_debug_log, "anchored");
15253 if (r->extflags & RXf_ANCH_BOL)
15254 PerlIO_printf(Perl_debug_log, "(BOL)");
15255 if (r->extflags & RXf_ANCH_MBOL)
15256 PerlIO_printf(Perl_debug_log, "(MBOL)");
15257 if (r->extflags & RXf_ANCH_SBOL)
15258 PerlIO_printf(Perl_debug_log, "(SBOL)");
15259 if (r->extflags & RXf_ANCH_GPOS)
15260 PerlIO_printf(Perl_debug_log, "(GPOS)");
15261 PerlIO_putc(Perl_debug_log, ' ');
15263 if (r->extflags & RXf_GPOS_SEEN)
15264 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15265 if (r->intflags & PREGf_SKIP)
15266 PerlIO_printf(Perl_debug_log, "plus ");
15267 if (r->intflags & PREGf_IMPLICIT)
15268 PerlIO_printf(Perl_debug_log, "implicit ");
15269 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15270 if (r->extflags & RXf_EVAL_SEEN)
15271 PerlIO_printf(Perl_debug_log, "with eval ");
15272 PerlIO_printf(Perl_debug_log, "\n");
15274 regdump_extflags("r->extflags: ",r->extflags);
15275 regdump_intflags("r->intflags: ",r->intflags);
15278 PERL_ARGS_ASSERT_REGDUMP;
15279 PERL_UNUSED_CONTEXT;
15280 PERL_UNUSED_ARG(r);
15281 #endif /* DEBUGGING */
15285 - regprop - printable representation of opcode
15289 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
15295 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15296 static const char * const anyofs[] = {
15297 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15298 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15299 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15300 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15301 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15302 || _CC_VERTSPACE != 16
15303 #error Need to adjust order of anyofs[]
15340 RXi_GET_DECL(prog,progi);
15341 GET_RE_DEBUG_FLAGS_DECL;
15343 PERL_ARGS_ASSERT_REGPROP;
15347 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15348 /* It would be nice to FAIL() here, but this may be called from
15349 regexec.c, and it would be hard to supply pRExC_state. */
15350 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
15351 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15353 k = PL_regkind[OP(o)];
15356 sv_catpvs(sv, " ");
15357 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15358 * is a crude hack but it may be the best for now since
15359 * we have no flag "this EXACTish node was UTF-8"
15361 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15362 PERL_PV_ESCAPE_UNI_DETECT |
15363 PERL_PV_ESCAPE_NONASCII |
15364 PERL_PV_PRETTY_ELLIPSES |
15365 PERL_PV_PRETTY_LTGT |
15366 PERL_PV_PRETTY_NOCLEAR
15368 } else if (k == TRIE) {
15369 /* print the details of the trie in dumpuntil instead, as
15370 * progi->data isn't available here */
15371 const char op = OP(o);
15372 const U32 n = ARG(o);
15373 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15374 (reg_ac_data *)progi->data->data[n] :
15376 const reg_trie_data * const trie
15377 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15379 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15380 DEBUG_TRIE_COMPILE_r(
15381 Perl_sv_catpvf(aTHX_ sv,
15382 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15383 (UV)trie->startstate,
15384 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15385 (UV)trie->wordcount,
15388 (UV)TRIE_CHARCOUNT(trie),
15389 (UV)trie->uniquecharcount
15392 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15393 sv_catpvs(sv, "[");
15394 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15396 : TRIE_BITMAP(trie));
15397 sv_catpvs(sv, "]");
15400 } else if (k == CURLY) {
15401 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15402 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15403 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15405 else if (k == WHILEM && o->flags) /* Ordinal/of */
15406 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15407 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
15408 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15409 if ( RXp_PAREN_NAMES(prog) ) {
15410 if ( k != REF || (OP(o) < NREF)) {
15411 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15412 SV **name= av_fetch(list, ARG(o), 0 );
15414 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15417 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15418 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15419 I32 *nums=(I32*)SvPVX(sv_dat);
15420 SV **name= av_fetch(list, nums[0], 0 );
15423 for ( n=0; n<SvIVX(sv_dat); n++ ) {
15424 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15425 (n ? "," : ""), (IV)nums[n]);
15427 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15431 } else if (k == GOSUB)
15432 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
15433 else if (k == VERB) {
15435 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15436 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15437 } else if (k == LOGICAL)
15438 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
15439 else if (k == ANYOF) {
15440 const U8 flags = ANYOF_FLAGS(o);
15444 if (flags & ANYOF_LOCALE)
15445 sv_catpvs(sv, "{loc}");
15446 if (flags & ANYOF_LOC_FOLD)
15447 sv_catpvs(sv, "{i}");
15448 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15449 if (flags & ANYOF_INVERT)
15450 sv_catpvs(sv, "^");
15452 /* output what the standard cp 0-255 bitmap matches */
15453 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15455 /* output any special charclass tests (used entirely under use
15457 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15459 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15460 if (ANYOF_POSIXL_TEST(o,i)) {
15461 sv_catpv(sv, anyofs[i]);
15467 if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
15468 || ANYOF_NONBITMAP(o))
15471 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15472 if (flags & ANYOF_INVERT)
15473 /*make sure the invert info is in each */
15474 sv_catpvs(sv, "^");
15477 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
15478 sv_catpvs(sv, "{non-utf8-latin1-all}");
15481 /* output information about the unicode matching */
15482 if (flags & ANYOF_ABOVE_LATIN1_ALL)
15483 sv_catpvs(sv, "{unicode_all}");
15484 else if (ANYOF_NONBITMAP(o)) {
15485 SV *lv; /* Set if there is something outside the bit map. */
15486 bool byte_output = FALSE; /* If something in the bitmap has been
15489 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15490 sv_catpvs(sv, "{outside bitmap}");
15493 sv_catpvs(sv, "{utf8}");
15496 /* Get the stuff that wasn't in the bitmap */
15497 (void) regclass_swash(prog, o, FALSE, &lv, NULL);
15498 if (lv && lv != &PL_sv_undef) {
15499 char *s = savesvpv(lv);
15500 char * const origs = s;
15502 while (*s && *s != '\n')
15506 const char * const t = ++s;
15509 sv_catpvs(sv, " ");
15515 /* Truncate very long output */
15516 if (s - origs > 256) {
15517 Perl_sv_catpvf(aTHX_ sv,
15519 (int) (s - origs - 1),
15525 else if (*s == '\t') {
15539 SvREFCNT_dec_NN(lv);
15544 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15546 else if (k == POSIXD || k == NPOSIXD) {
15547 U8 index = FLAGS(o) * 2;
15548 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
15549 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15552 if (*anyofs[index] != '[') {
15555 sv_catpv(sv, anyofs[index]);
15556 if (*anyofs[index] != '[') {
15561 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15562 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15564 PERL_UNUSED_CONTEXT;
15565 PERL_UNUSED_ARG(sv);
15566 PERL_UNUSED_ARG(o);
15567 PERL_UNUSED_ARG(prog);
15568 #endif /* DEBUGGING */
15572 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15573 { /* Assume that RE_INTUIT is set */
15575 struct regexp *const prog = ReANY(r);
15576 GET_RE_DEBUG_FLAGS_DECL;
15578 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15579 PERL_UNUSED_CONTEXT;
15583 const char * const s = SvPV_nolen_const(prog->check_substr
15584 ? prog->check_substr : prog->check_utf8);
15586 if (!PL_colorset) reginitcolors();
15587 PerlIO_printf(Perl_debug_log,
15588 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15590 prog->check_substr ? "" : "utf8 ",
15591 PL_colors[5],PL_colors[0],
15594 (strlen(s) > 60 ? "..." : ""));
15597 return prog->check_substr ? prog->check_substr : prog->check_utf8;
15603 handles refcounting and freeing the perl core regexp structure. When
15604 it is necessary to actually free the structure the first thing it
15605 does is call the 'free' method of the regexp_engine associated to
15606 the regexp, allowing the handling of the void *pprivate; member
15607 first. (This routine is not overridable by extensions, which is why
15608 the extensions free is called first.)
15610 See regdupe and regdupe_internal if you change anything here.
15612 #ifndef PERL_IN_XSUB_RE
15614 Perl_pregfree(pTHX_ REGEXP *r)
15620 Perl_pregfree2(pTHX_ REGEXP *rx)
15623 struct regexp *const r = ReANY(rx);
15624 GET_RE_DEBUG_FLAGS_DECL;
15626 PERL_ARGS_ASSERT_PREGFREE2;
15628 if (r->mother_re) {
15629 ReREFCNT_dec(r->mother_re);
15631 CALLREGFREE_PVT(rx); /* free the private data */
15632 SvREFCNT_dec(RXp_PAREN_NAMES(r));
15633 Safefree(r->xpv_len_u.xpvlenu_pv);
15636 SvREFCNT_dec(r->anchored_substr);
15637 SvREFCNT_dec(r->anchored_utf8);
15638 SvREFCNT_dec(r->float_substr);
15639 SvREFCNT_dec(r->float_utf8);
15640 Safefree(r->substrs);
15642 RX_MATCH_COPY_FREE(rx);
15643 #ifdef PERL_ANY_COW
15644 SvREFCNT_dec(r->saved_copy);
15647 SvREFCNT_dec(r->qr_anoncv);
15648 rx->sv_u.svu_rx = 0;
15653 This is a hacky workaround to the structural issue of match results
15654 being stored in the regexp structure which is in turn stored in
15655 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15656 could be PL_curpm in multiple contexts, and could require multiple
15657 result sets being associated with the pattern simultaneously, such
15658 as when doing a recursive match with (??{$qr})
15660 The solution is to make a lightweight copy of the regexp structure
15661 when a qr// is returned from the code executed by (??{$qr}) this
15662 lightweight copy doesn't actually own any of its data except for
15663 the starp/end and the actual regexp structure itself.
15669 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15671 struct regexp *ret;
15672 struct regexp *const r = ReANY(rx);
15673 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15675 PERL_ARGS_ASSERT_REG_TEMP_COPY;
15678 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15680 SvOK_off((SV *)ret_x);
15682 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15683 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
15684 made both spots point to the same regexp body.) */
15685 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15686 assert(!SvPVX(ret_x));
15687 ret_x->sv_u.svu_rx = temp->sv_any;
15688 temp->sv_any = NULL;
15689 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15690 SvREFCNT_dec_NN(temp);
15691 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15692 ing below will not set it. */
15693 SvCUR_set(ret_x, SvCUR(rx));
15696 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15697 sv_force_normal(sv) is called. */
15699 ret = ReANY(ret_x);
15701 SvFLAGS(ret_x) |= SvUTF8(rx);
15702 /* We share the same string buffer as the original regexp, on which we
15703 hold a reference count, incremented when mother_re is set below.
15704 The string pointer is copied here, being part of the regexp struct.
15706 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15707 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15709 const I32 npar = r->nparens+1;
15710 Newx(ret->offs, npar, regexp_paren_pair);
15711 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15714 Newx(ret->substrs, 1, struct reg_substr_data);
15715 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15717 SvREFCNT_inc_void(ret->anchored_substr);
15718 SvREFCNT_inc_void(ret->anchored_utf8);
15719 SvREFCNT_inc_void(ret->float_substr);
15720 SvREFCNT_inc_void(ret->float_utf8);
15722 /* check_substr and check_utf8, if non-NULL, point to either their
15723 anchored or float namesakes, and don't hold a second reference. */
15725 RX_MATCH_COPIED_off(ret_x);
15726 #ifdef PERL_ANY_COW
15727 ret->saved_copy = NULL;
15729 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15730 SvREFCNT_inc_void(ret->qr_anoncv);
15736 /* regfree_internal()
15738 Free the private data in a regexp. This is overloadable by
15739 extensions. Perl takes care of the regexp structure in pregfree(),
15740 this covers the *pprivate pointer which technically perl doesn't
15741 know about, however of course we have to handle the
15742 regexp_internal structure when no extension is in use.
15744 Note this is called before freeing anything in the regexp
15749 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15752 struct regexp *const r = ReANY(rx);
15753 RXi_GET_DECL(r,ri);
15754 GET_RE_DEBUG_FLAGS_DECL;
15756 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15762 SV *dsv= sv_newmortal();
15763 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15764 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15765 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15766 PL_colors[4],PL_colors[5],s);
15769 #ifdef RE_TRACK_PATTERN_OFFSETS
15771 Safefree(ri->u.offsets); /* 20010421 MJD */
15773 if (ri->code_blocks) {
15775 for (n = 0; n < ri->num_code_blocks; n++)
15776 SvREFCNT_dec(ri->code_blocks[n].src_regex);
15777 Safefree(ri->code_blocks);
15781 int n = ri->data->count;
15784 /* If you add a ->what type here, update the comment in regcomp.h */
15785 switch (ri->data->what[n]) {
15791 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15794 Safefree(ri->data->data[n]);
15800 { /* Aho Corasick add-on structure for a trie node.
15801 Used in stclass optimization only */
15803 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15805 refcount = --aho->refcount;
15808 PerlMemShared_free(aho->states);
15809 PerlMemShared_free(aho->fail);
15810 /* do this last!!!! */
15811 PerlMemShared_free(ri->data->data[n]);
15812 PerlMemShared_free(ri->regstclass);
15818 /* trie structure. */
15820 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15822 refcount = --trie->refcount;
15825 PerlMemShared_free(trie->charmap);
15826 PerlMemShared_free(trie->states);
15827 PerlMemShared_free(trie->trans);
15829 PerlMemShared_free(trie->bitmap);
15831 PerlMemShared_free(trie->jump);
15832 PerlMemShared_free(trie->wordinfo);
15833 /* do this last!!!! */
15834 PerlMemShared_free(ri->data->data[n]);
15839 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15842 Safefree(ri->data->what);
15843 Safefree(ri->data);
15849 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15850 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15851 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15854 re_dup - duplicate a regexp.
15856 This routine is expected to clone a given regexp structure. It is only
15857 compiled under USE_ITHREADS.
15859 After all of the core data stored in struct regexp is duplicated
15860 the regexp_engine.dupe method is used to copy any private data
15861 stored in the *pprivate pointer. This allows extensions to handle
15862 any duplication it needs to do.
15864 See pregfree() and regfree_internal() if you change anything here.
15866 #if defined(USE_ITHREADS)
15867 #ifndef PERL_IN_XSUB_RE
15869 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15873 const struct regexp *r = ReANY(sstr);
15874 struct regexp *ret = ReANY(dstr);
15876 PERL_ARGS_ASSERT_RE_DUP_GUTS;
15878 npar = r->nparens+1;
15879 Newx(ret->offs, npar, regexp_paren_pair);
15880 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15882 if (ret->substrs) {
15883 /* Do it this way to avoid reading from *r after the StructCopy().
15884 That way, if any of the sv_dup_inc()s dislodge *r from the L1
15885 cache, it doesn't matter. */
15886 const bool anchored = r->check_substr
15887 ? r->check_substr == r->anchored_substr
15888 : r->check_utf8 == r->anchored_utf8;
15889 Newx(ret->substrs, 1, struct reg_substr_data);
15890 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15892 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15893 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15894 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15895 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15897 /* check_substr and check_utf8, if non-NULL, point to either their
15898 anchored or float namesakes, and don't hold a second reference. */
15900 if (ret->check_substr) {
15902 assert(r->check_utf8 == r->anchored_utf8);
15903 ret->check_substr = ret->anchored_substr;
15904 ret->check_utf8 = ret->anchored_utf8;
15906 assert(r->check_substr == r->float_substr);
15907 assert(r->check_utf8 == r->float_utf8);
15908 ret->check_substr = ret->float_substr;
15909 ret->check_utf8 = ret->float_utf8;
15911 } else if (ret->check_utf8) {
15913 ret->check_utf8 = ret->anchored_utf8;
15915 ret->check_utf8 = ret->float_utf8;
15920 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15921 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15924 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15926 if (RX_MATCH_COPIED(dstr))
15927 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15929 ret->subbeg = NULL;
15930 #ifdef PERL_ANY_COW
15931 ret->saved_copy = NULL;
15934 /* Whether mother_re be set or no, we need to copy the string. We
15935 cannot refrain from copying it when the storage points directly to
15936 our mother regexp, because that's
15937 1: a buffer in a different thread
15938 2: something we no longer hold a reference on
15939 so we need to copy it locally. */
15940 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15941 ret->mother_re = NULL;
15943 #endif /* PERL_IN_XSUB_RE */
15948 This is the internal complement to regdupe() which is used to copy
15949 the structure pointed to by the *pprivate pointer in the regexp.
15950 This is the core version of the extension overridable cloning hook.
15951 The regexp structure being duplicated will be copied by perl prior
15952 to this and will be provided as the regexp *r argument, however
15953 with the /old/ structures pprivate pointer value. Thus this routine
15954 may override any copying normally done by perl.
15956 It returns a pointer to the new regexp_internal structure.
15960 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15963 struct regexp *const r = ReANY(rx);
15964 regexp_internal *reti;
15966 RXi_GET_DECL(r,ri);
15968 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15972 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15973 Copy(ri->program, reti->program, len+1, regnode);
15975 reti->num_code_blocks = ri->num_code_blocks;
15976 if (ri->code_blocks) {
15978 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15979 struct reg_code_block);
15980 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15981 struct reg_code_block);
15982 for (n = 0; n < ri->num_code_blocks; n++)
15983 reti->code_blocks[n].src_regex = (REGEXP*)
15984 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15987 reti->code_blocks = NULL;
15989 reti->regstclass = NULL;
15992 struct reg_data *d;
15993 const int count = ri->data->count;
15996 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15997 char, struct reg_data);
15998 Newx(d->what, count, U8);
16001 for (i = 0; i < count; i++) {
16002 d->what[i] = ri->data->what[i];
16003 switch (d->what[i]) {
16004 /* see also regcomp.h and regfree_internal() */
16005 case 'a': /* actually an AV, but the dup function is identical. */
16009 case 'u': /* actually an HV, but the dup function is identical. */
16010 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16013 /* This is cheating. */
16014 Newx(d->data[i], 1, regnode_ssc);
16015 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16016 reti->regstclass = (regnode*)d->data[i];
16019 /* Trie stclasses are readonly and can thus be shared
16020 * without duplication. We free the stclass in pregfree
16021 * when the corresponding reg_ac_data struct is freed.
16023 reti->regstclass= ri->regstclass;
16027 ((reg_trie_data*)ri->data->data[i])->refcount++;
16032 d->data[i] = ri->data->data[i];
16035 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
16044 reti->name_list_idx = ri->name_list_idx;
16046 #ifdef RE_TRACK_PATTERN_OFFSETS
16047 if (ri->u.offsets) {
16048 Newx(reti->u.offsets, 2*len+1, U32);
16049 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16052 SetProgLen(reti,len);
16055 return (void*)reti;
16058 #endif /* USE_ITHREADS */
16060 #ifndef PERL_IN_XSUB_RE
16063 - regnext - dig the "next" pointer out of a node
16066 Perl_regnext(pTHX_ regnode *p)
16074 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16075 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
16078 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16087 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16090 STRLEN l1 = strlen(pat1);
16091 STRLEN l2 = strlen(pat2);
16094 const char *message;
16096 PERL_ARGS_ASSERT_RE_CROAK2;
16102 Copy(pat1, buf, l1 , char);
16103 Copy(pat2, buf + l1, l2 , char);
16104 buf[l1 + l2] = '\n';
16105 buf[l1 + l2 + 1] = '\0';
16106 va_start(args, pat2);
16107 msv = vmess(buf, &args);
16109 message = SvPV_const(msv,l1);
16112 Copy(message, buf, l1 , char);
16113 /* l1-1 to avoid \n */
16114 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16117 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
16119 #ifndef PERL_IN_XSUB_RE
16121 Perl_save_re_context(pTHX)
16125 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16127 const REGEXP * const rx = PM_GETRE(PL_curpm);
16130 for (i = 1; i <= RX_NPARENS(rx); i++) {
16131 char digits[TYPE_CHARS(long)];
16132 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
16133 GV *const *const gvp
16134 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16137 GV * const gv = *gvp;
16138 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16150 S_put_byte(pTHX_ SV *sv, int c)
16152 PERL_ARGS_ASSERT_PUT_BYTE;
16156 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16157 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16158 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16159 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16160 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16163 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16168 const char string = c;
16169 if (c == '-' || c == ']' || c == '\\' || c == '^')
16170 sv_catpvs(sv, "\\");
16171 sv_catpvn(sv, &string, 1);
16176 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16178 /* Appends to 'sv' a displayable version of the innards of the bracketed
16179 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16180 * output anything */
16183 int rangestart = -1;
16184 bool has_output_anything = FALSE;
16186 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16188 for (i = 0; i <= 256; i++) {
16189 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16190 if (rangestart == -1)
16192 } else if (rangestart != -1) {
16194 if (i <= rangestart + 3) { /* Individual chars in short ranges */
16195 for (; rangestart < i; rangestart++)
16196 put_byte(sv, rangestart);
16199 || ! isALPHANUMERIC(rangestart)
16200 || ! isALPHANUMERIC(j)
16201 || isDIGIT(rangestart) != isDIGIT(j)
16202 || isUPPER(rangestart) != isUPPER(j)
16203 || isLOWER(rangestart) != isLOWER(j)
16205 /* This final test should get optimized out except
16206 * on EBCDIC platforms, where it causes ranges that
16207 * cross discontinuities like i/j to be shown as hex
16208 * instead of the misleading, e.g. H-K (since that
16209 * range includes more than H, I, J, K). */
16210 || (j - rangestart)
16211 != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
16213 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
16215 (j < 256) ? j : 255);
16217 else { /* Here, the ends of the range are both digits, or both
16218 uppercase, or both lowercase; and there's no
16219 discontinuity in the range (which could happen on EBCDIC
16221 put_byte(sv, rangestart);
16222 sv_catpvs(sv, "-");
16226 has_output_anything = TRUE;
16230 return has_output_anything;
16233 #define CLEAR_OPTSTART \
16234 if (optstart) STMT_START { \
16235 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16239 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16241 STATIC const regnode *
16242 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16243 const regnode *last, const regnode *plast,
16244 SV* sv, I32 indent, U32 depth)
16247 U8 op = PSEUDO; /* Arbitrary non-END op. */
16248 const regnode *next;
16249 const regnode *optstart= NULL;
16251 RXi_GET_DECL(r,ri);
16252 GET_RE_DEBUG_FLAGS_DECL;
16254 PERL_ARGS_ASSERT_DUMPUNTIL;
16256 #ifdef DEBUG_DUMPUNTIL
16257 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16258 last ? last-start : 0,plast ? plast-start : 0);
16261 if (plast && plast < last)
16264 while (PL_regkind[op] != END && (!last || node < last)) {
16265 /* While that wasn't END last time... */
16268 if (op == CLOSE || op == WHILEM)
16270 next = regnext((regnode *)node);
16273 if (OP(node) == OPTIMIZED) {
16274 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16281 regprop(r, sv, node);
16282 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16283 (int)(2*indent + 1), "", SvPVX_const(sv));
16285 if (OP(node) != OPTIMIZED) {
16286 if (next == NULL) /* Next ptr. */
16287 PerlIO_printf(Perl_debug_log, " (0)");
16288 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
16289 PerlIO_printf(Perl_debug_log, " (FAIL)");
16291 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16292 (void)PerlIO_putc(Perl_debug_log, '\n');
16296 if (PL_regkind[(U8)op] == BRANCHJ) {
16299 const regnode *nnode = (OP(next) == LONGJMP
16300 ? regnext((regnode *)next)
16302 if (last && nnode > last)
16304 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16307 else if (PL_regkind[(U8)op] == BRANCH) {
16309 DUMPUNTIL(NEXTOPER(node), next);
16311 else if ( PL_regkind[(U8)op] == TRIE ) {
16312 const regnode *this_trie = node;
16313 const char op = OP(node);
16314 const U32 n = ARG(node);
16315 const reg_ac_data * const ac = op>=AHOCORASICK ?
16316 (reg_ac_data *)ri->data->data[n] :
16318 const reg_trie_data * const trie =
16319 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16321 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16323 const regnode *nextbranch= NULL;
16326 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16327 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16329 PerlIO_printf(Perl_debug_log, "%*s%s ",
16330 (int)(2*(indent+3)), "",
16331 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
16332 PL_colors[0], PL_colors[1],
16333 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
16334 PERL_PV_PRETTY_ELLIPSES |
16335 PERL_PV_PRETTY_LTGT
16340 U16 dist= trie->jump[word_idx+1];
16341 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16342 (UV)((dist ? this_trie + dist : next) - start));
16345 nextbranch= this_trie + trie->jump[0];
16346 DUMPUNTIL(this_trie + dist, nextbranch);
16348 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16349 nextbranch= regnext((regnode *)nextbranch);
16351 PerlIO_printf(Perl_debug_log, "\n");
16354 if (last && next > last)
16359 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
16360 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16361 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16363 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16365 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16367 else if ( op == PLUS || op == STAR) {
16368 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16370 else if (PL_regkind[(U8)op] == ANYOF) {
16371 /* arglen 1 + class block */
16372 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16373 ? ANYOF_POSIXL_SKIP : ANYOF_SKIP);
16374 node = NEXTOPER(node);
16376 else if (PL_regkind[(U8)op] == EXACT) {
16377 /* Literal string, where present. */
16378 node += NODE_SZ_STR(node) - 1;
16379 node = NEXTOPER(node);
16382 node = NEXTOPER(node);
16383 node += regarglen[(U8)op];
16385 if (op == CURLYX || op == OPEN)
16389 #ifdef DEBUG_DUMPUNTIL
16390 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16395 #endif /* DEBUGGING */
16399 * c-indentation-style: bsd
16400 * c-basic-offset: 4
16401 * indent-tabs-mode: nil
16404 * ex: set ts=8 sts=4 sw=4 et: