5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 struct RExC_state_t {
104 U32 flags; /* RXf_* are we folding, multilining? */
105 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
106 char *precomp; /* uncompiled string. */
107 REGEXP *rx_sv; /* The SV that is the regexp. */
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
110 char *start; /* Start of input for compile */
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
113 SSize_t whilem_seen; /* number of WHILEM in this expr */
114 regnode *emit_start; /* Start of emitted-code area */
115 regnode *emit_bound; /* First regnode outside of the allocated space */
116 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
117 implies compiling, so don't emit */
118 regnode_ssc emit_dummy; /* placeholder for emit to point to;
119 large enough for the largest
120 non-EXACTish node, so can use it as
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
125 SSize_t size; /* Code size. */
126 I32 npar; /* Capture buffer count, (OPEN) plus one. ("par" 0 is the whole pattern)*/
127 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
140 HV *paren_names; /* Paren names */
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
144 U8 *study_chunk_recursed; /* bitmap of which parens we have moved through */
145 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
149 I32 override_recoding;
150 I32 in_multi_char_class;
151 struct reg_code_block *code_blocks; /* positions of literal (?{})
153 int num_code_blocks; /* size of code_blocks[] */
154 int code_index; /* next code_blocks[] slot */
156 char *starttry; /* -Dr: where regtry was called. */
157 #define RExC_starttry (pRExC_state->starttry)
159 SV *runtime_code_qr; /* qr with the runtime code blocks */
161 const char *lastparse;
163 AV *paren_name_list; /* idx -> name */
164 #define RExC_lastparse (pRExC_state->lastparse)
165 #define RExC_lastnum (pRExC_state->lastnum)
166 #define RExC_paren_name_list (pRExC_state->paren_name_list)
170 #define RExC_flags (pRExC_state->flags)
171 #define RExC_pm_flags (pRExC_state->pm_flags)
172 #define RExC_precomp (pRExC_state->precomp)
173 #define RExC_rx_sv (pRExC_state->rx_sv)
174 #define RExC_rx (pRExC_state->rx)
175 #define RExC_rxi (pRExC_state->rxi)
176 #define RExC_start (pRExC_state->start)
177 #define RExC_end (pRExC_state->end)
178 #define RExC_parse (pRExC_state->parse)
179 #define RExC_whilem_seen (pRExC_state->whilem_seen)
180 #ifdef RE_TRACK_PATTERN_OFFSETS
181 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
183 #define RExC_emit (pRExC_state->emit)
184 #define RExC_emit_dummy (pRExC_state->emit_dummy)
185 #define RExC_emit_start (pRExC_state->emit_start)
186 #define RExC_emit_bound (pRExC_state->emit_bound)
187 #define RExC_naughty (pRExC_state->naughty)
188 #define RExC_sawback (pRExC_state->sawback)
189 #define RExC_seen (pRExC_state->seen)
190 #define RExC_size (pRExC_state->size)
191 #define RExC_npar (pRExC_state->npar)
192 #define RExC_nestroot (pRExC_state->nestroot)
193 #define RExC_extralen (pRExC_state->extralen)
194 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
195 #define RExC_utf8 (pRExC_state->utf8)
196 #define RExC_uni_semantics (pRExC_state->uni_semantics)
197 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
198 #define RExC_open_parens (pRExC_state->open_parens)
199 #define RExC_close_parens (pRExC_state->close_parens)
200 #define RExC_opend (pRExC_state->opend)
201 #define RExC_paren_names (pRExC_state->paren_names)
202 #define RExC_recurse (pRExC_state->recurse)
203 #define RExC_recurse_count (pRExC_state->recurse_count)
204 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
205 #define RExC_study_chunk_recursed_bytes (pRExC_state->study_chunk_recursed_bytes)
206 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
207 #define RExC_contains_locale (pRExC_state->contains_locale)
208 #define RExC_contains_i (pRExC_state->contains_i)
209 #define RExC_override_recoding (pRExC_state->override_recoding)
210 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
213 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
214 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
215 ((*s) == '{' && regcurly(s, FALSE)))
218 * Flags to be passed up and down.
220 #define WORST 0 /* Worst case. */
221 #define HASWIDTH 0x01 /* Known to match non-null strings. */
223 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
224 * character. (There needs to be a case: in the switch statement in regexec.c
225 * for any node marked SIMPLE.) Note that this is not the same thing as
228 #define SPSTART 0x04 /* Starts with * or + */
229 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
230 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
231 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
233 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
235 /* whether trie related optimizations are enabled */
236 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
237 #define TRIE_STUDY_OPT
238 #define FULL_TRIE_STUDY
244 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
245 #define PBITVAL(paren) (1 << ((paren) & 7))
246 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
247 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
248 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
250 #define REQUIRE_UTF8 STMT_START { \
252 *flagp = RESTART_UTF8; \
257 /* This converts the named class defined in regcomp.h to its equivalent class
258 * number defined in handy.h. */
259 #define namedclass_to_classnum(class) ((int) ((class) / 2))
260 #define classnum_to_namedclass(classnum) ((classnum) * 2)
262 #define _invlist_union_complement_2nd(a, b, output) \
263 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
264 #define _invlist_intersection_complement_2nd(a, b, output) \
265 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
267 /* About scan_data_t.
269 During optimisation we recurse through the regexp program performing
270 various inplace (keyhole style) optimisations. In addition study_chunk
271 and scan_commit populate this data structure with information about
272 what strings MUST appear in the pattern. We look for the longest
273 string that must appear at a fixed location, and we look for the
274 longest string that may appear at a floating location. So for instance
279 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
280 strings (because they follow a .* construct). study_chunk will identify
281 both FOO and BAR as being the longest fixed and floating strings respectively.
283 The strings can be composites, for instance
287 will result in a composite fixed substring 'foo'.
289 For each string some basic information is maintained:
291 - offset or min_offset
292 This is the position the string must appear at, or not before.
293 It also implicitly (when combined with minlenp) tells us how many
294 characters must match before the string we are searching for.
295 Likewise when combined with minlenp and the length of the string it
296 tells us how many characters must appear after the string we have
300 Only used for floating strings. This is the rightmost point that
301 the string can appear at. If set to SSize_t_MAX it indicates that the
302 string can occur infinitely far to the right.
305 A pointer to the minimum number of characters of the pattern that the
306 string was found inside. This is important as in the case of positive
307 lookahead or positive lookbehind we can have multiple patterns
312 The minimum length of the pattern overall is 3, the minimum length
313 of the lookahead part is 3, but the minimum length of the part that
314 will actually match is 1. So 'FOO's minimum length is 3, but the
315 minimum length for the F is 1. This is important as the minimum length
316 is used to determine offsets in front of and behind the string being
317 looked for. Since strings can be composites this is the length of the
318 pattern at the time it was committed with a scan_commit. Note that
319 the length is calculated by study_chunk, so that the minimum lengths
320 are not known until the full pattern has been compiled, thus the
321 pointer to the value.
325 In the case of lookbehind the string being searched for can be
326 offset past the start point of the final matching string.
327 If this value was just blithely removed from the min_offset it would
328 invalidate some of the calculations for how many chars must match
329 before or after (as they are derived from min_offset and minlen and
330 the length of the string being searched for).
331 When the final pattern is compiled and the data is moved from the
332 scan_data_t structure into the regexp structure the information
333 about lookbehind is factored in, with the information that would
334 have been lost precalculated in the end_shift field for the
337 The fields pos_min and pos_delta are used to store the minimum offset
338 and the delta to the maximum offset at the current point in the pattern.
342 typedef struct scan_data_t {
343 /*I32 len_min; unused */
344 /*I32 len_delta; unused */
348 SSize_t last_end; /* min value, <0 unless valid. */
349 SSize_t last_start_min;
350 SSize_t last_start_max;
351 SV **longest; /* Either &l_fixed, or &l_float. */
352 SV *longest_fixed; /* longest fixed string found in pattern */
353 SSize_t offset_fixed; /* offset where it starts */
354 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
355 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
356 SV *longest_float; /* longest floating string found in pattern */
357 SSize_t offset_float_min; /* earliest point in string it can appear */
358 SSize_t offset_float_max; /* latest point in string it can appear */
359 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
360 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
363 SSize_t *last_closep;
364 regnode_ssc *start_class;
367 /* The below is perhaps overboard, but this allows us to save a test at the
368 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
369 * and 'a' differ by a single bit; the same with the upper and lower case of
370 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
371 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
372 * then inverts it to form a mask, with just a single 0, in the bit position
373 * where the upper- and lowercase differ. XXX There are about 40 other
374 * instances in the Perl core where this micro-optimization could be used.
375 * Should decide if maintenance cost is worse, before changing those
377 * Returns a boolean as to whether or not 'v' is either a lowercase or
378 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
379 * compile-time constant, the generated code is better than some optimizing
380 * compilers figure out, amounting to a mask and test. The results are
381 * meaningless if 'c' is not one of [A-Za-z] */
382 #define isARG2_lower_or_UPPER_ARG1(c, v) \
383 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
386 * Forward declarations for pregcomp()'s friends.
389 static const scan_data_t zero_scan_data =
390 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
392 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
393 #define SF_BEFORE_SEOL 0x0001
394 #define SF_BEFORE_MEOL 0x0002
395 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
396 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
398 #define SF_FIX_SHIFT_EOL (+2)
399 #define SF_FL_SHIFT_EOL (+4)
401 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
402 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
404 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
405 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
406 #define SF_IS_INF 0x0040
407 #define SF_HAS_PAR 0x0080
408 #define SF_IN_PAR 0x0100
409 #define SF_HAS_EVAL 0x0200
410 #define SCF_DO_SUBSTR 0x0400
411 #define SCF_DO_STCLASS_AND 0x0800
412 #define SCF_DO_STCLASS_OR 0x1000
413 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
414 #define SCF_WHILEM_VISITED_POS 0x2000
416 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
417 #define SCF_SEEN_ACCEPT 0x8000
418 #define SCF_TRIE_DOING_RESTUDY 0x10000
420 #define UTF cBOOL(RExC_utf8)
422 /* The enums for all these are ordered so things work out correctly */
423 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
424 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
425 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
426 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
427 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
428 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
429 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
431 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
433 #define OOB_NAMEDCLASS -1
435 /* There is no code point that is out-of-bounds, so this is problematic. But
436 * its only current use is to initialize a variable that is always set before
438 #define OOB_UNICODE 0xDEADBEEF
440 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
441 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
444 /* length of regex to show in messages that don't mark a position within */
445 #define RegexLengthToShowInErrorMessages 127
448 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
449 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
450 * op/pragma/warn/regcomp.
452 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
453 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
455 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
457 #define REPORT_LOCATION_ARGS(offset) \
458 UTF8fARG(UTF, offset, RExC_precomp), \
459 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
462 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
463 * arg. Show regex, up to a maximum length. If it's too long, chop and add
466 #define _FAIL(code) STMT_START { \
467 const char *ellipses = ""; \
468 IV len = RExC_end - RExC_precomp; \
471 SAVEFREESV(RExC_rx_sv); \
472 if (len > RegexLengthToShowInErrorMessages) { \
473 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
474 len = RegexLengthToShowInErrorMessages - 10; \
480 #define FAIL(msg) _FAIL( \
481 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
482 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
484 #define FAIL2(msg,arg) _FAIL( \
485 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
486 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
489 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
491 #define Simple_vFAIL(m) STMT_START { \
492 const IV offset = RExC_parse - RExC_precomp; \
493 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
494 m, REPORT_LOCATION_ARGS(offset)); \
498 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
500 #define vFAIL(m) STMT_START { \
502 SAVEFREESV(RExC_rx_sv); \
507 * Like Simple_vFAIL(), but accepts two arguments.
509 #define Simple_vFAIL2(m,a1) STMT_START { \
510 const IV offset = RExC_parse - RExC_precomp; \
511 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
512 REPORT_LOCATION_ARGS(offset)); \
516 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
518 #define vFAIL2(m,a1) STMT_START { \
520 SAVEFREESV(RExC_rx_sv); \
521 Simple_vFAIL2(m, a1); \
526 * Like Simple_vFAIL(), but accepts three arguments.
528 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
529 const IV offset = RExC_parse - RExC_precomp; \
530 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
531 REPORT_LOCATION_ARGS(offset)); \
535 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
537 #define vFAIL3(m,a1,a2) STMT_START { \
539 SAVEFREESV(RExC_rx_sv); \
540 Simple_vFAIL3(m, a1, a2); \
544 * Like Simple_vFAIL(), but accepts four arguments.
546 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
547 const IV offset = RExC_parse - RExC_precomp; \
548 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
549 REPORT_LOCATION_ARGS(offset)); \
552 #define vFAIL4(m,a1,a2,a3) STMT_START { \
554 SAVEFREESV(RExC_rx_sv); \
555 Simple_vFAIL4(m, a1, a2, a3); \
558 /* A specialized version of vFAIL2 that works with UTF8f */
559 #define vFAIL2utf8f(m, a1) STMT_START { \
560 const IV offset = RExC_parse - RExC_precomp; \
562 SAVEFREESV(RExC_rx_sv); \
563 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
564 REPORT_LOCATION_ARGS(offset)); \
568 /* m is not necessarily a "literal string", in this macro */
569 #define reg_warn_non_literal_string(loc, m) STMT_START { \
570 const IV offset = loc - RExC_precomp; \
571 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
572 m, REPORT_LOCATION_ARGS(offset)); \
575 #define ckWARNreg(loc,m) STMT_START { \
576 const IV offset = loc - RExC_precomp; \
577 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
578 REPORT_LOCATION_ARGS(offset)); \
581 #define vWARN_dep(loc, m) STMT_START { \
582 const IV offset = loc - RExC_precomp; \
583 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
584 REPORT_LOCATION_ARGS(offset)); \
587 #define ckWARNdep(loc,m) STMT_START { \
588 const IV offset = loc - RExC_precomp; \
589 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
591 REPORT_LOCATION_ARGS(offset)); \
594 #define ckWARNregdep(loc,m) STMT_START { \
595 const IV offset = loc - RExC_precomp; \
596 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
598 REPORT_LOCATION_ARGS(offset)); \
601 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
602 const IV offset = loc - RExC_precomp; \
603 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
605 a1, REPORT_LOCATION_ARGS(offset)); \
608 #define ckWARN2reg(loc, m, a1) STMT_START { \
609 const IV offset = loc - RExC_precomp; \
610 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
611 a1, REPORT_LOCATION_ARGS(offset)); \
614 #define vWARN3(loc, m, a1, a2) STMT_START { \
615 const IV offset = loc - RExC_precomp; \
616 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
617 a1, a2, REPORT_LOCATION_ARGS(offset)); \
620 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
621 const IV offset = loc - RExC_precomp; \
622 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
623 a1, a2, REPORT_LOCATION_ARGS(offset)); \
626 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
627 const IV offset = loc - RExC_precomp; \
628 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
629 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
632 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
633 const IV offset = loc - RExC_precomp; \
634 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
635 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
638 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
639 const IV offset = loc - RExC_precomp; \
640 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
641 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
645 /* Allow for side effects in s */
646 #define REGC(c,s) STMT_START { \
647 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
650 /* Macros for recording node offsets. 20001227 mjd@plover.com
651 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
652 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
653 * Element 0 holds the number n.
654 * Position is 1 indexed.
656 #ifndef RE_TRACK_PATTERN_OFFSETS
657 #define Set_Node_Offset_To_R(node,byte)
658 #define Set_Node_Offset(node,byte)
659 #define Set_Cur_Node_Offset
660 #define Set_Node_Length_To_R(node,len)
661 #define Set_Node_Length(node,len)
662 #define Set_Node_Cur_Length(node,start)
663 #define Node_Offset(n)
664 #define Node_Length(n)
665 #define Set_Node_Offset_Length(node,offset,len)
666 #define ProgLen(ri) ri->u.proglen
667 #define SetProgLen(ri,x) ri->u.proglen = x
669 #define ProgLen(ri) ri->u.offsets[0]
670 #define SetProgLen(ri,x) ri->u.offsets[0] = x
671 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
673 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
674 __LINE__, (int)(node), (int)(byte))); \
676 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
678 RExC_offsets[2*(node)-1] = (byte); \
683 #define Set_Node_Offset(node,byte) \
684 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
685 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
687 #define Set_Node_Length_To_R(node,len) STMT_START { \
689 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
690 __LINE__, (int)(node), (int)(len))); \
692 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
694 RExC_offsets[2*(node)] = (len); \
699 #define Set_Node_Length(node,len) \
700 Set_Node_Length_To_R((node)-RExC_emit_start, len)
701 #define Set_Node_Cur_Length(node, start) \
702 Set_Node_Length(node, RExC_parse - start)
704 /* Get offsets and lengths */
705 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
706 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
708 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
709 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
710 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
714 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
715 #define EXPERIMENTAL_INPLACESCAN
716 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
718 #define DEBUG_RExC_seen() \
719 DEBUG_OPTIMISE_MORE_r({ \
720 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
722 if (RExC_seen & REG_SEEN_ZERO_LEN) \
723 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN "); \
725 if (RExC_seen & REG_SEEN_LOOKBEHIND) \
726 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND "); \
728 if (RExC_seen & REG_SEEN_GPOS) \
729 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS "); \
731 if (RExC_seen & REG_SEEN_CANY) \
732 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY "); \
734 if (RExC_seen & REG_SEEN_RECURSE) \
735 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE "); \
737 if (RExC_seen & REG_TOP_LEVEL_BRANCHES) \
738 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES "); \
740 if (RExC_seen & REG_SEEN_VERBARG) \
741 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG "); \
743 if (RExC_seen & REG_SEEN_CUTGROUP) \
744 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP "); \
746 if (RExC_seen & REG_SEEN_RUN_ON_COMMENT) \
747 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT "); \
749 if (RExC_seen & REG_SEEN_EXACTF_SHARP_S) \
750 PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S "); \
752 if (RExC_seen & REG_SEEN_GOSTART) \
753 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART "); \
755 PerlIO_printf(Perl_debug_log,"\n"); \
758 #define DEBUG_STUDYDATA(str,data,depth) \
759 DEBUG_OPTIMISE_MORE_r(if(data){ \
760 PerlIO_printf(Perl_debug_log, \
761 "%*s" str "Pos:%"IVdf"/%"IVdf \
762 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
763 (int)(depth)*2, "", \
764 (IV)((data)->pos_min), \
765 (IV)((data)->pos_delta), \
766 (UV)((data)->flags), \
767 (IV)((data)->whilem_c), \
768 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
769 is_inf ? "INF " : "" \
771 if ((data)->last_found) \
772 PerlIO_printf(Perl_debug_log, \
773 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
774 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
775 SvPVX_const((data)->last_found), \
776 (IV)((data)->last_end), \
777 (IV)((data)->last_start_min), \
778 (IV)((data)->last_start_max), \
779 ((data)->longest && \
780 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
781 SvPVX_const((data)->longest_fixed), \
782 (IV)((data)->offset_fixed), \
783 ((data)->longest && \
784 (data)->longest==&((data)->longest_float)) ? "*" : "", \
785 SvPVX_const((data)->longest_float), \
786 (IV)((data)->offset_float_min), \
787 (IV)((data)->offset_float_max) \
789 PerlIO_printf(Perl_debug_log,"\n"); \
792 /* Mark that we cannot extend a found fixed substring at this point.
793 Update the longest found anchored substring and the longest found
794 floating substrings if needed. */
797 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
798 SSize_t *minlenp, int is_inf)
800 const STRLEN l = CHR_SVLEN(data->last_found);
801 const STRLEN old_l = CHR_SVLEN(*data->longest);
802 GET_RE_DEBUG_FLAGS_DECL;
804 PERL_ARGS_ASSERT_SCAN_COMMIT;
806 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
807 SvSetMagicSV(*data->longest, data->last_found);
808 if (*data->longest == data->longest_fixed) {
809 data->offset_fixed = l ? data->last_start_min : data->pos_min;
810 if (data->flags & SF_BEFORE_EOL)
812 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
814 data->flags &= ~SF_FIX_BEFORE_EOL;
815 data->minlen_fixed=minlenp;
816 data->lookbehind_fixed=0;
818 else { /* *data->longest == data->longest_float */
819 data->offset_float_min = l ? data->last_start_min : data->pos_min;
820 data->offset_float_max = (l
821 ? data->last_start_max
822 : (data->pos_delta == SSize_t_MAX
824 : data->pos_min + data->pos_delta));
826 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
827 data->offset_float_max = SSize_t_MAX;
828 if (data->flags & SF_BEFORE_EOL)
830 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
832 data->flags &= ~SF_FL_BEFORE_EOL;
833 data->minlen_float=minlenp;
834 data->lookbehind_float=0;
837 SvCUR_set(data->last_found, 0);
839 SV * const sv = data->last_found;
840 if (SvUTF8(sv) && SvMAGICAL(sv)) {
841 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
847 data->flags &= ~SF_BEFORE_EOL;
848 DEBUG_STUDYDATA("commit: ",data,0);
851 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
852 * list that describes which code points it matches */
855 S_ssc_anything(pTHX_ regnode_ssc *ssc)
857 /* Set the SSC 'ssc' to match an empty string or any code point */
859 PERL_ARGS_ASSERT_SSC_ANYTHING;
861 assert(OP(ssc) == ANYOF_SYNTHETIC);
863 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
864 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
865 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
869 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
871 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
872 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
873 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
874 * in any way, so there's no point in using it */
879 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
881 assert(OP(ssc) == ANYOF_SYNTHETIC);
883 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
887 /* See if the list consists solely of the range 0 - Infinity */
888 invlist_iterinit(ssc->invlist);
889 ret = invlist_iternext(ssc->invlist, &start, &end)
893 invlist_iterfinish(ssc->invlist);
899 /* If e.g., both \w and \W are set, matches everything */
900 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
902 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
903 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
913 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
915 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
916 * string, any code point, or any posix class under locale */
918 PERL_ARGS_ASSERT_SSC_INIT;
920 Zero(ssc, 1, regnode_ssc);
921 OP(ssc) = ANYOF_SYNTHETIC;
922 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
925 /* If any portion of the regex is to operate under locale rules,
926 * initialization includes it. The reason this isn't done for all regexes
927 * is that the optimizer was written under the assumption that locale was
928 * all-or-nothing. Given the complexity and lack of documentation in the
929 * optimizer, and that there are inadequate test cases for locale, many
930 * parts of it may not work properly, it is safest to avoid locale unless
932 if (RExC_contains_locale) {
933 ANYOF_POSIXL_SETALL(ssc);
934 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
935 if (RExC_contains_i) {
936 ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
940 ANYOF_POSIXL_ZERO(ssc);
945 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
946 const regnode_ssc *ssc)
948 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
949 * to the list of code points matched, and locale posix classes; hence does
950 * not check its flags) */
955 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
957 assert(OP(ssc) == ANYOF_SYNTHETIC);
959 invlist_iterinit(ssc->invlist);
960 ret = invlist_iternext(ssc->invlist, &start, &end)
964 invlist_iterfinish(ssc->invlist);
970 if (RExC_contains_locale) {
971 if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
972 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
973 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
977 if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
986 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
987 const regnode_charclass_posixl* const node)
989 /* Returns a mortal inversion list defining which code points are matched
990 * by 'node', which is of type ANYOF. Handles complementing the result if
991 * appropriate. If some code points aren't knowable at this time, the
992 * returned list must, and will, contain every possible code point. */
994 SV* invlist = sv_2mortal(_new_invlist(0));
996 const U32 n = ARG(node);
998 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1000 /* Look at the data structure created by S_set_ANYOF_arg() */
1001 if (n != ANYOF_NONBITMAP_EMPTY) {
1002 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1003 AV * const av = MUTABLE_AV(SvRV(rv));
1004 SV **const ary = AvARRAY(av);
1005 assert(RExC_rxi->data->what[n] == 's');
1007 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1008 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1010 else if (ary[0] && ary[0] != &PL_sv_undef) {
1012 /* Here, no compile-time swash, and there are things that won't be
1013 * known until runtime -- we have to assume it could be anything */
1014 return _add_range_to_invlist(invlist, 0, UV_MAX);
1018 /* Here no compile-time swash, and no run-time only data. Use the
1019 * node's inversion list */
1020 invlist = sv_2mortal(invlist_clone(ary[2]));
1024 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1025 * inversion list for the others, but if there are code points that should
1026 * match only conditionally on the target string being UTF-8, those are
1027 * placed in the inversion list, and not the bitmap. Since there are
1028 * circumstances under which they could match, they are included in the
1029 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1030 * here, so that when we invert below, the end result actually does include
1031 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1032 * before we add the unconditionally matched code points */
1033 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1034 _invlist_intersection_complement_2nd(invlist,
1039 /* Add in the points from the bit map */
1040 for (i = 0; i < 256; i++) {
1041 if (ANYOF_BITMAP_TEST(node, i)) {
1042 invlist = add_cp_to_invlist(invlist, i);
1046 /* If this can match all upper Latin1 code points, have to add them
1048 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
1049 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1052 /* Similarly for these */
1053 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1054 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1057 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1058 _invlist_invert(invlist);
1064 /* These two functions currently do the exact same thing */
1065 #define ssc_init_zero ssc_init
1067 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1068 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1071 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1073 /* Take the flags 'and_with' and accumulate them anded into the flags for
1074 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
1075 * The flags 'and_with' should not come from another SSC (otherwise the
1076 * EMPTY_STRING flag won't work) */
1078 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
1080 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1082 /* Use just the SSC-related flags from 'and_with' */
1083 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_LOCALE_FLAGS);
1084 ANYOF_FLAGS(ssc) |= ssc_only_flags;
1087 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1088 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1089 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1092 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1093 const regnode_ssc *and_with)
1095 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1096 * another SSC or a regular ANYOF class. Can create false positives. */
1101 PERL_ARGS_ASSERT_SSC_AND;
1103 assert(OP(ssc) == ANYOF_SYNTHETIC);
1105 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1106 * the code point inversion list and just the relevant flags */
1107 if (OP(and_with) == ANYOF_SYNTHETIC) {
1108 anded_cp_list = and_with->invlist;
1109 anded_flags = ANYOF_FLAGS(and_with);
1112 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1113 (regnode_charclass_posixl*) and_with);
1114 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
1117 ANYOF_FLAGS(ssc) &= anded_flags;
1119 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1120 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1121 * 'and_with' may be inverted. When not inverted, we have the situation of
1123 * (C1 | P1) & (C2 | P2)
1124 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1125 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1126 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1127 * <= ((C1 & C2) | P1 | P2)
1128 * Alternatively, the last few steps could be:
1129 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1130 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1131 * <= (C1 | C2 | (P1 & P2))
1132 * We favor the second approach if either P1 or P2 is non-empty. This is
1133 * because these components are a barrier to doing optimizations, as what
1134 * they match cannot be known until the moment of matching as they are
1135 * dependent on the current locale, 'AND"ing them likely will reduce or
1137 * But we can do better if we know that C1,P1 are in their initial state (a
1138 * frequent occurrence), each matching everything:
1139 * (<everything>) & (C2 | P2) = C2 | P2
1140 * Similarly, if C2,P2 are in their initial state (again a frequent
1141 * occurrence), the result is a no-op
1142 * (C1 | P1) & (<everything>) = C1 | P1
1145 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1146 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1147 * <= (C1 & ~C2) | (P1 & ~P2)
1150 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1151 && OP(and_with) != ANYOF_SYNTHETIC)
1155 ssc_intersection(ssc,
1157 FALSE /* Has already been inverted */
1160 /* If either P1 or P2 is empty, the intersection will be also; can skip
1162 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1163 ANYOF_POSIXL_ZERO(ssc);
1165 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1167 /* Note that the Posix class component P from 'and_with' actually
1169 * P = Pa | Pb | ... | Pn
1170 * where each component is one posix class, such as in [\w\s].
1172 * ~P = ~(Pa | Pb | ... | Pn)
1173 * = ~Pa & ~Pb & ... & ~Pn
1174 * <= ~Pa | ~Pb | ... | ~Pn
1175 * The last is something we can easily calculate, but unfortunately
1176 * is likely to have many false positives. We could do better
1177 * in some (but certainly not all) instances if two classes in
1178 * P have known relationships. For example
1179 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1181 * :lower: & :print: = :lower:
1182 * And similarly for classes that must be disjoint. For example,
1183 * since \s and \w can have no elements in common based on rules in
1184 * the POSIX standard,
1185 * \w & ^\S = nothing
1186 * Unfortunately, some vendor locales do not meet the Posix
1187 * standard, in particular almost everything by Microsoft.
1188 * The loop below just changes e.g., \w into \W and vice versa */
1190 regnode_charclass_posixl temp;
1191 int add = 1; /* To calculate the index of the complement */
1193 ANYOF_POSIXL_ZERO(&temp);
1194 for (i = 0; i < ANYOF_MAX; i++) {
1196 || ! ANYOF_POSIXL_TEST(and_with, i)
1197 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1199 if (ANYOF_POSIXL_TEST(and_with, i)) {
1200 ANYOF_POSIXL_SET(&temp, i + add);
1202 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1204 ANYOF_POSIXL_AND(&temp, ssc);
1206 } /* else ssc already has no posixes */
1207 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1208 in its initial state */
1209 else if (OP(and_with) != ANYOF_SYNTHETIC
1210 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1212 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1213 * copy it over 'ssc' */
1214 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1215 if (OP(and_with) == ANYOF_SYNTHETIC) {
1216 StructCopy(and_with, ssc, regnode_ssc);
1219 ssc->invlist = anded_cp_list;
1220 ANYOF_POSIXL_ZERO(ssc);
1221 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1222 ANYOF_POSIXL_OR(and_with, ssc);
1226 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1227 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1229 /* One or the other of P1, P2 is non-empty. */
1230 ANYOF_POSIXL_AND(and_with, ssc);
1231 ssc_union(ssc, anded_cp_list, FALSE);
1233 else { /* P1 = P2 = empty */
1234 ssc_intersection(ssc, anded_cp_list, FALSE);
1240 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1241 const regnode_ssc *or_with)
1243 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1244 * another SSC or a regular ANYOF class. Can create false positives if
1245 * 'or_with' is to be inverted. */
1250 PERL_ARGS_ASSERT_SSC_OR;
1252 assert(OP(ssc) == ANYOF_SYNTHETIC);
1254 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1255 * the code point inversion list and just the relevant flags */
1256 if (OP(or_with) == ANYOF_SYNTHETIC) {
1257 ored_cp_list = or_with->invlist;
1258 ored_flags = ANYOF_FLAGS(or_with);
1261 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1262 (regnode_charclass_posixl*) or_with);
1263 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
1266 ANYOF_FLAGS(ssc) |= ored_flags;
1268 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1269 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1270 * 'or_with' may be inverted. When not inverted, we have the simple
1271 * situation of computing:
1272 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1273 * If P1|P2 yields a situation with both a class and its complement are
1274 * set, like having both \w and \W, this matches all code points, and we
1275 * can delete these from the P component of the ssc going forward. XXX We
1276 * might be able to delete all the P components, but I (khw) am not certain
1277 * about this, and it is better to be safe.
1280 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1281 * <= (C1 | P1) | ~C2
1282 * <= (C1 | ~C2) | P1
1283 * (which results in actually simpler code than the non-inverted case)
1286 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1287 && OP(or_with) != ANYOF_SYNTHETIC)
1289 /* We ignore P2, leaving P1 going forward */
1291 else { /* Not inverted */
1292 ANYOF_POSIXL_OR(or_with, ssc);
1293 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1295 for (i = 0; i < ANYOF_MAX; i += 2) {
1296 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1298 ssc_match_all_cp(ssc);
1299 ANYOF_POSIXL_CLEAR(ssc, i);
1300 ANYOF_POSIXL_CLEAR(ssc, i+1);
1301 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1302 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1311 FALSE /* Already has been inverted */
1315 PERL_STATIC_INLINE void
1316 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1318 PERL_ARGS_ASSERT_SSC_UNION;
1320 assert(OP(ssc) == ANYOF_SYNTHETIC);
1322 _invlist_union_maybe_complement_2nd(ssc->invlist,
1328 PERL_STATIC_INLINE void
1329 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1331 const bool invert2nd)
1333 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1335 assert(OP(ssc) == ANYOF_SYNTHETIC);
1337 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1343 PERL_STATIC_INLINE void
1344 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1346 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1348 assert(OP(ssc) == ANYOF_SYNTHETIC);
1350 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1353 PERL_STATIC_INLINE void
1354 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1356 /* AND just the single code point 'cp' into the SSC 'ssc' */
1358 SV* cp_list = _new_invlist(2);
1360 PERL_ARGS_ASSERT_SSC_CP_AND;
1362 assert(OP(ssc) == ANYOF_SYNTHETIC);
1364 cp_list = add_cp_to_invlist(cp_list, cp);
1365 ssc_intersection(ssc, cp_list,
1366 FALSE /* Not inverted */
1368 SvREFCNT_dec_NN(cp_list);
1371 PERL_STATIC_INLINE void
1372 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1374 /* Set the SSC 'ssc' to not match any locale things */
1376 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1378 assert(OP(ssc) == ANYOF_SYNTHETIC);
1380 ANYOF_POSIXL_ZERO(ssc);
1381 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1385 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1387 /* The inversion list in the SSC is marked mortal; now we need a more
1388 * permanent copy, which is stored the same way that is done in a regular
1389 * ANYOF node, with the first 256 code points in a bit map */
1391 SV* invlist = invlist_clone(ssc->invlist);
1393 PERL_ARGS_ASSERT_SSC_FINALIZE;
1395 assert(OP(ssc) == ANYOF_SYNTHETIC);
1397 /* The code in this file assumes that all but these flags aren't relevant
1398 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1399 * time we reach here */
1400 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
1402 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1404 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1406 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1409 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1410 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1411 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1412 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1417 dump_trie(trie,widecharmap,revcharmap)
1418 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1419 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1421 These routines dump out a trie in a somewhat readable format.
1422 The _interim_ variants are used for debugging the interim
1423 tables that are used to generate the final compressed
1424 representation which is what dump_trie expects.
1426 Part of the reason for their existence is to provide a form
1427 of documentation as to how the different representations function.
1432 Dumps the final compressed table form of the trie to Perl_debug_log.
1433 Used for debugging make_trie().
1437 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1438 AV *revcharmap, U32 depth)
1441 SV *sv=sv_newmortal();
1442 int colwidth= widecharmap ? 6 : 4;
1444 GET_RE_DEBUG_FLAGS_DECL;
1446 PERL_ARGS_ASSERT_DUMP_TRIE;
1448 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1449 (int)depth * 2 + 2,"",
1450 "Match","Base","Ofs" );
1452 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1453 SV ** const tmp = av_fetch( revcharmap, state, 0);
1455 PerlIO_printf( Perl_debug_log, "%*s",
1457 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1458 PL_colors[0], PL_colors[1],
1459 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1460 PERL_PV_ESCAPE_FIRSTCHAR
1465 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1466 (int)depth * 2 + 2,"");
1468 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1469 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1470 PerlIO_printf( Perl_debug_log, "\n");
1472 for( state = 1 ; state < trie->statecount ; state++ ) {
1473 const U32 base = trie->states[ state ].trans.base;
1475 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1477 if ( trie->states[ state ].wordnum ) {
1478 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1480 PerlIO_printf( Perl_debug_log, "%6s", "" );
1483 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1488 while( ( base + ofs < trie->uniquecharcount ) ||
1489 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1490 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1493 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1495 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1496 if ( ( base + ofs >= trie->uniquecharcount ) &&
1497 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1498 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1500 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1502 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1504 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1508 PerlIO_printf( Perl_debug_log, "]");
1511 PerlIO_printf( Perl_debug_log, "\n" );
1513 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1514 for (word=1; word <= trie->wordcount; word++) {
1515 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1516 (int)word, (int)(trie->wordinfo[word].prev),
1517 (int)(trie->wordinfo[word].len));
1519 PerlIO_printf(Perl_debug_log, "\n" );
1522 Dumps a fully constructed but uncompressed trie in list form.
1523 List tries normally only are used for construction when the number of
1524 possible chars (trie->uniquecharcount) is very high.
1525 Used for debugging make_trie().
1528 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1529 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1533 SV *sv=sv_newmortal();
1534 int colwidth= widecharmap ? 6 : 4;
1535 GET_RE_DEBUG_FLAGS_DECL;
1537 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1539 /* print out the table precompression. */
1540 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1541 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1542 "------:-----+-----------------\n" );
1544 for( state=1 ; state < next_alloc ; state ++ ) {
1547 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1548 (int)depth * 2 + 2,"", (UV)state );
1549 if ( ! trie->states[ state ].wordnum ) {
1550 PerlIO_printf( Perl_debug_log, "%5s| ","");
1552 PerlIO_printf( Perl_debug_log, "W%4x| ",
1553 trie->states[ state ].wordnum
1556 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1557 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1559 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1561 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1562 PL_colors[0], PL_colors[1],
1563 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1564 PERL_PV_ESCAPE_FIRSTCHAR
1566 TRIE_LIST_ITEM(state,charid).forid,
1567 (UV)TRIE_LIST_ITEM(state,charid).newstate
1570 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1571 (int)((depth * 2) + 14), "");
1574 PerlIO_printf( Perl_debug_log, "\n");
1579 Dumps a fully constructed but uncompressed trie in table form.
1580 This is the normal DFA style state transition table, with a few
1581 twists to facilitate compression later.
1582 Used for debugging make_trie().
1585 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1586 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1591 SV *sv=sv_newmortal();
1592 int colwidth= widecharmap ? 6 : 4;
1593 GET_RE_DEBUG_FLAGS_DECL;
1595 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1598 print out the table precompression so that we can do a visual check
1599 that they are identical.
1602 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1604 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1605 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1607 PerlIO_printf( Perl_debug_log, "%*s",
1609 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1610 PL_colors[0], PL_colors[1],
1611 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1612 PERL_PV_ESCAPE_FIRSTCHAR
1618 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1620 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1621 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1624 PerlIO_printf( Perl_debug_log, "\n" );
1626 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1628 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1629 (int)depth * 2 + 2,"",
1630 (UV)TRIE_NODENUM( state ) );
1632 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1633 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1635 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1637 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1639 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1640 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1642 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1643 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1651 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1652 startbranch: the first branch in the whole branch sequence
1653 first : start branch of sequence of branch-exact nodes.
1654 May be the same as startbranch
1655 last : Thing following the last branch.
1656 May be the same as tail.
1657 tail : item following the branch sequence
1658 count : words in the sequence
1659 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1660 depth : indent depth
1662 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1664 A trie is an N'ary tree where the branches are determined by digital
1665 decomposition of the key. IE, at the root node you look up the 1st character and
1666 follow that branch repeat until you find the end of the branches. Nodes can be
1667 marked as "accepting" meaning they represent a complete word. Eg:
1671 would convert into the following structure. Numbers represent states, letters
1672 following numbers represent valid transitions on the letter from that state, if
1673 the number is in square brackets it represents an accepting state, otherwise it
1674 will be in parenthesis.
1676 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1680 (1) +-i->(6)-+-s->[7]
1682 +-s->(3)-+-h->(4)-+-e->[5]
1684 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1686 This shows that when matching against the string 'hers' we will begin at state 1
1687 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1688 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1689 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1690 single traverse. We store a mapping from accepting to state to which word was
1691 matched, and then when we have multiple possibilities we try to complete the
1692 rest of the regex in the order in which they occured in the alternation.
1694 The only prior NFA like behaviour that would be changed by the TRIE support is
1695 the silent ignoring of duplicate alternations which are of the form:
1697 / (DUPE|DUPE) X? (?{ ... }) Y /x
1699 Thus EVAL blocks following a trie may be called a different number of times with
1700 and without the optimisation. With the optimisations dupes will be silently
1701 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1702 the following demonstrates:
1704 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1706 which prints out 'word' three times, but
1708 'words'=~/(word|word|word)(?{ print $1 })S/
1710 which doesnt print it out at all. This is due to other optimisations kicking in.
1712 Example of what happens on a structural level:
1714 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1716 1: CURLYM[1] {1,32767}(18)
1727 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1728 and should turn into:
1730 1: CURLYM[1] {1,32767}(18)
1732 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1740 Cases where tail != last would be like /(?foo|bar)baz/:
1750 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1751 and would end up looking like:
1754 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1761 d = uvchr_to_utf8_flags(d, uv, 0);
1763 is the recommended Unicode-aware way of saying
1768 #define TRIE_STORE_REVCHAR(val) \
1771 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1772 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1773 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1774 SvCUR_set(zlopp, kapow - flrbbbbb); \
1777 av_push(revcharmap, zlopp); \
1779 char ooooff = (char)val; \
1780 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1784 /* This gets the next character from the input, folding it if not already
1786 #define TRIE_READ_CHAR STMT_START { \
1789 /* if it is UTF then it is either already folded, or does not need \
1791 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1793 else if (folder == PL_fold_latin1) { \
1794 /* This folder implies Unicode rules, which in the range expressible \
1795 * by not UTF is the lower case, with the two exceptions, one of \
1796 * which should have been taken care of before calling this */ \
1797 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1798 uvc = toLOWER_L1(*uc); \
1799 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1802 /* raw data, will be folded later if needed */ \
1810 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1811 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1812 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1813 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1815 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1816 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1817 TRIE_LIST_CUR( state )++; \
1820 #define TRIE_LIST_NEW(state) STMT_START { \
1821 Newxz( trie->states[ state ].trans.list, \
1822 4, reg_trie_trans_le ); \
1823 TRIE_LIST_CUR( state ) = 1; \
1824 TRIE_LIST_LEN( state ) = 4; \
1827 #define TRIE_HANDLE_WORD(state) STMT_START { \
1828 U16 dupe= trie->states[ state ].wordnum; \
1829 regnode * const noper_next = regnext( noper ); \
1832 /* store the word for dumping */ \
1834 if (OP(noper) != NOTHING) \
1835 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1837 tmp = newSVpvn_utf8( "", 0, UTF ); \
1838 av_push( trie_words, tmp ); \
1842 trie->wordinfo[curword].prev = 0; \
1843 trie->wordinfo[curword].len = wordlen; \
1844 trie->wordinfo[curword].accept = state; \
1846 if ( noper_next < tail ) { \
1848 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1849 trie->jump[curword] = (U16)(noper_next - convert); \
1851 jumper = noper_next; \
1853 nextbranch= regnext(cur); \
1857 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1858 /* chain, so that when the bits of chain are later */\
1859 /* linked together, the dups appear in the chain */\
1860 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1861 trie->wordinfo[dupe].prev = curword; \
1863 /* we haven't inserted this word yet. */ \
1864 trie->states[ state ].wordnum = curword; \
1869 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1870 ( ( base + charid >= ucharcount \
1871 && base + charid < ubound \
1872 && state == trie->trans[ base - ucharcount + charid ].check \
1873 && trie->trans[ base - ucharcount + charid ].next ) \
1874 ? trie->trans[ base - ucharcount + charid ].next \
1875 : ( state==1 ? special : 0 ) \
1879 #define MADE_JUMP_TRIE 2
1880 #define MADE_EXACT_TRIE 4
1883 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1886 /* first pass, loop through and scan words */
1887 reg_trie_data *trie;
1888 HV *widecharmap = NULL;
1889 AV *revcharmap = newAV();
1895 regnode *jumper = NULL;
1896 regnode *nextbranch = NULL;
1897 regnode *convert = NULL;
1898 U32 *prev_states; /* temp array mapping each state to previous one */
1899 /* we just use folder as a flag in utf8 */
1900 const U8 * folder = NULL;
1903 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1904 AV *trie_words = NULL;
1905 /* along with revcharmap, this only used during construction but both are
1906 * useful during debugging so we store them in the struct when debugging.
1909 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1910 STRLEN trie_charcount=0;
1912 SV *re_trie_maxbuff;
1913 GET_RE_DEBUG_FLAGS_DECL;
1915 PERL_ARGS_ASSERT_MAKE_TRIE;
1917 PERL_UNUSED_ARG(depth);
1924 case EXACTFU: folder = PL_fold_latin1; break;
1925 case EXACTF: folder = PL_fold; break;
1926 case EXACTFL: folder = PL_fold_locale; break;
1927 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1930 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1932 trie->startstate = 1;
1933 trie->wordcount = word_count;
1934 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1935 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1937 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1938 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1939 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1942 trie_words = newAV();
1945 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1946 if (!SvIOK(re_trie_maxbuff)) {
1947 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1949 DEBUG_TRIE_COMPILE_r({
1950 PerlIO_printf( Perl_debug_log,
1951 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1952 (int)depth * 2 + 2, "",
1953 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1954 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1958 /* Find the node we are going to overwrite */
1959 if ( first == startbranch && OP( last ) != BRANCH ) {
1960 /* whole branch chain */
1963 /* branch sub-chain */
1964 convert = NEXTOPER( first );
1967 /* -- First loop and Setup --
1969 We first traverse the branches and scan each word to determine if it
1970 contains widechars, and how many unique chars there are, this is
1971 important as we have to build a table with at least as many columns as we
1974 We use an array of integers to represent the character codes 0..255
1975 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1976 native representation of the character value as the key and IV's for the
1979 *TODO* If we keep track of how many times each character is used we can
1980 remap the columns so that the table compression later on is more
1981 efficient in terms of memory by ensuring the most common value is in the
1982 middle and the least common are on the outside. IMO this would be better
1983 than a most to least common mapping as theres a decent chance the most
1984 common letter will share a node with the least common, meaning the node
1985 will not be compressible. With a middle is most common approach the worst
1986 case is when we have the least common nodes twice.
1990 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1991 regnode *noper = NEXTOPER( cur );
1992 const U8 *uc = (U8*)STRING( noper );
1993 const U8 *e = uc + STR_LEN( noper );
1995 U32 wordlen = 0; /* required init */
1996 STRLEN minbytes = 0;
1997 STRLEN maxbytes = 0;
1998 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
2000 if (OP(noper) == NOTHING) {
2001 regnode *noper_next= regnext(noper);
2002 if (noper_next != tail && OP(noper_next) == flags) {
2004 uc= (U8*)STRING(noper);
2005 e= uc + STR_LEN(noper);
2006 trie->minlen= STR_LEN(noper);
2013 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2014 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2015 regardless of encoding */
2016 if (OP( noper ) == EXACTFU_SS) {
2017 /* false positives are ok, so just set this */
2018 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2021 for ( ; uc < e ; uc += len ) {
2022 TRIE_CHARCOUNT(trie)++;
2025 /* Acummulate to the current values, the range in the number of
2026 * bytes that this character could match. The max is presumed to
2027 * be the same as the folded input (which TRIE_READ_CHAR returns),
2028 * except that when this is not in UTF-8, it could be matched
2029 * against a string which is UTF-8, and the variant characters
2030 * could be 2 bytes instead of the 1 here. Likewise, for the
2031 * minimum number of bytes when not folded. When folding, the min
2032 * is assumed to be 1 byte could fold to match the single character
2033 * here, or in the case of a multi-char fold, 1 byte can fold to
2034 * the whole sequence. 'foldlen' is used to denote whether we are
2035 * in such a sequence, skipping the min setting if so. XXX TODO
2036 * Use the exact list of what folds to each character, from
2037 * PL_utf8_foldclosures */
2039 maxbytes += UTF8SKIP(uc);
2041 /* A non-UTF-8 string could be 1 byte to match our 2 */
2042 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2048 foldlen -= UTF8SKIP(uc);
2051 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2057 maxbytes += (UNI_IS_INVARIANT(*uc))
2068 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2075 U8 folded= folder[ (U8) uvc ];
2076 if ( !trie->charmap[ folded ] ) {
2077 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2078 TRIE_STORE_REVCHAR( folded );
2081 if ( !trie->charmap[ uvc ] ) {
2082 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2083 TRIE_STORE_REVCHAR( uvc );
2086 /* store the codepoint in the bitmap, and its folded
2088 TRIE_BITMAP_SET(trie, uvc);
2090 /* store the folded codepoint */
2091 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2094 /* store first byte of utf8 representation of
2095 variant codepoints */
2096 if (! UVCHR_IS_INVARIANT(uvc)) {
2097 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2100 set_bit = 0; /* We've done our bit :-) */
2105 widecharmap = newHV();
2107 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2110 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2112 if ( !SvTRUE( *svpp ) ) {
2113 sv_setiv( *svpp, ++trie->uniquecharcount );
2114 TRIE_STORE_REVCHAR(uvc);
2118 if( cur == first ) {
2119 trie->minlen = minbytes;
2120 trie->maxlen = maxbytes;
2121 } else if (minbytes < trie->minlen) {
2122 trie->minlen = minbytes;
2123 } else if (maxbytes > trie->maxlen) {
2124 trie->maxlen = maxbytes;
2126 } /* end first pass */
2127 DEBUG_TRIE_COMPILE_r(
2128 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2129 (int)depth * 2 + 2,"",
2130 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2131 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2132 (int)trie->minlen, (int)trie->maxlen )
2136 We now know what we are dealing with in terms of unique chars and
2137 string sizes so we can calculate how much memory a naive
2138 representation using a flat table will take. If it's over a reasonable
2139 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2140 conservative but potentially much slower representation using an array
2143 At the end we convert both representations into the same compressed
2144 form that will be used in regexec.c for matching with. The latter
2145 is a form that cannot be used to construct with but has memory
2146 properties similar to the list form and access properties similar
2147 to the table form making it both suitable for fast searches and
2148 small enough that its feasable to store for the duration of a program.
2150 See the comment in the code where the compressed table is produced
2151 inplace from the flat tabe representation for an explanation of how
2152 the compression works.
2157 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2160 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
2162 Second Pass -- Array Of Lists Representation
2164 Each state will be represented by a list of charid:state records
2165 (reg_trie_trans_le) the first such element holds the CUR and LEN
2166 points of the allocated array. (See defines above).
2168 We build the initial structure using the lists, and then convert
2169 it into the compressed table form which allows faster lookups
2170 (but cant be modified once converted).
2173 STRLEN transcount = 1;
2175 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2176 "%*sCompiling trie using list compiler\n",
2177 (int)depth * 2 + 2, ""));
2179 trie->states = (reg_trie_state *)
2180 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2181 sizeof(reg_trie_state) );
2185 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2187 regnode *noper = NEXTOPER( cur );
2188 U8 *uc = (U8*)STRING( noper );
2189 const U8 *e = uc + STR_LEN( noper );
2190 U32 state = 1; /* required init */
2191 U16 charid = 0; /* sanity init */
2192 U32 wordlen = 0; /* required init */
2194 if (OP(noper) == NOTHING) {
2195 regnode *noper_next= regnext(noper);
2196 if (noper_next != tail && OP(noper_next) == flags) {
2198 uc= (U8*)STRING(noper);
2199 e= uc + STR_LEN(noper);
2203 if (OP(noper) != NOTHING) {
2204 for ( ; uc < e ; uc += len ) {
2209 charid = trie->charmap[ uvc ];
2211 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2215 charid=(U16)SvIV( *svpp );
2218 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2225 if ( !trie->states[ state ].trans.list ) {
2226 TRIE_LIST_NEW( state );
2228 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2229 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2230 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2235 newstate = next_alloc++;
2236 prev_states[newstate] = state;
2237 TRIE_LIST_PUSH( state, charid, newstate );
2242 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2246 TRIE_HANDLE_WORD(state);
2248 } /* end second pass */
2250 /* next alloc is the NEXT state to be allocated */
2251 trie->statecount = next_alloc;
2252 trie->states = (reg_trie_state *)
2253 PerlMemShared_realloc( trie->states,
2255 * sizeof(reg_trie_state) );
2257 /* and now dump it out before we compress it */
2258 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2259 revcharmap, next_alloc,
2263 trie->trans = (reg_trie_trans *)
2264 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2271 for( state=1 ; state < next_alloc ; state ++ ) {
2275 DEBUG_TRIE_COMPILE_MORE_r(
2276 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2280 if (trie->states[state].trans.list) {
2281 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2285 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2286 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2287 if ( forid < minid ) {
2289 } else if ( forid > maxid ) {
2293 if ( transcount < tp + maxid - minid + 1) {
2295 trie->trans = (reg_trie_trans *)
2296 PerlMemShared_realloc( trie->trans,
2298 * sizeof(reg_trie_trans) );
2299 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2301 base = trie->uniquecharcount + tp - minid;
2302 if ( maxid == minid ) {
2304 for ( ; zp < tp ; zp++ ) {
2305 if ( ! trie->trans[ zp ].next ) {
2306 base = trie->uniquecharcount + zp - minid;
2307 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2308 trie->trans[ zp ].check = state;
2314 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2315 trie->trans[ tp ].check = state;
2320 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2321 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2322 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2323 trie->trans[ tid ].check = state;
2325 tp += ( maxid - minid + 1 );
2327 Safefree(trie->states[ state ].trans.list);
2330 DEBUG_TRIE_COMPILE_MORE_r(
2331 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2334 trie->states[ state ].trans.base=base;
2336 trie->lasttrans = tp + 1;
2340 Second Pass -- Flat Table Representation.
2342 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2343 each. We know that we will need Charcount+1 trans at most to store
2344 the data (one row per char at worst case) So we preallocate both
2345 structures assuming worst case.
2347 We then construct the trie using only the .next slots of the entry
2350 We use the .check field of the first entry of the node temporarily
2351 to make compression both faster and easier by keeping track of how
2352 many non zero fields are in the node.
2354 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2357 There are two terms at use here: state as a TRIE_NODEIDX() which is
2358 a number representing the first entry of the node, and state as a
2359 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2360 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2361 if there are 2 entrys per node. eg:
2369 The table is internally in the right hand, idx form. However as we
2370 also have to deal with the states array which is indexed by nodenum
2371 we have to use TRIE_NODENUM() to convert.
2374 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2375 "%*sCompiling trie using table compiler\n",
2376 (int)depth * 2 + 2, ""));
2378 trie->trans = (reg_trie_trans *)
2379 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2380 * trie->uniquecharcount + 1,
2381 sizeof(reg_trie_trans) );
2382 trie->states = (reg_trie_state *)
2383 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2384 sizeof(reg_trie_state) );
2385 next_alloc = trie->uniquecharcount + 1;
2388 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2390 regnode *noper = NEXTOPER( cur );
2391 const U8 *uc = (U8*)STRING( noper );
2392 const U8 *e = uc + STR_LEN( noper );
2394 U32 state = 1; /* required init */
2396 U16 charid = 0; /* sanity init */
2397 U32 accept_state = 0; /* sanity init */
2399 U32 wordlen = 0; /* required init */
2401 if (OP(noper) == NOTHING) {
2402 regnode *noper_next= regnext(noper);
2403 if (noper_next != tail && OP(noper_next) == flags) {
2405 uc= (U8*)STRING(noper);
2406 e= uc + STR_LEN(noper);
2410 if ( OP(noper) != NOTHING ) {
2411 for ( ; uc < e ; uc += len ) {
2416 charid = trie->charmap[ uvc ];
2418 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2419 charid = svpp ? (U16)SvIV(*svpp) : 0;
2423 if ( !trie->trans[ state + charid ].next ) {
2424 trie->trans[ state + charid ].next = next_alloc;
2425 trie->trans[ state ].check++;
2426 prev_states[TRIE_NODENUM(next_alloc)]
2427 = TRIE_NODENUM(state);
2428 next_alloc += trie->uniquecharcount;
2430 state = trie->trans[ state + charid ].next;
2432 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2434 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2437 accept_state = TRIE_NODENUM( state );
2438 TRIE_HANDLE_WORD(accept_state);
2440 } /* end second pass */
2442 /* and now dump it out before we compress it */
2443 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2445 next_alloc, depth+1));
2449 * Inplace compress the table.*
2451 For sparse data sets the table constructed by the trie algorithm will
2452 be mostly 0/FAIL transitions or to put it another way mostly empty.
2453 (Note that leaf nodes will not contain any transitions.)
2455 This algorithm compresses the tables by eliminating most such
2456 transitions, at the cost of a modest bit of extra work during lookup:
2458 - Each states[] entry contains a .base field which indicates the
2459 index in the state[] array wheres its transition data is stored.
2461 - If .base is 0 there are no valid transitions from that node.
2463 - If .base is nonzero then charid is added to it to find an entry in
2466 -If trans[states[state].base+charid].check!=state then the
2467 transition is taken to be a 0/Fail transition. Thus if there are fail
2468 transitions at the front of the node then the .base offset will point
2469 somewhere inside the previous nodes data (or maybe even into a node
2470 even earlier), but the .check field determines if the transition is
2474 The following process inplace converts the table to the compressed
2475 table: We first do not compress the root node 1,and mark all its
2476 .check pointers as 1 and set its .base pointer as 1 as well. This
2477 allows us to do a DFA construction from the compressed table later,
2478 and ensures that any .base pointers we calculate later are greater
2481 - We set 'pos' to indicate the first entry of the second node.
2483 - We then iterate over the columns of the node, finding the first and
2484 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2485 and set the .check pointers accordingly, and advance pos
2486 appropriately and repreat for the next node. Note that when we copy
2487 the next pointers we have to convert them from the original
2488 NODEIDX form to NODENUM form as the former is not valid post
2491 - If a node has no transitions used we mark its base as 0 and do not
2492 advance the pos pointer.
2494 - If a node only has one transition we use a second pointer into the
2495 structure to fill in allocated fail transitions from other states.
2496 This pointer is independent of the main pointer and scans forward
2497 looking for null transitions that are allocated to a state. When it
2498 finds one it writes the single transition into the "hole". If the
2499 pointer doesnt find one the single transition is appended as normal.
2501 - Once compressed we can Renew/realloc the structures to release the
2504 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2505 specifically Fig 3.47 and the associated pseudocode.
2509 const U32 laststate = TRIE_NODENUM( next_alloc );
2512 trie->statecount = laststate;
2514 for ( state = 1 ; state < laststate ; state++ ) {
2516 const U32 stateidx = TRIE_NODEIDX( state );
2517 const U32 o_used = trie->trans[ stateidx ].check;
2518 U32 used = trie->trans[ stateidx ].check;
2519 trie->trans[ stateidx ].check = 0;
2521 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2522 if ( flag || trie->trans[ stateidx + charid ].next ) {
2523 if ( trie->trans[ stateidx + charid ].next ) {
2525 for ( ; zp < pos ; zp++ ) {
2526 if ( ! trie->trans[ zp ].next ) {
2530 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2531 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2532 trie->trans[ zp ].check = state;
2533 if ( ++zp > pos ) pos = zp;
2540 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2542 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2543 trie->trans[ pos ].check = state;
2548 trie->lasttrans = pos + 1;
2549 trie->states = (reg_trie_state *)
2550 PerlMemShared_realloc( trie->states, laststate
2551 * sizeof(reg_trie_state) );
2552 DEBUG_TRIE_COMPILE_MORE_r(
2553 PerlIO_printf( Perl_debug_log,
2554 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2555 (int)depth * 2 + 2,"",
2556 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2559 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2562 } /* end table compress */
2564 DEBUG_TRIE_COMPILE_MORE_r(
2565 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2566 (int)depth * 2 + 2, "",
2567 (UV)trie->statecount,
2568 (UV)trie->lasttrans)
2570 /* resize the trans array to remove unused space */
2571 trie->trans = (reg_trie_trans *)
2572 PerlMemShared_realloc( trie->trans, trie->lasttrans
2573 * sizeof(reg_trie_trans) );
2575 { /* Modify the program and insert the new TRIE node */
2576 U8 nodetype =(U8)(flags & 0xFF);
2580 regnode *optimize = NULL;
2581 #ifdef RE_TRACK_PATTERN_OFFSETS
2584 U32 mjd_nodelen = 0;
2585 #endif /* RE_TRACK_PATTERN_OFFSETS */
2586 #endif /* DEBUGGING */
2588 This means we convert either the first branch or the first Exact,
2589 depending on whether the thing following (in 'last') is a branch
2590 or not and whther first is the startbranch (ie is it a sub part of
2591 the alternation or is it the whole thing.)
2592 Assuming its a sub part we convert the EXACT otherwise we convert
2593 the whole branch sequence, including the first.
2595 /* Find the node we are going to overwrite */
2596 if ( first != startbranch || OP( last ) == BRANCH ) {
2597 /* branch sub-chain */
2598 NEXT_OFF( first ) = (U16)(last - first);
2599 #ifdef RE_TRACK_PATTERN_OFFSETS
2601 mjd_offset= Node_Offset((convert));
2602 mjd_nodelen= Node_Length((convert));
2605 /* whole branch chain */
2607 #ifdef RE_TRACK_PATTERN_OFFSETS
2610 const regnode *nop = NEXTOPER( convert );
2611 mjd_offset= Node_Offset((nop));
2612 mjd_nodelen= Node_Length((nop));
2616 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2617 (int)depth * 2 + 2, "",
2618 (UV)mjd_offset, (UV)mjd_nodelen)
2621 /* But first we check to see if there is a common prefix we can
2622 split out as an EXACT and put in front of the TRIE node. */
2623 trie->startstate= 1;
2624 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2626 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2630 const U32 base = trie->states[ state ].trans.base;
2632 if ( trie->states[state].wordnum )
2635 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2636 if ( ( base + ofs >= trie->uniquecharcount ) &&
2637 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2638 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2640 if ( ++count > 1 ) {
2641 SV **tmp = av_fetch( revcharmap, ofs, 0);
2642 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2643 if ( state == 1 ) break;
2645 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2647 PerlIO_printf(Perl_debug_log,
2648 "%*sNew Start State=%"UVuf" Class: [",
2649 (int)depth * 2 + 2, "",
2652 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2653 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2655 TRIE_BITMAP_SET(trie,*ch);
2657 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2659 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2663 TRIE_BITMAP_SET(trie,*ch);
2665 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2666 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2672 SV **tmp = av_fetch( revcharmap, idx, 0);
2674 char *ch = SvPV( *tmp, len );
2676 SV *sv=sv_newmortal();
2677 PerlIO_printf( Perl_debug_log,
2678 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2679 (int)depth * 2 + 2, "",
2681 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2682 PL_colors[0], PL_colors[1],
2683 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2684 PERL_PV_ESCAPE_FIRSTCHAR
2689 OP( convert ) = nodetype;
2690 str=STRING(convert);
2693 STR_LEN(convert) += len;
2699 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2704 trie->prefixlen = (state-1);
2706 regnode *n = convert+NODE_SZ_STR(convert);
2707 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2708 trie->startstate = state;
2709 trie->minlen -= (state - 1);
2710 trie->maxlen -= (state - 1);
2712 /* At least the UNICOS C compiler choked on this
2713 * being argument to DEBUG_r(), so let's just have
2716 #ifdef PERL_EXT_RE_BUILD
2722 regnode *fix = convert;
2723 U32 word = trie->wordcount;
2725 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2726 while( ++fix < n ) {
2727 Set_Node_Offset_Length(fix, 0, 0);
2730 SV ** const tmp = av_fetch( trie_words, word, 0 );
2732 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2733 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2735 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2743 NEXT_OFF(convert) = (U16)(tail - convert);
2744 DEBUG_r(optimize= n);
2750 if ( trie->maxlen ) {
2751 NEXT_OFF( convert ) = (U16)(tail - convert);
2752 ARG_SET( convert, data_slot );
2753 /* Store the offset to the first unabsorbed branch in
2754 jump[0], which is otherwise unused by the jump logic.
2755 We use this when dumping a trie and during optimisation. */
2757 trie->jump[0] = (U16)(nextbranch - convert);
2759 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2760 * and there is a bitmap
2761 * and the first "jump target" node we found leaves enough room
2762 * then convert the TRIE node into a TRIEC node, with the bitmap
2763 * embedded inline in the opcode - this is hypothetically faster.
2765 if ( !trie->states[trie->startstate].wordnum
2767 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2769 OP( convert ) = TRIEC;
2770 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2771 PerlMemShared_free(trie->bitmap);
2774 OP( convert ) = TRIE;
2776 /* store the type in the flags */
2777 convert->flags = nodetype;
2781 + regarglen[ OP( convert ) ];
2783 /* XXX We really should free up the resource in trie now,
2784 as we won't use them - (which resources?) dmq */
2786 /* needed for dumping*/
2787 DEBUG_r(if (optimize) {
2788 regnode *opt = convert;
2790 while ( ++opt < optimize) {
2791 Set_Node_Offset_Length(opt,0,0);
2794 Try to clean up some of the debris left after the
2797 while( optimize < jumper ) {
2798 mjd_nodelen += Node_Length((optimize));
2799 OP( optimize ) = OPTIMIZED;
2800 Set_Node_Offset_Length(optimize,0,0);
2803 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2805 } /* end node insert */
2807 /* Finish populating the prev field of the wordinfo array. Walk back
2808 * from each accept state until we find another accept state, and if
2809 * so, point the first word's .prev field at the second word. If the
2810 * second already has a .prev field set, stop now. This will be the
2811 * case either if we've already processed that word's accept state,
2812 * or that state had multiple words, and the overspill words were
2813 * already linked up earlier.
2820 for (word=1; word <= trie->wordcount; word++) {
2822 if (trie->wordinfo[word].prev)
2824 state = trie->wordinfo[word].accept;
2826 state = prev_states[state];
2829 prev = trie->states[state].wordnum;
2833 trie->wordinfo[word].prev = prev;
2835 Safefree(prev_states);
2839 /* and now dump out the compressed format */
2840 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2842 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2844 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2845 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2847 SvREFCNT_dec_NN(revcharmap);
2851 : trie->startstate>1
2857 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2859 /* The Trie is constructed and compressed now so we can build a fail array if
2862 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2864 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2868 We find the fail state for each state in the trie, this state is the longest
2869 proper suffix of the current state's 'word' that is also a proper prefix of
2870 another word in our trie. State 1 represents the word '' and is thus the
2871 default fail state. This allows the DFA not to have to restart after its
2872 tried and failed a word at a given point, it simply continues as though it
2873 had been matching the other word in the first place.
2875 'abcdgu'=~/abcdefg|cdgu/
2876 When we get to 'd' we are still matching the first word, we would encounter
2877 'g' which would fail, which would bring us to the state representing 'd' in
2878 the second word where we would try 'g' and succeed, proceeding to match
2881 /* add a fail transition */
2882 const U32 trie_offset = ARG(source);
2883 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2885 const U32 ucharcount = trie->uniquecharcount;
2886 const U32 numstates = trie->statecount;
2887 const U32 ubound = trie->lasttrans + ucharcount;
2891 U32 base = trie->states[ 1 ].trans.base;
2894 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
2895 GET_RE_DEBUG_FLAGS_DECL;
2897 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2899 PERL_UNUSED_ARG(depth);
2903 ARG_SET( stclass, data_slot );
2904 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2905 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2906 aho->trie=trie_offset;
2907 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2908 Copy( trie->states, aho->states, numstates, reg_trie_state );
2909 Newxz( q, numstates, U32);
2910 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2913 /* initialize fail[0..1] to be 1 so that we always have
2914 a valid final fail state */
2915 fail[ 0 ] = fail[ 1 ] = 1;
2917 for ( charid = 0; charid < ucharcount ; charid++ ) {
2918 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2920 q[ q_write ] = newstate;
2921 /* set to point at the root */
2922 fail[ q[ q_write++ ] ]=1;
2925 while ( q_read < q_write) {
2926 const U32 cur = q[ q_read++ % numstates ];
2927 base = trie->states[ cur ].trans.base;
2929 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2930 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2932 U32 fail_state = cur;
2935 fail_state = fail[ fail_state ];
2936 fail_base = aho->states[ fail_state ].trans.base;
2937 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2939 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2940 fail[ ch_state ] = fail_state;
2941 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2943 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2945 q[ q_write++ % numstates] = ch_state;
2949 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2950 when we fail in state 1, this allows us to use the
2951 charclass scan to find a valid start char. This is based on the principle
2952 that theres a good chance the string being searched contains lots of stuff
2953 that cant be a start char.
2955 fail[ 0 ] = fail[ 1 ] = 0;
2956 DEBUG_TRIE_COMPILE_r({
2957 PerlIO_printf(Perl_debug_log,
2958 "%*sStclass Failtable (%"UVuf" states): 0",
2959 (int)(depth * 2), "", (UV)numstates
2961 for( q_read=1; q_read<numstates; q_read++ ) {
2962 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2964 PerlIO_printf(Perl_debug_log, "\n");
2967 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2971 #define DEBUG_PEEP(str,scan,depth) \
2972 DEBUG_OPTIMISE_r({if (scan){ \
2973 SV * const mysv=sv_newmortal(); \
2974 regnode *Next = regnext(scan); \
2975 regprop(RExC_rx, mysv, scan); \
2976 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2977 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2978 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2982 /* The below joins as many adjacent EXACTish nodes as possible into a single
2983 * one. The regop may be changed if the node(s) contain certain sequences that
2984 * require special handling. The joining is only done if:
2985 * 1) there is room in the current conglomerated node to entirely contain the
2987 * 2) they are the exact same node type
2989 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2990 * these get optimized out
2992 * If a node is to match under /i (folded), the number of characters it matches
2993 * can be different than its character length if it contains a multi-character
2994 * fold. *min_subtract is set to the total delta of the input nodes.
2996 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2997 * and contains LATIN SMALL LETTER SHARP S
2999 * This is as good a place as any to discuss the design of handling these
3000 * multi-character fold sequences. It's been wrong in Perl for a very long
3001 * time. There are three code points in Unicode whose multi-character folds
3002 * were long ago discovered to mess things up. The previous designs for
3003 * dealing with these involved assigning a special node for them. This
3004 * approach doesn't work, as evidenced by this example:
3005 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3006 * Both these fold to "sss", but if the pattern is parsed to create a node that
3007 * would match just the \xDF, it won't be able to handle the case where a
3008 * successful match would have to cross the node's boundary. The new approach
3009 * that hopefully generally solves the problem generates an EXACTFU_SS node
3012 * It turns out that there are problems with all multi-character folds, and not
3013 * just these three. Now the code is general, for all such cases. The
3014 * approach taken is:
3015 * 1) This routine examines each EXACTFish node that could contain multi-
3016 * character fold sequences. It returns in *min_subtract how much to
3017 * subtract from the the actual length of the string to get a real minimum
3018 * match length; it is 0 if there are no multi-char folds. This delta is
3019 * used by the caller to adjust the min length of the match, and the delta
3020 * between min and max, so that the optimizer doesn't reject these
3021 * possibilities based on size constraints.
3022 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3023 * is used for an EXACTFU node that contains at least one "ss" sequence in
3024 * it. For non-UTF-8 patterns and strings, this is the only case where
3025 * there is a possible fold length change. That means that a regular
3026 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3027 * with length changes, and so can be processed faster. regexec.c takes
3028 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3029 * pre-folded by regcomp.c. This saves effort in regex matching.
3030 * However, the pre-folding isn't done for non-UTF8 patterns because the
3031 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
3032 * down by forcing the pattern into UTF8 unless necessary. Also what
3033 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
3034 * possibilities for the non-UTF8 patterns are quite simple, except for
3035 * the sharp s. All the ones that don't involve a UTF-8 target string are
3036 * members of a fold-pair, and arrays are set up for all of them so that
3037 * the other member of the pair can be found quickly. Code elsewhere in
3038 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3039 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3040 * described in the next item.
3041 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
3042 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3043 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
3044 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
3045 * character in the pattern corresponds to at most a single character in
3046 * the target string. (And I do mean character, and not byte here, unlike
3047 * other parts of the documentation that have never been updated to
3048 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
3049 * two character string 'ss'; in EXACTFA nodes it can match
3050 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
3051 * instances where it is violated. I'm reluctant to try to change the
3052 * assumption, as the code involved is impenetrable to me (khw), so
3053 * instead the code here punts. This routine examines (when the pattern
3054 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3055 * boolean indicating whether or not the node contains a sharp s. When it
3056 * is true, the caller sets a flag that later causes the optimizer in this
3057 * file to not set values for the floating and fixed string lengths, and
3058 * thus avoids the optimizer code in regexec.c that makes the invalid
3059 * assumption. Thus, there is no optimization based on string lengths for
3060 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3061 * (The reason the assumption is wrong only in these two cases is that all
3062 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3063 * other folds to their expanded versions. We can't prefold sharp s to
3064 * 'ss' in EXACTF nodes because we don't know at compile time if it
3065 * actually matches 'ss' or not. It will match iff the target string is
3066 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3067 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
3068 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3069 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3070 * require the pattern to be forced into UTF-8, the overhead of which we
3073 * Similarly, the code that generates tries doesn't currently handle
3074 * not-already-folded multi-char folds, and it looks like a pain to change
3075 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3076 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3077 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3078 * using /iaa matching will be doing so almost entirely with ASCII
3079 * strings, so this should rarely be encountered in practice */
3081 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
3082 if (PL_regkind[OP(scan)] == EXACT) \
3083 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
3086 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
3087 /* Merge several consecutive EXACTish nodes into one. */
3088 regnode *n = regnext(scan);
3090 regnode *next = scan + NODE_SZ_STR(scan);
3094 regnode *stop = scan;
3095 GET_RE_DEBUG_FLAGS_DECL;
3097 PERL_UNUSED_ARG(depth);
3100 PERL_ARGS_ASSERT_JOIN_EXACT;
3101 #ifndef EXPERIMENTAL_INPLACESCAN
3102 PERL_UNUSED_ARG(flags);
3103 PERL_UNUSED_ARG(val);
3105 DEBUG_PEEP("join",scan,depth);
3107 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3108 * EXACT ones that are mergeable to the current one. */
3110 && (PL_regkind[OP(n)] == NOTHING
3111 || (stringok && OP(n) == OP(scan)))
3113 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3116 if (OP(n) == TAIL || n > next)
3118 if (PL_regkind[OP(n)] == NOTHING) {
3119 DEBUG_PEEP("skip:",n,depth);
3120 NEXT_OFF(scan) += NEXT_OFF(n);
3121 next = n + NODE_STEP_REGNODE;
3128 else if (stringok) {
3129 const unsigned int oldl = STR_LEN(scan);
3130 regnode * const nnext = regnext(n);
3132 /* XXX I (khw) kind of doubt that this works on platforms where
3133 * U8_MAX is above 255 because of lots of other assumptions */
3134 /* Don't join if the sum can't fit into a single node */
3135 if (oldl + STR_LEN(n) > U8_MAX)
3138 DEBUG_PEEP("merg",n,depth);
3141 NEXT_OFF(scan) += NEXT_OFF(n);
3142 STR_LEN(scan) += STR_LEN(n);
3143 next = n + NODE_SZ_STR(n);
3144 /* Now we can overwrite *n : */
3145 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3153 #ifdef EXPERIMENTAL_INPLACESCAN
3154 if (flags && !NEXT_OFF(n)) {
3155 DEBUG_PEEP("atch", val, depth);
3156 if (reg_off_by_arg[OP(n)]) {
3157 ARG_SET(n, val - n);
3160 NEXT_OFF(n) = val - n;
3168 *has_exactf_sharp_s = FALSE;
3170 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3171 * can now analyze for sequences of problematic code points. (Prior to
3172 * this final joining, sequences could have been split over boundaries, and
3173 * hence missed). The sequences only happen in folding, hence for any
3174 * non-EXACT EXACTish node */
3175 if (OP(scan) != EXACT) {
3176 const U8 * const s0 = (U8*) STRING(scan);
3178 const U8 * const s_end = s0 + STR_LEN(scan);
3180 /* One pass is made over the node's string looking for all the
3181 * possibilities. to avoid some tests in the loop, there are two main
3182 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3186 /* Examine the string for a multi-character fold sequence. UTF-8
3187 * patterns have all characters pre-folded by the time this code is
3189 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3190 length sequence we are looking for is 2 */
3193 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3194 if (! len) { /* Not a multi-char fold: get next char */
3199 /* Nodes with 'ss' require special handling, except for EXACTFL
3200 * and EXACTFA-ish for which there is no multi-char fold to
3202 if (len == 2 && *s == 's' && *(s+1) == 's'
3203 && OP(scan) != EXACTFL
3204 && OP(scan) != EXACTFA
3205 && OP(scan) != EXACTFA_NO_TRIE)
3208 OP(scan) = EXACTFU_SS;
3211 else { /* Here is a generic multi-char fold. */
3212 const U8* multi_end = s + len;
3214 /* Count how many characters in it. In the case of /l and
3215 * /aa, no folds which contain ASCII code points are
3216 * allowed, so check for those, and skip if found. (In
3217 * EXACTFL, no folds are allowed to any Latin1 code point,
3218 * not just ASCII. But there aren't any of these
3219 * currently, nor ever likely, so don't take the time to
3220 * test for them. The code that generates the
3221 * is_MULTI_foo() macros croaks should one actually get put
3222 * into Unicode .) */
3223 if (OP(scan) != EXACTFL
3224 && OP(scan) != EXACTFA
3225 && OP(scan) != EXACTFA_NO_TRIE)
3227 count = utf8_length(s, multi_end);
3231 while (s < multi_end) {
3234 goto next_iteration;
3244 /* The delta is how long the sequence is minus 1 (1 is how long
3245 * the character that folds to the sequence is) */
3246 *min_subtract += count - 1;
3250 else if (OP(scan) == EXACTFA) {
3252 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3253 * fold to the ASCII range (and there are no existing ones in the
3254 * upper latin1 range). But, as outlined in the comments preceding
3255 * this function, we need to flag any occurrences of the sharp s.
3256 * This character forbids trie formation (because of added
3259 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3260 OP(scan) = EXACTFA_NO_TRIE;
3261 *has_exactf_sharp_s = TRUE;
3268 else if (OP(scan) != EXACTFL) {
3270 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
3271 * multi-char folds that are all Latin1. (This code knows that
3272 * there are no current multi-char folds possible with EXACTFL,
3273 * relying on fold_grind.t to catch any errors if the very unlikely
3274 * event happens that some get added in future Unicode versions.)
3275 * As explained in the comments preceding this function, we look
3276 * also for the sharp s in EXACTF nodes; it can be in the final
3277 * position. Otherwise we can stop looking 1 byte earlier because
3278 * have to find at least two characters for a multi-fold */
3279 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3282 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3283 if (! len) { /* Not a multi-char fold. */
3284 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3286 *has_exactf_sharp_s = TRUE;
3293 && isARG2_lower_or_UPPER_ARG1('s', *s)
3294 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3297 /* EXACTF nodes need to know that the minimum length
3298 * changed so that a sharp s in the string can match this
3299 * ss in the pattern, but they remain EXACTF nodes, as they
3300 * won't match this unless the target string is is UTF-8,
3301 * which we don't know until runtime */
3302 if (OP(scan) != EXACTF) {
3303 OP(scan) = EXACTFU_SS;
3307 *min_subtract += len - 1;
3314 /* Allow dumping but overwriting the collection of skipped
3315 * ops and/or strings with fake optimized ops */
3316 n = scan + NODE_SZ_STR(scan);
3324 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3328 /* REx optimizer. Converts nodes into quicker variants "in place".
3329 Finds fixed substrings. */
3331 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3332 to the position after last scanned or to NULL. */
3334 #define INIT_AND_WITHP \
3335 assert(!and_withp); \
3336 Newx(and_withp,1, regnode_ssc); \
3337 SAVEFREEPV(and_withp)
3339 /* this is a chain of data about sub patterns we are processing that
3340 need to be handled separately/specially in study_chunk. Its so
3341 we can simulate recursion without losing state. */
3343 typedef struct scan_frame {
3344 regnode *last; /* last node to process in this frame */
3345 regnode *next; /* next node to process when last is reached */
3346 struct scan_frame *prev; /*previous frame*/
3347 U32 prev_recursed_depth;
3348 I32 stop; /* what stopparen do we use */
3352 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3355 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3356 SSize_t *minlenp, SSize_t *deltap,
3361 regnode_ssc *and_withp,
3362 U32 flags, U32 depth)
3363 /* scanp: Start here (read-write). */
3364 /* deltap: Write maxlen-minlen here. */
3365 /* last: Stop before this one. */
3366 /* data: string data about the pattern */
3367 /* stopparen: treat close N as END */
3368 /* recursed: which subroutines have we recursed into */
3369 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3372 /* There must be at least this number of characters to match */
3375 regnode *scan = *scanp, *next;
3377 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3378 int is_inf_internal = 0; /* The studied chunk is infinite */
3379 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3380 scan_data_t data_fake;
3381 SV *re_trie_maxbuff = NULL;
3382 regnode *first_non_open = scan;
3383 SSize_t stopmin = SSize_t_MAX;
3384 scan_frame *frame = NULL;
3385 GET_RE_DEBUG_FLAGS_DECL;
3387 PERL_ARGS_ASSERT_STUDY_CHUNK;
3390 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3393 while (first_non_open && OP(first_non_open) == OPEN)
3394 first_non_open=regnext(first_non_open);
3399 while ( scan && OP(scan) != END && scan < last ){
3400 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3401 node length to get a real minimum (because
3402 the folded version may be shorter) */
3403 bool has_exactf_sharp_s = FALSE;
3404 /* Peephole optimizer: */
3405 DEBUG_OPTIMISE_MORE_r(
3407 PerlIO_printf(Perl_debug_log,"%*sstudy_chunk stopparen=%d depth=%u recursed_depth=%u ",
3408 (depth*2), "", stopparen, depth, recursed_depth);
3409 if (recursed_depth) {
3412 for ( j = 0 ; j < recursed_depth ; j++ ) {
3413 PerlIO_printf(Perl_debug_log,"[");
3414 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3415 PerlIO_printf(Perl_debug_log,"%d",
3416 PAREN_TEST(RExC_study_chunk_recursed +
3417 (j * RExC_study_chunk_recursed_bytes), i)
3420 PerlIO_printf(Perl_debug_log,"]");
3423 PerlIO_printf(Perl_debug_log,"\n");
3426 DEBUG_STUDYDATA("Peep:", data, depth);
3427 DEBUG_PEEP("Peep", scan, depth);
3430 /* Its not clear to khw or hv why this is done here, and not in the
3431 * clauses that deal with EXACT nodes. khw's guess is that it's
3432 * because of a previous design */
3433 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3435 /* Follow the next-chain of the current node and optimize
3436 away all the NOTHINGs from it. */
3437 if (OP(scan) != CURLYX) {
3438 const int max = (reg_off_by_arg[OP(scan)]
3440 /* I32 may be smaller than U16 on CRAYs! */
3441 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3442 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3446 /* Skip NOTHING and LONGJMP. */
3447 while ((n = regnext(n))
3448 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3449 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3450 && off + noff < max)
3452 if (reg_off_by_arg[OP(scan)])
3455 NEXT_OFF(scan) = off;
3460 /* The principal pseudo-switch. Cannot be a switch, since we
3461 look into several different things. */
3462 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3463 || OP(scan) == IFTHEN) {
3464 next = regnext(scan);
3466 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3468 if (OP(next) == code || code == IFTHEN) {
3469 /* NOTE - There is similar code to this block below for
3470 * handling TRIE nodes on a re-study. If you change stuff here
3471 * check there too. */
3472 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3474 regnode * const startbranch=scan;
3476 if (flags & SCF_DO_SUBSTR)
3477 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3478 if (flags & SCF_DO_STCLASS)
3479 ssc_init_zero(pRExC_state, &accum);
3481 while (OP(scan) == code) {
3482 SSize_t deltanext, minnext, fake;
3484 regnode_ssc this_class;
3487 data_fake.flags = 0;
3489 data_fake.whilem_c = data->whilem_c;
3490 data_fake.last_closep = data->last_closep;
3493 data_fake.last_closep = &fake;
3495 data_fake.pos_delta = delta;
3496 next = regnext(scan);
3497 scan = NEXTOPER(scan);
3499 scan = NEXTOPER(scan);
3500 if (flags & SCF_DO_STCLASS) {
3501 ssc_init(pRExC_state, &this_class);
3502 data_fake.start_class = &this_class;
3503 f = SCF_DO_STCLASS_AND;
3505 if (flags & SCF_WHILEM_VISITED_POS)
3506 f |= SCF_WHILEM_VISITED_POS;
3508 /* we suppose the run is continuous, last=next...*/
3509 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3511 stopparen, recursed_depth, NULL, f,depth+1);
3514 if (deltanext == SSize_t_MAX) {
3515 is_inf = is_inf_internal = 1;
3517 } else if (max1 < minnext + deltanext)
3518 max1 = minnext + deltanext;
3520 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3522 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3523 if ( stopmin > minnext)
3524 stopmin = min + min1;
3525 flags &= ~SCF_DO_SUBSTR;
3527 data->flags |= SCF_SEEN_ACCEPT;
3530 if (data_fake.flags & SF_HAS_EVAL)
3531 data->flags |= SF_HAS_EVAL;
3532 data->whilem_c = data_fake.whilem_c;
3534 if (flags & SCF_DO_STCLASS)
3535 ssc_or(pRExC_state, &accum, &this_class);
3537 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3539 if (flags & SCF_DO_SUBSTR) {
3540 data->pos_min += min1;
3541 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3542 data->pos_delta = SSize_t_MAX;
3544 data->pos_delta += max1 - min1;
3545 if (max1 != min1 || is_inf)
3546 data->longest = &(data->longest_float);
3549 if (delta == SSize_t_MAX
3550 || SSize_t_MAX - delta - (max1 - min1) < 0)
3551 delta = SSize_t_MAX;
3553 delta += max1 - min1;
3554 if (flags & SCF_DO_STCLASS_OR) {
3555 ssc_or(pRExC_state, data->start_class, &accum);
3557 ssc_and(pRExC_state, data->start_class, and_withp);
3558 flags &= ~SCF_DO_STCLASS;
3561 else if (flags & SCF_DO_STCLASS_AND) {
3563 ssc_and(pRExC_state, data->start_class, &accum);
3564 flags &= ~SCF_DO_STCLASS;
3567 /* Switch to OR mode: cache the old value of
3568 * data->start_class */
3570 StructCopy(data->start_class, and_withp, regnode_ssc);
3571 flags &= ~SCF_DO_STCLASS_AND;
3572 StructCopy(&accum, data->start_class, regnode_ssc);
3573 flags |= SCF_DO_STCLASS_OR;
3577 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3580 Assuming this was/is a branch we are dealing with: 'scan'
3581 now points at the item that follows the branch sequence,
3582 whatever it is. We now start at the beginning of the
3583 sequence and look for subsequences of
3589 which would be constructed from a pattern like
3592 If we can find such a subsequence we need to turn the first
3593 element into a trie and then add the subsequent branch exact
3594 strings to the trie.
3598 1. patterns where the whole set of branches can be
3601 2. patterns where only a subset can be converted.
3603 In case 1 we can replace the whole set with a single regop
3604 for the trie. In case 2 we need to keep the start and end
3607 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3608 becomes BRANCH TRIE; BRANCH X;
3610 There is an additional case, that being where there is a
3611 common prefix, which gets split out into an EXACT like node
3612 preceding the TRIE node.
3614 If x(1..n)==tail then we can do a simple trie, if not we make
3615 a "jump" trie, such that when we match the appropriate word
3616 we "jump" to the appropriate tail node. Essentially we turn
3617 a nested if into a case structure of sorts.
3622 if (!re_trie_maxbuff) {
3623 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3624 if (!SvIOK(re_trie_maxbuff))
3625 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3627 if ( SvIV(re_trie_maxbuff)>=0 ) {
3629 regnode *first = (regnode *)NULL;
3630 regnode *last = (regnode *)NULL;
3631 regnode *tail = scan;
3636 SV * const mysv = sv_newmortal(); /* for dumping */
3638 /* var tail is used because there may be a TAIL
3639 regop in the way. Ie, the exacts will point to the
3640 thing following the TAIL, but the last branch will
3641 point at the TAIL. So we advance tail. If we
3642 have nested (?:) we may have to move through several
3646 while ( OP( tail ) == TAIL ) {
3647 /* this is the TAIL generated by (?:) */
3648 tail = regnext( tail );
3652 DEBUG_TRIE_COMPILE_r({
3653 regprop(RExC_rx, mysv, tail );
3654 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3655 (int)depth * 2 + 2, "",
3656 "Looking for TRIE'able sequences. Tail node is: ",
3657 SvPV_nolen_const( mysv )
3663 Step through the branches
3664 cur represents each branch,
3665 noper is the first thing to be matched as part
3667 noper_next is the regnext() of that node.
3669 We normally handle a case like this
3670 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3671 support building with NOJUMPTRIE, which restricts
3672 the trie logic to structures like /FOO|BAR/.
3674 If noper is a trieable nodetype then the branch is
3675 a possible optimization target. If we are building
3676 under NOJUMPTRIE then we require that noper_next is
3677 the same as scan (our current position in the regex
3680 Once we have two or more consecutive such branches
3681 we can create a trie of the EXACT's contents and
3682 stitch it in place into the program.
3684 If the sequence represents all of the branches in
3685 the alternation we replace the entire thing with a
3688 Otherwise when it is a subsequence we need to
3689 stitch it in place and replace only the relevant
3690 branches. This means the first branch has to remain
3691 as it is used by the alternation logic, and its
3692 next pointer, and needs to be repointed at the item
3693 on the branch chain following the last branch we
3694 have optimized away.
3696 This could be either a BRANCH, in which case the
3697 subsequence is internal, or it could be the item
3698 following the branch sequence in which case the
3699 subsequence is at the end (which does not
3700 necessarily mean the first node is the start of the
3703 TRIE_TYPE(X) is a define which maps the optype to a
3707 ----------------+-----------
3711 EXACTFU_SS | EXACTFU
3716 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3717 ( EXACT == (X) ) ? EXACT : \
3718 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3719 ( EXACTFA == (X) ) ? EXACTFA : \
3722 /* dont use tail as the end marker for this traverse */
3723 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3724 regnode * const noper = NEXTOPER( cur );
3725 U8 noper_type = OP( noper );
3726 U8 noper_trietype = TRIE_TYPE( noper_type );
3727 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3728 regnode * const noper_next = regnext( noper );
3729 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3730 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3733 DEBUG_TRIE_COMPILE_r({
3734 regprop(RExC_rx, mysv, cur);
3735 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3736 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3738 regprop(RExC_rx, mysv, noper);
3739 PerlIO_printf( Perl_debug_log, " -> %s",
3740 SvPV_nolen_const(mysv));
3743 regprop(RExC_rx, mysv, noper_next );
3744 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3745 SvPV_nolen_const(mysv));
3747 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3748 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3749 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3753 /* Is noper a trieable nodetype that can be merged
3754 * with the current trie (if there is one)? */
3758 ( noper_trietype == NOTHING)
3759 || ( trietype == NOTHING )
3760 || ( trietype == noper_trietype )
3763 && noper_next == tail
3767 /* Handle mergable triable node Either we are
3768 * the first node in a new trieable sequence,
3769 * in which case we do some bookkeeping,
3770 * otherwise we update the end pointer. */
3773 if ( noper_trietype == NOTHING ) {
3774 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3775 regnode * const noper_next = regnext( noper );
3776 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3777 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3780 if ( noper_next_trietype ) {
3781 trietype = noper_next_trietype;
3782 } else if (noper_next_type) {
3783 /* a NOTHING regop is 1 regop wide.
3784 * We need at least two for a trie
3785 * so we can't merge this in */
3789 trietype = noper_trietype;
3792 if ( trietype == NOTHING )
3793 trietype = noper_trietype;
3798 } /* end handle mergable triable node */
3800 /* handle unmergable node -
3801 * noper may either be a triable node which can
3802 * not be tried together with the current trie,
3803 * or a non triable node */
3805 /* If last is set and trietype is not
3806 * NOTHING then we have found at least two
3807 * triable branch sequences in a row of a
3808 * similar trietype so we can turn them
3809 * into a trie. If/when we allow NOTHING to
3810 * start a trie sequence this condition
3811 * will be required, and it isn't expensive
3812 * so we leave it in for now. */
3813 if ( trietype && trietype != NOTHING )
3814 make_trie( pRExC_state,
3815 startbranch, first, cur, tail, count,
3816 trietype, depth+1 );
3817 last = NULL; /* note: we clear/update
3818 first, trietype etc below,
3819 so we dont do it here */
3823 && noper_next == tail
3826 /* noper is triable, so we can start a new
3830 trietype = noper_trietype;
3832 /* if we already saw a first but the
3833 * current node is not triable then we have
3834 * to reset the first information. */
3839 } /* end handle unmergable node */
3840 } /* loop over branches */
3841 DEBUG_TRIE_COMPILE_r({
3842 regprop(RExC_rx, mysv, cur);
3843 PerlIO_printf( Perl_debug_log,
3844 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3845 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3848 if ( last && trietype ) {
3849 if ( trietype != NOTHING ) {
3850 /* the last branch of the sequence was part of
3851 * a trie, so we have to construct it here
3852 * outside of the loop */
3853 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3854 #ifdef TRIE_STUDY_OPT
3855 if ( ((made == MADE_EXACT_TRIE &&
3856 startbranch == first)
3857 || ( first_non_open == first )) &&
3859 flags |= SCF_TRIE_RESTUDY;
3860 if ( startbranch == first
3863 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3868 /* at this point we know whatever we have is a
3869 * NOTHING sequence/branch AND if 'startbranch'
3870 * is 'first' then we can turn the whole thing
3873 if ( startbranch == first ) {
3875 /* the entire thing is a NOTHING sequence,
3876 * something like this: (?:|) So we can
3877 * turn it into a plain NOTHING op. */
3878 DEBUG_TRIE_COMPILE_r({
3879 regprop(RExC_rx, mysv, cur);
3880 PerlIO_printf( Perl_debug_log,
3881 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3882 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3885 OP(startbranch)= NOTHING;
3886 NEXT_OFF(startbranch)= tail - startbranch;
3887 for ( opt= startbranch + 1; opt < tail ; opt++ )
3891 } /* end if ( last) */
3892 } /* TRIE_MAXBUF is non zero */
3897 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3898 scan = NEXTOPER(NEXTOPER(scan));
3899 } else /* single branch is optimized. */
3900 scan = NEXTOPER(scan);
3902 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3903 scan_frame *newframe = NULL;
3907 U32 my_recursed_depth= recursed_depth;
3909 if (OP(scan) != SUSPEND) {
3910 /* set the pointer */
3911 if (OP(scan) == GOSUB) {
3913 RExC_recurse[ARG2L(scan)] = scan;
3914 start = RExC_open_parens[paren-1];
3915 end = RExC_close_parens[paren-1];
3918 start = RExC_rxi->program + 1;
3923 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
3925 if (!recursed_depth) {
3926 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
3928 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
3929 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
3930 RExC_study_chunk_recursed_bytes, U8);
3932 /* we havent recursed into this paren yet, so recurse into it */
3933 DEBUG_STUDYDATA("set:", data,depth);
3934 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
3935 my_recursed_depth= recursed_depth + 1;
3936 Newx(newframe,1,scan_frame);
3938 DEBUG_STUDYDATA("inf:", data,depth);
3939 /* some form of infinite recursion, assume infinite length */
3940 if (flags & SCF_DO_SUBSTR) {
3941 SCAN_COMMIT(pRExC_state,data,minlenp);
3942 data->longest = &(data->longest_float);
3944 is_inf = is_inf_internal = 1;
3945 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3946 ssc_anything(data->start_class);
3947 flags &= ~SCF_DO_STCLASS;
3950 Newx(newframe,1,scan_frame);
3953 end = regnext(scan);
3958 SAVEFREEPV(newframe);
3959 newframe->next = regnext(scan);
3960 newframe->last = last;
3961 newframe->stop = stopparen;
3962 newframe->prev = frame;
3963 newframe->prev_recursed_depth = recursed_depth;
3965 DEBUG_STUDYDATA("frame-new:",data,depth);
3966 DEBUG_PEEP("fnew", scan, depth);
3973 recursed_depth= my_recursed_depth;
3978 else if (OP(scan) == EXACT) {
3979 SSize_t l = STR_LEN(scan);
3982 const U8 * const s = (U8*)STRING(scan);
3983 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3984 l = utf8_length(s, s + l);
3986 uc = *((U8*)STRING(scan));
3989 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3990 /* The code below prefers earlier match for fixed
3991 offset, later match for variable offset. */
3992 if (data->last_end == -1) { /* Update the start info. */
3993 data->last_start_min = data->pos_min;
3994 data->last_start_max = is_inf
3995 ? SSize_t_MAX : data->pos_min + data->pos_delta;
3997 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3999 SvUTF8_on(data->last_found);
4001 SV * const sv = data->last_found;
4002 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4003 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4004 if (mg && mg->mg_len >= 0)
4005 mg->mg_len += utf8_length((U8*)STRING(scan),
4006 (U8*)STRING(scan)+STR_LEN(scan));
4008 data->last_end = data->pos_min + l;
4009 data->pos_min += l; /* As in the first entry. */
4010 data->flags &= ~SF_BEFORE_EOL;
4013 /* ANDing the code point leaves at most it, and not in locale, and
4014 * can't match null string */
4015 if (flags & SCF_DO_STCLASS_AND) {
4016 ssc_cp_and(data->start_class, uc);
4017 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4018 ssc_clear_locale(data->start_class);
4020 else if (flags & SCF_DO_STCLASS_OR) {
4021 ssc_add_cp(data->start_class, uc);
4022 ssc_and(pRExC_state, data->start_class, and_withp);
4024 flags &= ~SCF_DO_STCLASS;
4026 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4027 SSize_t l = STR_LEN(scan);
4028 UV uc = *((U8*)STRING(scan));
4029 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4030 separate code points */
4032 /* Search for fixed substrings supports EXACT only. */
4033 if (flags & SCF_DO_SUBSTR) {
4035 SCAN_COMMIT(pRExC_state, data, minlenp);
4038 const U8 * const s = (U8 *)STRING(scan);
4039 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4040 l = utf8_length(s, s + l);
4042 if (has_exactf_sharp_s) {
4043 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
4045 min += l - min_subtract;
4047 delta += min_subtract;
4048 if (flags & SCF_DO_SUBSTR) {
4049 data->pos_min += l - min_subtract;
4050 if (data->pos_min < 0) {
4053 data->pos_delta += min_subtract;
4055 data->longest = &(data->longest_float);
4058 if (OP(scan) == EXACTFL) {
4059 if (flags & SCF_DO_STCLASS_AND) {
4060 ssc_flags_and(data->start_class,
4061 ANYOF_LOCALE|ANYOF_LOC_FOLD);
4063 else if (flags & SCF_DO_STCLASS_OR) {
4064 ANYOF_FLAGS(data->start_class)
4065 |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
4068 /* We don't know what the folds are; it could be anything. XXX
4069 * Actually, we only support UTF-8 encoding for code points
4070 * above Latin1, so we could know what those folds are. */
4071 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4075 else { /* Non-locale EXACTFish */
4076 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4077 if (flags & SCF_DO_STCLASS_AND) {
4078 ssc_clear_locale(data->start_class);
4080 if (uc < 256) { /* We know what the Latin1 folds are ... */
4081 if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
4082 know if anything folds
4084 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4085 PL_fold_latin1[uc]);
4086 if (OP(scan) != EXACTFA) { /* The folds below aren't
4088 if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4090 = add_cp_to_invlist(EXACTF_invlist,
4091 LATIN_SMALL_LETTER_SHARP_S);
4093 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4095 = add_cp_to_invlist(EXACTF_invlist, 's');
4097 = add_cp_to_invlist(EXACTF_invlist, 'S');
4101 /* We also know if there are above-Latin1 code points
4102 * that fold to this (none legal for ASCII and /iaa) */
4103 if ((! isASCII(uc) || OP(scan) != EXACTFA)
4104 && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4106 /* XXX We could know exactly what does fold to this
4107 * if the reverse folds are loaded, as currently in
4109 _invlist_union(EXACTF_invlist,
4115 else { /* Non-locale, above Latin1. XXX We don't currently
4116 know what participates in folds with this, so have
4117 to assume anything could */
4119 /* XXX We could know exactly what does fold to this if the
4120 * reverse folds are loaded, as currently in S_regclass().
4121 * But we do know that under /iaa nothing in the ASCII
4122 * range can participate */
4123 if (OP(scan) == EXACTFA) {
4124 _invlist_union_complement_2nd(EXACTF_invlist,
4125 PL_Posix_ptrs[_CC_ASCII],
4129 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4134 if (flags & SCF_DO_STCLASS_AND) {
4135 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4136 ANYOF_POSIXL_ZERO(data->start_class);
4137 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4139 else if (flags & SCF_DO_STCLASS_OR) {
4140 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4141 ssc_and(pRExC_state, data->start_class, and_withp);
4143 flags &= ~SCF_DO_STCLASS;
4144 SvREFCNT_dec(EXACTF_invlist);
4146 else if (REGNODE_VARIES(OP(scan))) {
4147 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4148 I32 fl = 0, f = flags;
4149 regnode * const oscan = scan;
4150 regnode_ssc this_class;
4151 regnode_ssc *oclass = NULL;
4152 I32 next_is_eval = 0;
4154 switch (PL_regkind[OP(scan)]) {
4155 case WHILEM: /* End of (?:...)* . */
4156 scan = NEXTOPER(scan);
4159 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4160 next = NEXTOPER(scan);
4161 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4163 maxcount = REG_INFTY;
4164 next = regnext(scan);
4165 scan = NEXTOPER(scan);
4169 if (flags & SCF_DO_SUBSTR)
4174 if (flags & SCF_DO_STCLASS) {
4176 maxcount = REG_INFTY;
4177 next = regnext(scan);
4178 scan = NEXTOPER(scan);
4181 is_inf = is_inf_internal = 1;
4182 scan = regnext(scan);
4183 if (flags & SCF_DO_SUBSTR) {
4184 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
4185 data->longest = &(data->longest_float);
4187 goto optimize_curly_tail;
4189 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4190 && (scan->flags == stopparen))
4195 mincount = ARG1(scan);
4196 maxcount = ARG2(scan);
4198 next = regnext(scan);
4199 if (OP(scan) == CURLYX) {
4200 I32 lp = (data ? *(data->last_closep) : 0);
4201 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4203 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4204 next_is_eval = (OP(scan) == EVAL);
4206 if (flags & SCF_DO_SUBSTR) {
4207 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
4208 pos_before = data->pos_min;
4212 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4214 data->flags |= SF_IS_INF;
4216 if (flags & SCF_DO_STCLASS) {
4217 ssc_init(pRExC_state, &this_class);
4218 oclass = data->start_class;
4219 data->start_class = &this_class;
4220 f |= SCF_DO_STCLASS_AND;
4221 f &= ~SCF_DO_STCLASS_OR;
4223 /* Exclude from super-linear cache processing any {n,m}
4224 regops for which the combination of input pos and regex
4225 pos is not enough information to determine if a match
4228 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4229 regex pos at the \s*, the prospects for a match depend not
4230 only on the input position but also on how many (bar\s*)
4231 repeats into the {4,8} we are. */
4232 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4233 f &= ~SCF_WHILEM_VISITED_POS;
4235 /* This will finish on WHILEM, setting scan, or on NULL: */
4236 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4237 last, data, stopparen, recursed_depth, NULL,
4239 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
4241 if (flags & SCF_DO_STCLASS)
4242 data->start_class = oclass;
4243 if (mincount == 0 || minnext == 0) {
4244 if (flags & SCF_DO_STCLASS_OR) {
4245 ssc_or(pRExC_state, data->start_class, &this_class);
4247 else if (flags & SCF_DO_STCLASS_AND) {
4248 /* Switch to OR mode: cache the old value of
4249 * data->start_class */
4251 StructCopy(data->start_class, and_withp, regnode_ssc);
4252 flags &= ~SCF_DO_STCLASS_AND;
4253 StructCopy(&this_class, data->start_class, regnode_ssc);
4254 flags |= SCF_DO_STCLASS_OR;
4255 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4257 } else { /* Non-zero len */
4258 if (flags & SCF_DO_STCLASS_OR) {
4259 ssc_or(pRExC_state, data->start_class, &this_class);
4260 ssc_and(pRExC_state, data->start_class, and_withp);
4262 else if (flags & SCF_DO_STCLASS_AND)
4263 ssc_and(pRExC_state, data->start_class, &this_class);
4264 flags &= ~SCF_DO_STCLASS;
4266 if (!scan) /* It was not CURLYX, but CURLY. */
4268 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4269 /* ? quantifier ok, except for (?{ ... }) */
4270 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4271 && (minnext == 0) && (deltanext == 0)
4272 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4273 && maxcount <= REG_INFTY/3) /* Complement check for big count */
4275 /* Fatal warnings may leak the regexp without this: */
4276 SAVEFREESV(RExC_rx_sv);
4277 ckWARNreg(RExC_parse,
4278 "Quantifier unexpected on zero-length expression");
4279 (void)ReREFCNT_inc(RExC_rx_sv);
4282 min += minnext * mincount;
4283 is_inf_internal |= deltanext == SSize_t_MAX
4284 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4285 is_inf |= is_inf_internal;
4287 delta = SSize_t_MAX;
4289 delta += (minnext + deltanext) * maxcount - minnext * mincount;
4291 /* Try powerful optimization CURLYX => CURLYN. */
4292 if ( OP(oscan) == CURLYX && data
4293 && data->flags & SF_IN_PAR
4294 && !(data->flags & SF_HAS_EVAL)
4295 && !deltanext && minnext == 1 ) {
4296 /* Try to optimize to CURLYN. */
4297 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4298 regnode * const nxt1 = nxt;
4305 if (!REGNODE_SIMPLE(OP(nxt))
4306 && !(PL_regkind[OP(nxt)] == EXACT
4307 && STR_LEN(nxt) == 1))
4313 if (OP(nxt) != CLOSE)
4315 if (RExC_open_parens) {
4316 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4317 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4319 /* Now we know that nxt2 is the only contents: */
4320 oscan->flags = (U8)ARG(nxt);
4322 OP(nxt1) = NOTHING; /* was OPEN. */
4325 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4326 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4327 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4328 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4329 OP(nxt + 1) = OPTIMIZED; /* was count. */
4330 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4335 /* Try optimization CURLYX => CURLYM. */
4336 if ( OP(oscan) == CURLYX && data
4337 && !(data->flags & SF_HAS_PAR)
4338 && !(data->flags & SF_HAS_EVAL)
4339 && !deltanext /* atom is fixed width */
4340 && minnext != 0 /* CURLYM can't handle zero width */
4341 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4343 /* XXXX How to optimize if data == 0? */
4344 /* Optimize to a simpler form. */
4345 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4349 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4350 && (OP(nxt2) != WHILEM))
4352 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4353 /* Need to optimize away parenths. */
4354 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4355 /* Set the parenth number. */
4356 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4358 oscan->flags = (U8)ARG(nxt);
4359 if (RExC_open_parens) {
4360 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4361 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4363 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4364 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4367 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4368 OP(nxt + 1) = OPTIMIZED; /* was count. */
4369 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4370 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4373 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4374 regnode *nnxt = regnext(nxt1);
4376 if (reg_off_by_arg[OP(nxt1)])
4377 ARG_SET(nxt1, nxt2 - nxt1);
4378 else if (nxt2 - nxt1 < U16_MAX)
4379 NEXT_OFF(nxt1) = nxt2 - nxt1;
4381 OP(nxt) = NOTHING; /* Cannot beautify */
4386 /* Optimize again: */
4387 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4388 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4393 else if ((OP(oscan) == CURLYX)
4394 && (flags & SCF_WHILEM_VISITED_POS)
4395 /* See the comment on a similar expression above.
4396 However, this time it's not a subexpression
4397 we care about, but the expression itself. */
4398 && (maxcount == REG_INFTY)
4399 && data && ++data->whilem_c < 16) {
4400 /* This stays as CURLYX, we can put the count/of pair. */
4401 /* Find WHILEM (as in regexec.c) */
4402 regnode *nxt = oscan + NEXT_OFF(oscan);
4404 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4406 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4407 | (RExC_whilem_seen << 4)); /* On WHILEM */
4409 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4411 if (flags & SCF_DO_SUBSTR) {
4412 SV *last_str = NULL;
4413 int counted = mincount != 0;
4415 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4416 SSize_t b = pos_before >= data->last_start_min
4417 ? pos_before : data->last_start_min;
4419 const char * const s = SvPV_const(data->last_found, l);
4420 SSize_t old = b - data->last_start_min;
4423 old = utf8_hop((U8*)s, old) - (U8*)s;
4425 /* Get the added string: */
4426 last_str = newSVpvn_utf8(s + old, l, UTF);
4427 if (deltanext == 0 && pos_before == b) {
4428 /* What was added is a constant string */
4430 SvGROW(last_str, (mincount * l) + 1);
4431 repeatcpy(SvPVX(last_str) + l,
4432 SvPVX_const(last_str), l, mincount - 1);
4433 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4434 /* Add additional parts. */
4435 SvCUR_set(data->last_found,
4436 SvCUR(data->last_found) - l);
4437 sv_catsv(data->last_found, last_str);
4439 SV * sv = data->last_found;
4441 SvUTF8(sv) && SvMAGICAL(sv) ?
4442 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4443 if (mg && mg->mg_len >= 0)
4444 mg->mg_len += CHR_SVLEN(last_str) - l;
4446 data->last_end += l * (mincount - 1);
4449 /* start offset must point into the last copy */
4450 data->last_start_min += minnext * (mincount - 1);
4451 data->last_start_max += is_inf ? SSize_t_MAX
4452 : (maxcount - 1) * (minnext + data->pos_delta);
4455 /* It is counted once already... */
4456 data->pos_min += minnext * (mincount - counted);
4458 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4459 " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4460 " maxcount=%"UVdf" mincount=%"UVdf"\n",
4461 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4463 if (deltanext != SSize_t_MAX)
4464 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4465 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4466 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4468 if (deltanext == SSize_t_MAX ||
4469 -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4470 data->pos_delta = SSize_t_MAX;
4472 data->pos_delta += - counted * deltanext +
4473 (minnext + deltanext) * maxcount - minnext * mincount;
4474 if (mincount != maxcount) {
4475 /* Cannot extend fixed substrings found inside
4477 SCAN_COMMIT(pRExC_state,data,minlenp);
4478 if (mincount && last_str) {
4479 SV * const sv = data->last_found;
4480 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4481 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4485 sv_setsv(sv, last_str);
4486 data->last_end = data->pos_min;
4487 data->last_start_min =
4488 data->pos_min - CHR_SVLEN(last_str);
4489 data->last_start_max = is_inf
4491 : data->pos_min + data->pos_delta
4492 - CHR_SVLEN(last_str);
4494 data->longest = &(data->longest_float);
4496 SvREFCNT_dec(last_str);
4498 if (data && (fl & SF_HAS_EVAL))
4499 data->flags |= SF_HAS_EVAL;
4500 optimize_curly_tail:
4501 if (OP(oscan) != CURLYX) {
4502 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4504 NEXT_OFF(oscan) += NEXT_OFF(next);
4510 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4515 if (flags & SCF_DO_SUBSTR) {
4516 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4517 data->longest = &(data->longest_float);
4519 is_inf = is_inf_internal = 1;
4520 if (flags & SCF_DO_STCLASS_OR) {
4521 if (OP(scan) == CLUMP) {
4522 /* Actually is any start char, but very few code points
4523 * aren't start characters */
4524 ssc_match_all_cp(data->start_class);
4527 ssc_anything(data->start_class);
4530 flags &= ~SCF_DO_STCLASS;
4534 else if (OP(scan) == LNBREAK) {
4535 if (flags & SCF_DO_STCLASS) {
4536 if (flags & SCF_DO_STCLASS_AND) {
4537 ssc_intersection(data->start_class,
4538 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4539 ssc_clear_locale(data->start_class);
4540 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4542 else if (flags & SCF_DO_STCLASS_OR) {
4543 ssc_union(data->start_class,
4544 PL_XPosix_ptrs[_CC_VERTSPACE],
4546 ssc_and(pRExC_state, data->start_class, and_withp);
4548 flags &= ~SCF_DO_STCLASS;
4551 delta++; /* Because of the 2 char string cr-lf */
4552 if (flags & SCF_DO_SUBSTR) {
4553 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4555 data->pos_delta += 1;
4556 data->longest = &(data->longest_float);
4559 else if (REGNODE_SIMPLE(OP(scan))) {
4561 if (flags & SCF_DO_SUBSTR) {
4562 SCAN_COMMIT(pRExC_state,data,minlenp);
4566 if (flags & SCF_DO_STCLASS) {
4568 SV* my_invlist = sv_2mortal(_new_invlist(0));
4572 if (flags & SCF_DO_STCLASS_AND) {
4573 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4576 /* Some of the logic below assumes that switching
4577 locale on will only add false positives. */
4582 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4586 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4587 ssc_match_all_cp(data->start_class);
4592 SV* REG_ANY_invlist = _new_invlist(2);
4593 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4595 if (flags & SCF_DO_STCLASS_OR) {
4596 ssc_union(data->start_class,
4598 TRUE /* TRUE => invert, hence all but \n
4602 else if (flags & SCF_DO_STCLASS_AND) {
4603 ssc_intersection(data->start_class,
4605 TRUE /* TRUE => invert */
4607 ssc_clear_locale(data->start_class);
4609 SvREFCNT_dec_NN(REG_ANY_invlist);
4613 case ANYOF_WARN_SUPER:
4615 if (flags & SCF_DO_STCLASS_AND)
4616 ssc_and(pRExC_state, data->start_class,
4617 (regnode_ssc*) scan);
4619 ssc_or(pRExC_state, data->start_class,
4620 (regnode_ssc*)scan);
4628 classnum = FLAGS(scan);
4629 namedclass = classnum_to_namedclass(classnum) + invert;
4630 if (flags & SCF_DO_STCLASS_AND) {
4631 bool was_there = cBOOL(
4632 ANYOF_POSIXL_TEST(data->start_class,
4634 ANYOF_POSIXL_ZERO(data->start_class);
4635 if (was_there) { /* Do an AND */
4636 ANYOF_POSIXL_SET(data->start_class, namedclass);
4638 /* No individual code points can now match */
4639 data->start_class->invlist
4640 = sv_2mortal(_new_invlist(0));
4643 int complement = namedclass + ((invert) ? -1 : 1);
4645 assert(flags & SCF_DO_STCLASS_OR);
4647 /* If the complement of this class was already there,
4648 * the result is that they match all code points,
4649 * (\d + \D == everything). Remove the classes from
4650 * future consideration. Locale is not relevant in
4652 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4653 ssc_match_all_cp(data->start_class);
4654 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4655 ANYOF_POSIXL_CLEAR(data->start_class, complement);
4656 if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
4658 ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
4661 else { /* The usual case; just add this class to the
4663 ANYOF_POSIXL_SET(data->start_class, namedclass);
4664 ANYOF_FLAGS(data->start_class)
4665 |= ANYOF_LOCALE|ANYOF_POSIXL;
4670 case NPOSIXA: /* For these, we always know the exact set of
4675 classnum = FLAGS(scan);
4676 my_invlist = PL_Posix_ptrs[classnum];
4685 classnum = FLAGS(scan);
4687 /* If we know all the code points that match the class, use
4688 * that; otherwise use the Latin1 code points, plus we have
4689 * to assume that it could match anything above Latin1 */
4690 if (PL_XPosix_ptrs[classnum]) {
4691 my_invlist = invlist_clone(PL_XPosix_ptrs[classnum]);
4694 _invlist_union(PL_L1Posix_ptrs[classnum],
4695 PL_AboveLatin1, &my_invlist);
4698 /* NPOSIXD matches all upper Latin1 code points unless the
4699 * target string being matched is UTF-8, which is
4700 * unknowable until match time */
4701 if (PL_regkind[OP(scan)] == NPOSIXD) {
4702 _invlist_union_complement_2nd(my_invlist,
4703 PL_Posix_ptrs[_CC_ASCII], &my_invlist);
4708 if (flags & SCF_DO_STCLASS_AND) {
4709 ssc_intersection(data->start_class, my_invlist, invert);
4710 ssc_clear_locale(data->start_class);
4713 assert(flags & SCF_DO_STCLASS_OR);
4714 ssc_union(data->start_class, my_invlist, invert);
4717 if (flags & SCF_DO_STCLASS_OR)
4718 ssc_and(pRExC_state, data->start_class, and_withp);
4719 flags &= ~SCF_DO_STCLASS;
4722 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4723 data->flags |= (OP(scan) == MEOL
4726 SCAN_COMMIT(pRExC_state, data, minlenp);
4729 else if ( PL_regkind[OP(scan)] == BRANCHJ
4730 /* Lookbehind, or need to calculate parens/evals/stclass: */
4731 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4732 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4733 if ( OP(scan) == UNLESSM &&
4735 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4736 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4739 regnode *upto= regnext(scan);
4741 SV * const mysv_val=sv_newmortal();
4742 DEBUG_STUDYDATA("OPFAIL",data,depth);
4744 /*DEBUG_PARSE_MSG("opfail");*/
4745 regprop(RExC_rx, mysv_val, upto);
4746 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4747 SvPV_nolen_const(mysv_val),
4748 (IV)REG_NODE_NUM(upto),
4753 NEXT_OFF(scan) = upto - scan;
4754 for (opt= scan + 1; opt < upto ; opt++)
4755 OP(opt) = OPTIMIZED;
4759 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4760 || OP(scan) == UNLESSM )
4762 /* Negative Lookahead/lookbehind
4763 In this case we can't do fixed string optimisation.
4766 SSize_t deltanext, minnext, fake = 0;
4771 data_fake.flags = 0;
4773 data_fake.whilem_c = data->whilem_c;
4774 data_fake.last_closep = data->last_closep;
4777 data_fake.last_closep = &fake;
4778 data_fake.pos_delta = delta;
4779 if ( flags & SCF_DO_STCLASS && !scan->flags
4780 && OP(scan) == IFMATCH ) { /* Lookahead */
4781 ssc_init(pRExC_state, &intrnl);
4782 data_fake.start_class = &intrnl;
4783 f |= SCF_DO_STCLASS_AND;
4785 if (flags & SCF_WHILEM_VISITED_POS)
4786 f |= SCF_WHILEM_VISITED_POS;
4787 next = regnext(scan);
4788 nscan = NEXTOPER(NEXTOPER(scan));
4789 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4790 last, &data_fake, stopparen, recursed_depth, NULL, f, depth+1);
4793 FAIL("Variable length lookbehind not implemented");
4795 else if (minnext > (I32)U8_MAX) {
4796 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4798 scan->flags = (U8)minnext;
4801 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4803 if (data_fake.flags & SF_HAS_EVAL)
4804 data->flags |= SF_HAS_EVAL;
4805 data->whilem_c = data_fake.whilem_c;
4807 if (f & SCF_DO_STCLASS_AND) {
4808 if (flags & SCF_DO_STCLASS_OR) {
4809 /* OR before, AND after: ideally we would recurse with
4810 * data_fake to get the AND applied by study of the
4811 * remainder of the pattern, and then derecurse;
4812 * *** HACK *** for now just treat as "no information".
4813 * See [perl #56690].
4815 ssc_init(pRExC_state, data->start_class);
4817 /* AND before and after: combine and continue */
4818 ssc_and(pRExC_state, data->start_class, &intrnl);
4822 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4824 /* Positive Lookahead/lookbehind
4825 In this case we can do fixed string optimisation,
4826 but we must be careful about it. Note in the case of
4827 lookbehind the positions will be offset by the minimum
4828 length of the pattern, something we won't know about
4829 until after the recurse.
4831 SSize_t deltanext, fake = 0;
4835 /* We use SAVEFREEPV so that when the full compile
4836 is finished perl will clean up the allocated
4837 minlens when it's all done. This way we don't
4838 have to worry about freeing them when we know
4839 they wont be used, which would be a pain.
4842 Newx( minnextp, 1, SSize_t );
4843 SAVEFREEPV(minnextp);
4846 StructCopy(data, &data_fake, scan_data_t);
4847 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4850 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4851 data_fake.last_found=newSVsv(data->last_found);
4855 data_fake.last_closep = &fake;
4856 data_fake.flags = 0;
4857 data_fake.pos_delta = delta;
4859 data_fake.flags |= SF_IS_INF;
4860 if ( flags & SCF_DO_STCLASS && !scan->flags
4861 && OP(scan) == IFMATCH ) { /* Lookahead */
4862 ssc_init(pRExC_state, &intrnl);
4863 data_fake.start_class = &intrnl;
4864 f |= SCF_DO_STCLASS_AND;
4866 if (flags & SCF_WHILEM_VISITED_POS)
4867 f |= SCF_WHILEM_VISITED_POS;
4868 next = regnext(scan);
4869 nscan = NEXTOPER(NEXTOPER(scan));
4871 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4872 last, &data_fake, stopparen, recursed_depth, NULL, f,depth+1);
4875 FAIL("Variable length lookbehind not implemented");
4877 else if (*minnextp > (I32)U8_MAX) {
4878 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4880 scan->flags = (U8)*minnextp;
4885 if (f & SCF_DO_STCLASS_AND) {
4886 ssc_and(pRExC_state, data->start_class, &intrnl);
4889 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4891 if (data_fake.flags & SF_HAS_EVAL)
4892 data->flags |= SF_HAS_EVAL;
4893 data->whilem_c = data_fake.whilem_c;
4894 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4895 if (RExC_rx->minlen<*minnextp)
4896 RExC_rx->minlen=*minnextp;
4897 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4898 SvREFCNT_dec_NN(data_fake.last_found);
4900 if ( data_fake.minlen_fixed != minlenp )
4902 data->offset_fixed= data_fake.offset_fixed;
4903 data->minlen_fixed= data_fake.minlen_fixed;
4904 data->lookbehind_fixed+= scan->flags;
4906 if ( data_fake.minlen_float != minlenp )
4908 data->minlen_float= data_fake.minlen_float;
4909 data->offset_float_min=data_fake.offset_float_min;
4910 data->offset_float_max=data_fake.offset_float_max;
4911 data->lookbehind_float+= scan->flags;
4918 else if (OP(scan) == OPEN) {
4919 if (stopparen != (I32)ARG(scan))
4922 else if (OP(scan) == CLOSE) {
4923 if (stopparen == (I32)ARG(scan)) {
4926 if ((I32)ARG(scan) == is_par) {
4927 next = regnext(scan);
4929 if ( next && (OP(next) != WHILEM) && next < last)
4930 is_par = 0; /* Disable optimization */
4933 *(data->last_closep) = ARG(scan);
4935 else if (OP(scan) == EVAL) {
4937 data->flags |= SF_HAS_EVAL;
4939 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4940 if (flags & SCF_DO_SUBSTR) {
4941 SCAN_COMMIT(pRExC_state,data,minlenp);
4942 flags &= ~SCF_DO_SUBSTR;
4944 if (data && OP(scan)==ACCEPT) {
4945 data->flags |= SCF_SEEN_ACCEPT;
4950 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4952 if (flags & SCF_DO_SUBSTR) {
4953 SCAN_COMMIT(pRExC_state,data,minlenp);
4954 data->longest = &(data->longest_float);
4956 is_inf = is_inf_internal = 1;
4957 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4958 ssc_anything(data->start_class);
4959 flags &= ~SCF_DO_STCLASS;
4961 else if (OP(scan) == GPOS) {
4962 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4963 !(delta || is_inf || (data && data->pos_delta)))
4965 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4966 RExC_rx->extflags |= RXf_ANCH_GPOS;
4967 if (RExC_rx->gofs < (STRLEN)min)
4968 RExC_rx->gofs = min;
4970 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4974 #ifdef TRIE_STUDY_OPT
4975 #ifdef FULL_TRIE_STUDY
4976 else if (PL_regkind[OP(scan)] == TRIE) {
4977 /* NOTE - There is similar code to this block above for handling
4978 BRANCH nodes on the initial study. If you change stuff here
4980 regnode *trie_node= scan;
4981 regnode *tail= regnext(scan);
4982 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4983 SSize_t max1 = 0, min1 = SSize_t_MAX;
4986 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4987 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4988 if (flags & SCF_DO_STCLASS)
4989 ssc_init_zero(pRExC_state, &accum);
4995 const regnode *nextbranch= NULL;
4998 for ( word=1 ; word <= trie->wordcount ; word++)
5000 SSize_t deltanext=0, minnext=0, f = 0, fake;
5001 regnode_ssc this_class;
5003 data_fake.flags = 0;
5005 data_fake.whilem_c = data->whilem_c;
5006 data_fake.last_closep = data->last_closep;
5009 data_fake.last_closep = &fake;
5010 data_fake.pos_delta = delta;
5011 if (flags & SCF_DO_STCLASS) {
5012 ssc_init(pRExC_state, &this_class);
5013 data_fake.start_class = &this_class;
5014 f = SCF_DO_STCLASS_AND;
5016 if (flags & SCF_WHILEM_VISITED_POS)
5017 f |= SCF_WHILEM_VISITED_POS;
5019 if (trie->jump[word]) {
5021 nextbranch = trie_node + trie->jump[0];
5022 scan= trie_node + trie->jump[word];
5023 /* We go from the jump point to the branch that follows
5024 it. Note this means we need the vestigal unused branches
5025 even though they arent otherwise used.
5027 minnext = study_chunk(pRExC_state, &scan, minlenp,
5028 &deltanext, (regnode *)nextbranch, &data_fake,
5029 stopparen, recursed_depth, NULL, f,depth+1);
5031 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5032 nextbranch= regnext((regnode*)nextbranch);
5034 if (min1 > (SSize_t)(minnext + trie->minlen))
5035 min1 = minnext + trie->minlen;
5036 if (deltanext == SSize_t_MAX) {
5037 is_inf = is_inf_internal = 1;
5039 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5040 max1 = minnext + deltanext + trie->maxlen;
5042 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5044 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5045 if ( stopmin > min + min1)
5046 stopmin = min + min1;
5047 flags &= ~SCF_DO_SUBSTR;
5049 data->flags |= SCF_SEEN_ACCEPT;
5052 if (data_fake.flags & SF_HAS_EVAL)
5053 data->flags |= SF_HAS_EVAL;
5054 data->whilem_c = data_fake.whilem_c;
5056 if (flags & SCF_DO_STCLASS)
5057 ssc_or(pRExC_state, &accum, &this_class);
5060 if (flags & SCF_DO_SUBSTR) {
5061 data->pos_min += min1;
5062 data->pos_delta += max1 - min1;
5063 if (max1 != min1 || is_inf)
5064 data->longest = &(data->longest_float);
5067 delta += max1 - min1;
5068 if (flags & SCF_DO_STCLASS_OR) {
5069 ssc_or(pRExC_state, data->start_class, &accum);
5071 ssc_and(pRExC_state, data->start_class, and_withp);
5072 flags &= ~SCF_DO_STCLASS;
5075 else if (flags & SCF_DO_STCLASS_AND) {
5077 ssc_and(pRExC_state, data->start_class, &accum);
5078 flags &= ~SCF_DO_STCLASS;
5081 /* Switch to OR mode: cache the old value of
5082 * data->start_class */
5084 StructCopy(data->start_class, and_withp, regnode_ssc);
5085 flags &= ~SCF_DO_STCLASS_AND;
5086 StructCopy(&accum, data->start_class, regnode_ssc);
5087 flags |= SCF_DO_STCLASS_OR;
5094 else if (PL_regkind[OP(scan)] == TRIE) {
5095 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5098 min += trie->minlen;
5099 delta += (trie->maxlen - trie->minlen);
5100 flags &= ~SCF_DO_STCLASS; /* xxx */
5101 if (flags & SCF_DO_SUBSTR) {
5102 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
5103 data->pos_min += trie->minlen;
5104 data->pos_delta += (trie->maxlen - trie->minlen);
5105 if (trie->maxlen != trie->minlen)
5106 data->longest = &(data->longest_float);
5108 if (trie->jump) /* no more substrings -- for now /grr*/
5109 flags &= ~SCF_DO_SUBSTR;
5111 #endif /* old or new */
5112 #endif /* TRIE_STUDY_OPT */
5114 /* Else: zero-length, ignore. */
5115 scan = regnext(scan);
5117 /* If we are exiting a recursion we can unset its recursed bit
5118 * and allow ourselves to enter it again - no danger of an
5119 * infinite loop there.
5120 if (stopparen > -1 && recursed) {
5121 DEBUG_STUDYDATA("unset:", data,depth);
5122 PAREN_UNSET( recursed, stopparen);
5126 DEBUG_STUDYDATA("frame-end:",data,depth);
5127 DEBUG_PEEP("fend", scan, depth);
5128 /* restore previous context */
5131 stopparen = frame->stop;
5132 recursed_depth = frame->prev_recursed_depth;
5135 frame = frame->prev;
5136 goto fake_study_recurse;
5141 DEBUG_STUDYDATA("pre-fin:",data,depth);
5144 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5145 if (flags & SCF_DO_SUBSTR && is_inf)
5146 data->pos_delta = SSize_t_MAX - data->pos_min;
5147 if (is_par > (I32)U8_MAX)
5149 if (is_par && pars==1 && data) {
5150 data->flags |= SF_IN_PAR;
5151 data->flags &= ~SF_HAS_PAR;
5153 else if (pars && data) {
5154 data->flags |= SF_HAS_PAR;
5155 data->flags &= ~SF_IN_PAR;
5157 if (flags & SCF_DO_STCLASS_OR)
5158 ssc_and(pRExC_state, data->start_class, and_withp);
5159 if (flags & SCF_TRIE_RESTUDY)
5160 data->flags |= SCF_TRIE_RESTUDY;
5162 DEBUG_STUDYDATA("post-fin:",data,depth);
5164 return min < stopmin ? min : stopmin;
5168 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5170 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5172 PERL_ARGS_ASSERT_ADD_DATA;
5174 Renewc(RExC_rxi->data,
5175 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5176 char, struct reg_data);
5178 Renew(RExC_rxi->data->what, count + n, U8);
5180 Newx(RExC_rxi->data->what, n, U8);
5181 RExC_rxi->data->count = count + n;
5182 Copy(s, RExC_rxi->data->what + count, n, U8);
5186 /*XXX: todo make this not included in a non debugging perl */
5187 #ifndef PERL_IN_XSUB_RE
5189 Perl_reginitcolors(pTHX)
5192 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5194 char *t = savepv(s);
5198 t = strchr(t, '\t');
5204 PL_colors[i] = t = (char *)"";
5209 PL_colors[i++] = (char *)"";
5216 #ifdef TRIE_STUDY_OPT
5217 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5220 (data.flags & SCF_TRIE_RESTUDY) \
5228 #define CHECK_RESTUDY_GOTO_butfirst
5232 * pregcomp - compile a regular expression into internal code
5234 * Decides which engine's compiler to call based on the hint currently in
5238 #ifndef PERL_IN_XSUB_RE
5240 /* return the currently in-scope regex engine (or the default if none) */
5242 regexp_engine const *
5243 Perl_current_re_engine(pTHX)
5247 if (IN_PERL_COMPILETIME) {
5248 HV * const table = GvHV(PL_hintgv);
5251 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5252 return &PL_core_reg_engine;
5253 ptr = hv_fetchs(table, "regcomp", FALSE);
5254 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5255 return &PL_core_reg_engine;
5256 return INT2PTR(regexp_engine*,SvIV(*ptr));
5260 if (!PL_curcop->cop_hints_hash)
5261 return &PL_core_reg_engine;
5262 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5263 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5264 return &PL_core_reg_engine;
5265 return INT2PTR(regexp_engine*,SvIV(ptr));
5271 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5274 regexp_engine const *eng = current_re_engine();
5275 GET_RE_DEBUG_FLAGS_DECL;
5277 PERL_ARGS_ASSERT_PREGCOMP;
5279 /* Dispatch a request to compile a regexp to correct regexp engine. */
5281 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5284 return CALLREGCOMP_ENG(eng, pattern, flags);
5288 /* public(ish) entry point for the perl core's own regex compiling code.
5289 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5290 * pattern rather than a list of OPs, and uses the internal engine rather
5291 * than the current one */
5294 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5296 SV *pat = pattern; /* defeat constness! */
5297 PERL_ARGS_ASSERT_RE_COMPILE;
5298 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5299 #ifdef PERL_IN_XSUB_RE
5302 &PL_core_reg_engine,
5304 NULL, NULL, rx_flags, 0);
5308 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5309 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5310 * point to the realloced string and length.
5312 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5316 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5317 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5319 U8 *const src = (U8*)*pat_p;
5322 STRLEN s = 0, d = 0;
5324 GET_RE_DEBUG_FLAGS_DECL;
5326 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5327 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5329 Newx(dst, *plen_p * 2 + 1, U8);
5331 while (s < *plen_p) {
5332 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5335 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5336 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
5338 if (n < num_code_blocks) {
5339 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5340 pRExC_state->code_blocks[n].start = d;
5341 assert(dst[d] == '(');
5344 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5345 pRExC_state->code_blocks[n].end = d;
5346 assert(dst[d] == ')');
5356 *pat_p = (char*) dst;
5358 RExC_orig_utf8 = RExC_utf8 = 1;
5363 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5364 * while recording any code block indices, and handling overloading,
5365 * nested qr// objects etc. If pat is null, it will allocate a new
5366 * string, or just return the first arg, if there's only one.
5368 * Returns the malloced/updated pat.
5369 * patternp and pat_count is the array of SVs to be concatted;
5370 * oplist is the optional list of ops that generated the SVs;
5371 * recompile_p is a pointer to a boolean that will be set if
5372 * the regex will need to be recompiled.
5373 * delim, if non-null is an SV that will be inserted between each element
5377 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5378 SV *pat, SV ** const patternp, int pat_count,
5379 OP *oplist, bool *recompile_p, SV *delim)
5383 bool use_delim = FALSE;
5384 bool alloced = FALSE;
5386 /* if we know we have at least two args, create an empty string,
5387 * then concatenate args to that. For no args, return an empty string */
5388 if (!pat && pat_count != 1) {
5389 pat = newSVpvn("", 0);
5394 for (svp = patternp; svp < patternp + pat_count; svp++) {
5397 STRLEN orig_patlen = 0;
5399 SV *msv = use_delim ? delim : *svp;
5400 if (!msv) msv = &PL_sv_undef;
5402 /* if we've got a delimiter, we go round the loop twice for each
5403 * svp slot (except the last), using the delimiter the second
5412 if (SvTYPE(msv) == SVt_PVAV) {
5413 /* we've encountered an interpolated array within
5414 * the pattern, e.g. /...@a..../. Expand the list of elements,
5415 * then recursively append elements.
5416 * The code in this block is based on S_pushav() */
5418 AV *const av = (AV*)msv;
5419 const SSize_t maxarg = AvFILL(av) + 1;
5423 assert(oplist->op_type == OP_PADAV
5424 || oplist->op_type == OP_RV2AV);
5425 oplist = oplist->op_sibling;;
5428 if (SvRMAGICAL(av)) {
5431 Newx(array, maxarg, SV*);
5433 for (i=0; i < maxarg; i++) {
5434 SV ** const svp = av_fetch(av, i, FALSE);
5435 array[i] = svp ? *svp : &PL_sv_undef;
5439 array = AvARRAY(av);
5441 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5442 array, maxarg, NULL, recompile_p,
5444 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5450 /* we make the assumption here that each op in the list of
5451 * op_siblings maps to one SV pushed onto the stack,
5452 * except for code blocks, with have both an OP_NULL and
5454 * This allows us to match up the list of SVs against the
5455 * list of OPs to find the next code block.
5457 * Note that PUSHMARK PADSV PADSV ..
5459 * PADRANGE PADSV PADSV ..
5460 * so the alignment still works. */
5463 if (oplist->op_type == OP_NULL
5464 && (oplist->op_flags & OPf_SPECIAL))
5466 assert(n < pRExC_state->num_code_blocks);
5467 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5468 pRExC_state->code_blocks[n].block = oplist;
5469 pRExC_state->code_blocks[n].src_regex = NULL;
5472 oplist = oplist->op_sibling; /* skip CONST */
5475 oplist = oplist->op_sibling;;
5478 /* apply magic and QR overloading to arg */
5481 if (SvROK(msv) && SvAMAGIC(msv)) {
5482 SV *sv = AMG_CALLunary(msv, regexp_amg);
5486 if (SvTYPE(sv) != SVt_REGEXP)
5487 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5492 /* try concatenation overload ... */
5493 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5494 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5497 /* overloading involved: all bets are off over literal
5498 * code. Pretend we haven't seen it */
5499 pRExC_state->num_code_blocks -= n;
5503 /* ... or failing that, try "" overload */
5504 while (SvAMAGIC(msv)
5505 && (sv = AMG_CALLunary(msv, string_amg))
5509 && SvRV(msv) == SvRV(sv))
5514 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5518 /* this is a partially unrolled
5519 * sv_catsv_nomg(pat, msv);
5520 * that allows us to adjust code block indices if
5523 char *dst = SvPV_force_nomg(pat, dlen);
5525 if (SvUTF8(msv) && !SvUTF8(pat)) {
5526 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5527 sv_setpvn(pat, dst, dlen);
5530 sv_catsv_nomg(pat, msv);
5537 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5540 /* extract any code blocks within any embedded qr//'s */
5541 if (rx && SvTYPE(rx) == SVt_REGEXP
5542 && RX_ENGINE((REGEXP*)rx)->op_comp)
5545 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5546 if (ri->num_code_blocks) {
5548 /* the presence of an embedded qr// with code means
5549 * we should always recompile: the text of the
5550 * qr// may not have changed, but it may be a
5551 * different closure than last time */
5553 Renew(pRExC_state->code_blocks,
5554 pRExC_state->num_code_blocks + ri->num_code_blocks,
5555 struct reg_code_block);
5556 pRExC_state->num_code_blocks += ri->num_code_blocks;
5558 for (i=0; i < ri->num_code_blocks; i++) {
5559 struct reg_code_block *src, *dst;
5560 STRLEN offset = orig_patlen
5561 + ReANY((REGEXP *)rx)->pre_prefix;
5562 assert(n < pRExC_state->num_code_blocks);
5563 src = &ri->code_blocks[i];
5564 dst = &pRExC_state->code_blocks[n];
5565 dst->start = src->start + offset;
5566 dst->end = src->end + offset;
5567 dst->block = src->block;
5568 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5577 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5586 /* see if there are any run-time code blocks in the pattern.
5587 * False positives are allowed */
5590 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5591 char *pat, STRLEN plen)
5596 for (s = 0; s < plen; s++) {
5597 if (n < pRExC_state->num_code_blocks
5598 && s == pRExC_state->code_blocks[n].start)
5600 s = pRExC_state->code_blocks[n].end;
5604 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5606 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5608 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5615 /* Handle run-time code blocks. We will already have compiled any direct
5616 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5617 * copy of it, but with any literal code blocks blanked out and
5618 * appropriate chars escaped; then feed it into
5620 * eval "qr'modified_pattern'"
5624 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5628 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5630 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5631 * and merge them with any code blocks of the original regexp.
5633 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5634 * instead, just save the qr and return FALSE; this tells our caller that
5635 * the original pattern needs upgrading to utf8.
5639 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5640 char *pat, STRLEN plen)
5644 GET_RE_DEBUG_FLAGS_DECL;
5646 if (pRExC_state->runtime_code_qr) {
5647 /* this is the second time we've been called; this should
5648 * only happen if the main pattern got upgraded to utf8
5649 * during compilation; re-use the qr we compiled first time
5650 * round (which should be utf8 too)
5652 qr = pRExC_state->runtime_code_qr;
5653 pRExC_state->runtime_code_qr = NULL;
5654 assert(RExC_utf8 && SvUTF8(qr));
5660 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5664 /* determine how many extra chars we need for ' and \ escaping */
5665 for (s = 0; s < plen; s++) {
5666 if (pat[s] == '\'' || pat[s] == '\\')
5670 Newx(newpat, newlen, char);
5672 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5674 for (s = 0; s < plen; s++) {
5675 if (n < pRExC_state->num_code_blocks
5676 && s == pRExC_state->code_blocks[n].start)
5678 /* blank out literal code block */
5679 assert(pat[s] == '(');
5680 while (s <= pRExC_state->code_blocks[n].end) {
5688 if (pat[s] == '\'' || pat[s] == '\\')
5693 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5697 PerlIO_printf(Perl_debug_log,
5698 "%sre-parsing pattern for runtime code:%s %s\n",
5699 PL_colors[4],PL_colors[5],newpat);
5702 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5708 PUSHSTACKi(PERLSI_REQUIRE);
5709 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5710 * parsing qr''; normally only q'' does this. It also alters
5712 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5713 SvREFCNT_dec_NN(sv);
5718 SV * const errsv = ERRSV;
5719 if (SvTRUE_NN(errsv))
5721 Safefree(pRExC_state->code_blocks);
5722 /* use croak_sv ? */
5723 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5726 assert(SvROK(qr_ref));
5728 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5729 /* the leaving below frees the tmp qr_ref.
5730 * Give qr a life of its own */
5738 if (!RExC_utf8 && SvUTF8(qr)) {
5739 /* first time through; the pattern got upgraded; save the
5740 * qr for the next time through */
5741 assert(!pRExC_state->runtime_code_qr);
5742 pRExC_state->runtime_code_qr = qr;
5747 /* extract any code blocks within the returned qr// */
5750 /* merge the main (r1) and run-time (r2) code blocks into one */
5752 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5753 struct reg_code_block *new_block, *dst;
5754 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5757 if (!r2->num_code_blocks) /* we guessed wrong */
5759 SvREFCNT_dec_NN(qr);
5764 r1->num_code_blocks + r2->num_code_blocks,
5765 struct reg_code_block);
5768 while ( i1 < r1->num_code_blocks
5769 || i2 < r2->num_code_blocks)
5771 struct reg_code_block *src;
5774 if (i1 == r1->num_code_blocks) {
5775 src = &r2->code_blocks[i2++];
5778 else if (i2 == r2->num_code_blocks)
5779 src = &r1->code_blocks[i1++];
5780 else if ( r1->code_blocks[i1].start
5781 < r2->code_blocks[i2].start)
5783 src = &r1->code_blocks[i1++];
5784 assert(src->end < r2->code_blocks[i2].start);
5787 assert( r1->code_blocks[i1].start
5788 > r2->code_blocks[i2].start);
5789 src = &r2->code_blocks[i2++];
5791 assert(src->end < r1->code_blocks[i1].start);
5794 assert(pat[src->start] == '(');
5795 assert(pat[src->end] == ')');
5796 dst->start = src->start;
5797 dst->end = src->end;
5798 dst->block = src->block;
5799 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5803 r1->num_code_blocks += r2->num_code_blocks;
5804 Safefree(r1->code_blocks);
5805 r1->code_blocks = new_block;
5808 SvREFCNT_dec_NN(qr);
5814 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5815 SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5817 /* This is the common code for setting up the floating and fixed length
5818 * string data extracted from Perl_re_op_compile() below. Returns a boolean
5819 * as to whether succeeded or not */
5824 if (! (longest_length
5825 || (eol /* Can't have SEOL and MULTI */
5826 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5828 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5829 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5834 /* copy the information about the longest from the reg_scan_data
5835 over to the program. */
5836 if (SvUTF8(sv_longest)) {
5837 *rx_utf8 = sv_longest;
5840 *rx_substr = sv_longest;
5843 /* end_shift is how many chars that must be matched that
5844 follow this item. We calculate it ahead of time as once the
5845 lookbehind offset is added in we lose the ability to correctly
5847 ml = minlen ? *(minlen) : (SSize_t)longest_length;
5848 *rx_end_shift = ml - offset
5849 - longest_length + (SvTAIL(sv_longest) != 0)
5852 t = (eol/* Can't have SEOL and MULTI */
5853 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5854 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5860 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5861 * regular expression into internal code.
5862 * The pattern may be passed either as:
5863 * a list of SVs (patternp plus pat_count)
5864 * a list of OPs (expr)
5865 * If both are passed, the SV list is used, but the OP list indicates
5866 * which SVs are actually pre-compiled code blocks
5868 * The SVs in the list have magic and qr overloading applied to them (and
5869 * the list may be modified in-place with replacement SVs in the latter
5872 * If the pattern hasn't changed from old_re, then old_re will be
5875 * eng is the current engine. If that engine has an op_comp method, then
5876 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5877 * do the initial concatenation of arguments and pass on to the external
5880 * If is_bare_re is not null, set it to a boolean indicating whether the
5881 * arg list reduced (after overloading) to a single bare regex which has
5882 * been returned (i.e. /$qr/).
5884 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5886 * pm_flags contains the PMf_* flags, typically based on those from the
5887 * pm_flags field of the related PMOP. Currently we're only interested in
5888 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5890 * We can't allocate space until we know how big the compiled form will be,
5891 * but we can't compile it (and thus know how big it is) until we've got a
5892 * place to put the code. So we cheat: we compile it twice, once with code
5893 * generation turned off and size counting turned on, and once "for real".
5894 * This also means that we don't allocate space until we are sure that the
5895 * thing really will compile successfully, and we never have to move the
5896 * code and thus invalidate pointers into it. (Note that it has to be in
5897 * one piece because free() must be able to free it all.) [NB: not true in perl]
5899 * Beware that the optimization-preparation code in here knows about some
5900 * of the structure of the compiled regexp. [I'll say.]
5904 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5905 OP *expr, const regexp_engine* eng, REGEXP *old_re,
5906 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5911 regexp_internal *ri;
5919 SV *code_blocksv = NULL;
5920 SV** new_patternp = patternp;
5922 /* these are all flags - maybe they should be turned
5923 * into a single int with different bit masks */
5924 I32 sawlookahead = 0;
5929 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5931 bool runtime_code = 0;
5933 RExC_state_t RExC_state;
5934 RExC_state_t * const pRExC_state = &RExC_state;
5935 #ifdef TRIE_STUDY_OPT
5937 RExC_state_t copyRExC_state;
5939 GET_RE_DEBUG_FLAGS_DECL;
5941 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5943 DEBUG_r(if (!PL_colorset) reginitcolors());
5945 #ifndef PERL_IN_XSUB_RE
5946 /* Initialize these here instead of as-needed, as is quick and avoids
5947 * having to test them each time otherwise */
5948 if (! PL_AboveLatin1) {
5949 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5950 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5951 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
5953 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5954 PL_L1Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5955 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5957 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5958 = _new_invlist_C_array(L1PosixAlnum_invlist);
5959 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5960 = _new_invlist_C_array(PosixAlnum_invlist);
5962 PL_L1Posix_ptrs[_CC_ALPHA]
5963 = _new_invlist_C_array(L1PosixAlpha_invlist);
5964 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5966 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5967 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5969 /* Cased is the same as Alpha in the ASCII range */
5970 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5971 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5973 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5974 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5976 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5977 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5979 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5980 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5982 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5983 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5985 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5986 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5988 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5989 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5991 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5992 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5993 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5994 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5996 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5997 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5999 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
6001 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
6002 PL_L1Posix_ptrs[_CC_WORDCHAR]
6003 = _new_invlist_C_array(L1PosixWord_invlist);
6005 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
6006 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
6008 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
6012 pRExC_state->code_blocks = NULL;
6013 pRExC_state->num_code_blocks = 0;
6016 *is_bare_re = FALSE;
6018 if (expr && (expr->op_type == OP_LIST ||
6019 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6020 /* allocate code_blocks if needed */
6024 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6025 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6026 ncode++; /* count of DO blocks */
6028 pRExC_state->num_code_blocks = ncode;
6029 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6034 /* compile-time pattern with just OP_CONSTs and DO blocks */
6039 /* find how many CONSTs there are */
6042 if (expr->op_type == OP_CONST)
6045 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6046 if (o->op_type == OP_CONST)
6050 /* fake up an SV array */
6052 assert(!new_patternp);
6053 Newx(new_patternp, n, SV*);
6054 SAVEFREEPV(new_patternp);
6058 if (expr->op_type == OP_CONST)
6059 new_patternp[n] = cSVOPx_sv(expr);
6061 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6062 if (o->op_type == OP_CONST)
6063 new_patternp[n++] = cSVOPo_sv;
6068 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6069 "Assembling pattern from %d elements%s\n", pat_count,
6070 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6072 /* set expr to the first arg op */
6074 if (pRExC_state->num_code_blocks
6075 && expr->op_type != OP_CONST)
6077 expr = cLISTOPx(expr)->op_first;
6078 assert( expr->op_type == OP_PUSHMARK
6079 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6080 || expr->op_type == OP_PADRANGE);
6081 expr = expr->op_sibling;
6084 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6085 expr, &recompile, NULL);
6087 /* handle bare (possibly after overloading) regex: foo =~ $re */
6092 if (SvTYPE(re) == SVt_REGEXP) {
6096 Safefree(pRExC_state->code_blocks);
6097 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6098 "Precompiled pattern%s\n",
6099 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6105 exp = SvPV_nomg(pat, plen);
6107 if (!eng->op_comp) {
6108 if ((SvUTF8(pat) && IN_BYTES)
6109 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6111 /* make a temporary copy; either to convert to bytes,
6112 * or to avoid repeating get-magic / overloaded stringify */
6113 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6114 (IN_BYTES ? 0 : SvUTF8(pat)));
6116 Safefree(pRExC_state->code_blocks);
6117 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6120 /* ignore the utf8ness if the pattern is 0 length */
6121 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6122 RExC_uni_semantics = 0;
6123 RExC_contains_locale = 0;
6124 RExC_contains_i = 0;
6125 pRExC_state->runtime_code_qr = NULL;
6128 SV *dsv= sv_newmortal();
6129 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6130 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6131 PL_colors[4],PL_colors[5],s);
6135 /* we jump here if we upgrade the pattern to utf8 and have to
6138 if ((pm_flags & PMf_USE_RE_EVAL)
6139 /* this second condition covers the non-regex literal case,
6140 * i.e. $foo =~ '(?{})'. */
6141 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6143 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6145 /* return old regex if pattern hasn't changed */
6146 /* XXX: note in the below we have to check the flags as well as the pattern.
6148 * Things get a touch tricky as we have to compare the utf8 flag independently
6149 * from the compile flags.
6154 && !!RX_UTF8(old_re) == !!RExC_utf8
6155 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6156 && RX_PRECOMP(old_re)
6157 && RX_PRELEN(old_re) == plen
6158 && memEQ(RX_PRECOMP(old_re), exp, plen)
6159 && !runtime_code /* with runtime code, always recompile */ )
6161 Safefree(pRExC_state->code_blocks);
6165 rx_flags = orig_rx_flags;
6167 if (rx_flags & PMf_FOLD) {
6168 RExC_contains_i = 1;
6170 if (initial_charset == REGEX_LOCALE_CHARSET) {
6171 RExC_contains_locale = 1;
6173 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6175 /* Set to use unicode semantics if the pattern is in utf8 and has the
6176 * 'depends' charset specified, as it means unicode when utf8 */
6177 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6181 RExC_flags = rx_flags;
6182 RExC_pm_flags = pm_flags;
6185 if (TAINTING_get && TAINT_get)
6186 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6188 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6189 /* whoops, we have a non-utf8 pattern, whilst run-time code
6190 * got compiled as utf8. Try again with a utf8 pattern */
6191 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6192 pRExC_state->num_code_blocks);
6193 goto redo_first_pass;
6196 assert(!pRExC_state->runtime_code_qr);
6201 RExC_in_lookbehind = 0;
6202 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6204 RExC_override_recoding = 0;
6205 RExC_in_multi_char_class = 0;
6207 /* First pass: determine size, legality. */
6210 RExC_end = exp + plen;
6215 RExC_emit = (regnode *) &RExC_emit_dummy;
6216 RExC_whilem_seen = 0;
6217 RExC_open_parens = NULL;
6218 RExC_close_parens = NULL;
6220 RExC_paren_names = NULL;
6222 RExC_paren_name_list = NULL;
6224 RExC_recurse = NULL;
6225 RExC_study_chunk_recursed = NULL;
6226 RExC_study_chunk_recursed_bytes= 0;
6227 RExC_recurse_count = 0;
6228 pRExC_state->code_index = 0;
6230 #if 0 /* REGC() is (currently) a NOP at the first pass.
6231 * Clever compilers notice this and complain. --jhi */
6232 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6235 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6237 RExC_lastparse=NULL;
6239 /* reg may croak on us, not giving us a chance to free
6240 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6241 need it to survive as long as the regexp (qr/(?{})/).
6242 We must check that code_blocksv is not already set, because we may
6243 have jumped back to restart the sizing pass. */
6244 if (pRExC_state->code_blocks && !code_blocksv) {
6245 code_blocksv = newSV_type(SVt_PV);
6246 SAVEFREESV(code_blocksv);
6247 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6248 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6250 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6251 /* It's possible to write a regexp in ascii that represents Unicode
6252 codepoints outside of the byte range, such as via \x{100}. If we
6253 detect such a sequence we have to convert the entire pattern to utf8
6254 and then recompile, as our sizing calculation will have been based
6255 on 1 byte == 1 character, but we will need to use utf8 to encode
6256 at least some part of the pattern, and therefore must convert the whole
6259 if (flags & RESTART_UTF8) {
6260 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6261 pRExC_state->num_code_blocks);
6262 goto redo_first_pass;
6264 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6267 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6270 PerlIO_printf(Perl_debug_log,
6271 "Required size %"IVdf" nodes\n"
6272 "Starting second pass (creation)\n",
6275 RExC_lastparse=NULL;
6278 /* The first pass could have found things that force Unicode semantics */
6279 if ((RExC_utf8 || RExC_uni_semantics)
6280 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6282 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6285 /* Small enough for pointer-storage convention?
6286 If extralen==0, this means that we will not need long jumps. */
6287 if (RExC_size >= 0x10000L && RExC_extralen)
6288 RExC_size += RExC_extralen;
6291 if (RExC_whilem_seen > 15)
6292 RExC_whilem_seen = 15;
6294 /* Allocate space and zero-initialize. Note, the two step process
6295 of zeroing when in debug mode, thus anything assigned has to
6296 happen after that */
6297 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6299 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6300 char, regexp_internal);
6301 if ( r == NULL || ri == NULL )
6302 FAIL("Regexp out of space");
6304 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6305 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
6307 /* bulk initialize base fields with 0. */
6308 Zero(ri, sizeof(regexp_internal), char);
6311 /* non-zero initialization begins here */
6314 r->extflags = rx_flags;
6315 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6317 if (pm_flags & PMf_IS_QR) {
6318 ri->code_blocks = pRExC_state->code_blocks;
6319 ri->num_code_blocks = pRExC_state->num_code_blocks;
6324 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6325 if (pRExC_state->code_blocks[n].src_regex)
6326 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6327 SAVEFREEPV(pRExC_state->code_blocks);
6331 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6332 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
6334 /* The caret is output if there are any defaults: if not all the STD
6335 * flags are set, or if no character set specifier is needed */
6337 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6339 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
6340 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6341 >> RXf_PMf_STD_PMMOD_SHIFT);
6342 const char *fptr = STD_PAT_MODS; /*"msix"*/
6344 /* Allocate for the worst case, which is all the std flags are turned
6345 * on. If more precision is desired, we could do a population count of
6346 * the flags set. This could be done with a small lookup table, or by
6347 * shifting, masking and adding, or even, when available, assembly
6348 * language for a machine-language population count.
6349 * We never output a minus, as all those are defaults, so are
6350 * covered by the caret */
6351 const STRLEN wraplen = plen + has_p + has_runon
6352 + has_default /* If needs a caret */
6354 /* If needs a character set specifier */
6355 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6356 + (sizeof(STD_PAT_MODS) - 1)
6357 + (sizeof("(?:)") - 1);
6359 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6360 r->xpv_len_u.xpvlenu_pv = p;
6362 SvFLAGS(rx) |= SVf_UTF8;
6365 /* If a default, cover it using the caret */
6367 *p++= DEFAULT_PAT_MOD;
6371 const char* const name = get_regex_charset_name(r->extflags, &len);
6372 Copy(name, p, len, char);
6376 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6379 while((ch = *fptr++)) {
6387 Copy(RExC_precomp, p, plen, char);
6388 assert ((RX_WRAPPED(rx) - p) < 16);
6389 r->pre_prefix = p - RX_WRAPPED(rx);
6395 SvCUR_set(rx, p - RX_WRAPPED(rx));
6399 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6401 /* setup various meta data about recursion, this all requires
6402 * RExC_npar to be correctly set, and a bit later on we clear it */
6403 if (RExC_seen & REG_SEEN_RECURSE) {
6404 Newxz(RExC_open_parens, RExC_npar,regnode *);
6405 SAVEFREEPV(RExC_open_parens);
6406 Newxz(RExC_close_parens,RExC_npar,regnode *);
6407 SAVEFREEPV(RExC_close_parens);
6409 if (RExC_seen & (REG_SEEN_RECURSE | REG_SEEN_GOSTART)) {
6410 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6411 * So its 1 if there are no parens. */
6412 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6413 ((RExC_npar & 0x07) != 0);
6414 Newx(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6415 SAVEFREEPV(RExC_study_chunk_recursed);
6418 /* Useful during FAIL. */
6419 #ifdef RE_TRACK_PATTERN_OFFSETS
6420 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6421 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6422 "%s %"UVuf" bytes for offset annotations.\n",
6423 ri->u.offsets ? "Got" : "Couldn't get",
6424 (UV)((2*RExC_size+1) * sizeof(U32))));
6426 SetProgLen(ri,RExC_size);
6431 /* Second pass: emit code. */
6432 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6433 RExC_pm_flags = pm_flags;
6435 RExC_end = exp + plen;
6438 RExC_emit_start = ri->program;
6439 RExC_emit = ri->program;
6440 RExC_emit_bound = ri->program + RExC_size + 1;
6441 pRExC_state->code_index = 0;
6443 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6444 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6446 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6448 /* XXXX To minimize changes to RE engine we always allocate
6449 3-units-long substrs field. */
6450 Newx(r->substrs, 1, struct reg_substr_data);
6451 if (RExC_recurse_count) {
6452 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6453 SAVEFREEPV(RExC_recurse);
6457 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6458 Zero(r->substrs, 1, struct reg_substr_data);
6459 if (RExC_study_chunk_recursed)
6460 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6462 #ifdef TRIE_STUDY_OPT
6464 StructCopy(&zero_scan_data, &data, scan_data_t);
6465 copyRExC_state = RExC_state;
6468 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6470 RExC_state = copyRExC_state;
6471 if (seen & REG_TOP_LEVEL_BRANCHES)
6472 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6474 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6475 StructCopy(&zero_scan_data, &data, scan_data_t);
6478 StructCopy(&zero_scan_data, &data, scan_data_t);
6481 /* Dig out information for optimizations. */
6482 r->extflags = RExC_flags; /* was pm_op */
6483 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6486 SvUTF8_on(rx); /* Unicode in it? */
6487 ri->regstclass = NULL;
6488 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6489 r->intflags |= PREGf_NAUGHTY;
6490 scan = ri->program + 1; /* First BRANCH. */
6492 /* testing for BRANCH here tells us whether there is "must appear"
6493 data in the pattern. If there is then we can use it for optimisations */
6494 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6496 STRLEN longest_float_length, longest_fixed_length;
6497 regnode_ssc ch_class; /* pointed to by data */
6499 SSize_t last_close = 0; /* pointed to by data */
6500 regnode *first= scan;
6501 regnode *first_next= regnext(first);
6503 * Skip introductions and multiplicators >= 1
6504 * so that we can extract the 'meat' of the pattern that must
6505 * match in the large if() sequence following.
6506 * NOTE that EXACT is NOT covered here, as it is normally
6507 * picked up by the optimiser separately.
6509 * This is unfortunate as the optimiser isnt handling lookahead
6510 * properly currently.
6513 while ((OP(first) == OPEN && (sawopen = 1)) ||
6514 /* An OR of *one* alternative - should not happen now. */
6515 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6516 /* for now we can't handle lookbehind IFMATCH*/
6517 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6518 (OP(first) == PLUS) ||
6519 (OP(first) == MINMOD) ||
6520 /* An {n,m} with n>0 */
6521 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6522 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6525 * the only op that could be a regnode is PLUS, all the rest
6526 * will be regnode_1 or regnode_2.
6528 * (yves doesn't think this is true)
6530 if (OP(first) == PLUS)
6533 if (OP(first) == MINMOD)
6535 first += regarglen[OP(first)];
6537 first = NEXTOPER(first);
6538 first_next= regnext(first);
6541 /* Starting-point info. */
6543 DEBUG_PEEP("first:",first,0);
6544 /* Ignore EXACT as we deal with it later. */
6545 if (PL_regkind[OP(first)] == EXACT) {
6546 if (OP(first) == EXACT)
6547 NOOP; /* Empty, get anchored substr later. */
6549 ri->regstclass = first;
6552 else if (PL_regkind[OP(first)] == TRIE &&
6553 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6556 /* this can happen only on restudy */
6557 if ( OP(first) == TRIE ) {
6558 struct regnode_1 *trieop = (struct regnode_1 *)
6559 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6560 StructCopy(first,trieop,struct regnode_1);
6561 trie_op=(regnode *)trieop;
6563 struct regnode_charclass *trieop = (struct regnode_charclass *)
6564 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6565 StructCopy(first,trieop,struct regnode_charclass);
6566 trie_op=(regnode *)trieop;
6569 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6570 ri->regstclass = trie_op;
6573 else if (REGNODE_SIMPLE(OP(first)))
6574 ri->regstclass = first;
6575 else if (PL_regkind[OP(first)] == BOUND ||
6576 PL_regkind[OP(first)] == NBOUND)
6577 ri->regstclass = first;
6578 else if (PL_regkind[OP(first)] == BOL) {
6579 r->extflags |= (OP(first) == MBOL
6581 : (OP(first) == SBOL
6584 first = NEXTOPER(first);
6587 else if (OP(first) == GPOS) {
6588 r->extflags |= RXf_ANCH_GPOS;
6589 first = NEXTOPER(first);
6592 else if ((!sawopen || !RExC_sawback) &&
6593 (OP(first) == STAR &&
6594 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6595 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6597 /* turn .* into ^.* with an implied $*=1 */
6599 (OP(NEXTOPER(first)) == REG_ANY)
6602 r->extflags |= type;
6603 r->intflags |= PREGf_IMPLICIT;
6604 first = NEXTOPER(first);
6607 if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6608 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6609 /* x+ must match at the 1st pos of run of x's */
6610 r->intflags |= PREGf_SKIP;
6612 /* Scan is after the zeroth branch, first is atomic matcher. */
6613 #ifdef TRIE_STUDY_OPT
6616 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6617 (IV)(first - scan + 1))
6621 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6622 (IV)(first - scan + 1))
6628 * If there's something expensive in the r.e., find the
6629 * longest literal string that must appear and make it the
6630 * regmust. Resolve ties in favor of later strings, since
6631 * the regstart check works with the beginning of the r.e.
6632 * and avoiding duplication strengthens checking. Not a
6633 * strong reason, but sufficient in the absence of others.
6634 * [Now we resolve ties in favor of the earlier string if
6635 * it happens that c_offset_min has been invalidated, since the
6636 * earlier string may buy us something the later one won't.]
6639 data.longest_fixed = newSVpvs("");
6640 data.longest_float = newSVpvs("");
6641 data.last_found = newSVpvs("");
6642 data.longest = &(data.longest_fixed);
6643 ENTER_with_name("study_chunk");
6644 SAVEFREESV(data.longest_fixed);
6645 SAVEFREESV(data.longest_float);
6646 SAVEFREESV(data.last_found);
6648 if (!ri->regstclass) {
6649 ssc_init(pRExC_state, &ch_class);
6650 data.start_class = &ch_class;
6651 stclass_flag = SCF_DO_STCLASS_AND;
6652 } else /* XXXX Check for BOUND? */
6654 data.last_closep = &last_close;
6657 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6659 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6660 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6664 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6667 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6668 && data.last_start_min == 0 && data.last_end > 0
6669 && !RExC_seen_zerolen
6670 && !(RExC_seen & REG_SEEN_VERBARG)
6671 && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6672 r->extflags |= RXf_CHECK_ALL;
6673 scan_commit(pRExC_state, &data,&minlen,0);
6675 longest_float_length = CHR_SVLEN(data.longest_float);
6677 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6678 && data.offset_fixed == data.offset_float_min
6679 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6680 && S_setup_longest (aTHX_ pRExC_state,
6684 &(r->float_end_shift),
6685 data.lookbehind_float,
6686 data.offset_float_min,
6688 longest_float_length,
6689 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6690 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6692 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6693 r->float_max_offset = data.offset_float_max;
6694 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6695 r->float_max_offset -= data.lookbehind_float;
6696 SvREFCNT_inc_simple_void_NN(data.longest_float);
6699 r->float_substr = r->float_utf8 = NULL;
6700 longest_float_length = 0;
6703 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6705 if (S_setup_longest (aTHX_ pRExC_state,
6707 &(r->anchored_utf8),
6708 &(r->anchored_substr),
6709 &(r->anchored_end_shift),
6710 data.lookbehind_fixed,
6713 longest_fixed_length,
6714 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6715 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6717 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6718 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6721 r->anchored_substr = r->anchored_utf8 = NULL;
6722 longest_fixed_length = 0;
6724 LEAVE_with_name("study_chunk");
6727 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6728 ri->regstclass = NULL;
6730 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6732 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6733 && !ssc_is_anything(data.start_class))
6735 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6737 ssc_finalize(pRExC_state, data.start_class);
6739 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6740 StructCopy(data.start_class,
6741 (regnode_ssc*)RExC_rxi->data->data[n],
6743 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6744 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6745 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6746 regprop(r, sv, (regnode*)data.start_class);
6747 PerlIO_printf(Perl_debug_log,
6748 "synthetic stclass \"%s\".\n",
6749 SvPVX_const(sv));});
6750 data.start_class = NULL;
6753 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6754 if (longest_fixed_length > longest_float_length) {
6755 r->check_end_shift = r->anchored_end_shift;
6756 r->check_substr = r->anchored_substr;
6757 r->check_utf8 = r->anchored_utf8;
6758 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6759 if (r->extflags & RXf_ANCH_SINGLE)
6760 r->extflags |= RXf_NOSCAN;
6763 r->check_end_shift = r->float_end_shift;
6764 r->check_substr = r->float_substr;
6765 r->check_utf8 = r->float_utf8;
6766 r->check_offset_min = r->float_min_offset;
6767 r->check_offset_max = r->float_max_offset;
6769 if ((r->check_substr || r->check_utf8) ) {
6770 r->extflags |= RXf_USE_INTUIT;
6771 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6772 r->extflags |= RXf_INTUIT_TAIL;
6774 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6775 if ( (STRLEN)minlen < longest_float_length )
6776 minlen= longest_float_length;
6777 if ( (STRLEN)minlen < longest_fixed_length )
6778 minlen= longest_fixed_length;
6782 /* Several toplevels. Best we can is to set minlen. */
6784 regnode_ssc ch_class;
6785 SSize_t last_close = 0;
6787 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6789 scan = ri->program + 1;
6790 ssc_init(pRExC_state, &ch_class);
6791 data.start_class = &ch_class;
6792 data.last_closep = &last_close;
6795 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6797 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6798 |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6801 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6803 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6804 = r->float_substr = r->float_utf8 = NULL;
6806 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6807 && ! ssc_is_anything(data.start_class))
6809 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6811 ssc_finalize(pRExC_state, data.start_class);
6813 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6814 StructCopy(data.start_class,
6815 (regnode_ssc*)RExC_rxi->data->data[n],
6817 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6818 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6819 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6820 regprop(r, sv, (regnode*)data.start_class);
6821 PerlIO_printf(Perl_debug_log,
6822 "synthetic stclass \"%s\".\n",
6823 SvPVX_const(sv));});
6824 data.start_class = NULL;
6828 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6829 the "real" pattern. */
6831 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6832 (IV)minlen, (IV)r->minlen);
6834 r->minlenret = minlen;
6835 if (r->minlen < minlen)
6838 if (RExC_seen & REG_SEEN_GPOS)
6839 r->extflags |= RXf_GPOS_SEEN;
6840 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6841 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6842 if (pRExC_state->num_code_blocks)
6843 r->extflags |= RXf_EVAL_SEEN;
6844 if (RExC_seen & REG_SEEN_CANY)
6845 r->extflags |= RXf_CANY_SEEN;
6846 if (RExC_seen & REG_SEEN_VERBARG)
6848 r->intflags |= PREGf_VERBARG_SEEN;
6849 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6851 if (RExC_seen & REG_SEEN_CUTGROUP)
6852 r->intflags |= PREGf_CUTGROUP_SEEN;
6853 if (pm_flags & PMf_USE_RE_EVAL)
6854 r->intflags |= PREGf_USE_RE_EVAL;
6855 if (RExC_paren_names)
6856 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6858 RXp_PAREN_NAMES(r) = NULL;
6861 regnode *first = ri->program + 1;
6863 regnode *next = NEXTOPER(first);
6866 if (PL_regkind[fop] == NOTHING && nop == END)
6867 r->extflags |= RXf_NULL;
6868 else if (PL_regkind[fop] == BOL && nop == END)
6869 r->extflags |= RXf_START_ONLY;
6870 else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6871 r->extflags |= RXf_WHITE;
6872 else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6873 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6877 if (RExC_paren_names) {
6878 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
6879 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6882 ri->name_list_idx = 0;
6884 if (RExC_recurse_count) {
6885 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6886 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6887 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6890 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6891 /* assume we don't need to swap parens around before we match */
6895 PerlIO_printf(Perl_debug_log,"Final program:\n");
6898 #ifdef RE_TRACK_PATTERN_OFFSETS
6899 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6900 const STRLEN len = ri->u.offsets[0];
6902 GET_RE_DEBUG_FLAGS_DECL;
6903 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6904 for (i = 1; i <= len; i++) {
6905 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6906 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6907 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6909 PerlIO_printf(Perl_debug_log, "\n");
6914 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6915 * by setting the regexp SV to readonly-only instead. If the
6916 * pattern's been recompiled, the USEDness should remain. */
6917 if (old_re && SvREADONLY(old_re))
6925 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6928 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6930 PERL_UNUSED_ARG(value);
6932 if (flags & RXapif_FETCH) {
6933 return reg_named_buff_fetch(rx, key, flags);
6934 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6935 Perl_croak_no_modify();
6937 } else if (flags & RXapif_EXISTS) {
6938 return reg_named_buff_exists(rx, key, flags)
6941 } else if (flags & RXapif_REGNAMES) {
6942 return reg_named_buff_all(rx, flags);
6943 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6944 return reg_named_buff_scalar(rx, flags);
6946 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6952 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6955 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6956 PERL_UNUSED_ARG(lastkey);
6958 if (flags & RXapif_FIRSTKEY)
6959 return reg_named_buff_firstkey(rx, flags);
6960 else if (flags & RXapif_NEXTKEY)
6961 return reg_named_buff_nextkey(rx, flags);
6963 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6969 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6972 AV *retarray = NULL;
6974 struct regexp *const rx = ReANY(r);
6976 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6978 if (flags & RXapif_ALL)
6981 if (rx && RXp_PAREN_NAMES(rx)) {
6982 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6985 SV* sv_dat=HeVAL(he_str);
6986 I32 *nums=(I32*)SvPVX(sv_dat);
6987 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6988 if ((I32)(rx->nparens) >= nums[i]
6989 && rx->offs[nums[i]].start != -1
6990 && rx->offs[nums[i]].end != -1)
6993 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6998 ret = newSVsv(&PL_sv_undef);
7001 av_push(retarray, ret);
7004 return newRV_noinc(MUTABLE_SV(retarray));
7011 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7014 struct regexp *const rx = ReANY(r);
7016 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7018 if (rx && RXp_PAREN_NAMES(rx)) {
7019 if (flags & RXapif_ALL) {
7020 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7022 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7024 SvREFCNT_dec_NN(sv);
7036 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7038 struct regexp *const rx = ReANY(r);
7040 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7042 if ( rx && RXp_PAREN_NAMES(rx) ) {
7043 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7045 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7052 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7054 struct regexp *const rx = ReANY(r);
7055 GET_RE_DEBUG_FLAGS_DECL;
7057 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7059 if (rx && RXp_PAREN_NAMES(rx)) {
7060 HV *hv = RXp_PAREN_NAMES(rx);
7062 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7065 SV* sv_dat = HeVAL(temphe);
7066 I32 *nums = (I32*)SvPVX(sv_dat);
7067 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7068 if ((I32)(rx->lastparen) >= nums[i] &&
7069 rx->offs[nums[i]].start != -1 &&
7070 rx->offs[nums[i]].end != -1)
7076 if (parno || flags & RXapif_ALL) {
7077 return newSVhek(HeKEY_hek(temphe));
7085 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7090 struct regexp *const rx = ReANY(r);
7092 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7094 if (rx && RXp_PAREN_NAMES(rx)) {
7095 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7096 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7097 } else if (flags & RXapif_ONE) {
7098 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7099 av = MUTABLE_AV(SvRV(ret));
7100 length = av_len(av);
7101 SvREFCNT_dec_NN(ret);
7102 return newSViv(length + 1);
7104 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
7108 return &PL_sv_undef;
7112 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7114 struct regexp *const rx = ReANY(r);
7117 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7119 if (rx && RXp_PAREN_NAMES(rx)) {
7120 HV *hv= RXp_PAREN_NAMES(rx);
7122 (void)hv_iterinit(hv);
7123 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7126 SV* sv_dat = HeVAL(temphe);
7127 I32 *nums = (I32*)SvPVX(sv_dat);
7128 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7129 if ((I32)(rx->lastparen) >= nums[i] &&
7130 rx->offs[nums[i]].start != -1 &&
7131 rx->offs[nums[i]].end != -1)
7137 if (parno || flags & RXapif_ALL) {
7138 av_push(av, newSVhek(HeKEY_hek(temphe)));
7143 return newRV_noinc(MUTABLE_SV(av));
7147 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7150 struct regexp *const rx = ReANY(r);
7156 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7158 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7159 || n == RX_BUFF_IDX_CARET_FULLMATCH
7160 || n == RX_BUFF_IDX_CARET_POSTMATCH
7163 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7165 /* on something like
7168 * the KEEPCOPY is set on the PMOP rather than the regex */
7169 if (PL_curpm && r == PM_GETRE(PL_curpm))
7170 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7179 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7180 /* no need to distinguish between them any more */
7181 n = RX_BUFF_IDX_FULLMATCH;
7183 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7184 && rx->offs[0].start != -1)
7186 /* $`, ${^PREMATCH} */
7187 i = rx->offs[0].start;
7191 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7192 && rx->offs[0].end != -1)
7194 /* $', ${^POSTMATCH} */
7195 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7196 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7199 if ( 0 <= n && n <= (I32)rx->nparens &&
7200 (s1 = rx->offs[n].start) != -1 &&
7201 (t1 = rx->offs[n].end) != -1)
7203 /* $&, ${^MATCH}, $1 ... */
7205 s = rx->subbeg + s1 - rx->suboffset;
7210 assert(s >= rx->subbeg);
7211 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7213 #if NO_TAINT_SUPPORT
7214 sv_setpvn(sv, s, i);
7216 const int oldtainted = TAINT_get;
7218 sv_setpvn(sv, s, i);
7219 TAINT_set(oldtainted);
7221 if ( (rx->extflags & RXf_CANY_SEEN)
7222 ? (RXp_MATCH_UTF8(rx)
7223 && (!i || is_utf8_string((U8*)s, i)))
7224 : (RXp_MATCH_UTF8(rx)) )
7231 if (RXp_MATCH_TAINTED(rx)) {
7232 if (SvTYPE(sv) >= SVt_PVMG) {
7233 MAGIC* const mg = SvMAGIC(sv);
7236 SvMAGIC_set(sv, mg->mg_moremagic);
7238 if ((mgt = SvMAGIC(sv))) {
7239 mg->mg_moremagic = mgt;
7240 SvMAGIC_set(sv, mg);
7251 sv_setsv(sv,&PL_sv_undef);
7257 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7258 SV const * const value)
7260 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7262 PERL_UNUSED_ARG(rx);
7263 PERL_UNUSED_ARG(paren);
7264 PERL_UNUSED_ARG(value);
7267 Perl_croak_no_modify();
7271 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7274 struct regexp *const rx = ReANY(r);
7278 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7280 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7281 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7282 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7285 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7287 /* on something like
7290 * the KEEPCOPY is set on the PMOP rather than the regex */
7291 if (PL_curpm && r == PM_GETRE(PL_curpm))
7292 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7298 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7300 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7301 case RX_BUFF_IDX_PREMATCH: /* $` */
7302 if (rx->offs[0].start != -1) {
7303 i = rx->offs[0].start;
7312 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7313 case RX_BUFF_IDX_POSTMATCH: /* $' */
7314 if (rx->offs[0].end != -1) {
7315 i = rx->sublen - rx->offs[0].end;
7317 s1 = rx->offs[0].end;
7324 default: /* $& / ${^MATCH}, $1, $2, ... */
7325 if (paren <= (I32)rx->nparens &&
7326 (s1 = rx->offs[paren].start) != -1 &&
7327 (t1 = rx->offs[paren].end) != -1)
7333 if (ckWARN(WARN_UNINITIALIZED))
7334 report_uninit((const SV *)sv);
7339 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7340 const char * const s = rx->subbeg - rx->suboffset + s1;
7345 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7352 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7354 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7355 PERL_UNUSED_ARG(rx);
7359 return newSVpvs("Regexp");
7362 /* Scans the name of a named buffer from the pattern.
7363 * If flags is REG_RSN_RETURN_NULL returns null.
7364 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7365 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7366 * to the parsed name as looked up in the RExC_paren_names hash.
7367 * If there is an error throws a vFAIL().. type exception.
7370 #define REG_RSN_RETURN_NULL 0
7371 #define REG_RSN_RETURN_NAME 1
7372 #define REG_RSN_RETURN_DATA 2
7375 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7377 char *name_start = RExC_parse;
7379 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7381 assert (RExC_parse <= RExC_end);
7382 if (RExC_parse == RExC_end) NOOP;
7383 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7384 /* skip IDFIRST by using do...while */
7387 RExC_parse += UTF8SKIP(RExC_parse);
7388 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7392 } while (isWORDCHAR(*RExC_parse));
7394 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7395 vFAIL("Group name must start with a non-digit word character");
7399 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7400 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7401 if ( flags == REG_RSN_RETURN_NAME)
7403 else if (flags==REG_RSN_RETURN_DATA) {
7406 if ( ! sv_name ) /* should not happen*/
7407 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7408 if (RExC_paren_names)
7409 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7411 sv_dat = HeVAL(he_str);
7413 vFAIL("Reference to nonexistent named group");
7417 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7418 (unsigned long) flags);
7420 assert(0); /* NOT REACHED */
7425 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7426 int rem=(int)(RExC_end - RExC_parse); \
7435 if (RExC_lastparse!=RExC_parse) \
7436 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7439 iscut ? "..." : "<" \
7442 PerlIO_printf(Perl_debug_log,"%16s",""); \
7445 num = RExC_size + 1; \
7447 num=REG_NODE_NUM(RExC_emit); \
7448 if (RExC_lastnum!=num) \
7449 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7451 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7452 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7453 (int)((depth*2)), "", \
7457 RExC_lastparse=RExC_parse; \
7462 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7463 DEBUG_PARSE_MSG((funcname)); \
7464 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7466 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7467 DEBUG_PARSE_MSG((funcname)); \
7468 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7471 /* This section of code defines the inversion list object and its methods. The
7472 * interfaces are highly subject to change, so as much as possible is static to
7473 * this file. An inversion list is here implemented as a malloc'd C UV array
7474 * as an SVt_INVLIST scalar.
7476 * An inversion list for Unicode is an array of code points, sorted by ordinal
7477 * number. The zeroth element is the first code point in the list. The 1th
7478 * element is the first element beyond that not in the list. In other words,
7479 * the first range is
7480 * invlist[0]..(invlist[1]-1)
7481 * The other ranges follow. Thus every element whose index is divisible by two
7482 * marks the beginning of a range that is in the list, and every element not
7483 * divisible by two marks the beginning of a range not in the list. A single
7484 * element inversion list that contains the single code point N generally
7485 * consists of two elements
7488 * (The exception is when N is the highest representable value on the
7489 * machine, in which case the list containing just it would be a single
7490 * element, itself. By extension, if the last range in the list extends to
7491 * infinity, then the first element of that range will be in the inversion list
7492 * at a position that is divisible by two, and is the final element in the
7494 * Taking the complement (inverting) an inversion list is quite simple, if the
7495 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7496 * This implementation reserves an element at the beginning of each inversion
7497 * list to always contain 0; there is an additional flag in the header which
7498 * indicates if the list begins at the 0, or is offset to begin at the next
7501 * More about inversion lists can be found in "Unicode Demystified"
7502 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7503 * More will be coming when functionality is added later.
7505 * The inversion list data structure is currently implemented as an SV pointing
7506 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7507 * array of UV whose memory management is automatically handled by the existing
7508 * facilities for SV's.
7510 * Some of the methods should always be private to the implementation, and some
7511 * should eventually be made public */
7513 /* The header definitions are in F<inline_invlist.c> */
7515 PERL_STATIC_INLINE UV*
7516 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7518 /* Returns a pointer to the first element in the inversion list's array.
7519 * This is called upon initialization of an inversion list. Where the
7520 * array begins depends on whether the list has the code point U+0000 in it
7521 * or not. The other parameter tells it whether the code that follows this
7522 * call is about to put a 0 in the inversion list or not. The first
7523 * element is either the element reserved for 0, if TRUE, or the element
7524 * after it, if FALSE */
7526 bool* offset = get_invlist_offset_addr(invlist);
7527 UV* zero_addr = (UV *) SvPVX(invlist);
7529 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7532 assert(! _invlist_len(invlist));
7536 /* 1^1 = 0; 1^0 = 1 */
7537 *offset = 1 ^ will_have_0;
7538 return zero_addr + *offset;
7541 PERL_STATIC_INLINE UV*
7542 S_invlist_array(pTHX_ SV* const invlist)
7544 /* Returns the pointer to the inversion list's array. Every time the
7545 * length changes, this needs to be called in case malloc or realloc moved
7548 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7550 /* Must not be empty. If these fail, you probably didn't check for <len>
7551 * being non-zero before trying to get the array */
7552 assert(_invlist_len(invlist));
7554 /* The very first element always contains zero, The array begins either
7555 * there, or if the inversion list is offset, at the element after it.
7556 * The offset header field determines which; it contains 0 or 1 to indicate
7557 * how much additionally to add */
7558 assert(0 == *(SvPVX(invlist)));
7559 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7562 PERL_STATIC_INLINE void
7563 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7565 /* Sets the current number of elements stored in the inversion list.
7566 * Updates SvCUR correspondingly */
7568 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7570 assert(SvTYPE(invlist) == SVt_INVLIST);
7575 : TO_INTERNAL_SIZE(len + offset));
7576 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7579 PERL_STATIC_INLINE IV*
7580 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7582 /* Return the address of the IV that is reserved to hold the cached index
7585 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7587 assert(SvTYPE(invlist) == SVt_INVLIST);
7589 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7592 PERL_STATIC_INLINE IV
7593 S_invlist_previous_index(pTHX_ SV* const invlist)
7595 /* Returns cached index of previous search */
7597 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7599 return *get_invlist_previous_index_addr(invlist);
7602 PERL_STATIC_INLINE void
7603 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7605 /* Caches <index> for later retrieval */
7607 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7609 assert(index == 0 || index < (int) _invlist_len(invlist));
7611 *get_invlist_previous_index_addr(invlist) = index;
7614 PERL_STATIC_INLINE UV
7615 S_invlist_max(pTHX_ SV* const invlist)
7617 /* Returns the maximum number of elements storable in the inversion list's
7618 * array, without having to realloc() */
7620 PERL_ARGS_ASSERT_INVLIST_MAX;
7622 assert(SvTYPE(invlist) == SVt_INVLIST);
7624 /* Assumes worst case, in which the 0 element is not counted in the
7625 * inversion list, so subtracts 1 for that */
7626 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7627 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7628 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7631 #ifndef PERL_IN_XSUB_RE
7633 Perl__new_invlist(pTHX_ IV initial_size)
7636 /* Return a pointer to a newly constructed inversion list, with enough
7637 * space to store 'initial_size' elements. If that number is negative, a
7638 * system default is used instead */
7642 if (initial_size < 0) {
7646 /* Allocate the initial space */
7647 new_list = newSV_type(SVt_INVLIST);
7649 /* First 1 is in case the zero element isn't in the list; second 1 is for
7651 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7652 invlist_set_len(new_list, 0, 0);
7654 /* Force iterinit() to be used to get iteration to work */
7655 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7657 *get_invlist_previous_index_addr(new_list) = 0;
7664 S__new_invlist_C_array(pTHX_ const UV* const list)
7666 /* Return a pointer to a newly constructed inversion list, initialized to
7667 * point to <list>, which has to be in the exact correct inversion list
7668 * form, including internal fields. Thus this is a dangerous routine that
7669 * should not be used in the wrong hands. The passed in 'list' contains
7670 * several header fields at the beginning that are not part of the
7671 * inversion list body proper */
7673 const STRLEN length = (STRLEN) list[0];
7674 const UV version_id = list[1];
7675 const bool offset = cBOOL(list[2]);
7676 #define HEADER_LENGTH 3
7677 /* If any of the above changes in any way, you must change HEADER_LENGTH
7678 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7679 * perl -E 'say int(rand 2**31-1)'
7681 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7682 data structure type, so that one being
7683 passed in can be validated to be an
7684 inversion list of the correct vintage.
7687 SV* invlist = newSV_type(SVt_INVLIST);
7689 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7691 if (version_id != INVLIST_VERSION_ID) {
7692 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7695 /* The generated array passed in includes header elements that aren't part
7696 * of the list proper, so start it just after them */
7697 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7699 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7700 shouldn't touch it */
7702 *(get_invlist_offset_addr(invlist)) = offset;
7704 /* The 'length' passed to us is the physical number of elements in the
7705 * inversion list. But if there is an offset the logical number is one
7707 invlist_set_len(invlist, length - offset, offset);
7709 invlist_set_previous_index(invlist, 0);
7711 /* Initialize the iteration pointer. */
7712 invlist_iterfinish(invlist);
7718 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7720 /* Grow the maximum size of an inversion list */
7722 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7724 assert(SvTYPE(invlist) == SVt_INVLIST);
7726 /* Add one to account for the zero element at the beginning which may not
7727 * be counted by the calling parameters */
7728 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7731 PERL_STATIC_INLINE void
7732 S_invlist_trim(pTHX_ SV* const invlist)
7734 PERL_ARGS_ASSERT_INVLIST_TRIM;
7736 assert(SvTYPE(invlist) == SVt_INVLIST);
7738 /* Change the length of the inversion list to how many entries it currently
7740 SvPV_shrink_to_cur((SV *) invlist);
7744 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7746 /* Subject to change or removal. Append the range from 'start' to 'end' at
7747 * the end of the inversion list. The range must be above any existing
7751 UV max = invlist_max(invlist);
7752 UV len = _invlist_len(invlist);
7755 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7757 if (len == 0) { /* Empty lists must be initialized */
7758 offset = start != 0;
7759 array = _invlist_array_init(invlist, ! offset);
7762 /* Here, the existing list is non-empty. The current max entry in the
7763 * list is generally the first value not in the set, except when the
7764 * set extends to the end of permissible values, in which case it is
7765 * the first entry in that final set, and so this call is an attempt to
7766 * append out-of-order */
7768 UV final_element = len - 1;
7769 array = invlist_array(invlist);
7770 if (array[final_element] > start
7771 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7773 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7774 array[final_element], start,
7775 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7778 /* Here, it is a legal append. If the new range begins with the first
7779 * value not in the set, it is extending the set, so the new first
7780 * value not in the set is one greater than the newly extended range.
7782 offset = *get_invlist_offset_addr(invlist);
7783 if (array[final_element] == start) {
7784 if (end != UV_MAX) {
7785 array[final_element] = end + 1;
7788 /* But if the end is the maximum representable on the machine,
7789 * just let the range that this would extend to have no end */
7790 invlist_set_len(invlist, len - 1, offset);
7796 /* Here the new range doesn't extend any existing set. Add it */
7798 len += 2; /* Includes an element each for the start and end of range */
7800 /* If wll overflow the existing space, extend, which may cause the array to
7803 invlist_extend(invlist, len);
7805 /* Have to set len here to avoid assert failure in invlist_array() */
7806 invlist_set_len(invlist, len, offset);
7808 array = invlist_array(invlist);
7811 invlist_set_len(invlist, len, offset);
7814 /* The next item on the list starts the range, the one after that is
7815 * one past the new range. */
7816 array[len - 2] = start;
7817 if (end != UV_MAX) {
7818 array[len - 1] = end + 1;
7821 /* But if the end is the maximum representable on the machine, just let
7822 * the range have no end */
7823 invlist_set_len(invlist, len - 1, offset);
7827 #ifndef PERL_IN_XSUB_RE
7830 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7832 /* Searches the inversion list for the entry that contains the input code
7833 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7834 * return value is the index into the list's array of the range that
7839 IV high = _invlist_len(invlist);
7840 const IV highest_element = high - 1;
7843 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7845 /* If list is empty, return failure. */
7850 /* (We can't get the array unless we know the list is non-empty) */
7851 array = invlist_array(invlist);
7853 mid = invlist_previous_index(invlist);
7854 assert(mid >=0 && mid <= highest_element);
7856 /* <mid> contains the cache of the result of the previous call to this
7857 * function (0 the first time). See if this call is for the same result,
7858 * or if it is for mid-1. This is under the theory that calls to this
7859 * function will often be for related code points that are near each other.
7860 * And benchmarks show that caching gives better results. We also test
7861 * here if the code point is within the bounds of the list. These tests
7862 * replace others that would have had to be made anyway to make sure that
7863 * the array bounds were not exceeded, and these give us extra information
7864 * at the same time */
7865 if (cp >= array[mid]) {
7866 if (cp >= array[highest_element]) {
7867 return highest_element;
7870 /* Here, array[mid] <= cp < array[highest_element]. This means that
7871 * the final element is not the answer, so can exclude it; it also
7872 * means that <mid> is not the final element, so can refer to 'mid + 1'
7874 if (cp < array[mid + 1]) {
7880 else { /* cp < aray[mid] */
7881 if (cp < array[0]) { /* Fail if outside the array */
7885 if (cp >= array[mid - 1]) {
7890 /* Binary search. What we are looking for is <i> such that
7891 * array[i] <= cp < array[i+1]
7892 * The loop below converges on the i+1. Note that there may not be an
7893 * (i+1)th element in the array, and things work nonetheless */
7894 while (low < high) {
7895 mid = (low + high) / 2;
7896 assert(mid <= highest_element);
7897 if (array[mid] <= cp) { /* cp >= array[mid] */
7900 /* We could do this extra test to exit the loop early.
7901 if (cp < array[low]) {
7906 else { /* cp < array[mid] */
7913 invlist_set_previous_index(invlist, high);
7918 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7920 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7921 * but is used when the swash has an inversion list. This makes this much
7922 * faster, as it uses a binary search instead of a linear one. This is
7923 * intimately tied to that function, and perhaps should be in utf8.c,
7924 * except it is intimately tied to inversion lists as well. It assumes
7925 * that <swatch> is all 0's on input */
7928 const IV len = _invlist_len(invlist);
7932 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7934 if (len == 0) { /* Empty inversion list */
7938 array = invlist_array(invlist);
7940 /* Find which element it is */
7941 i = _invlist_search(invlist, start);
7943 /* We populate from <start> to <end> */
7944 while (current < end) {
7947 /* The inversion list gives the results for every possible code point
7948 * after the first one in the list. Only those ranges whose index is
7949 * even are ones that the inversion list matches. For the odd ones,
7950 * and if the initial code point is not in the list, we have to skip
7951 * forward to the next element */
7952 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7954 if (i >= len) { /* Finished if beyond the end of the array */
7958 if (current >= end) { /* Finished if beyond the end of what we
7960 if (LIKELY(end < UV_MAX)) {
7964 /* We get here when the upper bound is the maximum
7965 * representable on the machine, and we are looking for just
7966 * that code point. Have to special case it */
7968 goto join_end_of_list;
7971 assert(current >= start);
7973 /* The current range ends one below the next one, except don't go past
7976 upper = (i < len && array[i] < end) ? array[i] : end;
7978 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7979 * for each code point in it */
7980 for (; current < upper; current++) {
7981 const STRLEN offset = (STRLEN)(current - start);
7982 swatch[offset >> 3] |= 1 << (offset & 7);
7987 /* Quit if at the end of the list */
7990 /* But first, have to deal with the highest possible code point on
7991 * the platform. The previous code assumes that <end> is one
7992 * beyond where we want to populate, but that is impossible at the
7993 * platform's infinity, so have to handle it specially */
7994 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7996 const STRLEN offset = (STRLEN)(end - start);
7997 swatch[offset >> 3] |= 1 << (offset & 7);
8002 /* Advance to the next range, which will be for code points not in the
8011 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
8013 /* Take the union of two inversion lists and point <output> to it. *output
8014 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8015 * the reference count to that list will be decremented if not already a
8016 * temporary (mortal); otherwise *output will be made correspondingly
8017 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8018 * second list is returned. If <complement_b> is TRUE, the union is taken
8019 * of the complement (inversion) of <b> instead of b itself.
8021 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8022 * Richard Gillam, published by Addison-Wesley, and explained at some
8023 * length there. The preface says to incorporate its examples into your
8024 * code at your own risk.
8026 * The algorithm is like a merge sort.
8028 * XXX A potential performance improvement is to keep track as we go along
8029 * if only one of the inputs contributes to the result, meaning the other
8030 * is a subset of that one. In that case, we can skip the final copy and
8031 * return the larger of the input lists, but then outside code might need
8032 * to keep track of whether to free the input list or not */
8034 const UV* array_a; /* a's array */
8036 UV len_a; /* length of a's array */
8039 SV* u; /* the resulting union */
8043 UV i_a = 0; /* current index into a's array */
8047 /* running count, as explained in the algorithm source book; items are
8048 * stopped accumulating and are output when the count changes to/from 0.
8049 * The count is incremented when we start a range that's in the set, and
8050 * decremented when we start a range that's not in the set. So its range
8051 * is 0 to 2. Only when the count is zero is something not in the set.
8055 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8058 /* If either one is empty, the union is the other one */
8059 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8060 bool make_temp = FALSE; /* Should we mortalize the result? */
8064 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8070 *output = invlist_clone(b);
8072 _invlist_invert(*output);
8074 } /* else *output already = b; */
8077 sv_2mortal(*output);
8081 else if ((len_b = _invlist_len(b)) == 0) {
8082 bool make_temp = FALSE;
8084 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8089 /* The complement of an empty list is a list that has everything in it,
8090 * so the union with <a> includes everything too */
8093 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8097 *output = _new_invlist(1);
8098 _append_range_to_invlist(*output, 0, UV_MAX);
8100 else if (*output != a) {
8101 *output = invlist_clone(a);
8103 /* else *output already = a; */
8106 sv_2mortal(*output);
8111 /* Here both lists exist and are non-empty */
8112 array_a = invlist_array(a);
8113 array_b = invlist_array(b);
8115 /* If are to take the union of 'a' with the complement of b, set it
8116 * up so are looking at b's complement. */
8119 /* To complement, we invert: if the first element is 0, remove it. To
8120 * do this, we just pretend the array starts one later */
8121 if (array_b[0] == 0) {
8127 /* But if the first element is not zero, we pretend the list starts
8128 * at the 0 that is always stored immediately before the array. */
8134 /* Size the union for the worst case: that the sets are completely
8136 u = _new_invlist(len_a + len_b);
8138 /* Will contain U+0000 if either component does */
8139 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8140 || (len_b > 0 && array_b[0] == 0));
8142 /* Go through each list item by item, stopping when exhausted one of
8144 while (i_a < len_a && i_b < len_b) {
8145 UV cp; /* The element to potentially add to the union's array */
8146 bool cp_in_set; /* is it in the the input list's set or not */
8148 /* We need to take one or the other of the two inputs for the union.
8149 * Since we are merging two sorted lists, we take the smaller of the
8150 * next items. In case of a tie, we take the one that is in its set
8151 * first. If we took one not in the set first, it would decrement the
8152 * count, possibly to 0 which would cause it to be output as ending the
8153 * range, and the next time through we would take the same number, and
8154 * output it again as beginning the next range. By doing it the
8155 * opposite way, there is no possibility that the count will be
8156 * momentarily decremented to 0, and thus the two adjoining ranges will
8157 * be seamlessly merged. (In a tie and both are in the set or both not
8158 * in the set, it doesn't matter which we take first.) */
8159 if (array_a[i_a] < array_b[i_b]
8160 || (array_a[i_a] == array_b[i_b]
8161 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8163 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8167 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8168 cp = array_b[i_b++];
8171 /* Here, have chosen which of the two inputs to look at. Only output
8172 * if the running count changes to/from 0, which marks the
8173 * beginning/end of a range in that's in the set */
8176 array_u[i_u++] = cp;
8183 array_u[i_u++] = cp;
8188 /* Here, we are finished going through at least one of the lists, which
8189 * means there is something remaining in at most one. We check if the list
8190 * that hasn't been exhausted is positioned such that we are in the middle
8191 * of a range in its set or not. (i_a and i_b point to the element beyond
8192 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8193 * is potentially more to output.
8194 * There are four cases:
8195 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8196 * in the union is entirely from the non-exhausted set.
8197 * 2) Both were in their sets, count is 2. Nothing further should
8198 * be output, as everything that remains will be in the exhausted
8199 * list's set, hence in the union; decrementing to 1 but not 0 insures
8201 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8202 * Nothing further should be output because the union includes
8203 * everything from the exhausted set. Not decrementing ensures that.
8204 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8205 * decrementing to 0 insures that we look at the remainder of the
8206 * non-exhausted set */
8207 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8208 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8213 /* The final length is what we've output so far, plus what else is about to
8214 * be output. (If 'count' is non-zero, then the input list we exhausted
8215 * has everything remaining up to the machine's limit in its set, and hence
8216 * in the union, so there will be no further output. */
8219 /* At most one of the subexpressions will be non-zero */
8220 len_u += (len_a - i_a) + (len_b - i_b);
8223 /* Set result to final length, which can change the pointer to array_u, so
8225 if (len_u != _invlist_len(u)) {
8226 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8228 array_u = invlist_array(u);
8231 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8232 * the other) ended with everything above it not in its set. That means
8233 * that the remaining part of the union is precisely the same as the
8234 * non-exhausted list, so can just copy it unchanged. (If both list were
8235 * exhausted at the same time, then the operations below will be both 0.)
8238 IV copy_count; /* At most one will have a non-zero copy count */
8239 if ((copy_count = len_a - i_a) > 0) {
8240 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8242 else if ((copy_count = len_b - i_b) > 0) {
8243 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8247 /* We may be removing a reference to one of the inputs. If so, the output
8248 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8249 * count decremented) */
8250 if (a == *output || b == *output) {
8251 assert(! invlist_is_iterating(*output));
8252 if ((SvTEMP(*output))) {
8256 SvREFCNT_dec_NN(*output);
8266 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
8268 /* Take the intersection of two inversion lists and point <i> to it. *i
8269 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8270 * the reference count to that list will be decremented if not already a
8271 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8272 * The first list, <a>, may be NULL, in which case an empty list is
8273 * returned. If <complement_b> is TRUE, the result will be the
8274 * intersection of <a> and the complement (or inversion) of <b> instead of
8277 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8278 * Richard Gillam, published by Addison-Wesley, and explained at some
8279 * length there. The preface says to incorporate its examples into your
8280 * code at your own risk. In fact, it had bugs
8282 * The algorithm is like a merge sort, and is essentially the same as the
8286 const UV* array_a; /* a's array */
8288 UV len_a; /* length of a's array */
8291 SV* r; /* the resulting intersection */
8295 UV i_a = 0; /* current index into a's array */
8299 /* running count, as explained in the algorithm source book; items are
8300 * stopped accumulating and are output when the count changes to/from 2.
8301 * The count is incremented when we start a range that's in the set, and
8302 * decremented when we start a range that's not in the set. So its range
8303 * is 0 to 2. Only when the count is 2 is something in the intersection.
8307 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8310 /* Special case if either one is empty */
8311 len_a = (a == NULL) ? 0 : _invlist_len(a);
8312 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8313 bool make_temp = FALSE;
8315 if (len_a != 0 && complement_b) {
8317 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8318 * be empty. Here, also we are using 'b's complement, which hence
8319 * must be every possible code point. Thus the intersection is
8323 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8328 *i = invlist_clone(a);
8330 /* else *i is already 'a' */
8338 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8339 * intersection must be empty */
8341 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8346 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8350 *i = _new_invlist(0);
8358 /* Here both lists exist and are non-empty */
8359 array_a = invlist_array(a);
8360 array_b = invlist_array(b);
8362 /* If are to take the intersection of 'a' with the complement of b, set it
8363 * up so are looking at b's complement. */
8366 /* To complement, we invert: if the first element is 0, remove it. To
8367 * do this, we just pretend the array starts one later */
8368 if (array_b[0] == 0) {
8374 /* But if the first element is not zero, we pretend the list starts
8375 * at the 0 that is always stored immediately before the array. */
8381 /* Size the intersection for the worst case: that the intersection ends up
8382 * fragmenting everything to be completely disjoint */
8383 r= _new_invlist(len_a + len_b);
8385 /* Will contain U+0000 iff both components do */
8386 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8387 && len_b > 0 && array_b[0] == 0);
8389 /* Go through each list item by item, stopping when exhausted one of
8391 while (i_a < len_a && i_b < len_b) {
8392 UV cp; /* The element to potentially add to the intersection's
8394 bool cp_in_set; /* Is it in the input list's set or not */
8396 /* We need to take one or the other of the two inputs for the
8397 * intersection. Since we are merging two sorted lists, we take the
8398 * smaller of the next items. In case of a tie, we take the one that
8399 * is not in its set first (a difference from the union algorithm). If
8400 * we took one in the set first, it would increment the count, possibly
8401 * to 2 which would cause it to be output as starting a range in the
8402 * intersection, and the next time through we would take that same
8403 * number, and output it again as ending the set. By doing it the
8404 * opposite of this, there is no possibility that the count will be
8405 * momentarily incremented to 2. (In a tie and both are in the set or
8406 * both not in the set, it doesn't matter which we take first.) */
8407 if (array_a[i_a] < array_b[i_b]
8408 || (array_a[i_a] == array_b[i_b]
8409 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8411 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8415 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8419 /* Here, have chosen which of the two inputs to look at. Only output
8420 * if the running count changes to/from 2, which marks the
8421 * beginning/end of a range that's in the intersection */
8425 array_r[i_r++] = cp;
8430 array_r[i_r++] = cp;
8436 /* Here, we are finished going through at least one of the lists, which
8437 * means there is something remaining in at most one. We check if the list
8438 * that has been exhausted is positioned such that we are in the middle
8439 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8440 * the ones we care about.) There are four cases:
8441 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8442 * nothing left in the intersection.
8443 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8444 * above 2. What should be output is exactly that which is in the
8445 * non-exhausted set, as everything it has is also in the intersection
8446 * set, and everything it doesn't have can't be in the intersection
8447 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8448 * gets incremented to 2. Like the previous case, the intersection is
8449 * everything that remains in the non-exhausted set.
8450 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8451 * remains 1. And the intersection has nothing more. */
8452 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8453 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8458 /* The final length is what we've output so far plus what else is in the
8459 * intersection. At most one of the subexpressions below will be non-zero */
8462 len_r += (len_a - i_a) + (len_b - i_b);
8465 /* Set result to final length, which can change the pointer to array_r, so
8467 if (len_r != _invlist_len(r)) {
8468 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8470 array_r = invlist_array(r);
8473 /* Finish outputting any remaining */
8474 if (count >= 2) { /* At most one will have a non-zero copy count */
8476 if ((copy_count = len_a - i_a) > 0) {
8477 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8479 else if ((copy_count = len_b - i_b) > 0) {
8480 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8484 /* We may be removing a reference to one of the inputs. If so, the output
8485 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8486 * count decremented) */
8487 if (a == *i || b == *i) {
8488 assert(! invlist_is_iterating(*i));
8493 SvREFCNT_dec_NN(*i);
8503 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8505 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8506 * set. A pointer to the inversion list is returned. This may actually be
8507 * a new list, in which case the passed in one has been destroyed. The
8508 * passed in inversion list can be NULL, in which case a new one is created
8509 * with just the one range in it */
8514 if (invlist == NULL) {
8515 invlist = _new_invlist(2);
8519 len = _invlist_len(invlist);
8522 /* If comes after the final entry actually in the list, can just append it
8525 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8526 && start >= invlist_array(invlist)[len - 1]))
8528 _append_range_to_invlist(invlist, start, end);
8532 /* Here, can't just append things, create and return a new inversion list
8533 * which is the union of this range and the existing inversion list */
8534 range_invlist = _new_invlist(2);
8535 _append_range_to_invlist(range_invlist, start, end);
8537 _invlist_union(invlist, range_invlist, &invlist);
8539 /* The temporary can be freed */
8540 SvREFCNT_dec_NN(range_invlist);
8547 PERL_STATIC_INLINE SV*
8548 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8549 return _add_range_to_invlist(invlist, cp, cp);
8552 #ifndef PERL_IN_XSUB_RE
8554 Perl__invlist_invert(pTHX_ SV* const invlist)
8556 /* Complement the input inversion list. This adds a 0 if the list didn't
8557 * have a zero; removes it otherwise. As described above, the data
8558 * structure is set up so that this is very efficient */
8560 PERL_ARGS_ASSERT__INVLIST_INVERT;
8562 assert(! invlist_is_iterating(invlist));
8564 /* The inverse of matching nothing is matching everything */
8565 if (_invlist_len(invlist) == 0) {
8566 _append_range_to_invlist(invlist, 0, UV_MAX);
8570 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8574 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8576 /* Complement the input inversion list (which must be a Unicode property,
8577 * all of which don't match above the Unicode maximum code point.) And
8578 * Perl has chosen to not have the inversion match above that either. This
8579 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8585 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8587 _invlist_invert(invlist);
8589 len = _invlist_len(invlist);
8591 if (len != 0) { /* If empty do nothing */
8592 array = invlist_array(invlist);
8593 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8594 /* Add 0x110000. First, grow if necessary */
8596 if (invlist_max(invlist) < len) {
8597 invlist_extend(invlist, len);
8598 array = invlist_array(invlist);
8600 invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8601 array[len - 1] = PERL_UNICODE_MAX + 1;
8603 else { /* Remove the 0x110000 */
8604 invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8612 PERL_STATIC_INLINE SV*
8613 S_invlist_clone(pTHX_ SV* const invlist)
8616 /* Return a new inversion list that is a copy of the input one, which is
8617 * unchanged. The new list will not be mortal even if the old one was. */
8619 /* Need to allocate extra space to accommodate Perl's addition of a
8620 * trailing NUL to SvPV's, since it thinks they are always strings */
8621 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8622 STRLEN physical_length = SvCUR(invlist);
8623 bool offset = *(get_invlist_offset_addr(invlist));
8625 PERL_ARGS_ASSERT_INVLIST_CLONE;
8627 *(get_invlist_offset_addr(new_invlist)) = offset;
8628 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8629 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8634 PERL_STATIC_INLINE STRLEN*
8635 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8637 /* Return the address of the UV that contains the current iteration
8640 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8642 assert(SvTYPE(invlist) == SVt_INVLIST);
8644 return &(((XINVLIST*) SvANY(invlist))->iterator);
8647 PERL_STATIC_INLINE void
8648 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8650 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8652 *get_invlist_iter_addr(invlist) = 0;
8655 PERL_STATIC_INLINE void
8656 S_invlist_iterfinish(pTHX_ SV* invlist)
8658 /* Terminate iterator for invlist. This is to catch development errors.
8659 * Any iteration that is interrupted before completed should call this
8660 * function. Functions that add code points anywhere else but to the end
8661 * of an inversion list assert that they are not in the middle of an
8662 * iteration. If they were, the addition would make the iteration
8663 * problematical: if the iteration hadn't reached the place where things
8664 * were being added, it would be ok */
8666 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8668 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8672 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8674 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8675 * This call sets in <*start> and <*end>, the next range in <invlist>.
8676 * Returns <TRUE> if successful and the next call will return the next
8677 * range; <FALSE> if was already at the end of the list. If the latter,
8678 * <*start> and <*end> are unchanged, and the next call to this function
8679 * will start over at the beginning of the list */
8681 STRLEN* pos = get_invlist_iter_addr(invlist);
8682 UV len = _invlist_len(invlist);
8685 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8688 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8692 array = invlist_array(invlist);
8694 *start = array[(*pos)++];
8700 *end = array[(*pos)++] - 1;
8706 PERL_STATIC_INLINE bool
8707 S_invlist_is_iterating(pTHX_ SV* const invlist)
8709 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8711 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8714 PERL_STATIC_INLINE UV
8715 S_invlist_highest(pTHX_ SV* const invlist)
8717 /* Returns the highest code point that matches an inversion list. This API
8718 * has an ambiguity, as it returns 0 under either the highest is actually
8719 * 0, or if the list is empty. If this distinction matters to you, check
8720 * for emptiness before calling this function */
8722 UV len = _invlist_len(invlist);
8725 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8731 array = invlist_array(invlist);
8733 /* The last element in the array in the inversion list always starts a
8734 * range that goes to infinity. That range may be for code points that are
8735 * matched in the inversion list, or it may be for ones that aren't
8736 * matched. In the latter case, the highest code point in the set is one
8737 * less than the beginning of this range; otherwise it is the final element
8738 * of this range: infinity */
8739 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8741 : array[len - 1] - 1;
8744 #ifndef PERL_IN_XSUB_RE
8746 Perl__invlist_contents(pTHX_ SV* const invlist)
8748 /* Get the contents of an inversion list into a string SV so that they can
8749 * be printed out. It uses the format traditionally done for debug tracing
8753 SV* output = newSVpvs("\n");
8755 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8757 assert(! invlist_is_iterating(invlist));
8759 invlist_iterinit(invlist);
8760 while (invlist_iternext(invlist, &start, &end)) {
8761 if (end == UV_MAX) {
8762 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8764 else if (end != start) {
8765 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8769 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8777 #ifndef PERL_IN_XSUB_RE
8779 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8781 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
8782 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
8783 * the string 'indent'. The output looks like this:
8784 [0] 0x000A .. 0x000D
8786 [4] 0x2028 .. 0x2029
8787 [6] 0x3104 .. INFINITY
8788 * This means that the first range of code points matched by the list are
8789 * 0xA through 0xD; the second range contains only the single code point
8790 * 0x85, etc. An inversion list is an array of UVs. Two array elements
8791 * are used to define each range (except if the final range extends to
8792 * infinity, only a single element is needed). The array index of the
8793 * first element for the corresponding range is given in brackets. */
8798 PERL_ARGS_ASSERT__INVLIST_DUMP;
8800 if (invlist_is_iterating(invlist)) {
8801 Perl_dump_indent(aTHX_ level, file,
8802 "%sCan't dump inversion list because is in middle of iterating\n",
8807 invlist_iterinit(invlist);
8808 while (invlist_iternext(invlist, &start, &end)) {
8809 if (end == UV_MAX) {
8810 Perl_dump_indent(aTHX_ level, file,
8811 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8812 indent, (UV)count, start);
8814 else if (end != start) {
8815 Perl_dump_indent(aTHX_ level, file,
8816 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8817 indent, (UV)count, start, end);
8820 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8821 indent, (UV)count, start);
8828 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8830 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8832 /* Return a boolean as to if the two passed in inversion lists are
8833 * identical. The final argument, if TRUE, says to take the complement of
8834 * the second inversion list before doing the comparison */
8836 const UV* array_a = invlist_array(a);
8837 const UV* array_b = invlist_array(b);
8838 UV len_a = _invlist_len(a);
8839 UV len_b = _invlist_len(b);
8841 UV i = 0; /* current index into the arrays */
8842 bool retval = TRUE; /* Assume are identical until proven otherwise */
8844 PERL_ARGS_ASSERT__INVLISTEQ;
8846 /* If are to compare 'a' with the complement of b, set it
8847 * up so are looking at b's complement. */
8850 /* The complement of nothing is everything, so <a> would have to have
8851 * just one element, starting at zero (ending at infinity) */
8853 return (len_a == 1 && array_a[0] == 0);
8855 else if (array_b[0] == 0) {
8857 /* Otherwise, to complement, we invert. Here, the first element is
8858 * 0, just remove it. To do this, we just pretend the array starts
8866 /* But if the first element is not zero, we pretend the list starts
8867 * at the 0 that is always stored immediately before the array. */
8873 /* Make sure that the lengths are the same, as well as the final element
8874 * before looping through the remainder. (Thus we test the length, final,
8875 * and first elements right off the bat) */
8876 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8879 else for (i = 0; i < len_a - 1; i++) {
8880 if (array_a[i] != array_b[i]) {
8890 #undef HEADER_LENGTH
8891 #undef TO_INTERNAL_SIZE
8892 #undef FROM_INTERNAL_SIZE
8893 #undef INVLIST_VERSION_ID
8895 /* End of inversion list object */
8898 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
8900 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8901 * constructs, and updates RExC_flags with them. On input, RExC_parse
8902 * should point to the first flag; it is updated on output to point to the
8903 * final ')' or ':'. There needs to be at least one flag, or this will
8906 /* for (?g), (?gc), and (?o) warnings; warning
8907 about (?c) will warn about (?g) -- japhy */
8909 #define WASTED_O 0x01
8910 #define WASTED_G 0x02
8911 #define WASTED_C 0x04
8912 #define WASTED_GC (WASTED_G|WASTED_C)
8913 I32 wastedflags = 0x00;
8914 U32 posflags = 0, negflags = 0;
8915 U32 *flagsp = &posflags;
8916 char has_charset_modifier = '\0';
8918 bool has_use_defaults = FALSE;
8919 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8921 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8923 /* '^' as an initial flag sets certain defaults */
8924 if (UCHARAT(RExC_parse) == '^') {
8926 has_use_defaults = TRUE;
8927 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8928 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8929 ? REGEX_UNICODE_CHARSET
8930 : REGEX_DEPENDS_CHARSET);
8933 cs = get_regex_charset(RExC_flags);
8934 if (cs == REGEX_DEPENDS_CHARSET
8935 && (RExC_utf8 || RExC_uni_semantics))
8937 cs = REGEX_UNICODE_CHARSET;
8940 while (*RExC_parse) {
8941 /* && strchr("iogcmsx", *RExC_parse) */
8942 /* (?g), (?gc) and (?o) are useless here
8943 and must be globally applied -- japhy */
8944 switch (*RExC_parse) {
8946 /* Code for the imsx flags */
8947 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8949 case LOCALE_PAT_MOD:
8950 if (has_charset_modifier) {
8951 goto excess_modifier;
8953 else if (flagsp == &negflags) {
8956 cs = REGEX_LOCALE_CHARSET;
8957 has_charset_modifier = LOCALE_PAT_MOD;
8958 RExC_contains_locale = 1;
8960 case UNICODE_PAT_MOD:
8961 if (has_charset_modifier) {
8962 goto excess_modifier;
8964 else if (flagsp == &negflags) {
8967 cs = REGEX_UNICODE_CHARSET;
8968 has_charset_modifier = UNICODE_PAT_MOD;
8970 case ASCII_RESTRICT_PAT_MOD:
8971 if (flagsp == &negflags) {
8974 if (has_charset_modifier) {
8975 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8976 goto excess_modifier;
8978 /* Doubled modifier implies more restricted */
8979 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8982 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8984 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8986 case DEPENDS_PAT_MOD:
8987 if (has_use_defaults) {
8988 goto fail_modifiers;
8990 else if (flagsp == &negflags) {
8993 else if (has_charset_modifier) {
8994 goto excess_modifier;
8997 /* The dual charset means unicode semantics if the
8998 * pattern (or target, not known until runtime) are
8999 * utf8, or something in the pattern indicates unicode
9001 cs = (RExC_utf8 || RExC_uni_semantics)
9002 ? REGEX_UNICODE_CHARSET
9003 : REGEX_DEPENDS_CHARSET;
9004 has_charset_modifier = DEPENDS_PAT_MOD;
9008 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9009 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9011 else if (has_charset_modifier == *(RExC_parse - 1)) {
9012 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9015 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9020 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9022 case ONCE_PAT_MOD: /* 'o' */
9023 case GLOBAL_PAT_MOD: /* 'g' */
9024 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9025 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9026 if (! (wastedflags & wflagbit) ) {
9027 wastedflags |= wflagbit;
9028 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9031 "Useless (%s%c) - %suse /%c modifier",
9032 flagsp == &negflags ? "?-" : "?",
9034 flagsp == &negflags ? "don't " : "",
9041 case CONTINUE_PAT_MOD: /* 'c' */
9042 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9043 if (! (wastedflags & WASTED_C) ) {
9044 wastedflags |= WASTED_GC;
9045 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9048 "Useless (%sc) - %suse /gc modifier",
9049 flagsp == &negflags ? "?-" : "?",
9050 flagsp == &negflags ? "don't " : ""
9055 case KEEPCOPY_PAT_MOD: /* 'p' */
9056 if (flagsp == &negflags) {
9058 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9060 *flagsp |= RXf_PMf_KEEPCOPY;
9064 /* A flag is a default iff it is following a minus, so
9065 * if there is a minus, it means will be trying to
9066 * re-specify a default which is an error */
9067 if (has_use_defaults || flagsp == &negflags) {
9068 goto fail_modifiers;
9071 wastedflags = 0; /* reset so (?g-c) warns twice */
9075 RExC_flags |= posflags;
9076 RExC_flags &= ~negflags;
9077 set_regex_charset(&RExC_flags, cs);
9078 if (RExC_flags & RXf_PMf_FOLD) {
9079 RExC_contains_i = 1;
9085 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9086 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9087 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9088 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9097 - reg - regular expression, i.e. main body or parenthesized thing
9099 * Caller must absorb opening parenthesis.
9101 * Combining parenthesis handling with the base level of regular expression
9102 * is a trifle forced, but the need to tie the tails of the branches to what
9103 * follows makes it hard to avoid.
9105 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9107 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9109 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9112 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9113 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9114 needs to be restarted.
9115 Otherwise would only return NULL if regbranch() returns NULL, which
9118 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9119 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9120 * 2 is like 1, but indicates that nextchar() has been called to advance
9121 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9122 * this flag alerts us to the need to check for that */
9125 regnode *ret; /* Will be the head of the group. */
9128 regnode *ender = NULL;
9131 U32 oregflags = RExC_flags;
9132 bool have_branch = 0;
9134 I32 freeze_paren = 0;
9135 I32 after_freeze = 0;
9137 char * parse_start = RExC_parse; /* MJD */
9138 char * const oregcomp_parse = RExC_parse;
9140 GET_RE_DEBUG_FLAGS_DECL;
9142 PERL_ARGS_ASSERT_REG;
9143 DEBUG_PARSE("reg ");
9145 *flagp = 0; /* Tentatively. */
9148 /* Make an OPEN node, if parenthesized. */
9151 /* Under /x, space and comments can be gobbled up between the '(' and
9152 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9153 * intervening space, as the sequence is a token, and a token should be
9155 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9157 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9158 char *start_verb = RExC_parse;
9159 STRLEN verb_len = 0;
9160 char *start_arg = NULL;
9161 unsigned char op = 0;
9163 int internal_argval = 0; /* internal_argval is only useful if !argok */
9165 if (has_intervening_patws && SIZE_ONLY) {
9166 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9168 while ( *RExC_parse && *RExC_parse != ')' ) {
9169 if ( *RExC_parse == ':' ) {
9170 start_arg = RExC_parse + 1;
9176 verb_len = RExC_parse - start_verb;
9179 while ( *RExC_parse && *RExC_parse != ')' )
9181 if ( *RExC_parse != ')' )
9182 vFAIL("Unterminated verb pattern argument");
9183 if ( RExC_parse == start_arg )
9186 if ( *RExC_parse != ')' )
9187 vFAIL("Unterminated verb pattern");
9190 switch ( *start_verb ) {
9191 case 'A': /* (*ACCEPT) */
9192 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9194 internal_argval = RExC_nestroot;
9197 case 'C': /* (*COMMIT) */
9198 if ( memEQs(start_verb,verb_len,"COMMIT") )
9201 case 'F': /* (*FAIL) */
9202 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9207 case ':': /* (*:NAME) */
9208 case 'M': /* (*MARK:NAME) */
9209 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9214 case 'P': /* (*PRUNE) */
9215 if ( memEQs(start_verb,verb_len,"PRUNE") )
9218 case 'S': /* (*SKIP) */
9219 if ( memEQs(start_verb,verb_len,"SKIP") )
9222 case 'T': /* (*THEN) */
9223 /* [19:06] <TimToady> :: is then */
9224 if ( memEQs(start_verb,verb_len,"THEN") ) {
9226 RExC_seen |= REG_SEEN_CUTGROUP;
9231 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9233 "Unknown verb pattern '%"UTF8f"'",
9234 UTF8fARG(UTF, verb_len, start_verb));
9237 if ( start_arg && internal_argval ) {
9238 vFAIL3("Verb pattern '%.*s' may not have an argument",
9239 verb_len, start_verb);
9240 } else if ( argok < 0 && !start_arg ) {
9241 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9242 verb_len, start_verb);
9244 ret = reganode(pRExC_state, op, internal_argval);
9245 if ( ! internal_argval && ! SIZE_ONLY ) {
9247 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
9248 ARG(ret) = add_data( pRExC_state, STR_WITH_LEN("S"));
9249 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9256 if (!internal_argval)
9257 RExC_seen |= REG_SEEN_VERBARG;
9258 } else if ( start_arg ) {
9259 vFAIL3("Verb pattern '%.*s' may not have an argument",
9260 verb_len, start_verb);
9262 ret = reg_node(pRExC_state, op);
9264 nextchar(pRExC_state);
9267 else if (*RExC_parse == '?') { /* (?...) */
9268 bool is_logical = 0;
9269 const char * const seqstart = RExC_parse;
9270 if (has_intervening_patws && SIZE_ONLY) {
9271 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9275 paren = *RExC_parse++;
9276 ret = NULL; /* For look-ahead/behind. */
9279 case 'P': /* (?P...) variants for those used to PCRE/Python */
9280 paren = *RExC_parse++;
9281 if ( paren == '<') /* (?P<...>) named capture */
9283 else if (paren == '>') { /* (?P>name) named recursion */
9284 goto named_recursion;
9286 else if (paren == '=') { /* (?P=...) named backref */
9287 /* this pretty much dupes the code for \k<NAME> in regatom(), if
9288 you change this make sure you change that */
9289 char* name_start = RExC_parse;
9291 SV *sv_dat = reg_scan_name(pRExC_state,
9292 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9293 if (RExC_parse == name_start || *RExC_parse != ')')
9294 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9295 vFAIL2("Sequence %.3s... not terminated",parse_start);
9298 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9299 RExC_rxi->data->data[num]=(void*)sv_dat;
9300 SvREFCNT_inc_simple_void(sv_dat);
9303 ret = reganode(pRExC_state,
9306 : (ASCII_FOLD_RESTRICTED)
9308 : (AT_LEAST_UNI_SEMANTICS)
9316 Set_Node_Offset(ret, parse_start+1);
9317 Set_Node_Cur_Length(ret, parse_start);
9319 nextchar(pRExC_state);
9323 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9324 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9326 case '<': /* (?<...) */
9327 if (*RExC_parse == '!')
9329 else if (*RExC_parse != '=')
9335 case '\'': /* (?'...') */
9336 name_start= RExC_parse;
9337 svname = reg_scan_name(pRExC_state,
9338 SIZE_ONLY ? /* reverse test from the others */
9339 REG_RSN_RETURN_NAME :
9340 REG_RSN_RETURN_NULL);
9341 if (RExC_parse == name_start || *RExC_parse != paren)
9342 vFAIL2("Sequence (?%c... not terminated",
9343 paren=='>' ? '<' : paren);
9347 if (!svname) /* shouldn't happen */
9349 "panic: reg_scan_name returned NULL");
9350 if (!RExC_paren_names) {
9351 RExC_paren_names= newHV();
9352 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9354 RExC_paren_name_list= newAV();
9355 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9358 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9360 sv_dat = HeVAL(he_str);
9362 /* croak baby croak */
9364 "panic: paren_name hash element allocation failed");
9365 } else if ( SvPOK(sv_dat) ) {
9366 /* (?|...) can mean we have dupes so scan to check
9367 its already been stored. Maybe a flag indicating
9368 we are inside such a construct would be useful,
9369 but the arrays are likely to be quite small, so
9370 for now we punt -- dmq */
9371 IV count = SvIV(sv_dat);
9372 I32 *pv = (I32*)SvPVX(sv_dat);
9374 for ( i = 0 ; i < count ; i++ ) {
9375 if ( pv[i] == RExC_npar ) {
9381 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
9382 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9383 pv[count] = RExC_npar;
9384 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9387 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9388 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
9390 SvIV_set(sv_dat, 1);
9393 /* Yes this does cause a memory leak in debugging Perls */
9394 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
9395 SvREFCNT_dec_NN(svname);
9398 /*sv_dump(sv_dat);*/
9400 nextchar(pRExC_state);
9402 goto capturing_parens;
9404 RExC_seen |= REG_SEEN_LOOKBEHIND;
9405 RExC_in_lookbehind++;
9407 case '=': /* (?=...) */
9408 RExC_seen_zerolen++;
9410 case '!': /* (?!...) */
9411 RExC_seen_zerolen++;
9412 if (*RExC_parse == ')') {
9413 ret=reg_node(pRExC_state, OPFAIL);
9414 nextchar(pRExC_state);
9418 case '|': /* (?|...) */
9419 /* branch reset, behave like a (?:...) except that
9420 buffers in alternations share the same numbers */
9422 after_freeze = freeze_paren = RExC_npar;
9424 case ':': /* (?:...) */
9425 case '>': /* (?>...) */
9427 case '$': /* (?$...) */
9428 case '@': /* (?@...) */
9429 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9431 case '#': /* (?#...) */
9432 /* XXX As soon as we disallow separating the '?' and '*' (by
9433 * spaces or (?#...) comment), it is believed that this case
9434 * will be unreachable and can be removed. See
9436 while (*RExC_parse && *RExC_parse != ')')
9438 if (*RExC_parse != ')')
9439 FAIL("Sequence (?#... not terminated");
9440 nextchar(pRExC_state);
9443 case '0' : /* (?0) */
9444 case 'R' : /* (?R) */
9445 if (*RExC_parse != ')')
9446 FAIL("Sequence (?R) not terminated");
9447 ret = reg_node(pRExC_state, GOSTART);
9448 RExC_seen |= REG_SEEN_GOSTART;
9449 *flagp |= POSTPONED;
9450 nextchar(pRExC_state);
9453 { /* named and numeric backreferences */
9455 case '&': /* (?&NAME) */
9456 parse_start = RExC_parse - 1;
9459 SV *sv_dat = reg_scan_name(pRExC_state,
9460 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9461 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9463 if (RExC_parse == RExC_end || *RExC_parse != ')')
9464 vFAIL("Sequence (?&... not terminated");
9465 goto gen_recurse_regop;
9466 assert(0); /* NOT REACHED */
9468 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9470 vFAIL("Illegal pattern");
9472 goto parse_recursion;
9474 case '-': /* (?-1) */
9475 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9476 RExC_parse--; /* rewind to let it be handled later */
9480 case '1': case '2': case '3': case '4': /* (?1) */
9481 case '5': case '6': case '7': case '8': case '9':
9484 num = atoi(RExC_parse);
9485 parse_start = RExC_parse - 1; /* MJD */
9486 if (*RExC_parse == '-')
9488 while (isDIGIT(*RExC_parse))
9490 if (*RExC_parse!=')')
9491 vFAIL("Expecting close bracket");
9494 if ( paren == '-' ) {
9496 Diagram of capture buffer numbering.
9497 Top line is the normal capture buffer numbers
9498 Bottom line is the negative indexing as from
9502 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9506 num = RExC_npar + num;
9509 vFAIL("Reference to nonexistent group");
9511 } else if ( paren == '+' ) {
9512 num = RExC_npar + num - 1;
9515 ret = reganode(pRExC_state, GOSUB, num);
9517 if (num > (I32)RExC_rx->nparens) {
9519 vFAIL("Reference to nonexistent group");
9521 ARG2L_SET( ret, RExC_recurse_count++);
9523 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9524 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9528 RExC_seen |= REG_SEEN_RECURSE;
9529 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9530 Set_Node_Offset(ret, parse_start); /* MJD */
9532 *flagp |= POSTPONED;
9533 nextchar(pRExC_state);
9535 } /* named and numeric backreferences */
9536 assert(0); /* NOT REACHED */
9538 case '?': /* (??...) */
9540 if (*RExC_parse != '{') {
9542 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9544 "Sequence (%"UTF8f"...) not recognized",
9545 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9548 *flagp |= POSTPONED;
9549 paren = *RExC_parse++;
9551 case '{': /* (?{...}) */
9554 struct reg_code_block *cb;
9556 RExC_seen_zerolen++;
9558 if ( !pRExC_state->num_code_blocks
9559 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9560 || pRExC_state->code_blocks[pRExC_state->code_index].start
9561 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9564 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9565 FAIL("panic: Sequence (?{...}): no code block found\n");
9566 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9568 /* this is a pre-compiled code block (?{...}) */
9569 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9570 RExC_parse = RExC_start + cb->end;
9573 if (cb->src_regex) {
9574 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9575 RExC_rxi->data->data[n] =
9576 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9577 RExC_rxi->data->data[n+1] = (void*)o;
9580 n = add_data(pRExC_state,
9581 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9582 RExC_rxi->data->data[n] = (void*)o;
9585 pRExC_state->code_index++;
9586 nextchar(pRExC_state);
9590 ret = reg_node(pRExC_state, LOGICAL);
9591 eval = reganode(pRExC_state, EVAL, n);
9594 /* for later propagation into (??{}) return value */
9595 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9597 REGTAIL(pRExC_state, ret, eval);
9598 /* deal with the length of this later - MJD */
9601 ret = reganode(pRExC_state, EVAL, n);
9602 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9603 Set_Node_Offset(ret, parse_start);
9606 case '(': /* (?(?{...})...) and (?(?=...)...) */
9609 if (RExC_parse[0] == '?') { /* (?(?...)) */
9610 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9611 || RExC_parse[1] == '<'
9612 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9616 ret = reg_node(pRExC_state, LOGICAL);
9620 tail = reg(pRExC_state, 1, &flag, depth+1);
9621 if (flag & RESTART_UTF8) {
9622 *flagp = RESTART_UTF8;
9625 REGTAIL(pRExC_state, ret, tail);
9629 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9630 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9632 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9633 char *name_start= RExC_parse++;
9635 SV *sv_dat=reg_scan_name(pRExC_state,
9636 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9637 if (RExC_parse == name_start || *RExC_parse != ch)
9638 vFAIL2("Sequence (?(%c... not terminated",
9639 (ch == '>' ? '<' : ch));
9642 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9643 RExC_rxi->data->data[num]=(void*)sv_dat;
9644 SvREFCNT_inc_simple_void(sv_dat);
9646 ret = reganode(pRExC_state,NGROUPP,num);
9647 goto insert_if_check_paren;
9649 else if (RExC_parse[0] == 'D' &&
9650 RExC_parse[1] == 'E' &&
9651 RExC_parse[2] == 'F' &&
9652 RExC_parse[3] == 'I' &&
9653 RExC_parse[4] == 'N' &&
9654 RExC_parse[5] == 'E')
9656 ret = reganode(pRExC_state,DEFINEP,0);
9659 goto insert_if_check_paren;
9661 else if (RExC_parse[0] == 'R') {
9664 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9665 parno = atoi(RExC_parse++);
9666 while (isDIGIT(*RExC_parse))
9668 } else if (RExC_parse[0] == '&') {
9671 sv_dat = reg_scan_name(pRExC_state,
9672 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9673 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9675 ret = reganode(pRExC_state,INSUBP,parno);
9676 goto insert_if_check_paren;
9678 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9682 parno = atoi(RExC_parse++);
9684 while (isDIGIT(*RExC_parse))
9686 ret = reganode(pRExC_state, GROUPP, parno);
9688 insert_if_check_paren:
9689 if (*(tmp = nextchar(pRExC_state)) != ')') {
9690 /* nextchar also skips comments, so undo its work
9691 * and skip over the the next character.
9694 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9695 vFAIL("Switch condition not recognized");
9698 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9699 br = regbranch(pRExC_state, &flags, 1,depth+1);
9701 if (flags & RESTART_UTF8) {
9702 *flagp = RESTART_UTF8;
9705 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9708 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9709 c = *nextchar(pRExC_state);
9714 vFAIL("(?(DEFINE)....) does not allow branches");
9715 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9716 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9717 if (flags & RESTART_UTF8) {
9718 *flagp = RESTART_UTF8;
9721 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9724 REGTAIL(pRExC_state, ret, lastbr);
9727 c = *nextchar(pRExC_state);
9732 vFAIL("Switch (?(condition)... contains too many branches");
9733 ender = reg_node(pRExC_state, TAIL);
9734 REGTAIL(pRExC_state, br, ender);
9736 REGTAIL(pRExC_state, lastbr, ender);
9737 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9740 REGTAIL(pRExC_state, ret, ender);
9741 RExC_size++; /* XXX WHY do we need this?!!
9742 For large programs it seems to be required
9743 but I can't figure out why. -- dmq*/
9747 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9748 vFAIL("Unknown switch condition (?(...))");
9751 case '[': /* (?[ ... ]) */
9752 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9755 RExC_parse--; /* for vFAIL to print correctly */
9756 vFAIL("Sequence (? incomplete");
9758 default: /* e.g., (?i) */
9761 parse_lparen_question_flags(pRExC_state);
9762 if (UCHARAT(RExC_parse) != ':') {
9763 nextchar(pRExC_state);
9768 nextchar(pRExC_state);
9778 ret = reganode(pRExC_state, OPEN, parno);
9781 RExC_nestroot = parno;
9782 if (RExC_seen & REG_SEEN_RECURSE
9783 && !RExC_open_parens[parno-1])
9785 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9786 "Setting open paren #%"IVdf" to %d\n",
9787 (IV)parno, REG_NODE_NUM(ret)));
9788 RExC_open_parens[parno-1]= ret;
9791 Set_Node_Length(ret, 1); /* MJD */
9792 Set_Node_Offset(ret, RExC_parse); /* MJD */
9800 /* Pick up the branches, linking them together. */
9801 parse_start = RExC_parse; /* MJD */
9802 br = regbranch(pRExC_state, &flags, 1,depth+1);
9804 /* branch_len = (paren != 0); */
9807 if (flags & RESTART_UTF8) {
9808 *flagp = RESTART_UTF8;
9811 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9813 if (*RExC_parse == '|') {
9814 if (!SIZE_ONLY && RExC_extralen) {
9815 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9818 reginsert(pRExC_state, BRANCH, br, depth+1);
9819 Set_Node_Length(br, paren != 0);
9820 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9824 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9826 else if (paren == ':') {
9827 *flagp |= flags&SIMPLE;
9829 if (is_open) { /* Starts with OPEN. */
9830 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9832 else if (paren != '?') /* Not Conditional */
9834 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9836 while (*RExC_parse == '|') {
9837 if (!SIZE_ONLY && RExC_extralen) {
9838 ender = reganode(pRExC_state, LONGJMP,0);
9839 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9842 RExC_extralen += 2; /* Account for LONGJMP. */
9843 nextchar(pRExC_state);
9845 if (RExC_npar > after_freeze)
9846 after_freeze = RExC_npar;
9847 RExC_npar = freeze_paren;
9849 br = regbranch(pRExC_state, &flags, 0, depth+1);
9852 if (flags & RESTART_UTF8) {
9853 *flagp = RESTART_UTF8;
9856 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9858 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9860 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9863 if (have_branch || paren != ':') {
9864 /* Make a closing node, and hook it on the end. */
9867 ender = reg_node(pRExC_state, TAIL);
9870 ender = reganode(pRExC_state, CLOSE, parno);
9871 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9872 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9873 "Setting close paren #%"IVdf" to %d\n",
9874 (IV)parno, REG_NODE_NUM(ender)));
9875 RExC_close_parens[parno-1]= ender;
9876 if (RExC_nestroot == parno)
9879 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9880 Set_Node_Length(ender,1); /* MJD */
9886 *flagp &= ~HASWIDTH;
9889 ender = reg_node(pRExC_state, SUCCEED);
9892 ender = reg_node(pRExC_state, END);
9894 assert(!RExC_opend); /* there can only be one! */
9899 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9900 SV * const mysv_val1=sv_newmortal();
9901 SV * const mysv_val2=sv_newmortal();
9902 DEBUG_PARSE_MSG("lsbr");
9903 regprop(RExC_rx, mysv_val1, lastbr);
9904 regprop(RExC_rx, mysv_val2, ender);
9905 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9906 SvPV_nolen_const(mysv_val1),
9907 (IV)REG_NODE_NUM(lastbr),
9908 SvPV_nolen_const(mysv_val2),
9909 (IV)REG_NODE_NUM(ender),
9910 (IV)(ender - lastbr)
9913 REGTAIL(pRExC_state, lastbr, ender);
9915 if (have_branch && !SIZE_ONLY) {
9918 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9920 /* Hook the tails of the branches to the closing node. */
9921 for (br = ret; br; br = regnext(br)) {
9922 const U8 op = PL_regkind[OP(br)];
9924 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9925 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9928 else if (op == BRANCHJ) {
9929 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9930 /* for now we always disable this optimisation * /
9931 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9937 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9938 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9939 SV * const mysv_val1=sv_newmortal();
9940 SV * const mysv_val2=sv_newmortal();
9941 DEBUG_PARSE_MSG("NADA");
9942 regprop(RExC_rx, mysv_val1, ret);
9943 regprop(RExC_rx, mysv_val2, ender);
9944 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9945 SvPV_nolen_const(mysv_val1),
9946 (IV)REG_NODE_NUM(ret),
9947 SvPV_nolen_const(mysv_val2),
9948 (IV)REG_NODE_NUM(ender),
9953 if (OP(ender) == TAIL) {
9958 for ( opt= br + 1; opt < ender ; opt++ )
9960 NEXT_OFF(br)= ender - br;
9968 static const char parens[] = "=!<,>";
9970 if (paren && (p = strchr(parens, paren))) {
9971 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9972 int flag = (p - parens) > 1;
9975 node = SUSPEND, flag = 0;
9976 reginsert(pRExC_state, node,ret, depth+1);
9977 Set_Node_Cur_Length(ret, parse_start);
9978 Set_Node_Offset(ret, parse_start + 1);
9980 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9984 /* Check for proper termination. */
9986 /* restore original flags, but keep (?p) */
9987 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9988 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9989 RExC_parse = oregcomp_parse;
9990 vFAIL("Unmatched (");
9993 else if (!paren && RExC_parse < RExC_end) {
9994 if (*RExC_parse == ')') {
9996 vFAIL("Unmatched )");
9999 FAIL("Junk on end of regexp"); /* "Can't happen". */
10000 assert(0); /* NOTREACHED */
10003 if (RExC_in_lookbehind) {
10004 RExC_in_lookbehind--;
10006 if (after_freeze > RExC_npar)
10007 RExC_npar = after_freeze;
10012 - regbranch - one alternative of an | operator
10014 * Implements the concatenation operator.
10016 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10020 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10024 regnode *chain = NULL;
10026 I32 flags = 0, c = 0;
10027 GET_RE_DEBUG_FLAGS_DECL;
10029 PERL_ARGS_ASSERT_REGBRANCH;
10031 DEBUG_PARSE("brnc");
10036 if (!SIZE_ONLY && RExC_extralen)
10037 ret = reganode(pRExC_state, BRANCHJ,0);
10039 ret = reg_node(pRExC_state, BRANCH);
10040 Set_Node_Length(ret, 1);
10044 if (!first && SIZE_ONLY)
10045 RExC_extralen += 1; /* BRANCHJ */
10047 *flagp = WORST; /* Tentatively. */
10050 nextchar(pRExC_state);
10051 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10052 flags &= ~TRYAGAIN;
10053 latest = regpiece(pRExC_state, &flags,depth+1);
10054 if (latest == NULL) {
10055 if (flags & TRYAGAIN)
10057 if (flags & RESTART_UTF8) {
10058 *flagp = RESTART_UTF8;
10061 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10063 else if (ret == NULL)
10065 *flagp |= flags&(HASWIDTH|POSTPONED);
10066 if (chain == NULL) /* First piece. */
10067 *flagp |= flags&SPSTART;
10070 REGTAIL(pRExC_state, chain, latest);
10075 if (chain == NULL) { /* Loop ran zero times. */
10076 chain = reg_node(pRExC_state, NOTHING);
10081 *flagp |= flags&SIMPLE;
10088 - regpiece - something followed by possible [*+?]
10090 * Note that the branching code sequences used for ? and the general cases
10091 * of * and + are somewhat optimized: they use the same NOTHING node as
10092 * both the endmarker for their branch list and the body of the last branch.
10093 * It might seem that this node could be dispensed with entirely, but the
10094 * endmarker role is not redundant.
10096 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10098 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10102 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10109 const char * const origparse = RExC_parse;
10111 I32 max = REG_INFTY;
10112 #ifdef RE_TRACK_PATTERN_OFFSETS
10115 const char *maxpos = NULL;
10117 /* Save the original in case we change the emitted regop to a FAIL. */
10118 regnode * const orig_emit = RExC_emit;
10120 GET_RE_DEBUG_FLAGS_DECL;
10122 PERL_ARGS_ASSERT_REGPIECE;
10124 DEBUG_PARSE("piec");
10126 ret = regatom(pRExC_state, &flags,depth+1);
10128 if (flags & (TRYAGAIN|RESTART_UTF8))
10129 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10131 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10137 if (op == '{' && regcurly(RExC_parse, FALSE)) {
10139 #ifdef RE_TRACK_PATTERN_OFFSETS
10140 parse_start = RExC_parse; /* MJD */
10142 next = RExC_parse + 1;
10143 while (isDIGIT(*next) || *next == ',') {
10144 if (*next == ',') {
10152 if (*next == '}') { /* got one */
10156 min = atoi(RExC_parse);
10157 if (*maxpos == ',')
10160 maxpos = RExC_parse;
10161 max = atoi(maxpos);
10162 if (!max && *maxpos != '0')
10163 max = REG_INFTY; /* meaning "infinity" */
10164 else if (max >= REG_INFTY)
10165 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10167 nextchar(pRExC_state);
10168 if (max < min) { /* If can't match, warn and optimize to fail
10171 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10173 /* We can't back off the size because we have to reserve
10174 * enough space for all the things we are about to throw
10175 * away, but we can shrink it by the ammount we are about
10176 * to re-use here */
10177 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10180 RExC_emit = orig_emit;
10182 ret = reg_node(pRExC_state, OPFAIL);
10187 if ((flags&SIMPLE)) {
10188 RExC_naughty += 2 + RExC_naughty / 2;
10189 reginsert(pRExC_state, CURLY, ret, depth+1);
10190 Set_Node_Offset(ret, parse_start+1); /* MJD */
10191 Set_Node_Cur_Length(ret, parse_start);
10194 regnode * const w = reg_node(pRExC_state, WHILEM);
10197 REGTAIL(pRExC_state, ret, w);
10198 if (!SIZE_ONLY && RExC_extralen) {
10199 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10200 reginsert(pRExC_state, NOTHING,ret, depth+1);
10201 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10203 reginsert(pRExC_state, CURLYX,ret, depth+1);
10205 Set_Node_Offset(ret, parse_start+1);
10206 Set_Node_Length(ret,
10207 op == '{' ? (RExC_parse - parse_start) : 1);
10209 if (!SIZE_ONLY && RExC_extralen)
10210 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10211 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10213 RExC_whilem_seen++, RExC_extralen += 3;
10214 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10221 *flagp |= HASWIDTH;
10223 ARG1_SET(ret, (U16)min);
10224 ARG2_SET(ret, (U16)max);
10231 if (!ISMULT1(op)) {
10236 #if 0 /* Now runtime fix should be reliable. */
10238 /* if this is reinstated, don't forget to put this back into perldiag:
10240 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10242 (F) The part of the regexp subject to either the * or + quantifier
10243 could match an empty string. The {#} shows in the regular
10244 expression about where the problem was discovered.
10248 if (!(flags&HASWIDTH) && op != '?')
10249 vFAIL("Regexp *+ operand could be empty");
10252 #ifdef RE_TRACK_PATTERN_OFFSETS
10253 parse_start = RExC_parse;
10255 nextchar(pRExC_state);
10257 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10259 if (op == '*' && (flags&SIMPLE)) {
10260 reginsert(pRExC_state, STAR, ret, depth+1);
10264 else if (op == '*') {
10268 else if (op == '+' && (flags&SIMPLE)) {
10269 reginsert(pRExC_state, PLUS, ret, depth+1);
10273 else if (op == '+') {
10277 else if (op == '?') {
10282 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10283 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10284 ckWARN2reg(RExC_parse,
10285 "%"UTF8f" matches null string many times",
10286 UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
10288 (void)ReREFCNT_inc(RExC_rx_sv);
10291 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10292 nextchar(pRExC_state);
10293 reginsert(pRExC_state, MINMOD, ret, depth+1);
10294 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10297 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10299 nextchar(pRExC_state);
10300 ender = reg_node(pRExC_state, SUCCEED);
10301 REGTAIL(pRExC_state, ret, ender);
10302 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10304 ender = reg_node(pRExC_state, TAIL);
10305 REGTAIL(pRExC_state, ret, ender);
10308 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10310 vFAIL("Nested quantifiers");
10317 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10318 const bool strict /* Apply stricter parsing rules? */
10322 /* This is expected to be called by a parser routine that has recognized '\N'
10323 and needs to handle the rest. RExC_parse is expected to point at the first
10324 char following the N at the time of the call. On successful return,
10325 RExC_parse has been updated to point to just after the sequence identified
10326 by this routine, and <*flagp> has been updated.
10328 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10331 \N may begin either a named sequence, or if outside a character class, mean
10332 to match a non-newline. For non single-quoted regexes, the tokenizer has
10333 attempted to decide which, and in the case of a named sequence, converted it
10334 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10335 where c1... are the characters in the sequence. For single-quoted regexes,
10336 the tokenizer passes the \N sequence through unchanged; this code will not
10337 attempt to determine this nor expand those, instead raising a syntax error.
10338 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10339 or there is no '}', it signals that this \N occurrence means to match a
10342 Only the \N{U+...} form should occur in a character class, for the same
10343 reason that '.' inside a character class means to just match a period: it
10344 just doesn't make sense.
10346 The function raises an error (via vFAIL), and doesn't return for various
10347 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
10348 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10349 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10350 only possible if node_p is non-NULL.
10353 If <valuep> is non-null, it means the caller can accept an input sequence
10354 consisting of a just a single code point; <*valuep> is set to that value
10355 if the input is such.
10357 If <node_p> is non-null it signifies that the caller can accept any other
10358 legal sequence (i.e., one that isn't just a single code point). <*node_p>
10360 1) \N means not-a-NL: points to a newly created REG_ANY node;
10361 2) \N{}: points to a new NOTHING node;
10362 3) otherwise: points to a new EXACT node containing the resolved
10364 Note that FALSE is returned for single code point sequences if <valuep> is
10368 char * endbrace; /* '}' following the name */
10370 char *endchar; /* Points to '.' or '}' ending cur char in the input
10372 bool has_multiple_chars; /* true if the input stream contains a sequence of
10373 more than one character */
10375 GET_RE_DEBUG_FLAGS_DECL;
10377 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10379 GET_RE_DEBUG_FLAGS;
10381 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10383 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10384 * modifier. The other meaning does not, so use a temporary until we find
10385 * out which we are being called with */
10386 p = (RExC_flags & RXf_PMf_EXTENDED)
10387 ? regwhite( pRExC_state, RExC_parse )
10390 /* Disambiguate between \N meaning a named character versus \N meaning
10391 * [^\n]. The former is assumed when it can't be the latter. */
10392 if (*p != '{' || regcurly(p, FALSE)) {
10395 /* no bare \N allowed in a charclass */
10396 if (in_char_class) {
10397 vFAIL("\\N in a character class must be a named character: \\N{...}");
10401 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10403 nextchar(pRExC_state);
10404 *node_p = reg_node(pRExC_state, REG_ANY);
10405 *flagp |= HASWIDTH|SIMPLE;
10407 Set_Node_Length(*node_p, 1); /* MJD */
10411 /* Here, we have decided it should be a named character or sequence */
10413 /* The test above made sure that the next real character is a '{', but
10414 * under the /x modifier, it could be separated by space (or a comment and
10415 * \n) and this is not allowed (for consistency with \x{...} and the
10416 * tokenizer handling of \N{NAME}). */
10417 if (*RExC_parse != '{') {
10418 vFAIL("Missing braces on \\N{}");
10421 RExC_parse++; /* Skip past the '{' */
10423 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10424 || ! (endbrace == RExC_parse /* nothing between the {} */
10425 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
10426 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
10428 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10429 vFAIL("\\N{NAME} must be resolved by the lexer");
10432 if (endbrace == RExC_parse) { /* empty: \N{} */
10435 *node_p = reg_node(pRExC_state,NOTHING);
10437 else if (in_char_class) {
10438 if (SIZE_ONLY && in_char_class) {
10440 RExC_parse++; /* Position after the "}" */
10441 vFAIL("Zero length \\N{}");
10444 ckWARNreg(RExC_parse,
10445 "Ignoring zero length \\N{} in character class");
10453 nextchar(pRExC_state);
10457 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10458 RExC_parse += 2; /* Skip past the 'U+' */
10460 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10462 /* Code points are separated by dots. If none, there is only one code
10463 * point, and is terminated by the brace */
10464 has_multiple_chars = (endchar < endbrace);
10466 if (valuep && (! has_multiple_chars || in_char_class)) {
10467 /* We only pay attention to the first char of
10468 multichar strings being returned in char classes. I kinda wonder
10469 if this makes sense as it does change the behaviour
10470 from earlier versions, OTOH that behaviour was broken
10471 as well. XXX Solution is to recharacterize as
10472 [rest-of-class]|multi1|multi2... */
10474 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10475 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10476 | PERL_SCAN_DISALLOW_PREFIX
10477 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10479 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10481 /* The tokenizer should have guaranteed validity, but it's possible to
10482 * bypass it by using single quoting, so check */
10483 if (length_of_hex == 0
10484 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10486 RExC_parse += length_of_hex; /* Includes all the valid */
10487 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10488 ? UTF8SKIP(RExC_parse)
10490 /* Guard against malformed utf8 */
10491 if (RExC_parse >= endchar) {
10492 RExC_parse = endchar;
10494 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10497 if (in_char_class && has_multiple_chars) {
10499 RExC_parse = endbrace;
10500 vFAIL("\\N{} in character class restricted to one character");
10503 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10507 RExC_parse = endbrace + 1;
10509 else if (! node_p || ! has_multiple_chars) {
10511 /* Here, the input is legal, but not according to the caller's
10512 * options. We fail without advancing the parse, so that the
10513 * caller can try again */
10519 /* What is done here is to convert this to a sub-pattern of the form
10520 * (?:\x{char1}\x{char2}...)
10521 * and then call reg recursively. That way, it retains its atomicness,
10522 * while not having to worry about special handling that some code
10523 * points may have. toke.c has converted the original Unicode values
10524 * to native, so that we can just pass on the hex values unchanged. We
10525 * do have to set a flag to keep recoding from happening in the
10528 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10530 char *orig_end = RExC_end;
10533 while (RExC_parse < endbrace) {
10535 /* Convert to notation the rest of the code understands */
10536 sv_catpv(substitute_parse, "\\x{");
10537 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10538 sv_catpv(substitute_parse, "}");
10540 /* Point to the beginning of the next character in the sequence. */
10541 RExC_parse = endchar + 1;
10542 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10544 sv_catpv(substitute_parse, ")");
10546 RExC_parse = SvPV(substitute_parse, len);
10548 /* Don't allow empty number */
10550 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10552 RExC_end = RExC_parse + len;
10554 /* The values are Unicode, and therefore not subject to recoding */
10555 RExC_override_recoding = 1;
10557 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10558 if (flags & RESTART_UTF8) {
10559 *flagp = RESTART_UTF8;
10562 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10565 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10567 RExC_parse = endbrace;
10568 RExC_end = orig_end;
10569 RExC_override_recoding = 0;
10571 nextchar(pRExC_state);
10581 * It returns the code point in utf8 for the value in *encp.
10582 * value: a code value in the source encoding
10583 * encp: a pointer to an Encode object
10585 * If the result from Encode is not a single character,
10586 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10589 S_reg_recode(pTHX_ const char value, SV **encp)
10592 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10593 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10594 const STRLEN newlen = SvCUR(sv);
10595 UV uv = UNICODE_REPLACEMENT;
10597 PERL_ARGS_ASSERT_REG_RECODE;
10601 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10604 if (!newlen || numlen != newlen) {
10605 uv = UNICODE_REPLACEMENT;
10611 PERL_STATIC_INLINE U8
10612 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10616 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10622 op = get_regex_charset(RExC_flags);
10623 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10624 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10625 been, so there is no hole */
10628 return op + EXACTF;
10631 PERL_STATIC_INLINE void
10632 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10634 /* This knows the details about sizing an EXACTish node, setting flags for
10635 * it (by setting <*flagp>, and potentially populating it with a single
10638 * If <len> (the length in bytes) is non-zero, this function assumes that
10639 * the node has already been populated, and just does the sizing. In this
10640 * case <code_point> should be the final code point that has already been
10641 * placed into the node. This value will be ignored except that under some
10642 * circumstances <*flagp> is set based on it.
10644 * If <len> is zero, the function assumes that the node is to contain only
10645 * the single character given by <code_point> and calculates what <len>
10646 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10647 * additionally will populate the node's STRING with <code_point>, if <len>
10648 * is 0. In both cases <*flagp> is appropriately set
10650 * It knows that under FOLD, the Latin Sharp S and UTF characters above
10651 * 255, must be folded (the former only when the rules indicate it can
10654 bool len_passed_in = cBOOL(len != 0);
10655 U8 character[UTF8_MAXBYTES_CASE+1];
10657 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10659 if (! len_passed_in) {
10661 if (FOLD && (! LOC || code_point > 255)) {
10662 _to_uni_fold_flags(code_point,
10665 FOLD_FLAGS_FULL | ((LOC)
10666 ? FOLD_FLAGS_LOCALE
10667 : (ASCII_FOLD_RESTRICTED)
10668 ? FOLD_FLAGS_NOMIX_ASCII
10672 uvchr_to_utf8( character, code_point);
10673 len = UTF8SKIP(character);
10677 || code_point != LATIN_SMALL_LETTER_SHARP_S
10678 || ASCII_FOLD_RESTRICTED
10679 || ! AT_LEAST_UNI_SEMANTICS)
10681 *character = (U8) code_point;
10686 *(character + 1) = 's';
10692 RExC_size += STR_SZ(len);
10695 RExC_emit += STR_SZ(len);
10696 STR_LEN(node) = len;
10697 if (! len_passed_in) {
10698 Copy((char *) character, STRING(node), len, char);
10702 *flagp |= HASWIDTH;
10704 /* A single character node is SIMPLE, except for the special-cased SHARP S
10706 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10707 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10708 || ! FOLD || ! DEPENDS_SEMANTICS))
10715 /* return atoi(p), unless it's too big to sensibly be a backref,
10716 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
10719 S_backref_value(char *p)
10723 for (;isDIGIT(*q); q++); /* calculate length of num */
10724 if (q - p == 0 || q - p > 9)
10731 - regatom - the lowest level
10733 Try to identify anything special at the start of the pattern. If there
10734 is, then handle it as required. This may involve generating a single regop,
10735 such as for an assertion; or it may involve recursing, such as to
10736 handle a () structure.
10738 If the string doesn't start with something special then we gobble up
10739 as much literal text as we can.
10741 Once we have been able to handle whatever type of thing started the
10742 sequence, we return.
10744 Note: we have to be careful with escapes, as they can be both literal
10745 and special, and in the case of \10 and friends, context determines which.
10747 A summary of the code structure is:
10749 switch (first_byte) {
10750 cases for each special:
10751 handle this special;
10754 switch (2nd byte) {
10755 cases for each unambiguous special:
10756 handle this special;
10758 cases for each ambigous special/literal:
10760 if (special) handle here
10762 default: // unambiguously literal:
10765 default: // is a literal char
10768 create EXACTish node for literal;
10769 while (more input and node isn't full) {
10770 switch (input_byte) {
10771 cases for each special;
10772 make sure parse pointer is set so that the next call to
10773 regatom will see this special first
10774 goto loopdone; // EXACTish node terminated by prev. char
10776 append char to EXACTISH node;
10778 get next input byte;
10782 return the generated node;
10784 Specifically there are two separate switches for handling
10785 escape sequences, with the one for handling literal escapes requiring
10786 a dummy entry for all of the special escapes that are actually handled
10789 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10791 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10793 Otherwise does not return NULL.
10797 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10800 regnode *ret = NULL;
10802 char *parse_start = RExC_parse;
10806 GET_RE_DEBUG_FLAGS_DECL;
10808 *flagp = WORST; /* Tentatively. */
10810 DEBUG_PARSE("atom");
10812 PERL_ARGS_ASSERT_REGATOM;
10815 switch ((U8)*RExC_parse) {
10817 RExC_seen_zerolen++;
10818 nextchar(pRExC_state);
10819 if (RExC_flags & RXf_PMf_MULTILINE)
10820 ret = reg_node(pRExC_state, MBOL);
10821 else if (RExC_flags & RXf_PMf_SINGLELINE)
10822 ret = reg_node(pRExC_state, SBOL);
10824 ret = reg_node(pRExC_state, BOL);
10825 Set_Node_Length(ret, 1); /* MJD */
10828 nextchar(pRExC_state);
10830 RExC_seen_zerolen++;
10831 if (RExC_flags & RXf_PMf_MULTILINE)
10832 ret = reg_node(pRExC_state, MEOL);
10833 else if (RExC_flags & RXf_PMf_SINGLELINE)
10834 ret = reg_node(pRExC_state, SEOL);
10836 ret = reg_node(pRExC_state, EOL);
10837 Set_Node_Length(ret, 1); /* MJD */
10840 nextchar(pRExC_state);
10841 if (RExC_flags & RXf_PMf_SINGLELINE)
10842 ret = reg_node(pRExC_state, SANY);
10844 ret = reg_node(pRExC_state, REG_ANY);
10845 *flagp |= HASWIDTH|SIMPLE;
10847 Set_Node_Length(ret, 1); /* MJD */
10851 char * const oregcomp_parse = ++RExC_parse;
10852 ret = regclass(pRExC_state, flagp,depth+1,
10853 FALSE, /* means parse the whole char class */
10854 TRUE, /* allow multi-char folds */
10855 FALSE, /* don't silence non-portable warnings. */
10857 if (*RExC_parse != ']') {
10858 RExC_parse = oregcomp_parse;
10859 vFAIL("Unmatched [");
10862 if (*flagp & RESTART_UTF8)
10864 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10867 nextchar(pRExC_state);
10868 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10872 nextchar(pRExC_state);
10873 ret = reg(pRExC_state, 2, &flags,depth+1);
10875 if (flags & TRYAGAIN) {
10876 if (RExC_parse == RExC_end) {
10877 /* Make parent create an empty node if needed. */
10878 *flagp |= TRYAGAIN;
10883 if (flags & RESTART_UTF8) {
10884 *flagp = RESTART_UTF8;
10887 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10889 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10893 if (flags & TRYAGAIN) {
10894 *flagp |= TRYAGAIN;
10897 vFAIL("Internal urp");
10898 /* Supposed to be caught earlier. */
10901 if (!regcurly(RExC_parse, FALSE)) {
10910 vFAIL("Quantifier follows nothing");
10915 This switch handles escape sequences that resolve to some kind
10916 of special regop and not to literal text. Escape sequnces that
10917 resolve to literal text are handled below in the switch marked
10920 Every entry in this switch *must* have a corresponding entry
10921 in the literal escape switch. However, the opposite is not
10922 required, as the default for this switch is to jump to the
10923 literal text handling code.
10925 switch ((U8)*++RExC_parse) {
10927 /* Special Escapes */
10929 RExC_seen_zerolen++;
10930 ret = reg_node(pRExC_state, SBOL);
10932 goto finish_meta_pat;
10934 ret = reg_node(pRExC_state, GPOS);
10935 RExC_seen |= REG_SEEN_GPOS;
10937 goto finish_meta_pat;
10939 RExC_seen_zerolen++;
10940 ret = reg_node(pRExC_state, KEEPS);
10942 /* XXX:dmq : disabling in-place substitution seems to
10943 * be necessary here to avoid cases of memory corruption, as
10944 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10946 RExC_seen |= REG_SEEN_LOOKBEHIND;
10947 goto finish_meta_pat;
10949 ret = reg_node(pRExC_state, SEOL);
10951 RExC_seen_zerolen++; /* Do not optimize RE away */
10952 goto finish_meta_pat;
10954 ret = reg_node(pRExC_state, EOS);
10956 RExC_seen_zerolen++; /* Do not optimize RE away */
10957 goto finish_meta_pat;
10959 ret = reg_node(pRExC_state, CANY);
10960 RExC_seen |= REG_SEEN_CANY;
10961 *flagp |= HASWIDTH|SIMPLE;
10962 goto finish_meta_pat;
10964 ret = reg_node(pRExC_state, CLUMP);
10965 *flagp |= HASWIDTH;
10966 goto finish_meta_pat;
10972 arg = ANYOF_WORDCHAR;
10976 RExC_seen_zerolen++;
10977 RExC_seen |= REG_SEEN_LOOKBEHIND;
10978 op = BOUND + get_regex_charset(RExC_flags);
10979 if (op > BOUNDA) { /* /aa is same as /a */
10982 ret = reg_node(pRExC_state, op);
10983 FLAGS(ret) = get_regex_charset(RExC_flags);
10985 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10986 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10988 goto finish_meta_pat;
10990 RExC_seen_zerolen++;
10991 RExC_seen |= REG_SEEN_LOOKBEHIND;
10992 op = NBOUND + get_regex_charset(RExC_flags);
10993 if (op > NBOUNDA) { /* /aa is same as /a */
10996 ret = reg_node(pRExC_state, op);
10997 FLAGS(ret) = get_regex_charset(RExC_flags);
10999 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11000 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
11002 goto finish_meta_pat;
11012 ret = reg_node(pRExC_state, LNBREAK);
11013 *flagp |= HASWIDTH|SIMPLE;
11014 goto finish_meta_pat;
11022 goto join_posix_op_known;
11028 arg = ANYOF_VERTWS;
11030 goto join_posix_op_known;
11040 op = POSIXD + get_regex_charset(RExC_flags);
11041 if (op > POSIXA) { /* /aa is same as /a */
11045 join_posix_op_known:
11048 op += NPOSIXD - POSIXD;
11051 ret = reg_node(pRExC_state, op);
11053 FLAGS(ret) = namedclass_to_classnum(arg);
11056 *flagp |= HASWIDTH|SIMPLE;
11060 nextchar(pRExC_state);
11061 Set_Node_Length(ret, 2); /* MJD */
11067 char* parse_start = RExC_parse - 2;
11072 ret = regclass(pRExC_state, flagp,depth+1,
11073 TRUE, /* means just parse this element */
11074 FALSE, /* don't allow multi-char folds */
11075 FALSE, /* don't silence non-portable warnings.
11076 It would be a bug if these returned
11079 /* regclass() can only return RESTART_UTF8 if multi-char folds
11082 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11087 Set_Node_Offset(ret, parse_start + 2);
11088 Set_Node_Cur_Length(ret, parse_start);
11089 nextchar(pRExC_state);
11093 /* Handle \N and \N{NAME} with multiple code points here and not
11094 * below because it can be multicharacter. join_exact() will join
11095 * them up later on. Also this makes sure that things like
11096 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11097 * The options to the grok function call causes it to fail if the
11098 * sequence is just a single code point. We then go treat it as
11099 * just another character in the current EXACT node, and hence it
11100 * gets uniform treatment with all the other characters. The
11101 * special treatment for quantifiers is not needed for such single
11102 * character sequences */
11104 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11105 FALSE /* not strict */ )) {
11106 if (*flagp & RESTART_UTF8)
11112 case 'k': /* Handle \k<NAME> and \k'NAME' */
11115 char ch= RExC_parse[1];
11116 if (ch != '<' && ch != '\'' && ch != '{') {
11118 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11119 vFAIL2("Sequence %.2s... not terminated",parse_start);
11121 /* this pretty much dupes the code for (?P=...) in reg(), if
11122 you change this make sure you change that */
11123 char* name_start = (RExC_parse += 2);
11125 SV *sv_dat = reg_scan_name(pRExC_state,
11126 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11127 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11128 if (RExC_parse == name_start || *RExC_parse != ch)
11129 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11130 vFAIL2("Sequence %.3s... not terminated",parse_start);
11133 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11134 RExC_rxi->data->data[num]=(void*)sv_dat;
11135 SvREFCNT_inc_simple_void(sv_dat);
11139 ret = reganode(pRExC_state,
11142 : (ASCII_FOLD_RESTRICTED)
11144 : (AT_LEAST_UNI_SEMANTICS)
11150 *flagp |= HASWIDTH;
11152 /* override incorrect value set in reganode MJD */
11153 Set_Node_Offset(ret, parse_start+1);
11154 Set_Node_Cur_Length(ret, parse_start);
11155 nextchar(pRExC_state);
11161 case '1': case '2': case '3': case '4':
11162 case '5': case '6': case '7': case '8': case '9':
11167 if (*RExC_parse == 'g') {
11171 if (*RExC_parse == '{') {
11175 if (*RExC_parse == '-') {
11179 if (hasbrace && !isDIGIT(*RExC_parse)) {
11180 if (isrel) RExC_parse--;
11182 goto parse_named_seq;
11185 num = S_backref_value(RExC_parse);
11187 vFAIL("Reference to invalid group 0");
11188 else if (num == I32_MAX) {
11189 if (isDIGIT(*RExC_parse))
11190 vFAIL("Reference to nonexistent group");
11192 vFAIL("Unterminated \\g... pattern");
11196 num = RExC_npar - num;
11198 vFAIL("Reference to nonexistent or unclosed group");
11202 num = S_backref_value(RExC_parse);
11203 /* bare \NNN might be backref or octal */
11204 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11205 && *RExC_parse != '8' && *RExC_parse != '9'))
11206 /* Probably a character specified in octal, e.g. \35 */
11210 /* at this point RExC_parse definitely points to a backref
11213 #ifdef RE_TRACK_PATTERN_OFFSETS
11214 char * const parse_start = RExC_parse - 1; /* MJD */
11216 while (isDIGIT(*RExC_parse))
11219 if (*RExC_parse != '}')
11220 vFAIL("Unterminated \\g{...} pattern");
11224 if (num > (I32)RExC_rx->nparens)
11225 vFAIL("Reference to nonexistent group");
11228 ret = reganode(pRExC_state,
11231 : (ASCII_FOLD_RESTRICTED)
11233 : (AT_LEAST_UNI_SEMANTICS)
11239 *flagp |= HASWIDTH;
11241 /* override incorrect value set in reganode MJD */
11242 Set_Node_Offset(ret, parse_start+1);
11243 Set_Node_Cur_Length(ret, parse_start);
11245 nextchar(pRExC_state);
11250 if (RExC_parse >= RExC_end)
11251 FAIL("Trailing \\");
11254 /* Do not generate "unrecognized" warnings here, we fall
11255 back into the quick-grab loop below */
11262 if (RExC_flags & RXf_PMf_EXTENDED) {
11263 if ( reg_skipcomment( pRExC_state ) )
11270 parse_start = RExC_parse - 1;
11279 #define MAX_NODE_STRING_SIZE 127
11280 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11282 U8 upper_parse = MAX_NODE_STRING_SIZE;
11284 U8 node_type = compute_EXACTish(pRExC_state);
11285 bool next_is_quantifier;
11286 char * oldp = NULL;
11288 /* We can convert EXACTF nodes to EXACTFU if they contain only
11289 * characters that match identically regardless of the target
11290 * string's UTF8ness. The reason to do this is that EXACTF is not
11291 * trie-able, EXACTFU is. (We don't need to figure this out until
11293 bool maybe_exactfu = node_type == EXACTF && PASS2;
11295 /* If a folding node contains only code points that don't
11296 * participate in folds, it can be changed into an EXACT node,
11297 * which allows the optimizer more things to look for */
11300 ret = reg_node(pRExC_state, node_type);
11302 /* In pass1, folded, we use a temporary buffer instead of the
11303 * actual node, as the node doesn't exist yet */
11304 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11310 /* We do the EXACTFish to EXACT node only if folding, and not if in
11311 * locale, as whether a character folds or not isn't known until
11312 * runtime. (And we don't need to figure this out until pass 2) */
11313 maybe_exact = FOLD && ! LOC && PASS2;
11315 /* XXX The node can hold up to 255 bytes, yet this only goes to
11316 * 127. I (khw) do not know why. Keeping it somewhat less than
11317 * 255 allows us to not have to worry about overflow due to
11318 * converting to utf8 and fold expansion, but that value is
11319 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11320 * split up by this limit into a single one using the real max of
11321 * 255. Even at 127, this breaks under rare circumstances. If
11322 * folding, we do not want to split a node at a character that is a
11323 * non-final in a multi-char fold, as an input string could just
11324 * happen to want to match across the node boundary. The join
11325 * would solve that problem if the join actually happens. But a
11326 * series of more than two nodes in a row each of 127 would cause
11327 * the first join to succeed to get to 254, but then there wouldn't
11328 * be room for the next one, which could at be one of those split
11329 * multi-char folds. I don't know of any fool-proof solution. One
11330 * could back off to end with only a code point that isn't such a
11331 * non-final, but it is possible for there not to be any in the
11333 for (p = RExC_parse - 1;
11334 len < upper_parse && p < RExC_end;
11339 if (RExC_flags & RXf_PMf_EXTENDED)
11340 p = regwhite( pRExC_state, p );
11351 /* Literal Escapes Switch
11353 This switch is meant to handle escape sequences that
11354 resolve to a literal character.
11356 Every escape sequence that represents something
11357 else, like an assertion or a char class, is handled
11358 in the switch marked 'Special Escapes' above in this
11359 routine, but also has an entry here as anything that
11360 isn't explicitly mentioned here will be treated as
11361 an unescaped equivalent literal.
11364 switch ((U8)*++p) {
11365 /* These are all the special escapes. */
11366 case 'A': /* Start assertion */
11367 case 'b': case 'B': /* Word-boundary assertion*/
11368 case 'C': /* Single char !DANGEROUS! */
11369 case 'd': case 'D': /* digit class */
11370 case 'g': case 'G': /* generic-backref, pos assertion */
11371 case 'h': case 'H': /* HORIZWS */
11372 case 'k': case 'K': /* named backref, keep marker */
11373 case 'p': case 'P': /* Unicode property */
11374 case 'R': /* LNBREAK */
11375 case 's': case 'S': /* space class */
11376 case 'v': case 'V': /* VERTWS */
11377 case 'w': case 'W': /* word class */
11378 case 'X': /* eXtended Unicode "combining character sequence" */
11379 case 'z': case 'Z': /* End of line/string assertion */
11383 /* Anything after here is an escape that resolves to a
11384 literal. (Except digits, which may or may not)
11390 case 'N': /* Handle a single-code point named character. */
11391 /* The options cause it to fail if a multiple code
11392 * point sequence. Handle those in the switch() above
11394 RExC_parse = p + 1;
11395 if (! grok_bslash_N(pRExC_state, NULL, &ender,
11396 flagp, depth, FALSE,
11397 FALSE /* not strict */ ))
11399 if (*flagp & RESTART_UTF8)
11400 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11401 RExC_parse = p = oldp;
11405 if (ender > 0xff) {
11422 ender = ASCII_TO_NATIVE('\033');
11432 const char* error_msg;
11434 bool valid = grok_bslash_o(&p,
11437 TRUE, /* out warnings */
11438 FALSE, /* not strict */
11439 TRUE, /* Output warnings
11444 RExC_parse = p; /* going to die anyway; point
11445 to exact spot of failure */
11449 if (PL_encoding && ender < 0x100) {
11450 goto recode_encoding;
11452 if (ender > 0xff) {
11459 UV result = UV_MAX; /* initialize to erroneous
11461 const char* error_msg;
11463 bool valid = grok_bslash_x(&p,
11466 TRUE, /* out warnings */
11467 FALSE, /* not strict */
11468 TRUE, /* Output warnings
11473 RExC_parse = p; /* going to die anyway; point
11474 to exact spot of failure */
11479 if (PL_encoding && ender < 0x100) {
11480 goto recode_encoding;
11482 if (ender > 0xff) {
11489 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11491 case '8': case '9': /* must be a backreference */
11494 case '1': case '2': case '3':case '4':
11495 case '5': case '6': case '7':
11496 /* When we parse backslash escapes there is ambiguity
11497 * between backreferences and octal escapes. Any escape
11498 * from \1 - \9 is a backreference, any multi-digit
11499 * escape which does not start with 0 and which when
11500 * evaluated as decimal could refer to an already
11501 * parsed capture buffer is a backslash. Anything else
11504 * Note this implies that \118 could be interpreted as
11505 * 118 OR as "\11" . "8" depending on whether there
11506 * were 118 capture buffers defined already in the
11508 if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
11509 { /* Not to be treated as an octal constant, go
11516 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11518 ender = grok_oct(p, &numlen, &flags, NULL);
11519 if (ender > 0xff) {
11523 if (SIZE_ONLY /* like \08, \178 */
11526 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11528 reg_warn_non_literal_string(
11530 form_short_octal_warning(p, numlen));
11533 if (PL_encoding && ender < 0x100)
11534 goto recode_encoding;
11537 if (! RExC_override_recoding) {
11538 SV* enc = PL_encoding;
11539 ender = reg_recode((const char)(U8)ender, &enc);
11540 if (!enc && SIZE_ONLY)
11541 ckWARNreg(p, "Invalid escape in the specified encoding");
11547 FAIL("Trailing \\");
11550 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11551 /* Include any { following the alpha to emphasize
11552 * that it could be part of an escape at some point
11554 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11555 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11557 goto normal_default;
11558 } /* End of switch on '\' */
11560 default: /* A literal character */
11563 && RExC_flags & RXf_PMf_EXTENDED
11564 && ckWARN_d(WARN_DEPRECATED)
11565 && is_PATWS_non_low(p, UTF))
11567 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11568 "Escape literal pattern white space under /x");
11572 if (UTF8_IS_START(*p) && UTF) {
11574 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11575 &numlen, UTF8_ALLOW_DEFAULT);
11581 } /* End of switch on the literal */
11583 /* Here, have looked at the literal character and <ender>
11584 * contains its ordinal, <p> points to the character after it
11587 if ( RExC_flags & RXf_PMf_EXTENDED)
11588 p = regwhite( pRExC_state, p );
11590 /* If the next thing is a quantifier, it applies to this
11591 * character only, which means that this character has to be in
11592 * its own node and can't just be appended to the string in an
11593 * existing node, so if there are already other characters in
11594 * the node, close the node with just them, and set up to do
11595 * this character again next time through, when it will be the
11596 * only thing in its new node */
11597 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11605 const STRLEN unilen = reguni(pRExC_state, ender, s);
11611 /* The loop increments <len> each time, as all but this
11612 * path (and one other) through it add a single byte to
11613 * the EXACTish node. But this one has changed len to
11614 * be the correct final value, so subtract one to
11615 * cancel out the increment that follows */
11619 REGC((char)ender, s++);
11622 else /* FOLD */ if (! ( UTF
11623 /* See comments for join_exact() as to why we fold this
11624 * non-UTF at compile time */
11625 || (node_type == EXACTFU
11626 && ender == LATIN_SMALL_LETTER_SHARP_S)))
11628 if (IS_IN_SOME_FOLD_L1(ender)) {
11629 maybe_exact = FALSE;
11631 /* See if the character's fold differs between /d and
11632 * /u. This includes the multi-char fold SHARP S to
11635 && (PL_fold[ender] != PL_fold_latin1[ender]
11636 || ender == LATIN_SMALL_LETTER_SHARP_S
11638 && isARG2_lower_or_UPPER_ARG1('s', ender)
11639 && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11641 maybe_exactfu = FALSE;
11644 *(s++) = (char) ender;
11648 /* Prime the casefolded buffer. Locale rules, which apply
11649 * only to code points < 256, aren't known until execution,
11650 * so for them, just output the original character using
11651 * utf8. If we start to fold non-UTF patterns, be sure to
11652 * update join_exact() */
11653 if (LOC && ender < 256) {
11654 if (UVCHR_IS_INVARIANT(ender)) {
11658 *s = UTF8_TWO_BYTE_HI(ender);
11659 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11664 UV folded = _to_uni_fold_flags(
11669 | ((LOC) ? FOLD_FLAGS_LOCALE
11670 : (ASCII_FOLD_RESTRICTED)
11671 ? FOLD_FLAGS_NOMIX_ASCII
11675 /* If this node only contains non-folding code points
11676 * so far, see if this new one is also non-folding */
11678 if (folded != ender) {
11679 maybe_exact = FALSE;
11682 /* Here the fold is the original; we have
11683 * to check further to see if anything
11685 if (! PL_utf8_foldable) {
11686 SV* swash = swash_init("utf8",
11688 &PL_sv_undef, 1, 0);
11690 _get_swash_invlist(swash);
11691 SvREFCNT_dec_NN(swash);
11693 if (_invlist_contains_cp(PL_utf8_foldable,
11696 maybe_exact = FALSE;
11704 /* The loop increments <len> each time, as all but this
11705 * path (and one other) through it add a single byte to the
11706 * EXACTish node. But this one has changed len to be the
11707 * correct final value, so subtract one to cancel out the
11708 * increment that follows */
11709 len += foldlen - 1;
11712 if (next_is_quantifier) {
11714 /* Here, the next input is a quantifier, and to get here,
11715 * the current character is the only one in the node.
11716 * Also, here <len> doesn't include the final byte for this
11722 } /* End of loop through literal characters */
11724 /* Here we have either exhausted the input or ran out of room in
11725 * the node. (If we encountered a character that can't be in the
11726 * node, transfer is made directly to <loopdone>, and so we
11727 * wouldn't have fallen off the end of the loop.) In the latter
11728 * case, we artificially have to split the node into two, because
11729 * we just don't have enough space to hold everything. This
11730 * creates a problem if the final character participates in a
11731 * multi-character fold in the non-final position, as a match that
11732 * should have occurred won't, due to the way nodes are matched,
11733 * and our artificial boundary. So back off until we find a non-
11734 * problematic character -- one that isn't at the beginning or
11735 * middle of such a fold. (Either it doesn't participate in any
11736 * folds, or appears only in the final position of all the folds it
11737 * does participate in.) A better solution with far fewer false
11738 * positives, and that would fill the nodes more completely, would
11739 * be to actually have available all the multi-character folds to
11740 * test against, and to back-off only far enough to be sure that
11741 * this node isn't ending with a partial one. <upper_parse> is set
11742 * further below (if we need to reparse the node) to include just
11743 * up through that final non-problematic character that this code
11744 * identifies, so when it is set to less than the full node, we can
11745 * skip the rest of this */
11746 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11748 const STRLEN full_len = len;
11750 assert(len >= MAX_NODE_STRING_SIZE);
11752 /* Here, <s> points to the final byte of the final character.
11753 * Look backwards through the string until find a non-
11754 * problematic character */
11758 /* These two have no multi-char folds to non-UTF characters
11760 if (ASCII_FOLD_RESTRICTED || LOC) {
11764 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11768 if (! PL_NonL1NonFinalFold) {
11769 PL_NonL1NonFinalFold = _new_invlist_C_array(
11770 NonL1_Perl_Non_Final_Folds_invlist);
11773 /* Point to the first byte of the final character */
11774 s = (char *) utf8_hop((U8 *) s, -1);
11776 while (s >= s0) { /* Search backwards until find
11777 non-problematic char */
11778 if (UTF8_IS_INVARIANT(*s)) {
11780 /* There are no ascii characters that participate
11781 * in multi-char folds under /aa. In EBCDIC, the
11782 * non-ascii invariants are all control characters,
11783 * so don't ever participate in any folds. */
11784 if (ASCII_FOLD_RESTRICTED
11785 || ! IS_NON_FINAL_FOLD(*s))
11790 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11792 /* No Latin1 characters participate in multi-char
11793 * folds under /l */
11795 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11801 else if (! _invlist_contains_cp(
11802 PL_NonL1NonFinalFold,
11803 valid_utf8_to_uvchr((U8 *) s, NULL)))
11808 /* Here, the current character is problematic in that
11809 * it does occur in the non-final position of some
11810 * fold, so try the character before it, but have to
11811 * special case the very first byte in the string, so
11812 * we don't read outside the string */
11813 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11814 } /* End of loop backwards through the string */
11816 /* If there were only problematic characters in the string,
11817 * <s> will point to before s0, in which case the length
11818 * should be 0, otherwise include the length of the
11819 * non-problematic character just found */
11820 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11823 /* Here, have found the final character, if any, that is
11824 * non-problematic as far as ending the node without splitting
11825 * it across a potential multi-char fold. <len> contains the
11826 * number of bytes in the node up-to and including that
11827 * character, or is 0 if there is no such character, meaning
11828 * the whole node contains only problematic characters. In
11829 * this case, give up and just take the node as-is. We can't
11834 /* If the node ends in an 's' we make sure it stays EXACTF,
11835 * as if it turns into an EXACTFU, it could later get
11836 * joined with another 's' that would then wrongly match
11838 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11840 maybe_exactfu = FALSE;
11844 /* Here, the node does contain some characters that aren't
11845 * problematic. If one such is the final character in the
11846 * node, we are done */
11847 if (len == full_len) {
11850 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11852 /* If the final character is problematic, but the
11853 * penultimate is not, back-off that last character to
11854 * later start a new node with it */
11859 /* Here, the final non-problematic character is earlier
11860 * in the input than the penultimate character. What we do
11861 * is reparse from the beginning, going up only as far as
11862 * this final ok one, thus guaranteeing that the node ends
11863 * in an acceptable character. The reason we reparse is
11864 * that we know how far in the character is, but we don't
11865 * know how to correlate its position with the input parse.
11866 * An alternate implementation would be to build that
11867 * correlation as we go along during the original parse,
11868 * but that would entail extra work for every node, whereas
11869 * this code gets executed only when the string is too
11870 * large for the node, and the final two characters are
11871 * problematic, an infrequent occurrence. Yet another
11872 * possible strategy would be to save the tail of the
11873 * string, and the next time regatom is called, initialize
11874 * with that. The problem with this is that unless you
11875 * back off one more character, you won't be guaranteed
11876 * regatom will get called again, unless regbranch,
11877 * regpiece ... are also changed. If you do back off that
11878 * extra character, so that there is input guaranteed to
11879 * force calling regatom, you can't handle the case where
11880 * just the first character in the node is acceptable. I
11881 * (khw) decided to try this method which doesn't have that
11882 * pitfall; if performance issues are found, we can do a
11883 * combination of the current approach plus that one */
11889 } /* End of verifying node ends with an appropriate char */
11891 loopdone: /* Jumped to when encounters something that shouldn't be in
11894 /* I (khw) don't know if you can get here with zero length, but the
11895 * old code handled this situation by creating a zero-length EXACT
11896 * node. Might as well be NOTHING instead */
11902 /* If 'maybe_exact' is still set here, means there are no
11903 * code points in the node that participate in folds;
11904 * similarly for 'maybe_exactfu' and code points that match
11905 * differently depending on UTF8ness of the target string
11910 else if (maybe_exactfu) {
11914 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11917 RExC_parse = p - 1;
11918 Set_Node_Cur_Length(ret, parse_start);
11919 nextchar(pRExC_state);
11921 /* len is STRLEN which is unsigned, need to copy to signed */
11924 vFAIL("Internal disaster");
11927 } /* End of label 'defchar:' */
11929 } /* End of giant switch on input character */
11935 S_regwhite( RExC_state_t *pRExC_state, char *p )
11937 const char *e = RExC_end;
11939 PERL_ARGS_ASSERT_REGWHITE;
11944 else if (*p == '#') {
11947 if (*p++ == '\n') {
11953 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11962 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11964 /* Returns the next non-pattern-white space, non-comment character (the
11965 * latter only if 'recognize_comment is true) in the string p, which is
11966 * ended by RExC_end. If there is no line break ending a comment,
11967 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11968 const char *e = RExC_end;
11970 PERL_ARGS_ASSERT_REGPATWS;
11974 if ((len = is_PATWS_safe(p, e, UTF))) {
11977 else if (recognize_comment && *p == '#') {
11981 if (is_LNBREAK_safe(p, e, UTF)) {
11987 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11996 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
11998 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
11999 * sets up the bitmap and any flags, removing those code points from the
12000 * inversion list, setting it to NULL should it become completely empty */
12002 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12003 assert(PL_regkind[OP(node)] == ANYOF);
12005 ANYOF_BITMAP_ZERO(node);
12006 if (*invlist_ptr) {
12008 /* This gets set if we actually need to modify things */
12009 bool change_invlist = FALSE;
12013 /* Start looking through *invlist_ptr */
12014 invlist_iterinit(*invlist_ptr);
12015 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12019 if (end == UV_MAX && start <= 256) {
12020 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12023 /* Quit if are above what we should change */
12028 change_invlist = TRUE;
12030 /* Set all the bits in the range, up to the max that we are doing */
12031 high = (end < 255) ? end : 255;
12032 for (i = start; i <= (int) high; i++) {
12033 if (! ANYOF_BITMAP_TEST(node, i)) {
12034 ANYOF_BITMAP_SET(node, i);
12038 invlist_iterfinish(*invlist_ptr);
12040 /* Done with loop; remove any code points that are in the bitmap from
12041 * *invlist_ptr; similarly for code points above latin1 if we have a flag
12042 * to match all of them anyways */
12043 if (change_invlist) {
12044 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12046 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12047 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12050 /* If have completely emptied it, remove it completely */
12051 if (_invlist_len(*invlist_ptr) == 0) {
12052 SvREFCNT_dec_NN(*invlist_ptr);
12053 *invlist_ptr = NULL;
12058 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12059 Character classes ([:foo:]) can also be negated ([:^foo:]).
12060 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12061 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12062 but trigger failures because they are currently unimplemented. */
12064 #define POSIXCC_DONE(c) ((c) == ':')
12065 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12066 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12068 PERL_STATIC_INLINE I32
12069 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12072 I32 namedclass = OOB_NAMEDCLASS;
12074 PERL_ARGS_ASSERT_REGPPOSIXCC;
12076 if (value == '[' && RExC_parse + 1 < RExC_end &&
12077 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12078 POSIXCC(UCHARAT(RExC_parse)))
12080 const char c = UCHARAT(RExC_parse);
12081 char* const s = RExC_parse++;
12083 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12085 if (RExC_parse == RExC_end) {
12088 /* Try to give a better location for the error (than the end of
12089 * the string) by looking for the matching ']' */
12091 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12094 vFAIL2("Unmatched '%c' in POSIX class", c);
12096 /* Grandfather lone [:, [=, [. */
12100 const char* const t = RExC_parse++; /* skip over the c */
12103 if (UCHARAT(RExC_parse) == ']') {
12104 const char *posixcc = s + 1;
12105 RExC_parse++; /* skip over the ending ] */
12108 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12109 const I32 skip = t - posixcc;
12111 /* Initially switch on the length of the name. */
12114 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12115 this is the Perl \w
12117 namedclass = ANYOF_WORDCHAR;
12120 /* Names all of length 5. */
12121 /* alnum alpha ascii blank cntrl digit graph lower
12122 print punct space upper */
12123 /* Offset 4 gives the best switch position. */
12124 switch (posixcc[4]) {
12126 if (memEQ(posixcc, "alph", 4)) /* alpha */
12127 namedclass = ANYOF_ALPHA;
12130 if (memEQ(posixcc, "spac", 4)) /* space */
12131 namedclass = ANYOF_PSXSPC;
12134 if (memEQ(posixcc, "grap", 4)) /* graph */
12135 namedclass = ANYOF_GRAPH;
12138 if (memEQ(posixcc, "asci", 4)) /* ascii */
12139 namedclass = ANYOF_ASCII;
12142 if (memEQ(posixcc, "blan", 4)) /* blank */
12143 namedclass = ANYOF_BLANK;
12146 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12147 namedclass = ANYOF_CNTRL;
12150 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12151 namedclass = ANYOF_ALPHANUMERIC;
12154 if (memEQ(posixcc, "lowe", 4)) /* lower */
12155 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12156 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12157 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12160 if (memEQ(posixcc, "digi", 4)) /* digit */
12161 namedclass = ANYOF_DIGIT;
12162 else if (memEQ(posixcc, "prin", 4)) /* print */
12163 namedclass = ANYOF_PRINT;
12164 else if (memEQ(posixcc, "punc", 4)) /* punct */
12165 namedclass = ANYOF_PUNCT;
12170 if (memEQ(posixcc, "xdigit", 6))
12171 namedclass = ANYOF_XDIGIT;
12175 if (namedclass == OOB_NAMEDCLASS)
12177 "POSIX class [:%"UTF8f":] unknown",
12178 UTF8fARG(UTF, t - s - 1, s + 1));
12180 /* The #defines are structured so each complement is +1 to
12181 * the normal one */
12185 assert (posixcc[skip] == ':');
12186 assert (posixcc[skip+1] == ']');
12187 } else if (!SIZE_ONLY) {
12188 /* [[=foo=]] and [[.foo.]] are still future. */
12190 /* adjust RExC_parse so the warning shows after
12191 the class closes */
12192 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12194 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12197 /* Maternal grandfather:
12198 * "[:" ending in ":" but not in ":]" */
12200 vFAIL("Unmatched '[' in POSIX class");
12203 /* Grandfather lone [:, [=, [. */
12213 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12215 /* This applies some heuristics at the current parse position (which should
12216 * be at a '[') to see if what follows might be intended to be a [:posix:]
12217 * class. It returns true if it really is a posix class, of course, but it
12218 * also can return true if it thinks that what was intended was a posix
12219 * class that didn't quite make it.
12221 * It will return true for
12223 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12224 * ')' indicating the end of the (?[
12225 * [:any garbage including %^&$ punctuation:]
12227 * This is designed to be called only from S_handle_regex_sets; it could be
12228 * easily adapted to be called from the spot at the beginning of regclass()
12229 * that checks to see in a normal bracketed class if the surrounding []
12230 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12231 * change long-standing behavior, so I (khw) didn't do that */
12232 char* p = RExC_parse + 1;
12233 char first_char = *p;
12235 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12237 assert(*(p - 1) == '[');
12239 if (! POSIXCC(first_char)) {
12244 while (p < RExC_end && isWORDCHAR(*p)) p++;
12246 if (p >= RExC_end) {
12250 if (p - RExC_parse > 2 /* Got at least 1 word character */
12251 && (*p == first_char
12252 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12257 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12260 && p - RExC_parse > 2 /* [:] evaluates to colon;
12261 [::] is a bad posix class. */
12262 && first_char == *(p - 1));
12266 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
12267 char * const oregcomp_parse)
12269 /* Handle the (?[...]) construct to do set operations */
12272 UV start, end; /* End points of code point ranges */
12274 char *save_end, *save_parse;
12279 const bool save_fold = FOLD;
12281 GET_RE_DEBUG_FLAGS_DECL;
12283 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12286 vFAIL("(?[...]) not valid in locale");
12288 RExC_uni_semantics = 1;
12290 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12291 * (such as EXACT). Thus we can skip most everything if just sizing. We
12292 * call regclass to handle '[]' so as to not have to reinvent its parsing
12293 * rules here (throwing away the size it computes each time). And, we exit
12294 * upon an unescaped ']' that isn't one ending a regclass. To do both
12295 * these things, we need to realize that something preceded by a backslash
12296 * is escaped, so we have to keep track of backslashes */
12298 UV depth = 0; /* how many nested (?[...]) constructs */
12300 Perl_ck_warner_d(aTHX_
12301 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12302 "The regex_sets feature is experimental" REPORT_LOCATION,
12303 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12304 UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
12306 while (RExC_parse < RExC_end) {
12307 SV* current = NULL;
12308 RExC_parse = regpatws(pRExC_state, RExC_parse,
12309 TRUE); /* means recognize comments */
12310 switch (*RExC_parse) {
12312 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12317 /* Skip the next byte (which could cause us to end up in
12318 * the middle of a UTF-8 character, but since none of those
12319 * are confusable with anything we currently handle in this
12320 * switch (invariants all), it's safe. We'll just hit the
12321 * default: case next time and keep on incrementing until
12322 * we find one of the invariants we do handle. */
12327 /* If this looks like it is a [:posix:] class, leave the
12328 * parse pointer at the '[' to fool regclass() into
12329 * thinking it is part of a '[[:posix:]]'. That function
12330 * will use strict checking to force a syntax error if it
12331 * doesn't work out to a legitimate class */
12332 bool is_posix_class
12333 = could_it_be_a_POSIX_class(pRExC_state);
12334 if (! is_posix_class) {
12338 /* regclass() can only return RESTART_UTF8 if multi-char
12339 folds are allowed. */
12340 if (!regclass(pRExC_state, flagp,depth+1,
12341 is_posix_class, /* parse the whole char
12342 class only if not a
12344 FALSE, /* don't allow multi-char folds */
12345 TRUE, /* silence non-portable warnings. */
12347 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12350 /* function call leaves parse pointing to the ']', except
12351 * if we faked it */
12352 if (is_posix_class) {
12356 SvREFCNT_dec(current); /* In case it returned something */
12361 if (depth--) break;
12363 if (RExC_parse < RExC_end
12364 && *RExC_parse == ')')
12366 node = reganode(pRExC_state, ANYOF, 0);
12367 RExC_size += ANYOF_SKIP;
12368 nextchar(pRExC_state);
12369 Set_Node_Length(node,
12370 RExC_parse - oregcomp_parse + 1); /* MJD */
12379 FAIL("Syntax error in (?[...])");
12382 /* Pass 2 only after this. Everything in this construct is a
12383 * metacharacter. Operands begin with either a '\' (for an escape
12384 * sequence), or a '[' for a bracketed character class. Any other
12385 * character should be an operator, or parenthesis for grouping. Both
12386 * types of operands are handled by calling regclass() to parse them. It
12387 * is called with a parameter to indicate to return the computed inversion
12388 * list. The parsing here is implemented via a stack. Each entry on the
12389 * stack is a single character representing one of the operators, or the
12390 * '('; or else a pointer to an operand inversion list. */
12392 #define IS_OPERAND(a) (! SvIOK(a))
12394 /* The stack starts empty. It is a syntax error if the first thing parsed
12395 * is a binary operator; everything else is pushed on the stack. When an
12396 * operand is parsed, the top of the stack is examined. If it is a binary
12397 * operator, the item before it should be an operand, and both are replaced
12398 * by the result of doing that operation on the new operand and the one on
12399 * the stack. Thus a sequence of binary operands is reduced to a single
12400 * one before the next one is parsed.
12402 * A unary operator may immediately follow a binary in the input, for
12405 * When an operand is parsed and the top of the stack is a unary operator,
12406 * the operation is performed, and then the stack is rechecked to see if
12407 * this new operand is part of a binary operation; if so, it is handled as
12410 * A '(' is simply pushed on the stack; it is valid only if the stack is
12411 * empty, or the top element of the stack is an operator or another '('
12412 * (for which the parenthesized expression will become an operand). By the
12413 * time the corresponding ')' is parsed everything in between should have
12414 * been parsed and evaluated to a single operand (or else is a syntax
12415 * error), and is handled as a regular operand */
12417 sv_2mortal((SV *)(stack = newAV()));
12419 while (RExC_parse < RExC_end) {
12420 I32 top_index = av_tindex(stack);
12422 SV* current = NULL;
12424 /* Skip white space */
12425 RExC_parse = regpatws(pRExC_state, RExC_parse,
12426 TRUE); /* means recognize comments */
12427 if (RExC_parse >= RExC_end) {
12428 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12430 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12437 if (av_tindex(stack) >= 0 /* This makes sure that we can
12438 safely subtract 1 from
12439 RExC_parse in the next clause.
12440 If we have something on the
12441 stack, we have parsed something
12443 && UCHARAT(RExC_parse - 1) == '('
12444 && RExC_parse < RExC_end)
12446 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12447 * This happens when we have some thing like
12449 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12451 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12453 * Here we would be handling the interpolated
12454 * '$thai_or_lao'. We handle this by a recursive call to
12455 * ourselves which returns the inversion list the
12456 * interpolated expression evaluates to. We use the flags
12457 * from the interpolated pattern. */
12458 U32 save_flags = RExC_flags;
12459 const char * const save_parse = ++RExC_parse;
12461 parse_lparen_question_flags(pRExC_state);
12463 if (RExC_parse == save_parse /* Makes sure there was at
12464 least one flag (or this
12465 embedding wasn't compiled)
12467 || RExC_parse >= RExC_end - 4
12468 || UCHARAT(RExC_parse) != ':'
12469 || UCHARAT(++RExC_parse) != '('
12470 || UCHARAT(++RExC_parse) != '?'
12471 || UCHARAT(++RExC_parse) != '[')
12474 /* In combination with the above, this moves the
12475 * pointer to the point just after the first erroneous
12476 * character (or if there are no flags, to where they
12477 * should have been) */
12478 if (RExC_parse >= RExC_end - 4) {
12479 RExC_parse = RExC_end;
12481 else if (RExC_parse != save_parse) {
12482 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12484 vFAIL("Expecting '(?flags:(?[...'");
12487 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
12488 depth+1, oregcomp_parse);
12490 /* Here, 'current' contains the embedded expression's
12491 * inversion list, and RExC_parse points to the trailing
12492 * ']'; the next character should be the ')' which will be
12493 * paired with the '(' that has been put on the stack, so
12494 * the whole embedded expression reduces to '(operand)' */
12497 RExC_flags = save_flags;
12498 goto handle_operand;
12503 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12504 vFAIL("Unexpected character");
12507 /* regclass() can only return RESTART_UTF8 if multi-char
12508 folds are allowed. */
12509 if (!regclass(pRExC_state, flagp,depth+1,
12510 TRUE, /* means parse just the next thing */
12511 FALSE, /* don't allow multi-char folds */
12512 FALSE, /* don't silence non-portable warnings. */
12514 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12516 /* regclass() will return with parsing just the \ sequence,
12517 * leaving the parse pointer at the next thing to parse */
12519 goto handle_operand;
12521 case '[': /* Is a bracketed character class */
12523 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12525 if (! is_posix_class) {
12529 /* regclass() can only return RESTART_UTF8 if multi-char
12530 folds are allowed. */
12531 if(!regclass(pRExC_state, flagp,depth+1,
12532 is_posix_class, /* parse the whole char class
12533 only if not a posix class */
12534 FALSE, /* don't allow multi-char folds */
12535 FALSE, /* don't silence non-portable warnings. */
12537 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12539 /* function call leaves parse pointing to the ']', except if we
12541 if (is_posix_class) {
12545 goto handle_operand;
12554 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12555 || ! IS_OPERAND(*top_ptr))
12558 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12560 av_push(stack, newSVuv(curchar));
12564 av_push(stack, newSVuv(curchar));
12568 if (top_index >= 0) {
12569 top_ptr = av_fetch(stack, top_index, FALSE);
12571 if (IS_OPERAND(*top_ptr)) {
12573 vFAIL("Unexpected '(' with no preceding operator");
12576 av_push(stack, newSVuv(curchar));
12583 || ! (current = av_pop(stack))
12584 || ! IS_OPERAND(current)
12585 || ! (lparen = av_pop(stack))
12586 || IS_OPERAND(lparen)
12587 || SvUV(lparen) != '(')
12589 SvREFCNT_dec(current);
12591 vFAIL("Unexpected ')'");
12594 SvREFCNT_dec_NN(lparen);
12601 /* Here, we have an operand to process, in 'current' */
12603 if (top_index < 0) { /* Just push if stack is empty */
12604 av_push(stack, current);
12607 SV* top = av_pop(stack);
12609 char current_operator;
12611 if (IS_OPERAND(top)) {
12612 SvREFCNT_dec_NN(top);
12613 SvREFCNT_dec_NN(current);
12614 vFAIL("Operand with no preceding operator");
12616 current_operator = (char) SvUV(top);
12617 switch (current_operator) {
12618 case '(': /* Push the '(' back on followed by the new
12620 av_push(stack, top);
12621 av_push(stack, current);
12622 SvREFCNT_inc(top); /* Counters the '_dec' done
12623 just after the 'break', so
12624 it doesn't get wrongly freed
12629 _invlist_invert(current);
12631 /* Unlike binary operators, the top of the stack,
12632 * now that this unary one has been popped off, may
12633 * legally be an operator, and we now have operand
12636 SvREFCNT_dec_NN(top);
12637 goto handle_operand;
12640 prev = av_pop(stack);
12641 _invlist_intersection(prev,
12644 av_push(stack, current);
12649 prev = av_pop(stack);
12650 _invlist_union(prev, current, ¤t);
12651 av_push(stack, current);
12655 prev = av_pop(stack);;
12656 _invlist_subtract(prev, current, ¤t);
12657 av_push(stack, current);
12660 case '^': /* The union minus the intersection */
12666 prev = av_pop(stack);
12667 _invlist_union(prev, current, &u);
12668 _invlist_intersection(prev, current, &i);
12669 /* _invlist_subtract will overwrite current
12670 without freeing what it already contains */
12672 _invlist_subtract(u, i, ¤t);
12673 av_push(stack, current);
12674 SvREFCNT_dec_NN(i);
12675 SvREFCNT_dec_NN(u);
12676 SvREFCNT_dec_NN(element);
12681 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12683 SvREFCNT_dec_NN(top);
12684 SvREFCNT_dec(prev);
12688 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12691 if (av_tindex(stack) < 0 /* Was empty */
12692 || ((final = av_pop(stack)) == NULL)
12693 || ! IS_OPERAND(final)
12694 || av_tindex(stack) >= 0) /* More left on stack */
12696 vFAIL("Incomplete expression within '(?[ ])'");
12699 /* Here, 'final' is the resultant inversion list from evaluating the
12700 * expression. Return it if so requested */
12701 if (return_invlist) {
12702 *return_invlist = final;
12706 /* Otherwise generate a resultant node, based on 'final'. regclass() is
12707 * expecting a string of ranges and individual code points */
12708 invlist_iterinit(final);
12709 result_string = newSVpvs("");
12710 while (invlist_iternext(final, &start, &end)) {
12711 if (start == end) {
12712 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12715 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12720 save_parse = RExC_parse;
12721 RExC_parse = SvPV(result_string, len);
12722 save_end = RExC_end;
12723 RExC_end = RExC_parse + len;
12725 /* We turn off folding around the call, as the class we have constructed
12726 * already has all folding taken into consideration, and we don't want
12727 * regclass() to add to that */
12728 RExC_flags &= ~RXf_PMf_FOLD;
12729 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12731 node = regclass(pRExC_state, flagp,depth+1,
12732 FALSE, /* means parse the whole char class */
12733 FALSE, /* don't allow multi-char folds */
12734 TRUE, /* silence non-portable warnings. The above may very
12735 well have generated non-portable code points, but
12736 they're valid on this machine */
12739 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12742 RExC_flags |= RXf_PMf_FOLD;
12744 RExC_parse = save_parse + 1;
12745 RExC_end = save_end;
12746 SvREFCNT_dec_NN(final);
12747 SvREFCNT_dec_NN(result_string);
12749 nextchar(pRExC_state);
12750 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12755 /* The names of properties whose definitions are not known at compile time are
12756 * stored in this SV, after a constant heading. So if the length has been
12757 * changed since initialization, then there is a run-time definition. */
12758 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12761 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12762 const bool stop_at_1, /* Just parse the next thing, don't
12763 look for a full character class */
12764 bool allow_multi_folds,
12765 const bool silence_non_portable, /* Don't output warnings
12768 SV** ret_invlist) /* Return an inversion list, not a node */
12770 /* parse a bracketed class specification. Most of these will produce an
12771 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12772 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
12773 * under /i with multi-character folds: it will be rewritten following the
12774 * paradigm of this example, where the <multi-fold>s are characters which
12775 * fold to multiple character sequences:
12776 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12777 * gets effectively rewritten as:
12778 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12779 * reg() gets called (recursively) on the rewritten version, and this
12780 * function will return what it constructs. (Actually the <multi-fold>s
12781 * aren't physically removed from the [abcdefghi], it's just that they are
12782 * ignored in the recursion by means of a flag:
12783 * <RExC_in_multi_char_class>.)
12785 * ANYOF nodes contain a bit map for the first 256 characters, with the
12786 * corresponding bit set if that character is in the list. For characters
12787 * above 255, a range list or swash is used. There are extra bits for \w,
12788 * etc. in locale ANYOFs, as what these match is not determinable at
12791 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12792 * to be restarted. This can only happen if ret_invlist is non-NULL.
12796 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12798 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12801 IV namedclass = OOB_NAMEDCLASS;
12802 char *rangebegin = NULL;
12803 bool need_class = 0;
12805 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12806 than just initialized. */
12807 SV* properties = NULL; /* Code points that match \p{} \P{} */
12808 SV* posixes = NULL; /* Code points that match classes like, [:word:],
12809 extended beyond the Latin1 range */
12810 UV element_count = 0; /* Number of distinct elements in the class.
12811 Optimizations may be possible if this is tiny */
12812 AV * multi_char_matches = NULL; /* Code points that fold to more than one
12813 character; used under /i */
12815 char * stop_ptr = RExC_end; /* where to stop parsing */
12816 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12818 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12820 /* Unicode properties are stored in a swash; this holds the current one
12821 * being parsed. If this swash is the only above-latin1 component of the
12822 * character class, an optimization is to pass it directly on to the
12823 * execution engine. Otherwise, it is set to NULL to indicate that there
12824 * are other things in the class that have to be dealt with at execution
12826 SV* swash = NULL; /* Code points that match \p{} \P{} */
12828 /* Set if a component of this character class is user-defined; just passed
12829 * on to the engine */
12830 bool has_user_defined_property = FALSE;
12832 /* inversion list of code points this node matches only when the target
12833 * string is in UTF-8. (Because is under /d) */
12834 SV* depends_list = NULL;
12836 /* inversion list of code points this node matches. For much of the
12837 * function, it includes only those that match regardless of the utf8ness
12838 * of the target string */
12839 SV* cp_list = NULL;
12842 /* In a range, counts how many 0-2 of the ends of it came from literals,
12843 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
12844 UV literal_endpoint = 0;
12846 bool invert = FALSE; /* Is this class to be complemented */
12848 /* Is there any thing like \W or [:^digit:] that matches above the legal
12849 * Unicode range? */
12850 bool runtime_posix_matches_above_Unicode = FALSE;
12852 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12853 case we need to change the emitted regop to an EXACT. */
12854 const char * orig_parse = RExC_parse;
12855 const SSize_t orig_size = RExC_size;
12856 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
12857 GET_RE_DEBUG_FLAGS_DECL;
12859 PERL_ARGS_ASSERT_REGCLASS;
12861 PERL_UNUSED_ARG(depth);
12864 DEBUG_PARSE("clas");
12866 /* Assume we are going to generate an ANYOF node. */
12867 ret = reganode(pRExC_state, ANYOF, 0);
12870 RExC_size += ANYOF_SKIP;
12871 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12874 ANYOF_FLAGS(ret) = 0;
12876 RExC_emit += ANYOF_SKIP;
12878 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12880 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12881 initial_listsv_len = SvCUR(listsv);
12882 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
12886 RExC_parse = regpatws(pRExC_state, RExC_parse,
12887 FALSE /* means don't recognize comments */);
12890 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12893 allow_multi_folds = FALSE;
12896 RExC_parse = regpatws(pRExC_state, RExC_parse,
12897 FALSE /* means don't recognize comments */);
12901 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12902 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12903 const char *s = RExC_parse;
12904 const char c = *s++;
12906 while (isWORDCHAR(*s))
12908 if (*s && c == *s && s[1] == ']') {
12909 SAVEFREESV(RExC_rx_sv);
12911 "POSIX syntax [%c %c] belongs inside character classes",
12913 (void)ReREFCNT_inc(RExC_rx_sv);
12917 /* If the caller wants us to just parse a single element, accomplish this
12918 * by faking the loop ending condition */
12919 if (stop_at_1 && RExC_end > RExC_parse) {
12920 stop_ptr = RExC_parse + 1;
12923 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12924 if (UCHARAT(RExC_parse) == ']')
12925 goto charclassloop;
12929 if (RExC_parse >= stop_ptr) {
12934 RExC_parse = regpatws(pRExC_state, RExC_parse,
12935 FALSE /* means don't recognize comments */);
12938 if (UCHARAT(RExC_parse) == ']') {
12944 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12945 save_value = value;
12946 save_prevvalue = prevvalue;
12949 rangebegin = RExC_parse;
12953 value = utf8n_to_uvchr((U8*)RExC_parse,
12954 RExC_end - RExC_parse,
12955 &numlen, UTF8_ALLOW_DEFAULT);
12956 RExC_parse += numlen;
12959 value = UCHARAT(RExC_parse++);
12962 && RExC_parse < RExC_end
12963 && POSIXCC(UCHARAT(RExC_parse)))
12965 namedclass = regpposixcc(pRExC_state, value, strict);
12967 else if (value == '\\') {
12969 value = utf8n_to_uvchr((U8*)RExC_parse,
12970 RExC_end - RExC_parse,
12971 &numlen, UTF8_ALLOW_DEFAULT);
12972 RExC_parse += numlen;
12975 value = UCHARAT(RExC_parse++);
12977 /* Some compilers cannot handle switching on 64-bit integer
12978 * values, therefore value cannot be an UV. Yes, this will
12979 * be a problem later if we want switch on Unicode.
12980 * A similar issue a little bit later when switching on
12981 * namedclass. --jhi */
12983 /* If the \ is escaping white space when white space is being
12984 * skipped, it means that that white space is wanted literally, and
12985 * is already in 'value'. Otherwise, need to translate the escape
12986 * into what it signifies. */
12987 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12989 case 'w': namedclass = ANYOF_WORDCHAR; break;
12990 case 'W': namedclass = ANYOF_NWORDCHAR; break;
12991 case 's': namedclass = ANYOF_SPACE; break;
12992 case 'S': namedclass = ANYOF_NSPACE; break;
12993 case 'd': namedclass = ANYOF_DIGIT; break;
12994 case 'D': namedclass = ANYOF_NDIGIT; break;
12995 case 'v': namedclass = ANYOF_VERTWS; break;
12996 case 'V': namedclass = ANYOF_NVERTWS; break;
12997 case 'h': namedclass = ANYOF_HORIZWS; break;
12998 case 'H': namedclass = ANYOF_NHORIZWS; break;
12999 case 'N': /* Handle \N{NAME} in class */
13001 /* We only pay attention to the first char of
13002 multichar strings being returned. I kinda wonder
13003 if this makes sense as it does change the behaviour
13004 from earlier versions, OTOH that behaviour was broken
13006 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13007 TRUE, /* => charclass */
13010 if (*flagp & RESTART_UTF8)
13011 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13021 /* We will handle any undefined properties ourselves */
13022 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13023 /* And we actually would prefer to get
13024 * the straight inversion list of the
13025 * swash, since we will be accessing it
13026 * anyway, to save a little time */
13027 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13029 if (RExC_parse >= RExC_end)
13030 vFAIL2("Empty \\%c{}", (U8)value);
13031 if (*RExC_parse == '{') {
13032 const U8 c = (U8)value;
13033 e = strchr(RExC_parse++, '}');
13035 vFAIL2("Missing right brace on \\%c{}", c);
13036 while (isSPACE(UCHARAT(RExC_parse)))
13038 if (e == RExC_parse)
13039 vFAIL2("Empty \\%c{}", c);
13040 n = e - RExC_parse;
13041 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13053 if (UCHARAT(RExC_parse) == '^') {
13056 /* toggle. (The rhs xor gets the single bit that
13057 * differs between P and p; the other xor inverts just
13059 value ^= 'P' ^ 'p';
13061 while (isSPACE(UCHARAT(RExC_parse))) {
13066 /* Try to get the definition of the property into
13067 * <invlist>. If /i is in effect, the effective property
13068 * will have its name be <__NAME_i>. The design is
13069 * discussed in commit
13070 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13071 formatted = Perl_form(aTHX_
13073 (FOLD) ? "__" : "",
13078 name = savepvn(formatted, strlen(formatted));
13080 /* Look up the property name, and get its swash and
13081 * inversion list, if the property is found */
13083 SvREFCNT_dec_NN(swash);
13085 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13088 NULL, /* No inversion list */
13091 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13093 SvREFCNT_dec_NN(swash);
13097 /* Here didn't find it. It could be a user-defined
13098 * property that will be available at run-time. If we
13099 * accept only compile-time properties, is an error;
13100 * otherwise add it to the list for run-time look up */
13102 RExC_parse = e + 1;
13104 "Property '%"UTF8f"' is unknown",
13105 UTF8fARG(UTF, n, name));
13107 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13108 (value == 'p' ? '+' : '!'),
13109 UTF8fARG(UTF, n, name));
13110 has_user_defined_property = TRUE;
13112 /* We don't know yet, so have to assume that the
13113 * property could match something in the Latin1 range,
13114 * hence something that isn't utf8. Note that this
13115 * would cause things in <depends_list> to match
13116 * inappropriately, except that any \p{}, including
13117 * this one forces Unicode semantics, which means there
13118 * is <no depends_list> */
13119 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13123 /* Here, did get the swash and its inversion list. If
13124 * the swash is from a user-defined property, then this
13125 * whole character class should be regarded as such */
13126 has_user_defined_property =
13128 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
13130 /* Invert if asking for the complement */
13131 if (value == 'P') {
13132 _invlist_union_complement_2nd(properties,
13136 /* The swash can't be used as-is, because we've
13137 * inverted things; delay removing it to here after
13138 * have copied its invlist above */
13139 SvREFCNT_dec_NN(swash);
13143 _invlist_union(properties, invlist, &properties);
13148 RExC_parse = e + 1;
13149 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13152 /* \p means they want Unicode semantics */
13153 RExC_uni_semantics = 1;
13156 case 'n': value = '\n'; break;
13157 case 'r': value = '\r'; break;
13158 case 't': value = '\t'; break;
13159 case 'f': value = '\f'; break;
13160 case 'b': value = '\b'; break;
13161 case 'e': value = ASCII_TO_NATIVE('\033');break;
13162 case 'a': value = '\a'; break;
13164 RExC_parse--; /* function expects to be pointed at the 'o' */
13166 const char* error_msg;
13167 bool valid = grok_bslash_o(&RExC_parse,
13170 SIZE_ONLY, /* warnings in pass
13173 silence_non_portable,
13179 if (PL_encoding && value < 0x100) {
13180 goto recode_encoding;
13184 RExC_parse--; /* function expects to be pointed at the 'x' */
13186 const char* error_msg;
13187 bool valid = grok_bslash_x(&RExC_parse,
13190 TRUE, /* Output warnings */
13192 silence_non_portable,
13198 if (PL_encoding && value < 0x100)
13199 goto recode_encoding;
13202 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
13204 case '0': case '1': case '2': case '3': case '4':
13205 case '5': case '6': case '7':
13207 /* Take 1-3 octal digits */
13208 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13209 numlen = (strict) ? 4 : 3;
13210 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13211 RExC_parse += numlen;
13214 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13215 vFAIL("Need exactly 3 octal digits");
13217 else if (! SIZE_ONLY /* like \08, \178 */
13219 && RExC_parse < RExC_end
13220 && isDIGIT(*RExC_parse)
13221 && ckWARN(WARN_REGEXP))
13223 SAVEFREESV(RExC_rx_sv);
13224 reg_warn_non_literal_string(
13226 form_short_octal_warning(RExC_parse, numlen));
13227 (void)ReREFCNT_inc(RExC_rx_sv);
13230 if (PL_encoding && value < 0x100)
13231 goto recode_encoding;
13235 if (! RExC_override_recoding) {
13236 SV* enc = PL_encoding;
13237 value = reg_recode((const char)(U8)value, &enc);
13240 vFAIL("Invalid escape in the specified encoding");
13242 else if (SIZE_ONLY) {
13243 ckWARNreg(RExC_parse,
13244 "Invalid escape in the specified encoding");
13250 /* Allow \_ to not give an error */
13251 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13253 vFAIL2("Unrecognized escape \\%c in character class",
13257 SAVEFREESV(RExC_rx_sv);
13258 ckWARN2reg(RExC_parse,
13259 "Unrecognized escape \\%c in character class passed through",
13261 (void)ReREFCNT_inc(RExC_rx_sv);
13265 } /* End of switch on char following backslash */
13266 } /* end of handling backslash escape sequences */
13269 literal_endpoint++;
13272 /* Here, we have the current token in 'value' */
13274 /* What matches in a locale is not known until runtime. This includes
13275 * what the Posix classes (like \w, [:space:]) match. Room must be
13276 * reserved (one time per outer bracketed class) to store such classes,
13277 * either if Perl is compiled so that locale nodes always should have
13278 * this space, or if there is such posix class info to be stored. The
13279 * space will contain a bit for each named class that is to be matched
13280 * against. This isn't needed for \p{} and pseudo-classes, as they are
13281 * not affected by locale, and hence are dealt with separately */
13284 && (ANYOF_LOCALE == ANYOF_POSIXL
13285 || (namedclass > OOB_NAMEDCLASS
13286 && namedclass < ANYOF_POSIXL_MAX)))
13290 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13293 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13295 ANYOF_POSIXL_ZERO(ret);
13296 ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13299 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13302 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13303 * literal, as is the character that began the false range, i.e.
13304 * the 'a' in the examples */
13307 const int w = (RExC_parse >= rangebegin)
13308 ? RExC_parse - rangebegin
13312 "False [] range \"%"UTF8f"\"",
13313 UTF8fARG(UTF, w, rangebegin));
13316 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13317 ckWARN2reg(RExC_parse,
13318 "False [] range \"%"UTF8f"\"",
13319 UTF8fARG(UTF, w, rangebegin));
13320 (void)ReREFCNT_inc(RExC_rx_sv);
13321 cp_list = add_cp_to_invlist(cp_list, '-');
13322 cp_list = add_cp_to_invlist(cp_list, prevvalue);
13326 range = 0; /* this was not a true range */
13327 element_count += 2; /* So counts for three values */
13330 classnum = namedclass_to_classnum(namedclass);
13332 if (LOC && namedclass < ANYOF_POSIXL_MAX
13333 #ifndef HAS_ISASCII
13334 && classnum != _CC_ASCII
13336 #ifndef HAS_ISBLANK
13337 && classnum != _CC_BLANK
13340 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13341 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13345 posixl_matches_all = TRUE;
13348 ANYOF_POSIXL_SET(ret, namedclass);
13350 /* XXX After have made all the posix classes known at compile time
13351 * we can move the LOC handling below to above */
13354 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
13355 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13357 /* Here, should be \h, \H, \v, or \V. Neither /d nor
13358 * /l make a difference in what these match. There
13359 * would be problems if these characters had folds
13360 * other than themselves, as cp_list is subject to
13362 if (classnum != _CC_VERTSPACE) {
13363 assert( namedclass == ANYOF_HORIZWS
13364 || namedclass == ANYOF_NHORIZWS);
13366 /* It turns out that \h is just a synonym for
13368 classnum = _CC_BLANK;
13371 _invlist_union_maybe_complement_2nd(
13373 PL_XPosix_ptrs[classnum],
13374 cBOOL(namedclass % 2), /* Complement if odd
13375 (NHORIZWS, NVERTWS)
13380 else if (classnum == _CC_ASCII) {
13383 ANYOF_POSIXL_SET(ret, namedclass);
13386 #endif /* Not isascii(); just use the hard-coded definition for it */
13387 _invlist_union_maybe_complement_2nd(
13389 PL_Posix_ptrs[_CC_ASCII],
13390 cBOOL(namedclass % 2), /* Complement if odd
13394 else { /* Garden variety class */
13396 /* The ascii range inversion list */
13397 SV* ascii_source = PL_Posix_ptrs[classnum];
13399 /* The full Latin1 range inversion list */
13400 SV* l1_source = PL_L1Posix_ptrs[classnum];
13402 /* This code is structured into two major clauses. The
13403 * first is for classes whose complete definitions may not
13404 * already be known. If not, the Latin1 definition
13405 * (guaranteed to already known) is used plus code is
13406 * generated to load the rest at run-time (only if needed).
13407 * If the complete definition is known, it drops down to
13408 * the second clause, where the complete definition is
13411 if (classnum < _FIRST_NON_SWASH_CC) {
13413 /* Here, the class has a swash, which may or not
13414 * already be loaded */
13416 /* The name of the property to use to match the full
13417 * eXtended Unicode range swash for this character
13419 const char *Xname = swash_property_names[classnum];
13421 /* If returning the inversion list, we can't defer
13422 * getting this until runtime */
13423 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
13424 PL_utf8_swash_ptrs[classnum] =
13425 _core_swash_init("utf8", Xname, &PL_sv_undef,
13428 NULL, /* No inversion list */
13429 NULL /* No flags */
13431 assert(PL_utf8_swash_ptrs[classnum]);
13433 if ( ! PL_utf8_swash_ptrs[classnum]) {
13434 if (namedclass % 2 == 0) { /* A non-complemented
13436 /* If not /a matching, there are code points we
13437 * don't know at compile time. Arrange for the
13438 * unknown matches to be loaded at run-time, if
13440 if (! AT_LEAST_ASCII_RESTRICTED) {
13441 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
13444 if (LOC) { /* Under locale, set run-time
13446 ANYOF_POSIXL_SET(ret, namedclass);
13449 /* Add the current class's code points to
13450 * the running total */
13451 _invlist_union(posixes,
13452 (AT_LEAST_ASCII_RESTRICTED)
13458 else { /* A complemented class */
13459 if (AT_LEAST_ASCII_RESTRICTED) {
13460 /* Under /a should match everything above
13461 * ASCII, plus the complement of the set's
13463 _invlist_union_complement_2nd(posixes,
13468 /* Arrange for the unknown matches to be
13469 * loaded at run-time, if needed */
13470 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
13472 runtime_posix_matches_above_Unicode = TRUE;
13474 ANYOF_POSIXL_SET(ret, namedclass);
13478 /* We want to match everything in
13479 * Latin1, except those things that
13480 * l1_source matches */
13481 SV* scratch_list = NULL;
13482 _invlist_subtract(PL_Latin1, l1_source,
13485 /* Add the list from this class to the
13488 posixes = scratch_list;
13491 _invlist_union(posixes,
13494 SvREFCNT_dec_NN(scratch_list);
13496 if (DEPENDS_SEMANTICS) {
13498 |= ANYOF_NON_UTF8_LATIN1_ALL;
13503 goto namedclass_done;
13506 /* Here, there is a swash loaded for the class. If no
13507 * inversion list for it yet, get it */
13508 if (! PL_XPosix_ptrs[classnum]) {
13509 PL_XPosix_ptrs[classnum]
13510 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
13514 /* Here there is an inversion list already loaded for the
13517 if (namedclass % 2 == 0) { /* A non-complemented class,
13518 like ANYOF_PUNCT */
13520 /* For non-locale, just add it to any existing list
13522 _invlist_union(posixes,
13523 (AT_LEAST_ASCII_RESTRICTED)
13525 : PL_XPosix_ptrs[classnum],
13528 else { /* Locale */
13529 SV* scratch_list = NULL;
13531 /* For above Latin1 code points, we use the full
13533 _invlist_intersection(PL_AboveLatin1,
13534 PL_XPosix_ptrs[classnum],
13536 /* And set the output to it, adding instead if
13537 * there already is an output. Checking if
13538 * 'posixes' is NULL first saves an extra clone.
13539 * Its reference count will be decremented at the
13540 * next union, etc, or if this is the only
13541 * instance, at the end of the routine */
13543 posixes = scratch_list;
13546 _invlist_union(posixes, scratch_list, &posixes);
13547 SvREFCNT_dec_NN(scratch_list);
13550 #ifndef HAS_ISBLANK
13551 if (namedclass != ANYOF_BLANK) {
13553 /* Set this class in the node for runtime
13555 ANYOF_POSIXL_SET(ret, namedclass);
13556 #ifndef HAS_ISBLANK
13559 /* No isblank(), use the hard-coded ASCII-range
13560 * blanks, adding them to the running total. */
13562 _invlist_union(posixes, ascii_source, &posixes);
13567 else { /* A complemented class, like ANYOF_NPUNCT */
13569 _invlist_union_complement_2nd(
13571 (AT_LEAST_ASCII_RESTRICTED)
13573 : PL_XPosix_ptrs[classnum],
13575 /* Under /d, everything in the upper half of the
13576 * Latin1 range matches this complement */
13577 if (DEPENDS_SEMANTICS) {
13578 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13581 else { /* Locale */
13582 SV* scratch_list = NULL;
13583 _invlist_subtract(PL_AboveLatin1,
13584 PL_XPosix_ptrs[classnum],
13587 posixes = scratch_list;
13590 _invlist_union(posixes, scratch_list, &posixes);
13591 SvREFCNT_dec_NN(scratch_list);
13593 #ifndef HAS_ISBLANK
13594 if (namedclass != ANYOF_NBLANK) {
13596 ANYOF_POSIXL_SET(ret, namedclass);
13597 #ifndef HAS_ISBLANK
13600 /* Get the list of all code points in Latin1
13601 * that are not ASCII blanks, and add them to
13602 * the running total */
13603 _invlist_subtract(PL_Latin1, ascii_source,
13605 _invlist_union(posixes, scratch_list, &posixes);
13606 SvREFCNT_dec_NN(scratch_list);
13613 continue; /* Go get next character */
13615 } /* end of namedclass \blah */
13617 /* Here, we have a single value. If 'range' is set, it is the ending
13618 * of a range--check its validity. Later, we will handle each
13619 * individual code point in the range. If 'range' isn't set, this
13620 * could be the beginning of a range, so check for that by looking
13621 * ahead to see if the next real character to be processed is the range
13622 * indicator--the minus sign */
13625 RExC_parse = regpatws(pRExC_state, RExC_parse,
13626 FALSE /* means don't recognize comments */);
13630 if (prevvalue > value) /* b-a */ {
13631 const int w = RExC_parse - rangebegin;
13633 "Invalid [] range \"%"UTF8f"\"",
13634 UTF8fARG(UTF, w, rangebegin));
13635 range = 0; /* not a valid range */
13639 prevvalue = value; /* save the beginning of the potential range */
13640 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13641 && *RExC_parse == '-')
13643 char* next_char_ptr = RExC_parse + 1;
13644 if (skip_white) { /* Get the next real char after the '-' */
13645 next_char_ptr = regpatws(pRExC_state,
13647 FALSE); /* means don't recognize
13651 /* If the '-' is at the end of the class (just before the ']',
13652 * it is a literal minus; otherwise it is a range */
13653 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13654 RExC_parse = next_char_ptr;
13656 /* a bad range like \w-, [:word:]- ? */
13657 if (namedclass > OOB_NAMEDCLASS) {
13658 if (strict || ckWARN(WARN_REGEXP)) {
13660 RExC_parse >= rangebegin ?
13661 RExC_parse - rangebegin : 0;
13663 vFAIL4("False [] range \"%*.*s\"",
13668 "False [] range \"%*.*s\"",
13673 cp_list = add_cp_to_invlist(cp_list, '-');
13677 range = 1; /* yeah, it's a range! */
13678 continue; /* but do it the next time */
13683 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13686 /* non-Latin1 code point implies unicode semantics. Must be set in
13687 * pass1 so is there for the whole of pass 2 */
13689 RExC_uni_semantics = 1;
13692 /* Ready to process either the single value, or the completed range.
13693 * For single-valued non-inverted ranges, we consider the possibility
13694 * of multi-char folds. (We made a conscious decision to not do this
13695 * for the other cases because it can often lead to non-intuitive
13696 * results. For example, you have the peculiar case that:
13697 * "s s" =~ /^[^\xDF]+$/i => Y
13698 * "ss" =~ /^[^\xDF]+$/i => N
13700 * See [perl #89750] */
13701 if (FOLD && allow_multi_folds && value == prevvalue) {
13702 if (value == LATIN_SMALL_LETTER_SHARP_S
13703 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13706 /* Here <value> is indeed a multi-char fold. Get what it is */
13708 U8 foldbuf[UTF8_MAXBYTES_CASE];
13711 UV folded = _to_uni_fold_flags(
13716 | ((LOC) ? FOLD_FLAGS_LOCALE
13717 : (ASCII_FOLD_RESTRICTED)
13718 ? FOLD_FLAGS_NOMIX_ASCII
13722 /* Here, <folded> should be the first character of the
13723 * multi-char fold of <value>, with <foldbuf> containing the
13724 * whole thing. But, if this fold is not allowed (because of
13725 * the flags), <fold> will be the same as <value>, and should
13726 * be processed like any other character, so skip the special
13728 if (folded != value) {
13730 /* Skip if we are recursed, currently parsing the class
13731 * again. Otherwise add this character to the list of
13732 * multi-char folds. */
13733 if (! RExC_in_multi_char_class) {
13734 AV** this_array_ptr;
13736 STRLEN cp_count = utf8_length(foldbuf,
13737 foldbuf + foldlen);
13738 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13740 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13743 if (! multi_char_matches) {
13744 multi_char_matches = newAV();
13747 /* <multi_char_matches> is actually an array of arrays.
13748 * There will be one or two top-level elements: [2],
13749 * and/or [3]. The [2] element is an array, each
13750 * element thereof is a character which folds to TWO
13751 * characters; [3] is for folds to THREE characters.
13752 * (Unicode guarantees a maximum of 3 characters in any
13753 * fold.) When we rewrite the character class below,
13754 * we will do so such that the longest folds are
13755 * written first, so that it prefers the longest
13756 * matching strings first. This is done even if it
13757 * turns out that any quantifier is non-greedy, out of
13758 * programmer laziness. Tom Christiansen has agreed
13759 * that this is ok. This makes the test for the
13760 * ligature 'ffi' come before the test for 'ff' */
13761 if (av_exists(multi_char_matches, cp_count)) {
13762 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13764 this_array = *this_array_ptr;
13767 this_array = newAV();
13768 av_store(multi_char_matches, cp_count,
13771 av_push(this_array, multi_fold);
13774 /* This element should not be processed further in this
13777 value = save_value;
13778 prevvalue = save_prevvalue;
13784 /* Deal with this element of the class */
13787 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13789 SV* this_range = _new_invlist(1);
13790 _append_range_to_invlist(this_range, prevvalue, value);
13792 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13793 * If this range was specified using something like 'i-j', we want
13794 * to include only the 'i' and the 'j', and not anything in
13795 * between, so exclude non-ASCII, non-alphabetics from it.
13796 * However, if the range was specified with something like
13797 * [\x89-\x91] or [\x89-j], all code points within it should be
13798 * included. literal_endpoint==2 means both ends of the range used
13799 * a literal character, not \x{foo} */
13800 if (literal_endpoint == 2
13801 && ((prevvalue >= 'a' && value <= 'z')
13802 || (prevvalue >= 'A' && value <= 'Z')))
13804 _invlist_intersection(this_range, PL_ASCII,
13806 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13809 _invlist_union(cp_list, this_range, &cp_list);
13810 literal_endpoint = 0;
13814 range = 0; /* this range (if it was one) is done now */
13815 } /* End of loop through all the text within the brackets */
13817 /* If anything in the class expands to more than one character, we have to
13818 * deal with them by building up a substitute parse string, and recursively
13819 * calling reg() on it, instead of proceeding */
13820 if (multi_char_matches) {
13821 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13824 char *save_end = RExC_end;
13825 char *save_parse = RExC_parse;
13826 bool first_time = TRUE; /* First multi-char occurrence doesn't get
13831 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13832 because too confusing */
13834 sv_catpv(substitute_parse, "(?:");
13838 /* Look at the longest folds first */
13839 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13841 if (av_exists(multi_char_matches, cp_count)) {
13842 AV** this_array_ptr;
13845 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13847 while ((this_sequence = av_pop(*this_array_ptr)) !=
13850 if (! first_time) {
13851 sv_catpv(substitute_parse, "|");
13853 first_time = FALSE;
13855 sv_catpv(substitute_parse, SvPVX(this_sequence));
13860 /* If the character class contains anything else besides these
13861 * multi-character folds, have to include it in recursive parsing */
13862 if (element_count) {
13863 sv_catpv(substitute_parse, "|[");
13864 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13865 sv_catpv(substitute_parse, "]");
13868 sv_catpv(substitute_parse, ")");
13871 /* This is a way to get the parse to skip forward a whole named
13872 * sequence instead of matching the 2nd character when it fails the
13874 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13878 RExC_parse = SvPV(substitute_parse, len);
13879 RExC_end = RExC_parse + len;
13880 RExC_in_multi_char_class = 1;
13881 RExC_emit = (regnode *)orig_emit;
13883 ret = reg(pRExC_state, 1, ®_flags, depth+1);
13885 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13887 RExC_parse = save_parse;
13888 RExC_end = save_end;
13889 RExC_in_multi_char_class = 0;
13890 SvREFCNT_dec_NN(multi_char_matches);
13894 /* If the character class contains only a single element, it may be
13895 * optimizable into another node type which is smaller and runs faster.
13896 * Check if this is the case for this class */
13897 if ((element_count == 1 && ! ret_invlist)
13898 || UNLIKELY(posixl_matches_all))
13903 if (UNLIKELY(posixl_matches_all)) {
13906 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
13907 \w or [:digit:] or \p{foo}
13910 /* All named classes are mapped into POSIXish nodes, with its FLAG
13911 * argument giving which class it is */
13912 switch ((I32)namedclass) {
13913 case ANYOF_UNIPROP:
13916 /* These don't depend on the charset modifiers. They always
13917 * match under /u rules */
13918 case ANYOF_NHORIZWS:
13919 case ANYOF_HORIZWS:
13920 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13923 case ANYOF_NVERTWS:
13928 /* The actual POSIXish node for all the rest depends on the
13929 * charset modifier. The ones in the first set depend only on
13930 * ASCII or, if available on this platform, locale */
13934 op = (LOC) ? POSIXL : POSIXA;
13945 /* under /a could be alpha */
13947 if (ASCII_RESTRICTED) {
13948 namedclass = ANYOF_ALPHA + (namedclass % 2);
13956 /* The rest have more possibilities depending on the charset.
13957 * We take advantage of the enum ordering of the charset
13958 * modifiers to get the exact node type, */
13960 op = POSIXD + get_regex_charset(RExC_flags);
13961 if (op > POSIXA) { /* /aa is same as /a */
13964 #ifndef HAS_ISBLANK
13966 && (namedclass == ANYOF_BLANK
13967 || namedclass == ANYOF_NBLANK))
13974 /* The odd numbered ones are the complements of the
13975 * next-lower even number one */
13976 if (namedclass % 2 == 1) {
13980 arg = namedclass_to_classnum(namedclass);
13984 else if (value == prevvalue) {
13986 /* Here, the class consists of just a single code point */
13989 if (! LOC && value == '\n') {
13990 op = REG_ANY; /* Optimize [^\n] */
13991 *flagp |= HASWIDTH|SIMPLE;
13995 else if (value < 256 || UTF) {
13997 /* Optimize a single value into an EXACTish node, but not if it
13998 * would require converting the pattern to UTF-8. */
13999 op = compute_EXACTish(pRExC_state);
14001 } /* Otherwise is a range */
14002 else if (! LOC) { /* locale could vary these */
14003 if (prevvalue == '0') {
14004 if (value == '9') {
14011 /* Here, we have changed <op> away from its initial value iff we found
14012 * an optimization */
14015 /* Throw away this ANYOF regnode, and emit the calculated one,
14016 * which should correspond to the beginning, not current, state of
14018 const char * cur_parse = RExC_parse;
14019 RExC_parse = (char *)orig_parse;
14023 /* To get locale nodes to not use the full ANYOF size would
14024 * require moving the code above that writes the portions
14025 * of it that aren't in other nodes to after this point.
14026 * e.g. ANYOF_POSIXL_SET */
14027 RExC_size = orig_size;
14031 RExC_emit = (regnode *)orig_emit;
14032 if (PL_regkind[op] == POSIXD) {
14034 op += NPOSIXD - POSIXD;
14039 ret = reg_node(pRExC_state, op);
14041 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14045 *flagp |= HASWIDTH|SIMPLE;
14047 else if (PL_regkind[op] == EXACT) {
14048 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14051 RExC_parse = (char *) cur_parse;
14053 SvREFCNT_dec(posixes);
14054 SvREFCNT_dec(cp_list);
14061 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14063 /* If folding, we calculate all characters that could fold to or from the
14064 * ones already on the list */
14065 if (FOLD && cp_list) {
14066 UV start, end; /* End points of code point ranges */
14068 SV* fold_intersection = NULL;
14070 /* If the highest code point is within Latin1, we can use the
14071 * compiled-in Alphas list, and not have to go out to disk. This
14072 * yields two false positives, the masculine and feminine ordinal
14073 * indicators, which are weeded out below using the
14074 * IS_IN_SOME_FOLD_L1() macro */
14075 if (invlist_highest(cp_list) < 256) {
14076 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
14077 &fold_intersection);
14081 /* Here, there are non-Latin1 code points, so we will have to go
14082 * fetch the list of all the characters that participate in folds
14084 if (! PL_utf8_foldable) {
14085 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14086 &PL_sv_undef, 1, 0);
14087 PL_utf8_foldable = _get_swash_invlist(swash);
14088 SvREFCNT_dec_NN(swash);
14091 /* This is a hash that for a particular fold gives all characters
14092 * that are involved in it */
14093 if (! PL_utf8_foldclosures) {
14095 /* If we were unable to find any folds, then we likely won't be
14096 * able to find the closures. So just create an empty list.
14097 * Folding will effectively be restricted to the non-Unicode
14098 * rules hard-coded into Perl. (This case happens legitimately
14099 * during compilation of Perl itself before the Unicode tables
14100 * are generated) */
14101 if (_invlist_len(PL_utf8_foldable) == 0) {
14102 PL_utf8_foldclosures = newHV();
14105 /* If the folds haven't been read in, call a fold function
14107 if (! PL_utf8_tofold) {
14108 U8 dummy[UTF8_MAXBYTES_CASE+1];
14110 /* This string is just a short named one above \xff */
14111 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14112 assert(PL_utf8_tofold); /* Verify that worked */
14114 PL_utf8_foldclosures =
14115 _swash_inversion_hash(PL_utf8_tofold);
14119 /* Only the characters in this class that participate in folds need
14120 * be checked. Get the intersection of this class and all the
14121 * possible characters that are foldable. This can quickly narrow
14122 * down a large class */
14123 _invlist_intersection(PL_utf8_foldable, cp_list,
14124 &fold_intersection);
14127 /* Now look at the foldable characters in this class individually */
14128 invlist_iterinit(fold_intersection);
14129 while (invlist_iternext(fold_intersection, &start, &end)) {
14132 /* Locale folding for Latin1 characters is deferred until runtime */
14133 if (LOC && start < 256) {
14137 /* Look at every character in the range */
14138 for (j = start; j <= end; j++) {
14140 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14146 /* We have the latin1 folding rules hard-coded here so that
14147 * an innocent-looking character class, like /[ks]/i won't
14148 * have to go out to disk to find the possible matches.
14149 * XXX It would be better to generate these via regen, in
14150 * case a new version of the Unicode standard adds new
14151 * mappings, though that is not really likely, and may be
14152 * caught by the default: case of the switch below. */
14154 if (IS_IN_SOME_FOLD_L1(j)) {
14156 /* ASCII is always matched; non-ASCII is matched only
14157 * under Unicode rules */
14158 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
14160 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
14164 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
14168 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14169 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14171 /* Certain Latin1 characters have matches outside
14172 * Latin1. To get here, <j> is one of those
14173 * characters. None of these matches is valid for
14174 * ASCII characters under /aa, which is why the 'if'
14175 * just above excludes those. These matches only
14176 * happen when the target string is utf8. The code
14177 * below adds the single fold closures for <j> to the
14178 * inversion list. */
14183 add_cp_to_invlist(cp_list, KELVIN_SIGN);
14187 cp_list = add_cp_to_invlist(cp_list,
14188 LATIN_SMALL_LETTER_LONG_S);
14191 cp_list = add_cp_to_invlist(cp_list,
14192 GREEK_CAPITAL_LETTER_MU);
14193 cp_list = add_cp_to_invlist(cp_list,
14194 GREEK_SMALL_LETTER_MU);
14196 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14197 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14199 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
14201 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14202 cp_list = add_cp_to_invlist(cp_list,
14203 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14205 case LATIN_SMALL_LETTER_SHARP_S:
14206 cp_list = add_cp_to_invlist(cp_list,
14207 LATIN_CAPITAL_LETTER_SHARP_S);
14209 case 'F': case 'f':
14210 case 'I': case 'i':
14211 case 'L': case 'l':
14212 case 'T': case 't':
14213 case 'A': case 'a':
14214 case 'H': case 'h':
14215 case 'J': case 'j':
14216 case 'N': case 'n':
14217 case 'W': case 'w':
14218 case 'Y': case 'y':
14219 /* These all are targets of multi-character
14220 * folds from code points that require UTF8 to
14221 * express, so they can't match unless the
14222 * target string is in UTF-8, so no action here
14223 * is necessary, as regexec.c properly handles
14224 * the general case for UTF-8 matching and
14225 * multi-char folds */
14228 /* Use deprecated warning to increase the
14229 * chances of this being output */
14230 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14237 /* Here is an above Latin1 character. We don't have the rules
14238 * hard-coded for it. First, get its fold. This is the simple
14239 * fold, as the multi-character folds have been handled earlier
14240 * and separated out */
14241 _to_uni_fold_flags(j, foldbuf, &foldlen,
14243 ? FOLD_FLAGS_LOCALE
14244 : (ASCII_FOLD_RESTRICTED)
14245 ? FOLD_FLAGS_NOMIX_ASCII
14248 /* Single character fold of above Latin1. Add everything in
14249 * its fold closure to the list that this node should match.
14250 * The fold closures data structure is a hash with the keys
14251 * being the UTF-8 of every character that is folded to, like
14252 * 'k', and the values each an array of all code points that
14253 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14254 * Multi-character folds are not included */
14255 if ((listp = hv_fetch(PL_utf8_foldclosures,
14256 (char *) foldbuf, foldlen, FALSE)))
14258 AV* list = (AV*) *listp;
14260 for (k = 0; k <= av_len(list); k++) {
14261 SV** c_p = av_fetch(list, k, FALSE);
14264 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14268 /* /aa doesn't allow folds between ASCII and non-; /l
14269 * doesn't allow them between above and below 256 */
14270 if ((ASCII_FOLD_RESTRICTED
14271 && (isASCII(c) != isASCII(j)))
14272 || (LOC && c < 256)) {
14276 /* Folds involving non-ascii Latin1 characters
14277 * under /d are added to a separate list */
14278 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14280 cp_list = add_cp_to_invlist(cp_list, c);
14283 depends_list = add_cp_to_invlist(depends_list, c);
14289 SvREFCNT_dec_NN(fold_intersection);
14292 /* And combine the result (if any) with any inversion list from posix
14293 * classes. The lists are kept separate up to now because we don't want to
14294 * fold the classes (folding of those is automatically handled by the swash
14295 * fetching code) */
14297 if (! DEPENDS_SEMANTICS) {
14299 _invlist_union(cp_list, posixes, &cp_list);
14300 SvREFCNT_dec_NN(posixes);
14307 /* Under /d, we put into a separate list the Latin1 things that
14308 * match only when the target string is utf8 */
14309 SV* nonascii_but_latin1_properties = NULL;
14310 _invlist_intersection(posixes, PL_UpperLatin1,
14311 &nonascii_but_latin1_properties);
14312 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14315 _invlist_union(cp_list, posixes, &cp_list);
14316 SvREFCNT_dec_NN(posixes);
14322 if (depends_list) {
14323 _invlist_union(depends_list, nonascii_but_latin1_properties,
14325 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14328 depends_list = nonascii_but_latin1_properties;
14333 /* And combine the result (if any) with any inversion list from properties.
14334 * The lists are kept separate up to now so that we can distinguish the two
14335 * in regards to matching above-Unicode. A run-time warning is generated
14336 * if a Unicode property is matched against a non-Unicode code point. But,
14337 * we allow user-defined properties to match anything, without any warning,
14338 * and we also suppress the warning if there is a portion of the character
14339 * class that isn't a Unicode property, and which matches above Unicode, \W
14340 * or [\x{110000}] for example.
14341 * (Note that in this case, unlike the Posix one above, there is no
14342 * <depends_list>, because having a Unicode property forces Unicode
14345 bool warn_super = ! has_user_defined_property;
14348 /* If it matters to the final outcome, see if a non-property
14349 * component of the class matches above Unicode. If so, the
14350 * warning gets suppressed. This is true even if just a single
14351 * such code point is specified, as though not strictly correct if
14352 * another such code point is matched against, the fact that they
14353 * are using above-Unicode code points indicates they should know
14354 * the issues involved */
14356 bool non_prop_matches_above_Unicode =
14357 runtime_posix_matches_above_Unicode
14358 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
14360 non_prop_matches_above_Unicode =
14361 ! non_prop_matches_above_Unicode;
14363 warn_super = ! non_prop_matches_above_Unicode;
14366 _invlist_union(properties, cp_list, &cp_list);
14367 SvREFCNT_dec_NN(properties);
14370 cp_list = properties;
14374 OP(ret) = ANYOF_WARN_SUPER;
14378 /* Here, we have calculated what code points should be in the character
14381 * Now we can see about various optimizations. Fold calculation (which we
14382 * did above) needs to take place before inversion. Otherwise /[^k]/i
14383 * would invert to include K, which under /i would match k, which it
14384 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14385 * folded until runtime */
14387 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14388 * at compile time. Besides not inverting folded locale now, we can't
14389 * invert if there are things such as \w, which aren't known until runtime
14392 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_POSIXL)))
14394 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14396 _invlist_invert(cp_list);
14398 /* Any swash can't be used as-is, because we've inverted things */
14400 SvREFCNT_dec_NN(swash);
14404 /* Clear the invert flag since have just done it here */
14409 *ret_invlist = cp_list;
14410 SvREFCNT_dec(swash);
14412 /* Discard the generated node */
14414 RExC_size = orig_size;
14417 RExC_emit = orig_emit;
14422 /* If we didn't do folding, it's because some information isn't available
14423 * until runtime; set the run-time fold flag for these. (We don't have to
14424 * worry about properties folding, as that is taken care of by the swash
14428 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14431 /* Some character classes are equivalent to other nodes. Such nodes take
14432 * up less room and generally fewer operations to execute than ANYOF nodes.
14433 * Above, we checked for and optimized into some such equivalents for
14434 * certain common classes that are easy to test. Getting to this point in
14435 * the code means that the class didn't get optimized there. Since this
14436 * code is only executed in Pass 2, it is too late to save space--it has
14437 * been allocated in Pass 1, and currently isn't given back. But turning
14438 * things into an EXACTish node can allow the optimizer to join it to any
14439 * adjacent such nodes. And if the class is equivalent to things like /./,
14440 * expensive run-time swashes can be avoided. Now that we have more
14441 * complete information, we can find things necessarily missed by the
14442 * earlier code. I (khw) am not sure how much to look for here. It would
14443 * be easy, but perhaps too slow, to check any candidates against all the
14444 * node types they could possibly match using _invlistEQ(). */
14449 && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
14450 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14453 U8 op = END; /* The optimzation node-type */
14454 const char * cur_parse= RExC_parse;
14456 invlist_iterinit(cp_list);
14457 if (! invlist_iternext(cp_list, &start, &end)) {
14459 /* Here, the list is empty. This happens, for example, when a
14460 * Unicode property is the only thing in the character class, and
14461 * it doesn't match anything. (perluniprops.pod notes such
14464 *flagp |= HASWIDTH|SIMPLE;
14466 else if (start == end) { /* The range is a single code point */
14467 if (! invlist_iternext(cp_list, &start, &end)
14469 /* Don't do this optimization if it would require changing
14470 * the pattern to UTF-8 */
14471 && (start < 256 || UTF))
14473 /* Here, the list contains a single code point. Can optimize
14474 * into an EXACT node */
14483 /* A locale node under folding with one code point can be
14484 * an EXACTFL, as its fold won't be calculated until
14490 /* Here, we are generally folding, but there is only one
14491 * code point to match. If we have to, we use an EXACT
14492 * node, but it would be better for joining with adjacent
14493 * nodes in the optimization pass if we used the same
14494 * EXACTFish node that any such are likely to be. We can
14495 * do this iff the code point doesn't participate in any
14496 * folds. For example, an EXACTF of a colon is the same as
14497 * an EXACT one, since nothing folds to or from a colon. */
14499 if (IS_IN_SOME_FOLD_L1(value)) {
14504 if (! PL_utf8_foldable) {
14505 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14506 &PL_sv_undef, 1, 0);
14507 PL_utf8_foldable = _get_swash_invlist(swash);
14508 SvREFCNT_dec_NN(swash);
14510 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14515 /* If we haven't found the node type, above, it means we
14516 * can use the prevailing one */
14518 op = compute_EXACTish(pRExC_state);
14523 else if (start == 0) {
14524 if (end == UV_MAX) {
14526 *flagp |= HASWIDTH|SIMPLE;
14529 else if (end == '\n' - 1
14530 && invlist_iternext(cp_list, &start, &end)
14531 && start == '\n' + 1 && end == UV_MAX)
14534 *flagp |= HASWIDTH|SIMPLE;
14538 invlist_iterfinish(cp_list);
14541 RExC_parse = (char *)orig_parse;
14542 RExC_emit = (regnode *)orig_emit;
14544 ret = reg_node(pRExC_state, op);
14546 RExC_parse = (char *)cur_parse;
14548 if (PL_regkind[op] == EXACT) {
14549 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14552 SvREFCNT_dec_NN(cp_list);
14557 /* Here, <cp_list> contains all the code points we can determine at
14558 * compile time that match under all conditions. Go through it, and
14559 * for things that belong in the bitmap, put them there, and delete from
14560 * <cp_list>. While we are at it, see if everything above 255 is in the
14561 * list, and if so, set a flag to speed up execution */
14563 populate_ANYOF_from_invlist(ret, &cp_list);
14566 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14569 /* Here, the bitmap has been populated with all the Latin1 code points that
14570 * always match. Can now add to the overall list those that match only
14571 * when the target string is UTF-8 (<depends_list>). */
14572 if (depends_list) {
14574 _invlist_union(cp_list, depends_list, &cp_list);
14575 SvREFCNT_dec_NN(depends_list);
14578 cp_list = depends_list;
14582 /* If there is a swash and more than one element, we can't use the swash in
14583 * the optimization below. */
14584 if (swash && element_count > 1) {
14585 SvREFCNT_dec_NN(swash);
14589 set_ANYOF_arg(pRExC_state, ret, cp_list,
14590 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14592 swash, has_user_defined_property);
14594 *flagp |= HASWIDTH|SIMPLE;
14598 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14601 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14602 regnode* const node,
14604 SV* const runtime_defns,
14606 const bool has_user_defined_property)
14608 /* Sets the arg field of an ANYOF-type node 'node', using information about
14609 * the node passed-in. If there is nothing outside the node's bitmap, the
14610 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
14611 * the count returned by add_data(), having allocated and stored an array,
14612 * av, that that count references, as follows:
14613 * av[0] stores the character class description in its textual form.
14614 * This is used later (regexec.c:Perl_regclass_swash()) to
14615 * initialize the appropriate swash, and is also useful for dumping
14616 * the regnode. This is set to &PL_sv_undef if the textual
14617 * description is not needed at run-time (as happens if the other
14618 * elements completely define the class)
14619 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14620 * computed from av[0]. But if no further computation need be done,
14621 * the swash is stored here now (and av[0] is &PL_sv_undef).
14622 * av[2] stores the cp_list inversion list for use in addition or instead
14623 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14624 * (Otherwise everything needed is already in av[0] and av[1])
14625 * av[3] is set if any component of the class is from a user-defined
14626 * property; used only if av[2] exists */
14630 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14632 if (! cp_list && ! runtime_defns) {
14633 ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14636 AV * const av = newAV();
14639 av_store(av, 0, (runtime_defns)
14640 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14642 av_store(av, 1, swash);
14643 SvREFCNT_dec_NN(cp_list);
14646 av_store(av, 1, &PL_sv_undef);
14648 av_store(av, 2, cp_list);
14649 av_store(av, 3, newSVuv(has_user_defined_property));
14653 rv = newRV_noinc(MUTABLE_SV(av));
14654 n = add_data(pRExC_state, STR_WITH_LEN("s"));
14655 RExC_rxi->data->data[n] = (void*)rv;
14661 /* reg_skipcomment()
14663 Absorbs an /x style # comments from the input stream.
14664 Returns true if there is more text remaining in the stream.
14665 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14666 terminates the pattern without including a newline.
14668 Note its the callers responsibility to ensure that we are
14669 actually in /x mode
14674 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14678 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14680 while (RExC_parse < RExC_end)
14681 if (*RExC_parse++ == '\n') {
14686 /* we ran off the end of the pattern without ending
14687 the comment, so we have to add an \n when wrapping */
14688 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14696 Advances the parse position, and optionally absorbs
14697 "whitespace" from the inputstream.
14699 Without /x "whitespace" means (?#...) style comments only,
14700 with /x this means (?#...) and # comments and whitespace proper.
14702 Returns the RExC_parse point from BEFORE the scan occurs.
14704 This is the /x friendly way of saying RExC_parse++.
14708 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14710 char* const retval = RExC_parse++;
14712 PERL_ARGS_ASSERT_NEXTCHAR;
14715 if (RExC_end - RExC_parse >= 3
14716 && *RExC_parse == '('
14717 && RExC_parse[1] == '?'
14718 && RExC_parse[2] == '#')
14720 while (*RExC_parse != ')') {
14721 if (RExC_parse == RExC_end)
14722 FAIL("Sequence (?#... not terminated");
14728 if (RExC_flags & RXf_PMf_EXTENDED) {
14729 if (isSPACE(*RExC_parse)) {
14733 else if (*RExC_parse == '#') {
14734 if ( reg_skipcomment( pRExC_state ) )
14743 - reg_node - emit a node
14745 STATIC regnode * /* Location. */
14746 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14750 regnode * const ret = RExC_emit;
14751 GET_RE_DEBUG_FLAGS_DECL;
14753 PERL_ARGS_ASSERT_REG_NODE;
14756 SIZE_ALIGN(RExC_size);
14760 if (RExC_emit >= RExC_emit_bound)
14761 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14762 op, RExC_emit, RExC_emit_bound);
14764 NODE_ALIGN_FILL(ret);
14766 FILL_ADVANCE_NODE(ptr, op);
14767 #ifdef RE_TRACK_PATTERN_OFFSETS
14768 if (RExC_offsets) { /* MJD */
14769 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14770 "reg_node", __LINE__,
14772 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14773 ? "Overwriting end of array!\n" : "OK",
14774 (UV)(RExC_emit - RExC_emit_start),
14775 (UV)(RExC_parse - RExC_start),
14776 (UV)RExC_offsets[0]));
14777 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14785 - reganode - emit a node with an argument
14787 STATIC regnode * /* Location. */
14788 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14792 regnode * const ret = RExC_emit;
14793 GET_RE_DEBUG_FLAGS_DECL;
14795 PERL_ARGS_ASSERT_REGANODE;
14798 SIZE_ALIGN(RExC_size);
14803 assert(2==regarglen[op]+1);
14805 Anything larger than this has to allocate the extra amount.
14806 If we changed this to be:
14808 RExC_size += (1 + regarglen[op]);
14810 then it wouldn't matter. Its not clear what side effect
14811 might come from that so its not done so far.
14816 if (RExC_emit >= RExC_emit_bound)
14817 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14818 op, RExC_emit, RExC_emit_bound);
14820 NODE_ALIGN_FILL(ret);
14822 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14823 #ifdef RE_TRACK_PATTERN_OFFSETS
14824 if (RExC_offsets) { /* MJD */
14825 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14829 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14830 "Overwriting end of array!\n" : "OK",
14831 (UV)(RExC_emit - RExC_emit_start),
14832 (UV)(RExC_parse - RExC_start),
14833 (UV)RExC_offsets[0]));
14834 Set_Cur_Node_Offset;
14842 - reguni - emit (if appropriate) a Unicode character
14845 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14849 PERL_ARGS_ASSERT_REGUNI;
14851 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14855 - reginsert - insert an operator in front of already-emitted operand
14857 * Means relocating the operand.
14860 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14866 const int offset = regarglen[(U8)op];
14867 const int size = NODE_STEP_REGNODE + offset;
14868 GET_RE_DEBUG_FLAGS_DECL;
14870 PERL_ARGS_ASSERT_REGINSERT;
14871 PERL_UNUSED_ARG(depth);
14872 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14873 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14882 if (RExC_open_parens) {
14884 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14885 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14886 if ( RExC_open_parens[paren] >= opnd ) {
14887 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14888 RExC_open_parens[paren] += size;
14890 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14892 if ( RExC_close_parens[paren] >= opnd ) {
14893 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14894 RExC_close_parens[paren] += size;
14896 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14901 while (src > opnd) {
14902 StructCopy(--src, --dst, regnode);
14903 #ifdef RE_TRACK_PATTERN_OFFSETS
14904 if (RExC_offsets) { /* MJD 20010112 */
14905 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14909 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14910 ? "Overwriting end of array!\n" : "OK",
14911 (UV)(src - RExC_emit_start),
14912 (UV)(dst - RExC_emit_start),
14913 (UV)RExC_offsets[0]));
14914 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14915 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14921 place = opnd; /* Op node, where operand used to be. */
14922 #ifdef RE_TRACK_PATTERN_OFFSETS
14923 if (RExC_offsets) { /* MJD */
14924 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14928 (UV)(place - RExC_emit_start) > RExC_offsets[0]
14929 ? "Overwriting end of array!\n" : "OK",
14930 (UV)(place - RExC_emit_start),
14931 (UV)(RExC_parse - RExC_start),
14932 (UV)RExC_offsets[0]));
14933 Set_Node_Offset(place, RExC_parse);
14934 Set_Node_Length(place, 1);
14937 src = NEXTOPER(place);
14938 FILL_ADVANCE_NODE(place, op);
14939 Zero(src, offset, regnode);
14943 - regtail - set the next-pointer at the end of a node chain of p to val.
14944 - SEE ALSO: regtail_study
14946 /* TODO: All three parms should be const */
14948 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14952 GET_RE_DEBUG_FLAGS_DECL;
14954 PERL_ARGS_ASSERT_REGTAIL;
14956 PERL_UNUSED_ARG(depth);
14962 /* Find last node. */
14965 regnode * const temp = regnext(scan);
14967 SV * const mysv=sv_newmortal();
14968 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14969 regprop(RExC_rx, mysv, scan);
14970 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14971 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14972 (temp == NULL ? "->" : ""),
14973 (temp == NULL ? PL_reg_name[OP(val)] : "")
14981 if (reg_off_by_arg[OP(scan)]) {
14982 ARG_SET(scan, val - scan);
14985 NEXT_OFF(scan) = val - scan;
14991 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14992 - Look for optimizable sequences at the same time.
14993 - currently only looks for EXACT chains.
14995 This is experimental code. The idea is to use this routine to perform
14996 in place optimizations on branches and groups as they are constructed,
14997 with the long term intention of removing optimization from study_chunk so
14998 that it is purely analytical.
15000 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15001 to control which is which.
15004 /* TODO: All four parms should be const */
15007 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
15012 #ifdef EXPERIMENTAL_INPLACESCAN
15015 GET_RE_DEBUG_FLAGS_DECL;
15017 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15023 /* Find last node. */
15027 regnode * const temp = regnext(scan);
15028 #ifdef EXPERIMENTAL_INPLACESCAN
15029 if (PL_regkind[OP(scan)] == EXACT) {
15030 bool has_exactf_sharp_s; /* Unexamined in this routine */
15031 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
15036 switch (OP(scan)) {
15039 case EXACTFA_NO_TRIE:
15044 if( exact == PSEUDO )
15046 else if ( exact != OP(scan) )
15055 SV * const mysv=sv_newmortal();
15056 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15057 regprop(RExC_rx, mysv, scan);
15058 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15059 SvPV_nolen_const(mysv),
15060 REG_NODE_NUM(scan),
15061 PL_reg_name[exact]);
15068 SV * const mysv_val=sv_newmortal();
15069 DEBUG_PARSE_MSG("");
15070 regprop(RExC_rx, mysv_val, val);
15071 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15072 SvPV_nolen_const(mysv_val),
15073 (IV)REG_NODE_NUM(val),
15077 if (reg_off_by_arg[OP(scan)]) {
15078 ARG_SET(scan, val - scan);
15081 NEXT_OFF(scan) = val - scan;
15089 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15094 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15099 for (bit=0; bit<32; bit++) {
15100 if (flags & (1<<bit)) {
15101 if (!set++ && lead)
15102 PerlIO_printf(Perl_debug_log, "%s",lead);
15103 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15108 PerlIO_printf(Perl_debug_log, "\n");
15110 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15115 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15121 for (bit=0; bit<32; bit++) {
15122 if (flags & (1<<bit)) {
15123 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15126 if (!set++ && lead)
15127 PerlIO_printf(Perl_debug_log, "%s",lead);
15128 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15131 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15132 if (!set++ && lead) {
15133 PerlIO_printf(Perl_debug_log, "%s",lead);
15136 case REGEX_UNICODE_CHARSET:
15137 PerlIO_printf(Perl_debug_log, "UNICODE");
15139 case REGEX_LOCALE_CHARSET:
15140 PerlIO_printf(Perl_debug_log, "LOCALE");
15142 case REGEX_ASCII_RESTRICTED_CHARSET:
15143 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15145 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15146 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15149 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15155 PerlIO_printf(Perl_debug_log, "\n");
15157 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15163 Perl_regdump(pTHX_ const regexp *r)
15167 SV * const sv = sv_newmortal();
15168 SV *dsv= sv_newmortal();
15169 RXi_GET_DECL(r,ri);
15170 GET_RE_DEBUG_FLAGS_DECL;
15172 PERL_ARGS_ASSERT_REGDUMP;
15174 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15176 /* Header fields of interest. */
15177 if (r->anchored_substr) {
15178 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15179 RE_SV_DUMPLEN(r->anchored_substr), 30);
15180 PerlIO_printf(Perl_debug_log,
15181 "anchored %s%s at %"IVdf" ",
15182 s, RE_SV_TAIL(r->anchored_substr),
15183 (IV)r->anchored_offset);
15184 } else if (r->anchored_utf8) {
15185 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15186 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15187 PerlIO_printf(Perl_debug_log,
15188 "anchored utf8 %s%s at %"IVdf" ",
15189 s, RE_SV_TAIL(r->anchored_utf8),
15190 (IV)r->anchored_offset);
15192 if (r->float_substr) {
15193 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15194 RE_SV_DUMPLEN(r->float_substr), 30);
15195 PerlIO_printf(Perl_debug_log,
15196 "floating %s%s at %"IVdf"..%"UVuf" ",
15197 s, RE_SV_TAIL(r->float_substr),
15198 (IV)r->float_min_offset, (UV)r->float_max_offset);
15199 } else if (r->float_utf8) {
15200 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15201 RE_SV_DUMPLEN(r->float_utf8), 30);
15202 PerlIO_printf(Perl_debug_log,
15203 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15204 s, RE_SV_TAIL(r->float_utf8),
15205 (IV)r->float_min_offset, (UV)r->float_max_offset);
15207 if (r->check_substr || r->check_utf8)
15208 PerlIO_printf(Perl_debug_log,
15210 (r->check_substr == r->float_substr
15211 && r->check_utf8 == r->float_utf8
15212 ? "(checking floating" : "(checking anchored"));
15213 if (r->extflags & RXf_NOSCAN)
15214 PerlIO_printf(Perl_debug_log, " noscan");
15215 if (r->extflags & RXf_CHECK_ALL)
15216 PerlIO_printf(Perl_debug_log, " isall");
15217 if (r->check_substr || r->check_utf8)
15218 PerlIO_printf(Perl_debug_log, ") ");
15220 if (ri->regstclass) {
15221 regprop(r, sv, ri->regstclass);
15222 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15224 if (r->extflags & RXf_ANCH) {
15225 PerlIO_printf(Perl_debug_log, "anchored");
15226 if (r->extflags & RXf_ANCH_BOL)
15227 PerlIO_printf(Perl_debug_log, "(BOL)");
15228 if (r->extflags & RXf_ANCH_MBOL)
15229 PerlIO_printf(Perl_debug_log, "(MBOL)");
15230 if (r->extflags & RXf_ANCH_SBOL)
15231 PerlIO_printf(Perl_debug_log, "(SBOL)");
15232 if (r->extflags & RXf_ANCH_GPOS)
15233 PerlIO_printf(Perl_debug_log, "(GPOS)");
15234 PerlIO_putc(Perl_debug_log, ' ');
15236 if (r->extflags & RXf_GPOS_SEEN)
15237 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15238 if (r->intflags & PREGf_SKIP)
15239 PerlIO_printf(Perl_debug_log, "plus ");
15240 if (r->intflags & PREGf_IMPLICIT)
15241 PerlIO_printf(Perl_debug_log, "implicit ");
15242 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15243 if (r->extflags & RXf_EVAL_SEEN)
15244 PerlIO_printf(Perl_debug_log, "with eval ");
15245 PerlIO_printf(Perl_debug_log, "\n");
15247 regdump_extflags("r->extflags: ",r->extflags);
15248 regdump_intflags("r->intflags: ",r->intflags);
15251 PERL_ARGS_ASSERT_REGDUMP;
15252 PERL_UNUSED_CONTEXT;
15253 PERL_UNUSED_ARG(r);
15254 #endif /* DEBUGGING */
15258 - regprop - printable representation of opcode
15262 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
15268 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15269 static const char * const anyofs[] = {
15270 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15271 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15272 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15273 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15274 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15275 || _CC_VERTSPACE != 16
15276 #error Need to adjust order of anyofs[]
15313 RXi_GET_DECL(prog,progi);
15314 GET_RE_DEBUG_FLAGS_DECL;
15316 PERL_ARGS_ASSERT_REGPROP;
15320 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15321 /* It would be nice to FAIL() here, but this may be called from
15322 regexec.c, and it would be hard to supply pRExC_state. */
15323 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
15324 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15326 k = PL_regkind[OP(o)];
15329 sv_catpvs(sv, " ");
15330 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15331 * is a crude hack but it may be the best for now since
15332 * we have no flag "this EXACTish node was UTF-8"
15334 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15335 PERL_PV_ESCAPE_UNI_DETECT |
15336 PERL_PV_ESCAPE_NONASCII |
15337 PERL_PV_PRETTY_ELLIPSES |
15338 PERL_PV_PRETTY_LTGT |
15339 PERL_PV_PRETTY_NOCLEAR
15341 } else if (k == TRIE) {
15342 /* print the details of the trie in dumpuntil instead, as
15343 * progi->data isn't available here */
15344 const char op = OP(o);
15345 const U32 n = ARG(o);
15346 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15347 (reg_ac_data *)progi->data->data[n] :
15349 const reg_trie_data * const trie
15350 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15352 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15353 DEBUG_TRIE_COMPILE_r(
15354 Perl_sv_catpvf(aTHX_ sv,
15355 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15356 (UV)trie->startstate,
15357 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15358 (UV)trie->wordcount,
15361 (UV)TRIE_CHARCOUNT(trie),
15362 (UV)trie->uniquecharcount
15365 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15366 sv_catpvs(sv, "[");
15367 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15369 : TRIE_BITMAP(trie));
15370 sv_catpvs(sv, "]");
15373 } else if (k == CURLY) {
15374 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15375 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15376 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15378 else if (k == WHILEM && o->flags) /* Ordinal/of */
15379 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15380 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
15381 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15382 if ( RXp_PAREN_NAMES(prog) ) {
15383 if ( k != REF || (OP(o) < NREF)) {
15384 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15385 SV **name= av_fetch(list, ARG(o), 0 );
15387 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15390 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15391 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15392 I32 *nums=(I32*)SvPVX(sv_dat);
15393 SV **name= av_fetch(list, nums[0], 0 );
15396 for ( n=0; n<SvIVX(sv_dat); n++ ) {
15397 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15398 (n ? "," : ""), (IV)nums[n]);
15400 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15404 } else if (k == GOSUB)
15405 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
15406 else if (k == VERB) {
15408 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15409 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15410 } else if (k == LOGICAL)
15411 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
15412 else if (k == ANYOF) {
15413 const U8 flags = ANYOF_FLAGS(o);
15417 if (flags & ANYOF_LOCALE)
15418 sv_catpvs(sv, "{loc}");
15419 if (flags & ANYOF_LOC_FOLD)
15420 sv_catpvs(sv, "{i}");
15421 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15422 if (flags & ANYOF_INVERT)
15423 sv_catpvs(sv, "^");
15425 /* output what the standard cp 0-255 bitmap matches */
15426 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15428 /* output any special charclass tests (used entirely under use
15430 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15432 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15433 if (ANYOF_POSIXL_TEST(o,i)) {
15434 sv_catpv(sv, anyofs[i]);
15440 if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
15441 || ANYOF_NONBITMAP(o))
15444 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15445 if (flags & ANYOF_INVERT)
15446 /*make sure the invert info is in each */
15447 sv_catpvs(sv, "^");
15450 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
15451 sv_catpvs(sv, "{non-utf8-latin1-all}");
15454 /* output information about the unicode matching */
15455 if (flags & ANYOF_ABOVE_LATIN1_ALL)
15456 sv_catpvs(sv, "{unicode_all}");
15457 else if (ANYOF_NONBITMAP(o)) {
15458 SV *lv; /* Set if there is something outside the bit map. */
15459 bool byte_output = FALSE; /* If something in the bitmap has been
15462 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15463 sv_catpvs(sv, "{outside bitmap}");
15466 sv_catpvs(sv, "{utf8}");
15469 /* Get the stuff that wasn't in the bitmap */
15470 (void) regclass_swash(prog, o, FALSE, &lv, NULL);
15471 if (lv && lv != &PL_sv_undef) {
15472 char *s = savesvpv(lv);
15473 char * const origs = s;
15475 while (*s && *s != '\n')
15479 const char * const t = ++s;
15482 sv_catpvs(sv, " ");
15488 /* Truncate very long output */
15489 if (s - origs > 256) {
15490 Perl_sv_catpvf(aTHX_ sv,
15492 (int) (s - origs - 1),
15498 else if (*s == '\t') {
15512 SvREFCNT_dec_NN(lv);
15517 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15519 else if (k == POSIXD || k == NPOSIXD) {
15520 U8 index = FLAGS(o) * 2;
15521 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
15522 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15525 if (*anyofs[index] != '[') {
15528 sv_catpv(sv, anyofs[index]);
15529 if (*anyofs[index] != '[') {
15534 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15535 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15537 PERL_UNUSED_CONTEXT;
15538 PERL_UNUSED_ARG(sv);
15539 PERL_UNUSED_ARG(o);
15540 PERL_UNUSED_ARG(prog);
15541 #endif /* DEBUGGING */
15545 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15546 { /* Assume that RE_INTUIT is set */
15548 struct regexp *const prog = ReANY(r);
15549 GET_RE_DEBUG_FLAGS_DECL;
15551 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15552 PERL_UNUSED_CONTEXT;
15556 const char * const s = SvPV_nolen_const(prog->check_substr
15557 ? prog->check_substr : prog->check_utf8);
15559 if (!PL_colorset) reginitcolors();
15560 PerlIO_printf(Perl_debug_log,
15561 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15563 prog->check_substr ? "" : "utf8 ",
15564 PL_colors[5],PL_colors[0],
15567 (strlen(s) > 60 ? "..." : ""));
15570 return prog->check_substr ? prog->check_substr : prog->check_utf8;
15576 handles refcounting and freeing the perl core regexp structure. When
15577 it is necessary to actually free the structure the first thing it
15578 does is call the 'free' method of the regexp_engine associated to
15579 the regexp, allowing the handling of the void *pprivate; member
15580 first. (This routine is not overridable by extensions, which is why
15581 the extensions free is called first.)
15583 See regdupe and regdupe_internal if you change anything here.
15585 #ifndef PERL_IN_XSUB_RE
15587 Perl_pregfree(pTHX_ REGEXP *r)
15593 Perl_pregfree2(pTHX_ REGEXP *rx)
15596 struct regexp *const r = ReANY(rx);
15597 GET_RE_DEBUG_FLAGS_DECL;
15599 PERL_ARGS_ASSERT_PREGFREE2;
15601 if (r->mother_re) {
15602 ReREFCNT_dec(r->mother_re);
15604 CALLREGFREE_PVT(rx); /* free the private data */
15605 SvREFCNT_dec(RXp_PAREN_NAMES(r));
15606 Safefree(r->xpv_len_u.xpvlenu_pv);
15609 SvREFCNT_dec(r->anchored_substr);
15610 SvREFCNT_dec(r->anchored_utf8);
15611 SvREFCNT_dec(r->float_substr);
15612 SvREFCNT_dec(r->float_utf8);
15613 Safefree(r->substrs);
15615 RX_MATCH_COPY_FREE(rx);
15616 #ifdef PERL_ANY_COW
15617 SvREFCNT_dec(r->saved_copy);
15620 SvREFCNT_dec(r->qr_anoncv);
15621 rx->sv_u.svu_rx = 0;
15626 This is a hacky workaround to the structural issue of match results
15627 being stored in the regexp structure which is in turn stored in
15628 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15629 could be PL_curpm in multiple contexts, and could require multiple
15630 result sets being associated with the pattern simultaneously, such
15631 as when doing a recursive match with (??{$qr})
15633 The solution is to make a lightweight copy of the regexp structure
15634 when a qr// is returned from the code executed by (??{$qr}) this
15635 lightweight copy doesn't actually own any of its data except for
15636 the starp/end and the actual regexp structure itself.
15642 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15644 struct regexp *ret;
15645 struct regexp *const r = ReANY(rx);
15646 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15648 PERL_ARGS_ASSERT_REG_TEMP_COPY;
15651 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15653 SvOK_off((SV *)ret_x);
15655 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15656 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
15657 made both spots point to the same regexp body.) */
15658 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15659 assert(!SvPVX(ret_x));
15660 ret_x->sv_u.svu_rx = temp->sv_any;
15661 temp->sv_any = NULL;
15662 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15663 SvREFCNT_dec_NN(temp);
15664 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15665 ing below will not set it. */
15666 SvCUR_set(ret_x, SvCUR(rx));
15669 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15670 sv_force_normal(sv) is called. */
15672 ret = ReANY(ret_x);
15674 SvFLAGS(ret_x) |= SvUTF8(rx);
15675 /* We share the same string buffer as the original regexp, on which we
15676 hold a reference count, incremented when mother_re is set below.
15677 The string pointer is copied here, being part of the regexp struct.
15679 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15680 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15682 const I32 npar = r->nparens+1;
15683 Newx(ret->offs, npar, regexp_paren_pair);
15684 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15687 Newx(ret->substrs, 1, struct reg_substr_data);
15688 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15690 SvREFCNT_inc_void(ret->anchored_substr);
15691 SvREFCNT_inc_void(ret->anchored_utf8);
15692 SvREFCNT_inc_void(ret->float_substr);
15693 SvREFCNT_inc_void(ret->float_utf8);
15695 /* check_substr and check_utf8, if non-NULL, point to either their
15696 anchored or float namesakes, and don't hold a second reference. */
15698 RX_MATCH_COPIED_off(ret_x);
15699 #ifdef PERL_ANY_COW
15700 ret->saved_copy = NULL;
15702 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15703 SvREFCNT_inc_void(ret->qr_anoncv);
15709 /* regfree_internal()
15711 Free the private data in a regexp. This is overloadable by
15712 extensions. Perl takes care of the regexp structure in pregfree(),
15713 this covers the *pprivate pointer which technically perl doesn't
15714 know about, however of course we have to handle the
15715 regexp_internal structure when no extension is in use.
15717 Note this is called before freeing anything in the regexp
15722 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15725 struct regexp *const r = ReANY(rx);
15726 RXi_GET_DECL(r,ri);
15727 GET_RE_DEBUG_FLAGS_DECL;
15729 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15735 SV *dsv= sv_newmortal();
15736 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15737 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15738 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15739 PL_colors[4],PL_colors[5],s);
15742 #ifdef RE_TRACK_PATTERN_OFFSETS
15744 Safefree(ri->u.offsets); /* 20010421 MJD */
15746 if (ri->code_blocks) {
15748 for (n = 0; n < ri->num_code_blocks; n++)
15749 SvREFCNT_dec(ri->code_blocks[n].src_regex);
15750 Safefree(ri->code_blocks);
15754 int n = ri->data->count;
15757 /* If you add a ->what type here, update the comment in regcomp.h */
15758 switch (ri->data->what[n]) {
15764 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15767 Safefree(ri->data->data[n]);
15773 { /* Aho Corasick add-on structure for a trie node.
15774 Used in stclass optimization only */
15776 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15778 refcount = --aho->refcount;
15781 PerlMemShared_free(aho->states);
15782 PerlMemShared_free(aho->fail);
15783 /* do this last!!!! */
15784 PerlMemShared_free(ri->data->data[n]);
15785 PerlMemShared_free(ri->regstclass);
15791 /* trie structure. */
15793 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15795 refcount = --trie->refcount;
15798 PerlMemShared_free(trie->charmap);
15799 PerlMemShared_free(trie->states);
15800 PerlMemShared_free(trie->trans);
15802 PerlMemShared_free(trie->bitmap);
15804 PerlMemShared_free(trie->jump);
15805 PerlMemShared_free(trie->wordinfo);
15806 /* do this last!!!! */
15807 PerlMemShared_free(ri->data->data[n]);
15812 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15815 Safefree(ri->data->what);
15816 Safefree(ri->data);
15822 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15823 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15824 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15827 re_dup - duplicate a regexp.
15829 This routine is expected to clone a given regexp structure. It is only
15830 compiled under USE_ITHREADS.
15832 After all of the core data stored in struct regexp is duplicated
15833 the regexp_engine.dupe method is used to copy any private data
15834 stored in the *pprivate pointer. This allows extensions to handle
15835 any duplication it needs to do.
15837 See pregfree() and regfree_internal() if you change anything here.
15839 #if defined(USE_ITHREADS)
15840 #ifndef PERL_IN_XSUB_RE
15842 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15846 const struct regexp *r = ReANY(sstr);
15847 struct regexp *ret = ReANY(dstr);
15849 PERL_ARGS_ASSERT_RE_DUP_GUTS;
15851 npar = r->nparens+1;
15852 Newx(ret->offs, npar, regexp_paren_pair);
15853 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15855 if (ret->substrs) {
15856 /* Do it this way to avoid reading from *r after the StructCopy().
15857 That way, if any of the sv_dup_inc()s dislodge *r from the L1
15858 cache, it doesn't matter. */
15859 const bool anchored = r->check_substr
15860 ? r->check_substr == r->anchored_substr
15861 : r->check_utf8 == r->anchored_utf8;
15862 Newx(ret->substrs, 1, struct reg_substr_data);
15863 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15865 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15866 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15867 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15868 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15870 /* check_substr and check_utf8, if non-NULL, point to either their
15871 anchored or float namesakes, and don't hold a second reference. */
15873 if (ret->check_substr) {
15875 assert(r->check_utf8 == r->anchored_utf8);
15876 ret->check_substr = ret->anchored_substr;
15877 ret->check_utf8 = ret->anchored_utf8;
15879 assert(r->check_substr == r->float_substr);
15880 assert(r->check_utf8 == r->float_utf8);
15881 ret->check_substr = ret->float_substr;
15882 ret->check_utf8 = ret->float_utf8;
15884 } else if (ret->check_utf8) {
15886 ret->check_utf8 = ret->anchored_utf8;
15888 ret->check_utf8 = ret->float_utf8;
15893 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15894 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15897 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15899 if (RX_MATCH_COPIED(dstr))
15900 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15902 ret->subbeg = NULL;
15903 #ifdef PERL_ANY_COW
15904 ret->saved_copy = NULL;
15907 /* Whether mother_re be set or no, we need to copy the string. We
15908 cannot refrain from copying it when the storage points directly to
15909 our mother regexp, because that's
15910 1: a buffer in a different thread
15911 2: something we no longer hold a reference on
15912 so we need to copy it locally. */
15913 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15914 ret->mother_re = NULL;
15916 #endif /* PERL_IN_XSUB_RE */
15921 This is the internal complement to regdupe() which is used to copy
15922 the structure pointed to by the *pprivate pointer in the regexp.
15923 This is the core version of the extension overridable cloning hook.
15924 The regexp structure being duplicated will be copied by perl prior
15925 to this and will be provided as the regexp *r argument, however
15926 with the /old/ structures pprivate pointer value. Thus this routine
15927 may override any copying normally done by perl.
15929 It returns a pointer to the new regexp_internal structure.
15933 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15936 struct regexp *const r = ReANY(rx);
15937 regexp_internal *reti;
15939 RXi_GET_DECL(r,ri);
15941 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15945 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15946 Copy(ri->program, reti->program, len+1, regnode);
15948 reti->num_code_blocks = ri->num_code_blocks;
15949 if (ri->code_blocks) {
15951 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15952 struct reg_code_block);
15953 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15954 struct reg_code_block);
15955 for (n = 0; n < ri->num_code_blocks; n++)
15956 reti->code_blocks[n].src_regex = (REGEXP*)
15957 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15960 reti->code_blocks = NULL;
15962 reti->regstclass = NULL;
15965 struct reg_data *d;
15966 const int count = ri->data->count;
15969 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15970 char, struct reg_data);
15971 Newx(d->what, count, U8);
15974 for (i = 0; i < count; i++) {
15975 d->what[i] = ri->data->what[i];
15976 switch (d->what[i]) {
15977 /* see also regcomp.h and regfree_internal() */
15978 case 'a': /* actually an AV, but the dup function is identical. */
15982 case 'u': /* actually an HV, but the dup function is identical. */
15983 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15986 /* This is cheating. */
15987 Newx(d->data[i], 1, regnode_ssc);
15988 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
15989 reti->regstclass = (regnode*)d->data[i];
15992 /* Trie stclasses are readonly and can thus be shared
15993 * without duplication. We free the stclass in pregfree
15994 * when the corresponding reg_ac_data struct is freed.
15996 reti->regstclass= ri->regstclass;
16000 ((reg_trie_data*)ri->data->data[i])->refcount++;
16005 d->data[i] = ri->data->data[i];
16008 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
16017 reti->name_list_idx = ri->name_list_idx;
16019 #ifdef RE_TRACK_PATTERN_OFFSETS
16020 if (ri->u.offsets) {
16021 Newx(reti->u.offsets, 2*len+1, U32);
16022 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16025 SetProgLen(reti,len);
16028 return (void*)reti;
16031 #endif /* USE_ITHREADS */
16033 #ifndef PERL_IN_XSUB_RE
16036 - regnext - dig the "next" pointer out of a node
16039 Perl_regnext(pTHX_ regnode *p)
16047 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16048 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
16051 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16060 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16063 STRLEN l1 = strlen(pat1);
16064 STRLEN l2 = strlen(pat2);
16067 const char *message;
16069 PERL_ARGS_ASSERT_RE_CROAK2;
16075 Copy(pat1, buf, l1 , char);
16076 Copy(pat2, buf + l1, l2 , char);
16077 buf[l1 + l2] = '\n';
16078 buf[l1 + l2 + 1] = '\0';
16079 va_start(args, pat2);
16080 msv = vmess(buf, &args);
16082 message = SvPV_const(msv,l1);
16085 Copy(message, buf, l1 , char);
16086 /* l1-1 to avoid \n */
16087 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16090 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
16092 #ifndef PERL_IN_XSUB_RE
16094 Perl_save_re_context(pTHX)
16098 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16100 const REGEXP * const rx = PM_GETRE(PL_curpm);
16103 for (i = 1; i <= RX_NPARENS(rx); i++) {
16104 char digits[TYPE_CHARS(long)];
16105 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
16106 GV *const *const gvp
16107 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16110 GV * const gv = *gvp;
16111 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16123 S_put_byte(pTHX_ SV *sv, int c)
16125 PERL_ARGS_ASSERT_PUT_BYTE;
16127 /* Our definition of isPRINT() ignores locales, so only bytes that are
16128 not part of UTF-8 are considered printable. I assume that the same
16129 holds for UTF-EBCDIC.
16130 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
16131 which Wikipedia says:
16133 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
16134 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
16135 identical, to the ASCII delete (DEL) or rubout control character. ...
16136 it is typically mapped to hexadecimal code 9F, in order to provide a
16137 unique character mapping in both directions)
16139 So the old condition can be simplified to !isPRINT(c) */
16142 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16143 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16144 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16145 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16146 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16149 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16154 const char string = c;
16155 if (c == '-' || c == ']' || c == '\\' || c == '^')
16156 sv_catpvs(sv, "\\");
16157 sv_catpvn(sv, &string, 1);
16162 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16164 /* Appends to 'sv' a displayable version of the innards of the bracketed
16165 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16166 * output anything */
16169 int rangestart = -1;
16170 bool has_output_anything = FALSE;
16172 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16174 for (i = 0; i <= 256; i++) {
16175 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16176 if (rangestart == -1)
16178 } else if (rangestart != -1) {
16180 if (i <= rangestart + 3) { /* Individual chars in short ranges */
16181 for (; rangestart < i; rangestart++)
16182 put_byte(sv, rangestart);
16185 || ! isALPHANUMERIC(rangestart)
16186 || ! isALPHANUMERIC(j)
16187 || isDIGIT(rangestart) != isDIGIT(j)
16188 || isUPPER(rangestart) != isUPPER(j)
16189 || isLOWER(rangestart) != isLOWER(j)
16191 /* This final test should get optimized out except
16192 * on EBCDIC platforms, where it causes ranges that
16193 * cross discontinuities like i/j to be shown as hex
16194 * instead of the misleading, e.g. H-K (since that
16195 * range includes more than H, I, J, K). */
16196 || (j - rangestart)
16197 != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
16199 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
16201 (j < 256) ? j : 255);
16203 else { /* Here, the ends of the range are both digits, or both
16204 uppercase, or both lowercase; and there's no
16205 discontinuity in the range (which could happen on EBCDIC
16207 put_byte(sv, rangestart);
16208 sv_catpvs(sv, "-");
16212 has_output_anything = TRUE;
16216 return has_output_anything;
16219 #define CLEAR_OPTSTART \
16220 if (optstart) STMT_START { \
16221 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16225 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16227 STATIC const regnode *
16228 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16229 const regnode *last, const regnode *plast,
16230 SV* sv, I32 indent, U32 depth)
16233 U8 op = PSEUDO; /* Arbitrary non-END op. */
16234 const regnode *next;
16235 const regnode *optstart= NULL;
16237 RXi_GET_DECL(r,ri);
16238 GET_RE_DEBUG_FLAGS_DECL;
16240 PERL_ARGS_ASSERT_DUMPUNTIL;
16242 #ifdef DEBUG_DUMPUNTIL
16243 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16244 last ? last-start : 0,plast ? plast-start : 0);
16247 if (plast && plast < last)
16250 while (PL_regkind[op] != END && (!last || node < last)) {
16251 /* While that wasn't END last time... */
16254 if (op == CLOSE || op == WHILEM)
16256 next = regnext((regnode *)node);
16259 if (OP(node) == OPTIMIZED) {
16260 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16267 regprop(r, sv, node);
16268 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16269 (int)(2*indent + 1), "", SvPVX_const(sv));
16271 if (OP(node) != OPTIMIZED) {
16272 if (next == NULL) /* Next ptr. */
16273 PerlIO_printf(Perl_debug_log, " (0)");
16274 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
16275 PerlIO_printf(Perl_debug_log, " (FAIL)");
16277 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16278 (void)PerlIO_putc(Perl_debug_log, '\n');
16282 if (PL_regkind[(U8)op] == BRANCHJ) {
16285 const regnode *nnode = (OP(next) == LONGJMP
16286 ? regnext((regnode *)next)
16288 if (last && nnode > last)
16290 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16293 else if (PL_regkind[(U8)op] == BRANCH) {
16295 DUMPUNTIL(NEXTOPER(node), next);
16297 else if ( PL_regkind[(U8)op] == TRIE ) {
16298 const regnode *this_trie = node;
16299 const char op = OP(node);
16300 const U32 n = ARG(node);
16301 const reg_ac_data * const ac = op>=AHOCORASICK ?
16302 (reg_ac_data *)ri->data->data[n] :
16304 const reg_trie_data * const trie =
16305 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16307 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16309 const regnode *nextbranch= NULL;
16312 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16313 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16315 PerlIO_printf(Perl_debug_log, "%*s%s ",
16316 (int)(2*(indent+3)), "",
16317 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
16318 PL_colors[0], PL_colors[1],
16319 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
16320 PERL_PV_PRETTY_ELLIPSES |
16321 PERL_PV_PRETTY_LTGT
16326 U16 dist= trie->jump[word_idx+1];
16327 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16328 (UV)((dist ? this_trie + dist : next) - start));
16331 nextbranch= this_trie + trie->jump[0];
16332 DUMPUNTIL(this_trie + dist, nextbranch);
16334 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16335 nextbranch= regnext((regnode *)nextbranch);
16337 PerlIO_printf(Perl_debug_log, "\n");
16340 if (last && next > last)
16345 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
16346 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16347 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16349 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16351 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16353 else if ( op == PLUS || op == STAR) {
16354 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16356 else if (PL_regkind[(U8)op] == ANYOF) {
16357 /* arglen 1 + class block */
16358 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16359 ? ANYOF_POSIXL_SKIP : ANYOF_SKIP);
16360 node = NEXTOPER(node);
16362 else if (PL_regkind[(U8)op] == EXACT) {
16363 /* Literal string, where present. */
16364 node += NODE_SZ_STR(node) - 1;
16365 node = NEXTOPER(node);
16368 node = NEXTOPER(node);
16369 node += regarglen[(U8)op];
16371 if (op == CURLYX || op == OPEN)
16375 #ifdef DEBUG_DUMPUNTIL
16376 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16381 #endif /* DEBUGGING */
16385 * c-indentation-style: bsd
16386 * c-basic-offset: 4
16387 * indent-tabs-mode: nil
16390 * ex: set ts=8 sts=4 sw=4 et: