5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
100 #define STATIC static
104 struct RExC_state_t {
105 U32 flags; /* RXf_* are we folding, multilining? */
106 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object
112 char *start; /* Start of input for compile */
113 char *end; /* End of input for compile */
114 char *parse; /* Input-scan pointer. */
115 SSize_t whilem_seen; /* number of WHILEM in this expr */
116 regnode *emit_start; /* Start of emitted-code area */
117 regnode *emit_bound; /* First regnode outside of the
119 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
120 implies compiling, so don't emit */
121 regnode_ssc emit_dummy; /* placeholder for emit to point to;
122 large enough for the largest
123 non-EXACTish node, so can use it as
125 I32 naughty; /* How bad is this pattern? */
126 I32 sawback; /* Did we see \1, ...? */
128 SSize_t size; /* Code size. */
129 I32 npar; /* Capture buffer count, (OPEN) plus
130 one. ("par" 0 is the whole
132 I32 nestroot; /* root parens we are in - used by
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
146 HV *paren_names; /* Paren names */
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
150 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
152 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
156 I32 override_recoding;
157 I32 in_multi_char_class;
158 struct reg_code_block *code_blocks; /* positions of literal (?{})
160 int num_code_blocks; /* size of code_blocks[] */
161 int code_index; /* next code_blocks[] slot */
163 char *starttry; /* -Dr: where regtry was called. */
164 #define RExC_starttry (pRExC_state->starttry)
166 SV *runtime_code_qr; /* qr with the runtime code blocks */
168 const char *lastparse;
170 AV *paren_name_list; /* idx -> name */
171 #define RExC_lastparse (pRExC_state->lastparse)
172 #define RExC_lastnum (pRExC_state->lastnum)
173 #define RExC_paren_name_list (pRExC_state->paren_name_list)
177 #define RExC_flags (pRExC_state->flags)
178 #define RExC_pm_flags (pRExC_state->pm_flags)
179 #define RExC_precomp (pRExC_state->precomp)
180 #define RExC_rx_sv (pRExC_state->rx_sv)
181 #define RExC_rx (pRExC_state->rx)
182 #define RExC_rxi (pRExC_state->rxi)
183 #define RExC_start (pRExC_state->start)
184 #define RExC_end (pRExC_state->end)
185 #define RExC_parse (pRExC_state->parse)
186 #define RExC_whilem_seen (pRExC_state->whilem_seen)
187 #ifdef RE_TRACK_PATTERN_OFFSETS
188 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
191 #define RExC_emit (pRExC_state->emit)
192 #define RExC_emit_dummy (pRExC_state->emit_dummy)
193 #define RExC_emit_start (pRExC_state->emit_start)
194 #define RExC_emit_bound (pRExC_state->emit_bound)
195 #define RExC_naughty (pRExC_state->naughty)
196 #define RExC_sawback (pRExC_state->sawback)
197 #define RExC_seen (pRExC_state->seen)
198 #define RExC_size (pRExC_state->size)
199 #define RExC_npar (pRExC_state->npar)
200 #define RExC_nestroot (pRExC_state->nestroot)
201 #define RExC_extralen (pRExC_state->extralen)
202 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
203 #define RExC_utf8 (pRExC_state->utf8)
204 #define RExC_uni_semantics (pRExC_state->uni_semantics)
205 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
206 #define RExC_open_parens (pRExC_state->open_parens)
207 #define RExC_close_parens (pRExC_state->close_parens)
208 #define RExC_opend (pRExC_state->opend)
209 #define RExC_paren_names (pRExC_state->paren_names)
210 #define RExC_recurse (pRExC_state->recurse)
211 #define RExC_recurse_count (pRExC_state->recurse_count)
212 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
213 #define RExC_study_chunk_recursed_bytes \
214 (pRExC_state->study_chunk_recursed_bytes)
215 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
216 #define RExC_contains_locale (pRExC_state->contains_locale)
217 #define RExC_contains_i (pRExC_state->contains_i)
218 #define RExC_override_recoding (pRExC_state->override_recoding)
219 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
222 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
223 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
224 ((*s) == '{' && regcurly(s, FALSE)))
227 * Flags to be passed up and down.
229 #define WORST 0 /* Worst case. */
230 #define HASWIDTH 0x01 /* Known to match non-null strings. */
232 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
233 * character. (There needs to be a case: in the switch statement in regexec.c
234 * for any node marked SIMPLE.) Note that this is not the same thing as
237 #define SPSTART 0x04 /* Starts with * or + */
238 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
239 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
240 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
242 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
244 /* whether trie related optimizations are enabled */
245 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
246 #define TRIE_STUDY_OPT
247 #define FULL_TRIE_STUDY
253 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
254 #define PBITVAL(paren) (1 << ((paren) & 7))
255 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
256 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
257 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
259 #define REQUIRE_UTF8 STMT_START { \
261 *flagp = RESTART_UTF8; \
266 /* This converts the named class defined in regcomp.h to its equivalent class
267 * number defined in handy.h. */
268 #define namedclass_to_classnum(class) ((int) ((class) / 2))
269 #define classnum_to_namedclass(classnum) ((classnum) * 2)
271 #define _invlist_union_complement_2nd(a, b, output) \
272 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
273 #define _invlist_intersection_complement_2nd(a, b, output) \
274 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
276 /* About scan_data_t.
278 During optimisation we recurse through the regexp program performing
279 various inplace (keyhole style) optimisations. In addition study_chunk
280 and scan_commit populate this data structure with information about
281 what strings MUST appear in the pattern. We look for the longest
282 string that must appear at a fixed location, and we look for the
283 longest string that may appear at a floating location. So for instance
288 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
289 strings (because they follow a .* construct). study_chunk will identify
290 both FOO and BAR as being the longest fixed and floating strings respectively.
292 The strings can be composites, for instance
296 will result in a composite fixed substring 'foo'.
298 For each string some basic information is maintained:
300 - offset or min_offset
301 This is the position the string must appear at, or not before.
302 It also implicitly (when combined with minlenp) tells us how many
303 characters must match before the string we are searching for.
304 Likewise when combined with minlenp and the length of the string it
305 tells us how many characters must appear after the string we have
309 Only used for floating strings. This is the rightmost point that
310 the string can appear at. If set to SSize_t_MAX it indicates that the
311 string can occur infinitely far to the right.
314 A pointer to the minimum number of characters of the pattern that the
315 string was found inside. This is important as in the case of positive
316 lookahead or positive lookbehind we can have multiple patterns
321 The minimum length of the pattern overall is 3, the minimum length
322 of the lookahead part is 3, but the minimum length of the part that
323 will actually match is 1. So 'FOO's minimum length is 3, but the
324 minimum length for the F is 1. This is important as the minimum length
325 is used to determine offsets in front of and behind the string being
326 looked for. Since strings can be composites this is the length of the
327 pattern at the time it was committed with a scan_commit. Note that
328 the length is calculated by study_chunk, so that the minimum lengths
329 are not known until the full pattern has been compiled, thus the
330 pointer to the value.
334 In the case of lookbehind the string being searched for can be
335 offset past the start point of the final matching string.
336 If this value was just blithely removed from the min_offset it would
337 invalidate some of the calculations for how many chars must match
338 before or after (as they are derived from min_offset and minlen and
339 the length of the string being searched for).
340 When the final pattern is compiled and the data is moved from the
341 scan_data_t structure into the regexp structure the information
342 about lookbehind is factored in, with the information that would
343 have been lost precalculated in the end_shift field for the
346 The fields pos_min and pos_delta are used to store the minimum offset
347 and the delta to the maximum offset at the current point in the pattern.
351 typedef struct scan_data_t {
352 /*I32 len_min; unused */
353 /*I32 len_delta; unused */
357 SSize_t last_end; /* min value, <0 unless valid. */
358 SSize_t last_start_min;
359 SSize_t last_start_max;
360 SV **longest; /* Either &l_fixed, or &l_float. */
361 SV *longest_fixed; /* longest fixed string found in pattern */
362 SSize_t offset_fixed; /* offset where it starts */
363 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
364 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
365 SV *longest_float; /* longest floating string found in pattern */
366 SSize_t offset_float_min; /* earliest point in string it can appear */
367 SSize_t offset_float_max; /* latest point in string it can appear */
368 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
369 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
372 SSize_t *last_closep;
373 regnode_ssc *start_class;
376 /* The below is perhaps overboard, but this allows us to save a test at the
377 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
378 * and 'a' differ by a single bit; the same with the upper and lower case of
379 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
380 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
381 * then inverts it to form a mask, with just a single 0, in the bit position
382 * where the upper- and lowercase differ. XXX There are about 40 other
383 * instances in the Perl core where this micro-optimization could be used.
384 * Should decide if maintenance cost is worse, before changing those
386 * Returns a boolean as to whether or not 'v' is either a lowercase or
387 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
388 * compile-time constant, the generated code is better than some optimizing
389 * compilers figure out, amounting to a mask and test. The results are
390 * meaningless if 'c' is not one of [A-Za-z] */
391 #define isARG2_lower_or_UPPER_ARG1(c, v) \
392 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
395 * Forward declarations for pregcomp()'s friends.
398 static const scan_data_t zero_scan_data =
399 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
401 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
402 #define SF_BEFORE_SEOL 0x0001
403 #define SF_BEFORE_MEOL 0x0002
404 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
405 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
407 #define SF_FIX_SHIFT_EOL (+2)
408 #define SF_FL_SHIFT_EOL (+4)
410 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
411 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
413 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
414 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
415 #define SF_IS_INF 0x0040
416 #define SF_HAS_PAR 0x0080
417 #define SF_IN_PAR 0x0100
418 #define SF_HAS_EVAL 0x0200
419 #define SCF_DO_SUBSTR 0x0400
420 #define SCF_DO_STCLASS_AND 0x0800
421 #define SCF_DO_STCLASS_OR 0x1000
422 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
423 #define SCF_WHILEM_VISITED_POS 0x2000
425 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
426 #define SCF_SEEN_ACCEPT 0x8000
427 #define SCF_TRIE_DOING_RESTUDY 0x10000
429 #define UTF cBOOL(RExC_utf8)
431 /* The enums for all these are ordered so things work out correctly */
432 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
433 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
434 == REGEX_DEPENDS_CHARSET)
435 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
436 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
437 >= REGEX_UNICODE_CHARSET)
438 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
439 == REGEX_ASCII_RESTRICTED_CHARSET)
440 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
441 >= REGEX_ASCII_RESTRICTED_CHARSET)
442 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
443 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
445 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
447 /* For programs that want to be strictly Unicode compatible by dying if any
448 * attempt is made to match a non-Unicode code point against a Unicode
450 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
452 #define OOB_NAMEDCLASS -1
454 /* There is no code point that is out-of-bounds, so this is problematic. But
455 * its only current use is to initialize a variable that is always set before
457 #define OOB_UNICODE 0xDEADBEEF
459 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
460 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
463 /* length of regex to show in messages that don't mark a position within */
464 #define RegexLengthToShowInErrorMessages 127
467 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
468 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
469 * op/pragma/warn/regcomp.
471 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
472 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
474 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
475 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
477 #define REPORT_LOCATION_ARGS(offset) \
478 UTF8fARG(UTF, offset, RExC_precomp), \
479 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
482 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
483 * arg. Show regex, up to a maximum length. If it's too long, chop and add
486 #define _FAIL(code) STMT_START { \
487 const char *ellipses = ""; \
488 IV len = RExC_end - RExC_precomp; \
491 SAVEFREESV(RExC_rx_sv); \
492 if (len > RegexLengthToShowInErrorMessages) { \
493 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
494 len = RegexLengthToShowInErrorMessages - 10; \
500 #define FAIL(msg) _FAIL( \
501 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
502 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
504 #define FAIL2(msg,arg) _FAIL( \
505 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
506 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
509 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
511 #define Simple_vFAIL(m) STMT_START { \
512 const IV offset = RExC_parse - RExC_precomp; \
513 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
514 m, REPORT_LOCATION_ARGS(offset)); \
518 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
520 #define vFAIL(m) STMT_START { \
522 SAVEFREESV(RExC_rx_sv); \
527 * Like Simple_vFAIL(), but accepts two arguments.
529 #define Simple_vFAIL2(m,a1) STMT_START { \
530 const IV offset = RExC_parse - RExC_precomp; \
531 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
532 REPORT_LOCATION_ARGS(offset)); \
536 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
538 #define vFAIL2(m,a1) STMT_START { \
540 SAVEFREESV(RExC_rx_sv); \
541 Simple_vFAIL2(m, a1); \
546 * Like Simple_vFAIL(), but accepts three arguments.
548 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
549 const IV offset = RExC_parse - RExC_precomp; \
550 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
551 REPORT_LOCATION_ARGS(offset)); \
555 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
557 #define vFAIL3(m,a1,a2) STMT_START { \
559 SAVEFREESV(RExC_rx_sv); \
560 Simple_vFAIL3(m, a1, a2); \
564 * Like Simple_vFAIL(), but accepts four arguments.
566 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
567 const IV offset = RExC_parse - RExC_precomp; \
568 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
569 REPORT_LOCATION_ARGS(offset)); \
572 #define vFAIL4(m,a1,a2,a3) STMT_START { \
574 SAVEFREESV(RExC_rx_sv); \
575 Simple_vFAIL4(m, a1, a2, a3); \
578 /* A specialized version of vFAIL2 that works with UTF8f */
579 #define vFAIL2utf8f(m, a1) STMT_START { \
580 const IV offset = RExC_parse - RExC_precomp; \
582 SAVEFREESV(RExC_rx_sv); \
583 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
584 REPORT_LOCATION_ARGS(offset)); \
588 /* m is not necessarily a "literal string", in this macro */
589 #define reg_warn_non_literal_string(loc, m) STMT_START { \
590 const IV offset = loc - RExC_precomp; \
591 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
592 m, REPORT_LOCATION_ARGS(offset)); \
595 #define ckWARNreg(loc,m) STMT_START { \
596 const IV offset = loc - RExC_precomp; \
597 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
598 REPORT_LOCATION_ARGS(offset)); \
601 #define vWARN_dep(loc, m) STMT_START { \
602 const IV offset = loc - RExC_precomp; \
603 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
604 REPORT_LOCATION_ARGS(offset)); \
607 #define ckWARNdep(loc,m) STMT_START { \
608 const IV offset = loc - RExC_precomp; \
609 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
611 REPORT_LOCATION_ARGS(offset)); \
614 #define ckWARNregdep(loc,m) STMT_START { \
615 const IV offset = loc - RExC_precomp; \
616 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
618 REPORT_LOCATION_ARGS(offset)); \
621 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
622 const IV offset = loc - RExC_precomp; \
623 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
625 a1, REPORT_LOCATION_ARGS(offset)); \
628 #define ckWARN2reg(loc, m, a1) STMT_START { \
629 const IV offset = loc - RExC_precomp; \
630 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
631 a1, REPORT_LOCATION_ARGS(offset)); \
634 #define vWARN3(loc, m, a1, a2) STMT_START { \
635 const IV offset = loc - RExC_precomp; \
636 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
637 a1, a2, REPORT_LOCATION_ARGS(offset)); \
640 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
641 const IV offset = loc - RExC_precomp; \
642 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
643 a1, a2, REPORT_LOCATION_ARGS(offset)); \
646 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
647 const IV offset = loc - RExC_precomp; \
648 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
649 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
652 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
653 const IV offset = loc - RExC_precomp; \
654 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
655 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
658 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
659 const IV offset = loc - RExC_precomp; \
660 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
661 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
665 /* Allow for side effects in s */
666 #define REGC(c,s) STMT_START { \
667 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
670 /* Macros for recording node offsets. 20001227 mjd@plover.com
671 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
672 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
673 * Element 0 holds the number n.
674 * Position is 1 indexed.
676 #ifndef RE_TRACK_PATTERN_OFFSETS
677 #define Set_Node_Offset_To_R(node,byte)
678 #define Set_Node_Offset(node,byte)
679 #define Set_Cur_Node_Offset
680 #define Set_Node_Length_To_R(node,len)
681 #define Set_Node_Length(node,len)
682 #define Set_Node_Cur_Length(node,start)
683 #define Node_Offset(n)
684 #define Node_Length(n)
685 #define Set_Node_Offset_Length(node,offset,len)
686 #define ProgLen(ri) ri->u.proglen
687 #define SetProgLen(ri,x) ri->u.proglen = x
689 #define ProgLen(ri) ri->u.offsets[0]
690 #define SetProgLen(ri,x) ri->u.offsets[0] = x
691 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
693 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
694 __LINE__, (int)(node), (int)(byte))); \
696 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
699 RExC_offsets[2*(node)-1] = (byte); \
704 #define Set_Node_Offset(node,byte) \
705 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
706 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
708 #define Set_Node_Length_To_R(node,len) STMT_START { \
710 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
711 __LINE__, (int)(node), (int)(len))); \
713 Perl_croak(aTHX_ "value of node is %d in Length macro", \
716 RExC_offsets[2*(node)] = (len); \
721 #define Set_Node_Length(node,len) \
722 Set_Node_Length_To_R((node)-RExC_emit_start, len)
723 #define Set_Node_Cur_Length(node, start) \
724 Set_Node_Length(node, RExC_parse - start)
726 /* Get offsets and lengths */
727 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
728 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
730 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
731 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
732 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
736 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
737 #define EXPERIMENTAL_INPLACESCAN
738 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
740 #define DEBUG_RExC_seen() \
741 DEBUG_OPTIMISE_MORE_r({ \
742 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
744 if (RExC_seen & REG_ZERO_LEN_SEEN) \
745 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
747 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
748 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
750 if (RExC_seen & REG_GPOS_SEEN) \
751 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
753 if (RExC_seen & REG_CANY_SEEN) \
754 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
756 if (RExC_seen & REG_RECURSE_SEEN) \
757 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
759 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
760 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
762 if (RExC_seen & REG_VERBARG_SEEN) \
763 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
765 if (RExC_seen & REG_CUTGROUP_SEEN) \
766 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
768 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
769 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
771 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
772 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
774 if (RExC_seen & REG_GOSTART_SEEN) \
775 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
777 PerlIO_printf(Perl_debug_log,"\n"); \
780 #define DEBUG_STUDYDATA(str,data,depth) \
781 DEBUG_OPTIMISE_MORE_r(if(data){ \
782 PerlIO_printf(Perl_debug_log, \
783 "%*s" str "Pos:%"IVdf"/%"IVdf \
784 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
785 (int)(depth)*2, "", \
786 (IV)((data)->pos_min), \
787 (IV)((data)->pos_delta), \
788 (UV)((data)->flags), \
789 (IV)((data)->whilem_c), \
790 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
791 is_inf ? "INF " : "" \
793 if ((data)->last_found) \
794 PerlIO_printf(Perl_debug_log, \
795 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
796 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
797 SvPVX_const((data)->last_found), \
798 (IV)((data)->last_end), \
799 (IV)((data)->last_start_min), \
800 (IV)((data)->last_start_max), \
801 ((data)->longest && \
802 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
803 SvPVX_const((data)->longest_fixed), \
804 (IV)((data)->offset_fixed), \
805 ((data)->longest && \
806 (data)->longest==&((data)->longest_float)) ? "*" : "", \
807 SvPVX_const((data)->longest_float), \
808 (IV)((data)->offset_float_min), \
809 (IV)((data)->offset_float_max) \
811 PerlIO_printf(Perl_debug_log,"\n"); \
814 /* Mark that we cannot extend a found fixed substring at this point.
815 Update the longest found anchored substring and the longest found
816 floating substrings if needed. */
819 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
820 SSize_t *minlenp, int is_inf)
822 const STRLEN l = CHR_SVLEN(data->last_found);
823 const STRLEN old_l = CHR_SVLEN(*data->longest);
824 GET_RE_DEBUG_FLAGS_DECL;
826 PERL_ARGS_ASSERT_SCAN_COMMIT;
828 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
829 SvSetMagicSV(*data->longest, data->last_found);
830 if (*data->longest == data->longest_fixed) {
831 data->offset_fixed = l ? data->last_start_min : data->pos_min;
832 if (data->flags & SF_BEFORE_EOL)
834 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
836 data->flags &= ~SF_FIX_BEFORE_EOL;
837 data->minlen_fixed=minlenp;
838 data->lookbehind_fixed=0;
840 else { /* *data->longest == data->longest_float */
841 data->offset_float_min = l ? data->last_start_min : data->pos_min;
842 data->offset_float_max = (l
843 ? data->last_start_max
844 : (data->pos_delta == SSize_t_MAX
846 : data->pos_min + data->pos_delta));
848 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
849 data->offset_float_max = SSize_t_MAX;
850 if (data->flags & SF_BEFORE_EOL)
852 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
854 data->flags &= ~SF_FL_BEFORE_EOL;
855 data->minlen_float=minlenp;
856 data->lookbehind_float=0;
859 SvCUR_set(data->last_found, 0);
861 SV * const sv = data->last_found;
862 if (SvUTF8(sv) && SvMAGICAL(sv)) {
863 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
869 data->flags &= ~SF_BEFORE_EOL;
870 DEBUG_STUDYDATA("commit: ",data,0);
873 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
874 * list that describes which code points it matches */
877 S_ssc_anything(pTHX_ regnode_ssc *ssc)
879 /* Set the SSC 'ssc' to match an empty string or any code point */
881 PERL_ARGS_ASSERT_SSC_ANYTHING;
883 assert(is_ANYOF_SYNTHETIC(ssc));
885 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
886 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
887 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
891 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
893 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
894 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
895 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
896 * in any way, so there's no point in using it */
901 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
903 assert(is_ANYOF_SYNTHETIC(ssc));
905 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
909 /* See if the list consists solely of the range 0 - Infinity */
910 invlist_iterinit(ssc->invlist);
911 ret = invlist_iternext(ssc->invlist, &start, &end)
915 invlist_iterfinish(ssc->invlist);
921 /* If e.g., both \w and \W are set, matches everything */
922 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
924 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
925 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
935 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
937 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
938 * string, any code point, or any posix class under locale */
940 PERL_ARGS_ASSERT_SSC_INIT;
942 Zero(ssc, 1, regnode_ssc);
943 set_ANYOF_SYNTHETIC(ssc);
944 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
947 /* If any portion of the regex is to operate under locale rules,
948 * initialization includes it. The reason this isn't done for all regexes
949 * is that the optimizer was written under the assumption that locale was
950 * all-or-nothing. Given the complexity and lack of documentation in the
951 * optimizer, and that there are inadequate test cases for locale, many
952 * parts of it may not work properly, it is safest to avoid locale unless
954 if (RExC_contains_locale) {
955 ANYOF_POSIXL_SETALL(ssc);
956 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
959 ANYOF_POSIXL_ZERO(ssc);
964 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
965 const regnode_ssc *ssc)
967 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
968 * to the list of code points matched, and locale posix classes; hence does
969 * not check its flags) */
974 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
976 assert(is_ANYOF_SYNTHETIC(ssc));
978 invlist_iterinit(ssc->invlist);
979 ret = invlist_iternext(ssc->invlist, &start, &end)
983 invlist_iterfinish(ssc->invlist);
989 if (RExC_contains_locale
990 && ! ((ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
991 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
992 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc)))
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002 const regnode_charclass_posixl_fold* const node)
1004 /* Returns a mortal inversion list defining which code points are matched
1005 * by 'node', which is of type ANYOF. Handles complementing the result if
1006 * appropriate. If some code points aren't knowable at this time, the
1007 * returned list must, and will, contain every code point that is a
1010 SV* invlist = sv_2mortal(_new_invlist(0));
1012 const U32 n = ARG(node);
1013 bool new_node_has_latin1 = FALSE;
1015 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1017 /* Look at the data structure created by S_set_ANYOF_arg() */
1018 if (n != ANYOF_NONBITMAP_EMPTY) {
1019 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1020 AV * const av = MUTABLE_AV(SvRV(rv));
1021 SV **const ary = AvARRAY(av);
1022 assert(RExC_rxi->data->what[n] == 's');
1024 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1025 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1027 else if (ary[0] && ary[0] != &PL_sv_undef) {
1029 /* Here, no compile-time swash, and there are things that won't be
1030 * known until runtime -- we have to assume it could be anything */
1031 return _add_range_to_invlist(invlist, 0, UV_MAX);
1035 /* Here no compile-time swash, and no run-time only data. Use the
1036 * node's inversion list */
1037 invlist = sv_2mortal(invlist_clone(ary[2]));
1041 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1042 * inversion list for the others, but if there are code points that should
1043 * match only conditionally on the target string being UTF-8, those are
1044 * placed in the inversion list, and not the bitmap. Since there are
1045 * circumstances under which they could match, they are included in the
1046 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1047 * here, so that when we invert below, the end result actually does include
1048 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1049 * before we add the unconditionally matched code points */
1050 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1051 _invlist_intersection_complement_2nd(invlist,
1056 /* Add in the points from the bit map */
1057 for (i = 0; i < 256; i++) {
1058 if (ANYOF_BITMAP_TEST(node, i)) {
1059 invlist = add_cp_to_invlist(invlist, i);
1060 new_node_has_latin1 = TRUE;
1064 /* If this can match all upper Latin1 code points, have to add them
1066 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1067 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1070 /* Similarly for these */
1071 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1072 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1075 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1076 _invlist_invert(invlist);
1078 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1080 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1081 * locale. We can skip this if there are no 0-255 at all. */
1082 _invlist_union(invlist, PL_Latin1, &invlist);
1085 /* Similarly add the UTF-8 locale possible matches */
1086 if (ANYOF_FLAGS(node) & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(node))
1088 _invlist_union_maybe_complement_2nd(invlist,
1089 ANYOF_UTF8_LOCALE_INVLIST(node),
1090 ANYOF_FLAGS(node) & ANYOF_INVERT,
1097 /* These two functions currently do the exact same thing */
1098 #define ssc_init_zero ssc_init
1100 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1101 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1104 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1106 /* Take the flags 'and_with' and accumulate them anded into the flags for
1107 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
1108 * The flags 'and_with' should not come from another SSC (otherwise the
1109 * EMPTY_STRING flag won't work) */
1111 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS;
1113 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1115 /* Use just the SSC-related flags from 'and_with' */
1116 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_COMMON_FLAGS);
1117 ANYOF_FLAGS(ssc) |= ssc_only_flags;
1120 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1121 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1122 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1125 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1126 const regnode_ssc *and_with)
1128 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1129 * another SSC or a regular ANYOF class. Can create false positives. */
1134 PERL_ARGS_ASSERT_SSC_AND;
1136 assert(is_ANYOF_SYNTHETIC(ssc));
1138 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1139 * the code point inversion list and just the relevant flags */
1140 if (is_ANYOF_SYNTHETIC(and_with)) {
1141 anded_cp_list = and_with->invlist;
1142 anded_flags = ANYOF_FLAGS(and_with);
1144 /* XXX This is a kludge around what appears to be deficiencies in the
1145 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1146 * there are paths through the optimizer where it doesn't get weeded
1147 * out when it should. And if we don't make some extra provision for
1148 * it like the code just below, it doesn't get added when it should.
1149 * This solution is to add it only when AND'ing, which is here, and
1150 * only when what is being AND'ed is the pristine, original node
1151 * matching anything. Thus it is like adding it to ssc_anything() but
1152 * only when the result is to be AND'ed. Probably the same solution
1153 * could be adopted for the same problem we have with /l matching,
1154 * which is solved differently in S_ssc_init(), and that would lead to
1155 * fewer false positives than that solution has. But if this solution
1156 * creates bugs, the consequences are only that a warning isn't raised
1157 * that should be; while the consequences for having /l bugs is
1158 * incorrect matches */
1159 if (ssc_is_anything(and_with)) {
1160 anded_flags |= ANYOF_WARN_SUPER;
1164 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1165 (regnode_charclass_posixl_fold*) and_with);
1166 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1169 ANYOF_FLAGS(ssc) &= anded_flags;
1171 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1172 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1173 * 'and_with' may be inverted. When not inverted, we have the situation of
1175 * (C1 | P1) & (C2 | P2)
1176 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1177 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1178 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1179 * <= ((C1 & C2) | P1 | P2)
1180 * Alternatively, the last few steps could be:
1181 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1182 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1183 * <= (C1 | C2 | (P1 & P2))
1184 * We favor the second approach if either P1 or P2 is non-empty. This is
1185 * because these components are a barrier to doing optimizations, as what
1186 * they match cannot be known until the moment of matching as they are
1187 * dependent on the current locale, 'AND"ing them likely will reduce or
1189 * But we can do better if we know that C1,P1 are in their initial state (a
1190 * frequent occurrence), each matching everything:
1191 * (<everything>) & (C2 | P2) = C2 | P2
1192 * Similarly, if C2,P2 are in their initial state (again a frequent
1193 * occurrence), the result is a no-op
1194 * (C1 | P1) & (<everything>) = C1 | P1
1197 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1198 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1199 * <= (C1 & ~C2) | (P1 & ~P2)
1202 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1203 && ! is_ANYOF_SYNTHETIC(and_with))
1207 ssc_intersection(ssc,
1209 FALSE /* Has already been inverted */
1212 /* If either P1 or P2 is empty, the intersection will be also; can skip
1214 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1215 ANYOF_POSIXL_ZERO(ssc);
1217 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1219 /* Note that the Posix class component P from 'and_with' actually
1221 * P = Pa | Pb | ... | Pn
1222 * where each component is one posix class, such as in [\w\s].
1224 * ~P = ~(Pa | Pb | ... | Pn)
1225 * = ~Pa & ~Pb & ... & ~Pn
1226 * <= ~Pa | ~Pb | ... | ~Pn
1227 * The last is something we can easily calculate, but unfortunately
1228 * is likely to have many false positives. We could do better
1229 * in some (but certainly not all) instances if two classes in
1230 * P have known relationships. For example
1231 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1233 * :lower: & :print: = :lower:
1234 * And similarly for classes that must be disjoint. For example,
1235 * since \s and \w can have no elements in common based on rules in
1236 * the POSIX standard,
1237 * \w & ^\S = nothing
1238 * Unfortunately, some vendor locales do not meet the Posix
1239 * standard, in particular almost everything by Microsoft.
1240 * The loop below just changes e.g., \w into \W and vice versa */
1242 regnode_charclass_posixl_fold temp;
1243 int add = 1; /* To calculate the index of the complement */
1245 ANYOF_POSIXL_ZERO(&temp);
1246 for (i = 0; i < ANYOF_MAX; i++) {
1248 || ! ANYOF_POSIXL_TEST(and_with, i)
1249 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1251 if (ANYOF_POSIXL_TEST(and_with, i)) {
1252 ANYOF_POSIXL_SET(&temp, i + add);
1254 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1256 ANYOF_POSIXL_AND(&temp, ssc);
1258 } /* else ssc already has no posixes */
1259 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1260 in its initial state */
1261 else if (! is_ANYOF_SYNTHETIC(and_with)
1262 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1264 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1265 * copy it over 'ssc' */
1266 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1267 if (is_ANYOF_SYNTHETIC(and_with)) {
1268 StructCopy(and_with, ssc, regnode_ssc);
1271 ssc->invlist = anded_cp_list;
1272 ANYOF_POSIXL_ZERO(ssc);
1273 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274 ANYOF_POSIXL_OR(and_with, ssc);
1278 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1279 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1281 /* One or the other of P1, P2 is non-empty. */
1282 ANYOF_POSIXL_AND(and_with, ssc);
1283 ssc_union(ssc, anded_cp_list, FALSE);
1285 else { /* P1 = P2 = empty */
1286 ssc_intersection(ssc, anded_cp_list, FALSE);
1292 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1293 const regnode_ssc *or_with)
1295 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1296 * another SSC or a regular ANYOF class. Can create false positives if
1297 * 'or_with' is to be inverted. */
1302 PERL_ARGS_ASSERT_SSC_OR;
1304 assert(is_ANYOF_SYNTHETIC(ssc));
1306 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1307 * the code point inversion list and just the relevant flags */
1308 if (is_ANYOF_SYNTHETIC(or_with)) {
1309 ored_cp_list = or_with->invlist;
1310 ored_flags = ANYOF_FLAGS(or_with);
1313 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1314 (regnode_charclass_posixl_fold*) or_with);
1315 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1318 ANYOF_FLAGS(ssc) |= ored_flags;
1320 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1321 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1322 * 'or_with' may be inverted. When not inverted, we have the simple
1323 * situation of computing:
1324 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1325 * If P1|P2 yields a situation with both a class and its complement are
1326 * set, like having both \w and \W, this matches all code points, and we
1327 * can delete these from the P component of the ssc going forward. XXX We
1328 * might be able to delete all the P components, but I (khw) am not certain
1329 * about this, and it is better to be safe.
1332 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1333 * <= (C1 | P1) | ~C2
1334 * <= (C1 | ~C2) | P1
1335 * (which results in actually simpler code than the non-inverted case)
1338 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1339 && ! is_ANYOF_SYNTHETIC(or_with))
1341 /* We ignore P2, leaving P1 going forward */
1343 else { /* Not inverted */
1344 ANYOF_POSIXL_OR(or_with, ssc);
1345 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1347 for (i = 0; i < ANYOF_MAX; i += 2) {
1348 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1350 ssc_match_all_cp(ssc);
1351 ANYOF_POSIXL_CLEAR(ssc, i);
1352 ANYOF_POSIXL_CLEAR(ssc, i+1);
1353 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1354 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1363 FALSE /* Already has been inverted */
1367 PERL_STATIC_INLINE void
1368 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1370 PERL_ARGS_ASSERT_SSC_UNION;
1372 assert(is_ANYOF_SYNTHETIC(ssc));
1374 _invlist_union_maybe_complement_2nd(ssc->invlist,
1380 PERL_STATIC_INLINE void
1381 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1383 const bool invert2nd)
1385 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1387 assert(is_ANYOF_SYNTHETIC(ssc));
1389 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1395 PERL_STATIC_INLINE void
1396 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1398 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1400 assert(is_ANYOF_SYNTHETIC(ssc));
1402 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1405 PERL_STATIC_INLINE void
1406 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1408 /* AND just the single code point 'cp' into the SSC 'ssc' */
1410 SV* cp_list = _new_invlist(2);
1412 PERL_ARGS_ASSERT_SSC_CP_AND;
1414 assert(is_ANYOF_SYNTHETIC(ssc));
1416 cp_list = add_cp_to_invlist(cp_list, cp);
1417 ssc_intersection(ssc, cp_list,
1418 FALSE /* Not inverted */
1420 SvREFCNT_dec_NN(cp_list);
1423 PERL_STATIC_INLINE void
1424 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1426 /* Set the SSC 'ssc' to not match any locale things */
1428 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1430 assert(is_ANYOF_SYNTHETIC(ssc));
1432 ANYOF_POSIXL_ZERO(ssc);
1433 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1437 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1439 /* The inversion list in the SSC is marked mortal; now we need a more
1440 * permanent copy, which is stored the same way that is done in a regular
1441 * ANYOF node, with the first 256 code points in a bit map */
1443 SV* invlist = invlist_clone(ssc->invlist);
1445 PERL_ARGS_ASSERT_SSC_FINALIZE;
1447 assert(is_ANYOF_SYNTHETIC(ssc));
1449 /* The code in this file assumes that all but these flags aren't relevant
1450 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1451 * time we reach here */
1452 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1454 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1456 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1458 /* The code points that could match under /li are already incorporated into
1459 * the inversion list and bit map */
1460 ANYOF_FLAGS(ssc) &= ~ANYOF_LOC_FOLD;
1462 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1465 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1466 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1467 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1468 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1469 ? (TRIE_LIST_CUR( idx ) - 1) \
1475 dump_trie(trie,widecharmap,revcharmap)
1476 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1477 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1479 These routines dump out a trie in a somewhat readable format.
1480 The _interim_ variants are used for debugging the interim
1481 tables that are used to generate the final compressed
1482 representation which is what dump_trie expects.
1484 Part of the reason for their existence is to provide a form
1485 of documentation as to how the different representations function.
1490 Dumps the final compressed table form of the trie to Perl_debug_log.
1491 Used for debugging make_trie().
1495 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1496 AV *revcharmap, U32 depth)
1499 SV *sv=sv_newmortal();
1500 int colwidth= widecharmap ? 6 : 4;
1502 GET_RE_DEBUG_FLAGS_DECL;
1504 PERL_ARGS_ASSERT_DUMP_TRIE;
1506 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1507 (int)depth * 2 + 2,"",
1508 "Match","Base","Ofs" );
1510 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1511 SV ** const tmp = av_fetch( revcharmap, state, 0);
1513 PerlIO_printf( Perl_debug_log, "%*s",
1515 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1516 PL_colors[0], PL_colors[1],
1517 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1518 PERL_PV_ESCAPE_FIRSTCHAR
1523 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1524 (int)depth * 2 + 2,"");
1526 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1527 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1528 PerlIO_printf( Perl_debug_log, "\n");
1530 for( state = 1 ; state < trie->statecount ; state++ ) {
1531 const U32 base = trie->states[ state ].trans.base;
1533 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1534 (int)depth * 2 + 2,"", (UV)state);
1536 if ( trie->states[ state ].wordnum ) {
1537 PerlIO_printf( Perl_debug_log, " W%4X",
1538 trie->states[ state ].wordnum );
1540 PerlIO_printf( Perl_debug_log, "%6s", "" );
1543 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1548 while( ( base + ofs < trie->uniquecharcount ) ||
1549 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1550 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1554 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1556 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1557 if ( ( base + ofs >= trie->uniquecharcount )
1558 && ( base + ofs - trie->uniquecharcount
1560 && trie->trans[ base + ofs
1561 - trie->uniquecharcount ].check == state )
1563 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1565 (UV)trie->trans[ base + ofs
1566 - trie->uniquecharcount ].next );
1568 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1572 PerlIO_printf( Perl_debug_log, "]");
1575 PerlIO_printf( Perl_debug_log, "\n" );
1577 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1579 for (word=1; word <= trie->wordcount; word++) {
1580 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1581 (int)word, (int)(trie->wordinfo[word].prev),
1582 (int)(trie->wordinfo[word].len));
1584 PerlIO_printf(Perl_debug_log, "\n" );
1587 Dumps a fully constructed but uncompressed trie in list form.
1588 List tries normally only are used for construction when the number of
1589 possible chars (trie->uniquecharcount) is very high.
1590 Used for debugging make_trie().
1593 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1594 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1598 SV *sv=sv_newmortal();
1599 int colwidth= widecharmap ? 6 : 4;
1600 GET_RE_DEBUG_FLAGS_DECL;
1602 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1604 /* print out the table precompression. */
1605 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1606 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1607 "------:-----+-----------------\n" );
1609 for( state=1 ; state < next_alloc ; state ++ ) {
1612 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1613 (int)depth * 2 + 2,"", (UV)state );
1614 if ( ! trie->states[ state ].wordnum ) {
1615 PerlIO_printf( Perl_debug_log, "%5s| ","");
1617 PerlIO_printf( Perl_debug_log, "W%4x| ",
1618 trie->states[ state ].wordnum
1621 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1622 SV ** const tmp = av_fetch( revcharmap,
1623 TRIE_LIST_ITEM(state,charid).forid, 0);
1625 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1627 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1629 PL_colors[0], PL_colors[1],
1630 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1631 | PERL_PV_ESCAPE_FIRSTCHAR
1633 TRIE_LIST_ITEM(state,charid).forid,
1634 (UV)TRIE_LIST_ITEM(state,charid).newstate
1637 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1638 (int)((depth * 2) + 14), "");
1641 PerlIO_printf( Perl_debug_log, "\n");
1646 Dumps a fully constructed but uncompressed trie in table form.
1647 This is the normal DFA style state transition table, with a few
1648 twists to facilitate compression later.
1649 Used for debugging make_trie().
1652 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1653 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1658 SV *sv=sv_newmortal();
1659 int colwidth= widecharmap ? 6 : 4;
1660 GET_RE_DEBUG_FLAGS_DECL;
1662 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1665 print out the table precompression so that we can do a visual check
1666 that they are identical.
1669 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1671 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1672 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1674 PerlIO_printf( Perl_debug_log, "%*s",
1676 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1677 PL_colors[0], PL_colors[1],
1678 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1679 PERL_PV_ESCAPE_FIRSTCHAR
1685 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1687 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1688 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1691 PerlIO_printf( Perl_debug_log, "\n" );
1693 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1695 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1696 (int)depth * 2 + 2,"",
1697 (UV)TRIE_NODENUM( state ) );
1699 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1700 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1702 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1704 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1706 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1707 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1708 (UV)trie->trans[ state ].check );
1710 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1711 (UV)trie->trans[ state ].check,
1712 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1720 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1721 startbranch: the first branch in the whole branch sequence
1722 first : start branch of sequence of branch-exact nodes.
1723 May be the same as startbranch
1724 last : Thing following the last branch.
1725 May be the same as tail.
1726 tail : item following the branch sequence
1727 count : words in the sequence
1728 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1729 depth : indent depth
1731 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1733 A trie is an N'ary tree where the branches are determined by digital
1734 decomposition of the key. IE, at the root node you look up the 1st character and
1735 follow that branch repeat until you find the end of the branches. Nodes can be
1736 marked as "accepting" meaning they represent a complete word. Eg:
1740 would convert into the following structure. Numbers represent states, letters
1741 following numbers represent valid transitions on the letter from that state, if
1742 the number is in square brackets it represents an accepting state, otherwise it
1743 will be in parenthesis.
1745 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1749 (1) +-i->(6)-+-s->[7]
1751 +-s->(3)-+-h->(4)-+-e->[5]
1753 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1755 This shows that when matching against the string 'hers' we will begin at state 1
1756 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1757 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1758 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1759 single traverse. We store a mapping from accepting to state to which word was
1760 matched, and then when we have multiple possibilities we try to complete the
1761 rest of the regex in the order in which they occured in the alternation.
1763 The only prior NFA like behaviour that would be changed by the TRIE support is
1764 the silent ignoring of duplicate alternations which are of the form:
1766 / (DUPE|DUPE) X? (?{ ... }) Y /x
1768 Thus EVAL blocks following a trie may be called a different number of times with
1769 and without the optimisation. With the optimisations dupes will be silently
1770 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1771 the following demonstrates:
1773 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1775 which prints out 'word' three times, but
1777 'words'=~/(word|word|word)(?{ print $1 })S/
1779 which doesnt print it out at all. This is due to other optimisations kicking in.
1781 Example of what happens on a structural level:
1783 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1785 1: CURLYM[1] {1,32767}(18)
1796 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1797 and should turn into:
1799 1: CURLYM[1] {1,32767}(18)
1801 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1809 Cases where tail != last would be like /(?foo|bar)baz/:
1819 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1820 and would end up looking like:
1823 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1830 d = uvchr_to_utf8_flags(d, uv, 0);
1832 is the recommended Unicode-aware way of saying
1837 #define TRIE_STORE_REVCHAR(val) \
1840 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1841 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1842 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1843 SvCUR_set(zlopp, kapow - flrbbbbb); \
1846 av_push(revcharmap, zlopp); \
1848 char ooooff = (char)val; \
1849 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1853 /* This gets the next character from the input, folding it if not already
1855 #define TRIE_READ_CHAR STMT_START { \
1858 /* if it is UTF then it is either already folded, or does not need \
1860 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1862 else if (folder == PL_fold_latin1) { \
1863 /* This folder implies Unicode rules, which in the range expressible \
1864 * by not UTF is the lower case, with the two exceptions, one of \
1865 * which should have been taken care of before calling this */ \
1866 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1867 uvc = toLOWER_L1(*uc); \
1868 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1871 /* raw data, will be folded later if needed */ \
1879 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1880 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1881 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1882 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1884 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1885 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1886 TRIE_LIST_CUR( state )++; \
1889 #define TRIE_LIST_NEW(state) STMT_START { \
1890 Newxz( trie->states[ state ].trans.list, \
1891 4, reg_trie_trans_le ); \
1892 TRIE_LIST_CUR( state ) = 1; \
1893 TRIE_LIST_LEN( state ) = 4; \
1896 #define TRIE_HANDLE_WORD(state) STMT_START { \
1897 U16 dupe= trie->states[ state ].wordnum; \
1898 regnode * const noper_next = regnext( noper ); \
1901 /* store the word for dumping */ \
1903 if (OP(noper) != NOTHING) \
1904 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1906 tmp = newSVpvn_utf8( "", 0, UTF ); \
1907 av_push( trie_words, tmp ); \
1911 trie->wordinfo[curword].prev = 0; \
1912 trie->wordinfo[curword].len = wordlen; \
1913 trie->wordinfo[curword].accept = state; \
1915 if ( noper_next < tail ) { \
1917 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1919 trie->jump[curword] = (U16)(noper_next - convert); \
1921 jumper = noper_next; \
1923 nextbranch= regnext(cur); \
1927 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1928 /* chain, so that when the bits of chain are later */\
1929 /* linked together, the dups appear in the chain */\
1930 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1931 trie->wordinfo[dupe].prev = curword; \
1933 /* we haven't inserted this word yet. */ \
1934 trie->states[ state ].wordnum = curword; \
1939 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1940 ( ( base + charid >= ucharcount \
1941 && base + charid < ubound \
1942 && state == trie->trans[ base - ucharcount + charid ].check \
1943 && trie->trans[ base - ucharcount + charid ].next ) \
1944 ? trie->trans[ base - ucharcount + charid ].next \
1945 : ( state==1 ? special : 0 ) \
1949 #define MADE_JUMP_TRIE 2
1950 #define MADE_EXACT_TRIE 4
1953 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1954 regnode *first, regnode *last, regnode *tail,
1955 U32 word_count, U32 flags, U32 depth)
1958 /* first pass, loop through and scan words */
1959 reg_trie_data *trie;
1960 HV *widecharmap = NULL;
1961 AV *revcharmap = newAV();
1967 regnode *jumper = NULL;
1968 regnode *nextbranch = NULL;
1969 regnode *convert = NULL;
1970 U32 *prev_states; /* temp array mapping each state to previous one */
1971 /* we just use folder as a flag in utf8 */
1972 const U8 * folder = NULL;
1975 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1976 AV *trie_words = NULL;
1977 /* along with revcharmap, this only used during construction but both are
1978 * useful during debugging so we store them in the struct when debugging.
1981 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1982 STRLEN trie_charcount=0;
1984 SV *re_trie_maxbuff;
1985 GET_RE_DEBUG_FLAGS_DECL;
1987 PERL_ARGS_ASSERT_MAKE_TRIE;
1989 PERL_UNUSED_ARG(depth);
1996 case EXACTFU: folder = PL_fold_latin1; break;
1997 case EXACTF: folder = PL_fold; break;
1998 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2001 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2003 trie->startstate = 1;
2004 trie->wordcount = word_count;
2005 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2006 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2008 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2009 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2010 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2013 trie_words = newAV();
2016 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2017 if (!SvIOK(re_trie_maxbuff)) {
2018 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2020 DEBUG_TRIE_COMPILE_r({
2021 PerlIO_printf( Perl_debug_log,
2022 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2023 (int)depth * 2 + 2, "",
2024 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2025 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2028 /* Find the node we are going to overwrite */
2029 if ( first == startbranch && OP( last ) != BRANCH ) {
2030 /* whole branch chain */
2033 /* branch sub-chain */
2034 convert = NEXTOPER( first );
2037 /* -- First loop and Setup --
2039 We first traverse the branches and scan each word to determine if it
2040 contains widechars, and how many unique chars there are, this is
2041 important as we have to build a table with at least as many columns as we
2044 We use an array of integers to represent the character codes 0..255
2045 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2046 the native representation of the character value as the key and IV's for
2049 *TODO* If we keep track of how many times each character is used we can
2050 remap the columns so that the table compression later on is more
2051 efficient in terms of memory by ensuring the most common value is in the
2052 middle and the least common are on the outside. IMO this would be better
2053 than a most to least common mapping as theres a decent chance the most
2054 common letter will share a node with the least common, meaning the node
2055 will not be compressible. With a middle is most common approach the worst
2056 case is when we have the least common nodes twice.
2060 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2061 regnode *noper = NEXTOPER( cur );
2062 const U8 *uc = (U8*)STRING( noper );
2063 const U8 *e = uc + STR_LEN( noper );
2065 U32 wordlen = 0; /* required init */
2066 STRLEN minbytes = 0;
2067 STRLEN maxbytes = 0;
2068 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2071 if (OP(noper) == NOTHING) {
2072 regnode *noper_next= regnext(noper);
2073 if (noper_next != tail && OP(noper_next) == flags) {
2075 uc= (U8*)STRING(noper);
2076 e= uc + STR_LEN(noper);
2077 trie->minlen= STR_LEN(noper);
2084 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2085 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2086 regardless of encoding */
2087 if (OP( noper ) == EXACTFU_SS) {
2088 /* false positives are ok, so just set this */
2089 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2092 for ( ; uc < e ; uc += len ) {
2093 TRIE_CHARCOUNT(trie)++;
2096 /* Acummulate to the current values, the range in the number of
2097 * bytes that this character could match. The max is presumed to
2098 * be the same as the folded input (which TRIE_READ_CHAR returns),
2099 * except that when this is not in UTF-8, it could be matched
2100 * against a string which is UTF-8, and the variant characters
2101 * could be 2 bytes instead of the 1 here. Likewise, for the
2102 * minimum number of bytes when not folded. When folding, the min
2103 * is assumed to be 1 byte could fold to match the single character
2104 * here, or in the case of a multi-char fold, 1 byte can fold to
2105 * the whole sequence. 'foldlen' is used to denote whether we are
2106 * in such a sequence, skipping the min setting if so. XXX TODO
2107 * Use the exact list of what folds to each character, from
2108 * PL_utf8_foldclosures */
2110 maxbytes += UTF8SKIP(uc);
2112 /* A non-UTF-8 string could be 1 byte to match our 2 */
2113 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2119 foldlen -= UTF8SKIP(uc);
2122 foldlen = is_MULTI_CHAR_FOLD_utf8(uc);
2128 maxbytes += (UNI_IS_INVARIANT(*uc))
2139 foldlen = is_MULTI_CHAR_FOLD_latin1(uc);
2146 U8 folded= folder[ (U8) uvc ];
2147 if ( !trie->charmap[ folded ] ) {
2148 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2149 TRIE_STORE_REVCHAR( folded );
2152 if ( !trie->charmap[ uvc ] ) {
2153 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2154 TRIE_STORE_REVCHAR( uvc );
2157 /* store the codepoint in the bitmap, and its folded
2159 TRIE_BITMAP_SET(trie, uvc);
2161 /* store the folded codepoint */
2162 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2165 /* store first byte of utf8 representation of
2166 variant codepoints */
2167 if (! UVCHR_IS_INVARIANT(uvc)) {
2168 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2171 set_bit = 0; /* We've done our bit :-) */
2176 widecharmap = newHV();
2178 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2181 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2183 if ( !SvTRUE( *svpp ) ) {
2184 sv_setiv( *svpp, ++trie->uniquecharcount );
2185 TRIE_STORE_REVCHAR(uvc);
2189 if( cur == first ) {
2190 trie->minlen = minbytes;
2191 trie->maxlen = maxbytes;
2192 } else if (minbytes < trie->minlen) {
2193 trie->minlen = minbytes;
2194 } else if (maxbytes > trie->maxlen) {
2195 trie->maxlen = maxbytes;
2197 } /* end first pass */
2198 DEBUG_TRIE_COMPILE_r(
2199 PerlIO_printf( Perl_debug_log,
2200 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2201 (int)depth * 2 + 2,"",
2202 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2203 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2204 (int)trie->minlen, (int)trie->maxlen )
2208 We now know what we are dealing with in terms of unique chars and
2209 string sizes so we can calculate how much memory a naive
2210 representation using a flat table will take. If it's over a reasonable
2211 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2212 conservative but potentially much slower representation using an array
2215 At the end we convert both representations into the same compressed
2216 form that will be used in regexec.c for matching with. The latter
2217 is a form that cannot be used to construct with but has memory
2218 properties similar to the list form and access properties similar
2219 to the table form making it both suitable for fast searches and
2220 small enough that its feasable to store for the duration of a program.
2222 See the comment in the code where the compressed table is produced
2223 inplace from the flat tabe representation for an explanation of how
2224 the compression works.
2229 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2232 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2233 > SvIV(re_trie_maxbuff) )
2236 Second Pass -- Array Of Lists Representation
2238 Each state will be represented by a list of charid:state records
2239 (reg_trie_trans_le) the first such element holds the CUR and LEN
2240 points of the allocated array. (See defines above).
2242 We build the initial structure using the lists, and then convert
2243 it into the compressed table form which allows faster lookups
2244 (but cant be modified once converted).
2247 STRLEN transcount = 1;
2249 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2250 "%*sCompiling trie using list compiler\n",
2251 (int)depth * 2 + 2, ""));
2253 trie->states = (reg_trie_state *)
2254 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2255 sizeof(reg_trie_state) );
2259 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2261 regnode *noper = NEXTOPER( cur );
2262 U8 *uc = (U8*)STRING( noper );
2263 const U8 *e = uc + STR_LEN( noper );
2264 U32 state = 1; /* required init */
2265 U16 charid = 0; /* sanity init */
2266 U32 wordlen = 0; /* required init */
2268 if (OP(noper) == NOTHING) {
2269 regnode *noper_next= regnext(noper);
2270 if (noper_next != tail && OP(noper_next) == flags) {
2272 uc= (U8*)STRING(noper);
2273 e= uc + STR_LEN(noper);
2277 if (OP(noper) != NOTHING) {
2278 for ( ; uc < e ; uc += len ) {
2283 charid = trie->charmap[ uvc ];
2285 SV** const svpp = hv_fetch( widecharmap,
2292 charid=(U16)SvIV( *svpp );
2295 /* charid is now 0 if we dont know the char read, or
2296 * nonzero if we do */
2303 if ( !trie->states[ state ].trans.list ) {
2304 TRIE_LIST_NEW( state );
2307 check <= TRIE_LIST_USED( state );
2310 if ( TRIE_LIST_ITEM( state, check ).forid
2313 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2318 newstate = next_alloc++;
2319 prev_states[newstate] = state;
2320 TRIE_LIST_PUSH( state, charid, newstate );
2325 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2329 TRIE_HANDLE_WORD(state);
2331 } /* end second pass */
2333 /* next alloc is the NEXT state to be allocated */
2334 trie->statecount = next_alloc;
2335 trie->states = (reg_trie_state *)
2336 PerlMemShared_realloc( trie->states,
2338 * sizeof(reg_trie_state) );
2340 /* and now dump it out before we compress it */
2341 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2342 revcharmap, next_alloc,
2346 trie->trans = (reg_trie_trans *)
2347 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2354 for( state=1 ; state < next_alloc ; state ++ ) {
2358 DEBUG_TRIE_COMPILE_MORE_r(
2359 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2363 if (trie->states[state].trans.list) {
2364 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2368 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2369 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2370 if ( forid < minid ) {
2372 } else if ( forid > maxid ) {
2376 if ( transcount < tp + maxid - minid + 1) {
2378 trie->trans = (reg_trie_trans *)
2379 PerlMemShared_realloc( trie->trans,
2381 * sizeof(reg_trie_trans) );
2382 Zero( trie->trans + (transcount / 2),
2386 base = trie->uniquecharcount + tp - minid;
2387 if ( maxid == minid ) {
2389 for ( ; zp < tp ; zp++ ) {
2390 if ( ! trie->trans[ zp ].next ) {
2391 base = trie->uniquecharcount + zp - minid;
2392 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2394 trie->trans[ zp ].check = state;
2400 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2402 trie->trans[ tp ].check = state;
2407 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2408 const U32 tid = base
2409 - trie->uniquecharcount
2410 + TRIE_LIST_ITEM( state, idx ).forid;
2411 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2413 trie->trans[ tid ].check = state;
2415 tp += ( maxid - minid + 1 );
2417 Safefree(trie->states[ state ].trans.list);
2420 DEBUG_TRIE_COMPILE_MORE_r(
2421 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2424 trie->states[ state ].trans.base=base;
2426 trie->lasttrans = tp + 1;
2430 Second Pass -- Flat Table Representation.
2432 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2433 each. We know that we will need Charcount+1 trans at most to store
2434 the data (one row per char at worst case) So we preallocate both
2435 structures assuming worst case.
2437 We then construct the trie using only the .next slots of the entry
2440 We use the .check field of the first entry of the node temporarily
2441 to make compression both faster and easier by keeping track of how
2442 many non zero fields are in the node.
2444 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2447 There are two terms at use here: state as a TRIE_NODEIDX() which is
2448 a number representing the first entry of the node, and state as a
2449 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2450 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2451 if there are 2 entrys per node. eg:
2459 The table is internally in the right hand, idx form. However as we
2460 also have to deal with the states array which is indexed by nodenum
2461 we have to use TRIE_NODENUM() to convert.
2464 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2465 "%*sCompiling trie using table compiler\n",
2466 (int)depth * 2 + 2, ""));
2468 trie->trans = (reg_trie_trans *)
2469 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2470 * trie->uniquecharcount + 1,
2471 sizeof(reg_trie_trans) );
2472 trie->states = (reg_trie_state *)
2473 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2474 sizeof(reg_trie_state) );
2475 next_alloc = trie->uniquecharcount + 1;
2478 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2480 regnode *noper = NEXTOPER( cur );
2481 const U8 *uc = (U8*)STRING( noper );
2482 const U8 *e = uc + STR_LEN( noper );
2484 U32 state = 1; /* required init */
2486 U16 charid = 0; /* sanity init */
2487 U32 accept_state = 0; /* sanity init */
2489 U32 wordlen = 0; /* required init */
2491 if (OP(noper) == NOTHING) {
2492 regnode *noper_next= regnext(noper);
2493 if (noper_next != tail && OP(noper_next) == flags) {
2495 uc= (U8*)STRING(noper);
2496 e= uc + STR_LEN(noper);
2500 if ( OP(noper) != NOTHING ) {
2501 for ( ; uc < e ; uc += len ) {
2506 charid = trie->charmap[ uvc ];
2508 SV* const * const svpp = hv_fetch( widecharmap,
2512 charid = svpp ? (U16)SvIV(*svpp) : 0;
2516 if ( !trie->trans[ state + charid ].next ) {
2517 trie->trans[ state + charid ].next = next_alloc;
2518 trie->trans[ state ].check++;
2519 prev_states[TRIE_NODENUM(next_alloc)]
2520 = TRIE_NODENUM(state);
2521 next_alloc += trie->uniquecharcount;
2523 state = trie->trans[ state + charid ].next;
2525 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2527 /* charid is now 0 if we dont know the char read, or
2528 * nonzero if we do */
2531 accept_state = TRIE_NODENUM( state );
2532 TRIE_HANDLE_WORD(accept_state);
2534 } /* end second pass */
2536 /* and now dump it out before we compress it */
2537 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2539 next_alloc, depth+1));
2543 * Inplace compress the table.*
2545 For sparse data sets the table constructed by the trie algorithm will
2546 be mostly 0/FAIL transitions or to put it another way mostly empty.
2547 (Note that leaf nodes will not contain any transitions.)
2549 This algorithm compresses the tables by eliminating most such
2550 transitions, at the cost of a modest bit of extra work during lookup:
2552 - Each states[] entry contains a .base field which indicates the
2553 index in the state[] array wheres its transition data is stored.
2555 - If .base is 0 there are no valid transitions from that node.
2557 - If .base is nonzero then charid is added to it to find an entry in
2560 -If trans[states[state].base+charid].check!=state then the
2561 transition is taken to be a 0/Fail transition. Thus if there are fail
2562 transitions at the front of the node then the .base offset will point
2563 somewhere inside the previous nodes data (or maybe even into a node
2564 even earlier), but the .check field determines if the transition is
2568 The following process inplace converts the table to the compressed
2569 table: We first do not compress the root node 1,and mark all its
2570 .check pointers as 1 and set its .base pointer as 1 as well. This
2571 allows us to do a DFA construction from the compressed table later,
2572 and ensures that any .base pointers we calculate later are greater
2575 - We set 'pos' to indicate the first entry of the second node.
2577 - We then iterate over the columns of the node, finding the first and
2578 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2579 and set the .check pointers accordingly, and advance pos
2580 appropriately and repreat for the next node. Note that when we copy
2581 the next pointers we have to convert them from the original
2582 NODEIDX form to NODENUM form as the former is not valid post
2585 - If a node has no transitions used we mark its base as 0 and do not
2586 advance the pos pointer.
2588 - If a node only has one transition we use a second pointer into the
2589 structure to fill in allocated fail transitions from other states.
2590 This pointer is independent of the main pointer and scans forward
2591 looking for null transitions that are allocated to a state. When it
2592 finds one it writes the single transition into the "hole". If the
2593 pointer doesnt find one the single transition is appended as normal.
2595 - Once compressed we can Renew/realloc the structures to release the
2598 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2599 specifically Fig 3.47 and the associated pseudocode.
2603 const U32 laststate = TRIE_NODENUM( next_alloc );
2606 trie->statecount = laststate;
2608 for ( state = 1 ; state < laststate ; state++ ) {
2610 const U32 stateidx = TRIE_NODEIDX( state );
2611 const U32 o_used = trie->trans[ stateidx ].check;
2612 U32 used = trie->trans[ stateidx ].check;
2613 trie->trans[ stateidx ].check = 0;
2616 used && charid < trie->uniquecharcount;
2619 if ( flag || trie->trans[ stateidx + charid ].next ) {
2620 if ( trie->trans[ stateidx + charid ].next ) {
2622 for ( ; zp < pos ; zp++ ) {
2623 if ( ! trie->trans[ zp ].next ) {
2627 trie->states[ state ].trans.base
2629 + trie->uniquecharcount
2631 trie->trans[ zp ].next
2632 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2634 trie->trans[ zp ].check = state;
2635 if ( ++zp > pos ) pos = zp;
2642 trie->states[ state ].trans.base
2643 = pos + trie->uniquecharcount - charid ;
2645 trie->trans[ pos ].next
2646 = SAFE_TRIE_NODENUM(
2647 trie->trans[ stateidx + charid ].next );
2648 trie->trans[ pos ].check = state;
2653 trie->lasttrans = pos + 1;
2654 trie->states = (reg_trie_state *)
2655 PerlMemShared_realloc( trie->states, laststate
2656 * sizeof(reg_trie_state) );
2657 DEBUG_TRIE_COMPILE_MORE_r(
2658 PerlIO_printf( Perl_debug_log,
2659 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2660 (int)depth * 2 + 2,"",
2661 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2665 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2668 } /* end table compress */
2670 DEBUG_TRIE_COMPILE_MORE_r(
2671 PerlIO_printf(Perl_debug_log,
2672 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2673 (int)depth * 2 + 2, "",
2674 (UV)trie->statecount,
2675 (UV)trie->lasttrans)
2677 /* resize the trans array to remove unused space */
2678 trie->trans = (reg_trie_trans *)
2679 PerlMemShared_realloc( trie->trans, trie->lasttrans
2680 * sizeof(reg_trie_trans) );
2682 { /* Modify the program and insert the new TRIE node */
2683 U8 nodetype =(U8)(flags & 0xFF);
2687 regnode *optimize = NULL;
2688 #ifdef RE_TRACK_PATTERN_OFFSETS
2691 U32 mjd_nodelen = 0;
2692 #endif /* RE_TRACK_PATTERN_OFFSETS */
2693 #endif /* DEBUGGING */
2695 This means we convert either the first branch or the first Exact,
2696 depending on whether the thing following (in 'last') is a branch
2697 or not and whther first is the startbranch (ie is it a sub part of
2698 the alternation or is it the whole thing.)
2699 Assuming its a sub part we convert the EXACT otherwise we convert
2700 the whole branch sequence, including the first.
2702 /* Find the node we are going to overwrite */
2703 if ( first != startbranch || OP( last ) == BRANCH ) {
2704 /* branch sub-chain */
2705 NEXT_OFF( first ) = (U16)(last - first);
2706 #ifdef RE_TRACK_PATTERN_OFFSETS
2708 mjd_offset= Node_Offset((convert));
2709 mjd_nodelen= Node_Length((convert));
2712 /* whole branch chain */
2714 #ifdef RE_TRACK_PATTERN_OFFSETS
2717 const regnode *nop = NEXTOPER( convert );
2718 mjd_offset= Node_Offset((nop));
2719 mjd_nodelen= Node_Length((nop));
2723 PerlIO_printf(Perl_debug_log,
2724 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2725 (int)depth * 2 + 2, "",
2726 (UV)mjd_offset, (UV)mjd_nodelen)
2729 /* But first we check to see if there is a common prefix we can
2730 split out as an EXACT and put in front of the TRIE node. */
2731 trie->startstate= 1;
2732 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2734 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2738 const U32 base = trie->states[ state ].trans.base;
2740 if ( trie->states[state].wordnum )
2743 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2744 if ( ( base + ofs >= trie->uniquecharcount ) &&
2745 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2746 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2748 if ( ++count > 1 ) {
2749 SV **tmp = av_fetch( revcharmap, ofs, 0);
2750 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2751 if ( state == 1 ) break;
2753 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2755 PerlIO_printf(Perl_debug_log,
2756 "%*sNew Start State=%"UVuf" Class: [",
2757 (int)depth * 2 + 2, "",
2760 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2761 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2763 TRIE_BITMAP_SET(trie,*ch);
2765 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2767 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2771 TRIE_BITMAP_SET(trie,*ch);
2773 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2774 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2780 SV **tmp = av_fetch( revcharmap, idx, 0);
2782 char *ch = SvPV( *tmp, len );
2784 SV *sv=sv_newmortal();
2785 PerlIO_printf( Perl_debug_log,
2786 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2787 (int)depth * 2 + 2, "",
2789 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2790 PL_colors[0], PL_colors[1],
2791 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2792 PERL_PV_ESCAPE_FIRSTCHAR
2797 OP( convert ) = nodetype;
2798 str=STRING(convert);
2801 STR_LEN(convert) += len;
2807 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2812 trie->prefixlen = (state-1);
2814 regnode *n = convert+NODE_SZ_STR(convert);
2815 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2816 trie->startstate = state;
2817 trie->minlen -= (state - 1);
2818 trie->maxlen -= (state - 1);
2820 /* At least the UNICOS C compiler choked on this
2821 * being argument to DEBUG_r(), so let's just have
2824 #ifdef PERL_EXT_RE_BUILD
2830 regnode *fix = convert;
2831 U32 word = trie->wordcount;
2833 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2834 while( ++fix < n ) {
2835 Set_Node_Offset_Length(fix, 0, 0);
2838 SV ** const tmp = av_fetch( trie_words, word, 0 );
2840 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2841 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2843 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2851 NEXT_OFF(convert) = (U16)(tail - convert);
2852 DEBUG_r(optimize= n);
2858 if ( trie->maxlen ) {
2859 NEXT_OFF( convert ) = (U16)(tail - convert);
2860 ARG_SET( convert, data_slot );
2861 /* Store the offset to the first unabsorbed branch in
2862 jump[0], which is otherwise unused by the jump logic.
2863 We use this when dumping a trie and during optimisation. */
2865 trie->jump[0] = (U16)(nextbranch - convert);
2867 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2868 * and there is a bitmap
2869 * and the first "jump target" node we found leaves enough room
2870 * then convert the TRIE node into a TRIEC node, with the bitmap
2871 * embedded inline in the opcode - this is hypothetically faster.
2873 if ( !trie->states[trie->startstate].wordnum
2875 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2877 OP( convert ) = TRIEC;
2878 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2879 PerlMemShared_free(trie->bitmap);
2882 OP( convert ) = TRIE;
2884 /* store the type in the flags */
2885 convert->flags = nodetype;
2889 + regarglen[ OP( convert ) ];
2891 /* XXX We really should free up the resource in trie now,
2892 as we won't use them - (which resources?) dmq */
2894 /* needed for dumping*/
2895 DEBUG_r(if (optimize) {
2896 regnode *opt = convert;
2898 while ( ++opt < optimize) {
2899 Set_Node_Offset_Length(opt,0,0);
2902 Try to clean up some of the debris left after the
2905 while( optimize < jumper ) {
2906 mjd_nodelen += Node_Length((optimize));
2907 OP( optimize ) = OPTIMIZED;
2908 Set_Node_Offset_Length(optimize,0,0);
2911 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2913 } /* end node insert */
2915 /* Finish populating the prev field of the wordinfo array. Walk back
2916 * from each accept state until we find another accept state, and if
2917 * so, point the first word's .prev field at the second word. If the
2918 * second already has a .prev field set, stop now. This will be the
2919 * case either if we've already processed that word's accept state,
2920 * or that state had multiple words, and the overspill words were
2921 * already linked up earlier.
2928 for (word=1; word <= trie->wordcount; word++) {
2930 if (trie->wordinfo[word].prev)
2932 state = trie->wordinfo[word].accept;
2934 state = prev_states[state];
2937 prev = trie->states[state].wordnum;
2941 trie->wordinfo[word].prev = prev;
2943 Safefree(prev_states);
2947 /* and now dump out the compressed format */
2948 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2950 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2952 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2953 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2955 SvREFCNT_dec_NN(revcharmap);
2959 : trie->startstate>1
2965 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2967 /* The Trie is constructed and compressed now so we can build a fail array if
2970 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2972 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2976 We find the fail state for each state in the trie, this state is the longest
2977 proper suffix of the current state's 'word' that is also a proper prefix of
2978 another word in our trie. State 1 represents the word '' and is thus the
2979 default fail state. This allows the DFA not to have to restart after its
2980 tried and failed a word at a given point, it simply continues as though it
2981 had been matching the other word in the first place.
2983 'abcdgu'=~/abcdefg|cdgu/
2984 When we get to 'd' we are still matching the first word, we would encounter
2985 'g' which would fail, which would bring us to the state representing 'd' in
2986 the second word where we would try 'g' and succeed, proceeding to match
2989 /* add a fail transition */
2990 const U32 trie_offset = ARG(source);
2991 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2993 const U32 ucharcount = trie->uniquecharcount;
2994 const U32 numstates = trie->statecount;
2995 const U32 ubound = trie->lasttrans + ucharcount;
2999 U32 base = trie->states[ 1 ].trans.base;
3002 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3003 GET_RE_DEBUG_FLAGS_DECL;
3005 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3007 PERL_UNUSED_ARG(depth);
3011 ARG_SET( stclass, data_slot );
3012 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3013 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3014 aho->trie=trie_offset;
3015 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3016 Copy( trie->states, aho->states, numstates, reg_trie_state );
3017 Newxz( q, numstates, U32);
3018 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3021 /* initialize fail[0..1] to be 1 so that we always have
3022 a valid final fail state */
3023 fail[ 0 ] = fail[ 1 ] = 1;
3025 for ( charid = 0; charid < ucharcount ; charid++ ) {
3026 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3028 q[ q_write ] = newstate;
3029 /* set to point at the root */
3030 fail[ q[ q_write++ ] ]=1;
3033 while ( q_read < q_write) {
3034 const U32 cur = q[ q_read++ % numstates ];
3035 base = trie->states[ cur ].trans.base;
3037 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3038 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3040 U32 fail_state = cur;
3043 fail_state = fail[ fail_state ];
3044 fail_base = aho->states[ fail_state ].trans.base;
3045 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3047 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3048 fail[ ch_state ] = fail_state;
3049 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3051 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3053 q[ q_write++ % numstates] = ch_state;
3057 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3058 when we fail in state 1, this allows us to use the
3059 charclass scan to find a valid start char. This is based on the principle
3060 that theres a good chance the string being searched contains lots of stuff
3061 that cant be a start char.
3063 fail[ 0 ] = fail[ 1 ] = 0;
3064 DEBUG_TRIE_COMPILE_r({
3065 PerlIO_printf(Perl_debug_log,
3066 "%*sStclass Failtable (%"UVuf" states): 0",
3067 (int)(depth * 2), "", (UV)numstates
3069 for( q_read=1; q_read<numstates; q_read++ ) {
3070 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3072 PerlIO_printf(Perl_debug_log, "\n");
3075 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3079 #define DEBUG_PEEP(str,scan,depth) \
3080 DEBUG_OPTIMISE_r({if (scan){ \
3081 SV * const mysv=sv_newmortal(); \
3082 regnode *Next = regnext(scan); \
3083 regprop(RExC_rx, mysv, scan); \
3084 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3085 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3086 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3090 /* The below joins as many adjacent EXACTish nodes as possible into a single
3091 * one. The regop may be changed if the node(s) contain certain sequences that
3092 * require special handling. The joining is only done if:
3093 * 1) there is room in the current conglomerated node to entirely contain the
3095 * 2) they are the exact same node type
3097 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3098 * these get optimized out
3100 * If a node is to match under /i (folded), the number of characters it matches
3101 * can be different than its character length if it contains a multi-character
3102 * fold. *min_subtract is set to the total delta number of characters of the
3105 * And *unfolded_multi_char is set to indicate whether or not the node contains
3106 * an unfolded multi-char fold. This happens when whether the fold is valid or
3107 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3108 * SMALL LETTER SHARP S, as only if the target string being matched against
3109 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3110 * folding rules depend on the locale in force at runtime. (Multi-char folds
3111 * whose components are all above the Latin1 range are not run-time locale
3112 * dependent, and have already been folded by the time this function is
3115 * This is as good a place as any to discuss the design of handling these
3116 * multi-character fold sequences. It's been wrong in Perl for a very long
3117 * time. There are three code points in Unicode whose multi-character folds
3118 * were long ago discovered to mess things up. The previous designs for
3119 * dealing with these involved assigning a special node for them. This
3120 * approach doesn't always work, as evidenced by this example:
3121 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3122 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3123 * would match just the \xDF, it won't be able to handle the case where a
3124 * successful match would have to cross the node's boundary. The new approach
3125 * that hopefully generally solves the problem generates an EXACTFU_SS node
3126 * that is "sss" in this case.
3128 * It turns out that there are problems with all multi-character folds, and not
3129 * just these three. Now the code is general, for all such cases. The
3130 * approach taken is:
3131 * 1) This routine examines each EXACTFish node that could contain multi-
3132 * character folded sequences. Since a single character can fold into
3133 * such a sequence, the minimum match length for this node is less than
3134 * the number of characters in the node. This routine returns in
3135 * *min_subtract how many characters to subtract from the the actual
3136 * length of the string to get a real minimum match length; it is 0 if
3137 * there are no multi-char foldeds. This delta is used by the caller to
3138 * adjust the min length of the match, and the delta between min and max,
3139 * so that the optimizer doesn't reject these possibilities based on size
3141 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3142 * is used for an EXACTFU node that contains at least one "ss" sequence in
3143 * it. For non-UTF-8 patterns and strings, this is the only case where
3144 * there is a possible fold length change. That means that a regular
3145 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3146 * with length changes, and so can be processed faster. regexec.c takes
3147 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3148 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3149 * known until runtime). This saves effort in regex matching. However,
3150 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3151 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3152 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3153 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3154 * possibilities for the non-UTF8 patterns are quite simple, except for
3155 * the sharp s. All the ones that don't involve a UTF-8 target string are
3156 * members of a fold-pair, and arrays are set up for all of them so that
3157 * the other member of the pair can be found quickly. Code elsewhere in
3158 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3159 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3160 * described in the next item.
3161 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3162 * validity of the fold won't be known until runtime, and so must remain
3163 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3164 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3165 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3166 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3167 * The reason this is a problem is that the optimizer part of regexec.c
3168 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3169 * that a character in the pattern corresponds to at most a single
3170 * character in the target string. (And I do mean character, and not byte
3171 * here, unlike other parts of the documentation that have never been
3172 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3173 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3174 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3175 * nodes, violate the assumption, and they are the only instances where it
3176 * is violated. I'm reluctant to try to change the assumption, as the
3177 * code involved is impenetrable to me (khw), so instead the code here
3178 * punts. This routine examines EXACTFL nodes, and (when the pattern
3179 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3180 * boolean indicating whether or not the node contains such a fold. When
3181 * it is true, the caller sets a flag that later causes the optimizer in
3182 * this file to not set values for the floating and fixed string lengths,
3183 * and thus avoids the optimizer code in regexec.c that makes the invalid
3184 * assumption. Thus, there is no optimization based on string lengths for
3185 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3186 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3187 * assumption is wrong only in these cases is that all other non-UTF-8
3188 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3189 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3190 * EXACTF nodes because we don't know at compile time if it actually
3191 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3192 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3193 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3194 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3195 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3196 * string would require the pattern to be forced into UTF-8, the overhead
3197 * of which we want to avoid. Similarly the unfolded multi-char folds in
3198 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3201 * Similarly, the code that generates tries doesn't currently handle
3202 * not-already-folded multi-char folds, and it looks like a pain to change
3203 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3204 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3205 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3206 * using /iaa matching will be doing so almost entirely with ASCII
3207 * strings, so this should rarely be encountered in practice */
3209 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3210 if (PL_regkind[OP(scan)] == EXACT) \
3211 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3214 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3215 UV *min_subtract, bool *unfolded_multi_char,
3216 U32 flags,regnode *val, U32 depth)
3218 /* Merge several consecutive EXACTish nodes into one. */
3219 regnode *n = regnext(scan);
3221 regnode *next = scan + NODE_SZ_STR(scan);
3225 regnode *stop = scan;
3226 GET_RE_DEBUG_FLAGS_DECL;
3228 PERL_UNUSED_ARG(depth);
3231 PERL_ARGS_ASSERT_JOIN_EXACT;
3232 #ifndef EXPERIMENTAL_INPLACESCAN
3233 PERL_UNUSED_ARG(flags);
3234 PERL_UNUSED_ARG(val);
3236 DEBUG_PEEP("join",scan,depth);
3238 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3239 * EXACT ones that are mergeable to the current one. */
3241 && (PL_regkind[OP(n)] == NOTHING
3242 || (stringok && OP(n) == OP(scan)))
3244 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3247 if (OP(n) == TAIL || n > next)
3249 if (PL_regkind[OP(n)] == NOTHING) {
3250 DEBUG_PEEP("skip:",n,depth);
3251 NEXT_OFF(scan) += NEXT_OFF(n);
3252 next = n + NODE_STEP_REGNODE;
3259 else if (stringok) {
3260 const unsigned int oldl = STR_LEN(scan);
3261 regnode * const nnext = regnext(n);
3263 /* XXX I (khw) kind of doubt that this works on platforms (should
3264 * Perl ever run on one) where U8_MAX is above 255 because of lots
3265 * of other assumptions */
3266 /* Don't join if the sum can't fit into a single node */
3267 if (oldl + STR_LEN(n) > U8_MAX)
3270 DEBUG_PEEP("merg",n,depth);
3273 NEXT_OFF(scan) += NEXT_OFF(n);
3274 STR_LEN(scan) += STR_LEN(n);
3275 next = n + NODE_SZ_STR(n);
3276 /* Now we can overwrite *n : */
3277 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3285 #ifdef EXPERIMENTAL_INPLACESCAN
3286 if (flags && !NEXT_OFF(n)) {
3287 DEBUG_PEEP("atch", val, depth);
3288 if (reg_off_by_arg[OP(n)]) {
3289 ARG_SET(n, val - n);
3292 NEXT_OFF(n) = val - n;
3300 *unfolded_multi_char = FALSE;
3302 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3303 * can now analyze for sequences of problematic code points. (Prior to
3304 * this final joining, sequences could have been split over boundaries, and
3305 * hence missed). The sequences only happen in folding, hence for any
3306 * non-EXACT EXACTish node */
3307 if (OP(scan) != EXACT) {
3308 U8* s0 = (U8*) STRING(scan);
3310 U8* s_end = s0 + STR_LEN(scan);
3312 int total_count_delta = 0; /* Total delta number of characters that
3313 multi-char folds expand to */
3315 /* One pass is made over the node's string looking for all the
3316 * possibilities. To avoid some tests in the loop, there are two main
3317 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3322 if (OP(scan) == EXACTFL) {
3325 /* An EXACTFL node would already have been changed to another
3326 * node type unless there is at least one character in it that
3327 * is problematic; likely a character whose fold definition
3328 * won't be known until runtime, and so has yet to be folded.
3329 * For all but the UTF-8 locale, folds are 1-1 in length, but
3330 * to handle the UTF-8 case, we need to create a temporary
3331 * folded copy using UTF-8 locale rules in order to analyze it.
3332 * This is because our macros that look to see if a sequence is
3333 * a multi-char fold assume everything is folded (otherwise the
3334 * tests in those macros would be too complicated and slow).
3335 * Note that here, the non-problematic folds will have already
3336 * been done, so we can just copy such characters. We actually
3337 * don't completely fold the EXACTFL string. We skip the
3338 * unfolded multi-char folds, as that would just create work
3339 * below to figure out the size they already are */
3341 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3344 STRLEN s_len = UTF8SKIP(s);
3345 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3346 Copy(s, d, s_len, U8);
3349 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3350 *unfolded_multi_char = TRUE;
3351 Copy(s, d, s_len, U8);
3354 else if (isASCII(*s)) {
3355 *(d++) = toFOLD(*s);
3359 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3365 /* Point the remainder of the routine to look at our temporary
3369 } /* End of creating folded copy of EXACTFL string */
3371 /* Examine the string for a multi-character fold sequence. UTF-8
3372 * patterns have all characters pre-folded by the time this code is
3374 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3375 length sequence we are looking for is 2 */
3377 int count = 0; /* How many characters in a multi-char fold */
3378 int len = is_MULTI_CHAR_FOLD_utf8(s);
3379 if (! len) { /* Not a multi-char fold: get next char */
3384 /* Nodes with 'ss' require special handling, except for
3385 * EXACTFA-ish for which there is no multi-char fold to this */
3386 if (len == 2 && *s == 's' && *(s+1) == 's'
3387 && OP(scan) != EXACTFA
3388 && OP(scan) != EXACTFA_NO_TRIE)
3391 if (OP(scan) != EXACTFL) {
3392 OP(scan) = EXACTFU_SS;
3396 else { /* Here is a generic multi-char fold. */
3397 U8* multi_end = s + len;
3399 /* Count how many characters in it. In the case of /aa, no
3400 * folds which contain ASCII code points are allowed, so
3401 * check for those, and skip if found. */
3402 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3403 count = utf8_length(s, multi_end);
3407 while (s < multi_end) {
3410 goto next_iteration;
3420 /* The delta is how long the sequence is minus 1 (1 is how long
3421 * the character that folds to the sequence is) */
3422 total_count_delta += count - 1;
3426 /* We created a temporary folded copy of the string in EXACTFL
3427 * nodes. Therefore we need to be sure it doesn't go below zero,
3428 * as the real string could be shorter */
3429 if (OP(scan) == EXACTFL) {
3430 int total_chars = utf8_length((U8*) STRING(scan),
3431 (U8*) STRING(scan) + STR_LEN(scan));
3432 if (total_count_delta > total_chars) {
3433 total_count_delta = total_chars;
3437 *min_subtract += total_count_delta;
3440 else if (OP(scan) == EXACTFA) {
3442 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3443 * fold to the ASCII range (and there are no existing ones in the
3444 * upper latin1 range). But, as outlined in the comments preceding
3445 * this function, we need to flag any occurrences of the sharp s.
3446 * This character forbids trie formation (because of added
3449 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3450 OP(scan) = EXACTFA_NO_TRIE;
3451 *unfolded_multi_char = TRUE;
3460 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3461 * folds that are all Latin1. As explained in the comments
3462 * preceding this function, we look also for the sharp s in EXACTF
3463 * and EXACTFL nodes; it can be in the final position. Otherwise
3464 * we can stop looking 1 byte earlier because have to find at least
3465 * two characters for a multi-fold */
3466 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3471 int len = is_MULTI_CHAR_FOLD_latin1(s);
3472 if (! len) { /* Not a multi-char fold. */
3473 if (*s == LATIN_SMALL_LETTER_SHARP_S
3474 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3476 *unfolded_multi_char = TRUE;
3483 && isARG2_lower_or_UPPER_ARG1('s', *s)
3484 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3487 /* EXACTF nodes need to know that the minimum length
3488 * changed so that a sharp s in the string can match this
3489 * ss in the pattern, but they remain EXACTF nodes, as they
3490 * won't match this unless the target string is is UTF-8,
3491 * which we don't know until runtime. EXACTFL nodes can't
3492 * transform into EXACTFU nodes */
3493 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3494 OP(scan) = EXACTFU_SS;
3498 *min_subtract += len - 1;
3505 /* Allow dumping but overwriting the collection of skipped
3506 * ops and/or strings with fake optimized ops */
3507 n = scan + NODE_SZ_STR(scan);
3515 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3519 /* REx optimizer. Converts nodes into quicker variants "in place".
3520 Finds fixed substrings. */
3522 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3523 to the position after last scanned or to NULL. */
3525 #define INIT_AND_WITHP \
3526 assert(!and_withp); \
3527 Newx(and_withp,1, regnode_ssc); \
3528 SAVEFREEPV(and_withp)
3530 /* this is a chain of data about sub patterns we are processing that
3531 need to be handled separately/specially in study_chunk. Its so
3532 we can simulate recursion without losing state. */
3534 typedef struct scan_frame {
3535 regnode *last; /* last node to process in this frame */
3536 regnode *next; /* next node to process when last is reached */
3537 struct scan_frame *prev; /*previous frame*/
3538 U32 prev_recursed_depth;
3539 I32 stop; /* what stopparen do we use */
3543 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3546 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3547 SSize_t *minlenp, SSize_t *deltap,
3552 regnode_ssc *and_withp,
3553 U32 flags, U32 depth)
3554 /* scanp: Start here (read-write). */
3555 /* deltap: Write maxlen-minlen here. */
3556 /* last: Stop before this one. */
3557 /* data: string data about the pattern */
3558 /* stopparen: treat close N as END */
3559 /* recursed: which subroutines have we recursed into */
3560 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3563 /* There must be at least this number of characters to match */
3566 regnode *scan = *scanp, *next;
3568 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3569 int is_inf_internal = 0; /* The studied chunk is infinite */
3570 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3571 scan_data_t data_fake;
3572 SV *re_trie_maxbuff = NULL;
3573 regnode *first_non_open = scan;
3574 SSize_t stopmin = SSize_t_MAX;
3575 scan_frame *frame = NULL;
3576 GET_RE_DEBUG_FLAGS_DECL;
3578 PERL_ARGS_ASSERT_STUDY_CHUNK;
3581 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3584 while (first_non_open && OP(first_non_open) == OPEN)
3585 first_non_open=regnext(first_non_open);
3590 while ( scan && OP(scan) != END && scan < last ){
3591 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3592 node length to get a real minimum (because
3593 the folded version may be shorter) */
3594 bool unfolded_multi_char = FALSE;
3595 /* Peephole optimizer: */
3596 DEBUG_OPTIMISE_MORE_r(
3598 PerlIO_printf(Perl_debug_log,
3599 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3600 ((int) depth*2), "", (long)stopparen,
3601 (unsigned long)depth, (unsigned long)recursed_depth);
3602 if (recursed_depth) {
3605 for ( j = 0 ; j < recursed_depth ; j++ ) {
3606 PerlIO_printf(Perl_debug_log,"[");
3607 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3608 PerlIO_printf(Perl_debug_log,"%d",
3609 PAREN_TEST(RExC_study_chunk_recursed +
3610 (j * RExC_study_chunk_recursed_bytes), i)
3613 PerlIO_printf(Perl_debug_log,"]");
3616 PerlIO_printf(Perl_debug_log,"\n");
3619 DEBUG_STUDYDATA("Peep:", data, depth);
3620 DEBUG_PEEP("Peep", scan, depth);
3623 /* Its not clear to khw or hv why this is done here, and not in the
3624 * clauses that deal with EXACT nodes. khw's guess is that it's
3625 * because of a previous design */
3626 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3628 /* Follow the next-chain of the current node and optimize
3629 away all the NOTHINGs from it. */
3630 if (OP(scan) != CURLYX) {
3631 const int max = (reg_off_by_arg[OP(scan)]
3633 /* I32 may be smaller than U16 on CRAYs! */
3634 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3635 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3639 /* Skip NOTHING and LONGJMP. */
3640 while ((n = regnext(n))
3641 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3642 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3643 && off + noff < max)
3645 if (reg_off_by_arg[OP(scan)])
3648 NEXT_OFF(scan) = off;
3653 /* The principal pseudo-switch. Cannot be a switch, since we
3654 look into several different things. */
3655 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3656 || OP(scan) == IFTHEN) {
3657 next = regnext(scan);
3659 /* demq: the op(next)==code check is to see if we have
3660 * "branch-branch" AFAICT */
3662 if (OP(next) == code || code == IFTHEN) {
3663 /* NOTE - There is similar code to this block below for
3664 * handling TRIE nodes on a re-study. If you change stuff here
3665 * check there too. */
3666 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3668 regnode * const startbranch=scan;
3670 if (flags & SCF_DO_SUBSTR)
3671 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge
3674 if (flags & SCF_DO_STCLASS)
3675 ssc_init_zero(pRExC_state, &accum);
3677 while (OP(scan) == code) {
3678 SSize_t deltanext, minnext, fake;
3680 regnode_ssc this_class;
3683 data_fake.flags = 0;
3685 data_fake.whilem_c = data->whilem_c;
3686 data_fake.last_closep = data->last_closep;
3689 data_fake.last_closep = &fake;
3691 data_fake.pos_delta = delta;
3692 next = regnext(scan);
3693 scan = NEXTOPER(scan);
3695 scan = NEXTOPER(scan);
3696 if (flags & SCF_DO_STCLASS) {
3697 ssc_init(pRExC_state, &this_class);
3698 data_fake.start_class = &this_class;
3699 f = SCF_DO_STCLASS_AND;
3701 if (flags & SCF_WHILEM_VISITED_POS)
3702 f |= SCF_WHILEM_VISITED_POS;
3704 /* we suppose the run is continuous, last=next...*/
3705 minnext = study_chunk(pRExC_state, &scan, minlenp,
3706 &deltanext, next, &data_fake, stopparen,
3707 recursed_depth, NULL, f,depth+1);
3710 if (deltanext == SSize_t_MAX) {
3711 is_inf = is_inf_internal = 1;
3713 } else if (max1 < minnext + deltanext)
3714 max1 = minnext + deltanext;
3716 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3718 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3719 if ( stopmin > minnext)
3720 stopmin = min + min1;
3721 flags &= ~SCF_DO_SUBSTR;
3723 data->flags |= SCF_SEEN_ACCEPT;
3726 if (data_fake.flags & SF_HAS_EVAL)
3727 data->flags |= SF_HAS_EVAL;
3728 data->whilem_c = data_fake.whilem_c;
3730 if (flags & SCF_DO_STCLASS)
3731 ssc_or(pRExC_state, &accum, &this_class);
3733 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3735 if (flags & SCF_DO_SUBSTR) {
3736 data->pos_min += min1;
3737 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3738 data->pos_delta = SSize_t_MAX;
3740 data->pos_delta += max1 - min1;
3741 if (max1 != min1 || is_inf)
3742 data->longest = &(data->longest_float);
3745 if (delta == SSize_t_MAX
3746 || SSize_t_MAX - delta - (max1 - min1) < 0)
3747 delta = SSize_t_MAX;
3749 delta += max1 - min1;
3750 if (flags & SCF_DO_STCLASS_OR) {
3751 ssc_or(pRExC_state, data->start_class, &accum);
3753 ssc_and(pRExC_state, data->start_class, and_withp);
3754 flags &= ~SCF_DO_STCLASS;
3757 else if (flags & SCF_DO_STCLASS_AND) {
3759 ssc_and(pRExC_state, data->start_class, &accum);
3760 flags &= ~SCF_DO_STCLASS;
3763 /* Switch to OR mode: cache the old value of
3764 * data->start_class */
3766 StructCopy(data->start_class, and_withp, regnode_ssc);
3767 flags &= ~SCF_DO_STCLASS_AND;
3768 StructCopy(&accum, data->start_class, regnode_ssc);
3769 flags |= SCF_DO_STCLASS_OR;
3773 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch )
3778 Assuming this was/is a branch we are dealing with: 'scan'
3779 now points at the item that follows the branch sequence,
3780 whatever it is. We now start at the beginning of the
3781 sequence and look for subsequences of
3787 which would be constructed from a pattern like
3790 If we can find such a subsequence we need to turn the first
3791 element into a trie and then add the subsequent branch exact
3792 strings to the trie.
3796 1. patterns where the whole set of branches can be
3799 2. patterns where only a subset can be converted.
3801 In case 1 we can replace the whole set with a single regop
3802 for the trie. In case 2 we need to keep the start and end
3805 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3806 becomes BRANCH TRIE; BRANCH X;
3808 There is an additional case, that being where there is a
3809 common prefix, which gets split out into an EXACT like node
3810 preceding the TRIE node.
3812 If x(1..n)==tail then we can do a simple trie, if not we make
3813 a "jump" trie, such that when we match the appropriate word
3814 we "jump" to the appropriate tail node. Essentially we turn
3815 a nested if into a case structure of sorts.
3820 if (!re_trie_maxbuff) {
3821 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3822 if (!SvIOK(re_trie_maxbuff))
3823 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3825 if ( SvIV(re_trie_maxbuff)>=0 ) {
3827 regnode *first = (regnode *)NULL;
3828 regnode *last = (regnode *)NULL;
3829 regnode *tail = scan;
3834 SV * const mysv = sv_newmortal(); /* for dumping */
3836 /* var tail is used because there may be a TAIL
3837 regop in the way. Ie, the exacts will point to the
3838 thing following the TAIL, but the last branch will
3839 point at the TAIL. So we advance tail. If we
3840 have nested (?:) we may have to move through several
3844 while ( OP( tail ) == TAIL ) {
3845 /* this is the TAIL generated by (?:) */
3846 tail = regnext( tail );
3850 DEBUG_TRIE_COMPILE_r({
3851 regprop(RExC_rx, mysv, tail );
3852 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3853 (int)depth * 2 + 2, "",
3854 "Looking for TRIE'able sequences. Tail node is: ",
3855 SvPV_nolen_const( mysv )
3861 Step through the branches
3862 cur represents each branch,
3863 noper is the first thing to be matched as part
3865 noper_next is the regnext() of that node.
3867 We normally handle a case like this
3868 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3869 support building with NOJUMPTRIE, which restricts
3870 the trie logic to structures like /FOO|BAR/.
3872 If noper is a trieable nodetype then the branch is
3873 a possible optimization target. If we are building
3874 under NOJUMPTRIE then we require that noper_next is
3875 the same as scan (our current position in the regex
3878 Once we have two or more consecutive such branches
3879 we can create a trie of the EXACT's contents and
3880 stitch it in place into the program.
3882 If the sequence represents all of the branches in
3883 the alternation we replace the entire thing with a
3886 Otherwise when it is a subsequence we need to
3887 stitch it in place and replace only the relevant
3888 branches. This means the first branch has to remain
3889 as it is used by the alternation logic, and its
3890 next pointer, and needs to be repointed at the item
3891 on the branch chain following the last branch we
3892 have optimized away.
3894 This could be either a BRANCH, in which case the
3895 subsequence is internal, or it could be the item
3896 following the branch sequence in which case the
3897 subsequence is at the end (which does not
3898 necessarily mean the first node is the start of the
3901 TRIE_TYPE(X) is a define which maps the optype to a
3905 ----------------+-----------
3909 EXACTFU_SS | EXACTFU
3914 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3915 ( EXACT == (X) ) ? EXACT : \
3916 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3917 ( EXACTFA == (X) ) ? EXACTFA : \
3920 /* dont use tail as the end marker for this traverse */
3921 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3922 regnode * const noper = NEXTOPER( cur );
3923 U8 noper_type = OP( noper );