5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 HV *paren_names; /* Paren names */
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
142 char *starttry; /* -Dr: where regtry was called. */
143 #define RExC_starttry (pRExC_state->starttry)
146 const char *lastparse;
148 AV *paren_name_list; /* idx -> name */
149 #define RExC_lastparse (pRExC_state->lastparse)
150 #define RExC_lastnum (pRExC_state->lastnum)
151 #define RExC_paren_name_list (pRExC_state->paren_name_list)
155 #define RExC_flags (pRExC_state->flags)
156 #define RExC_precomp (pRExC_state->precomp)
157 #define RExC_rx_sv (pRExC_state->rx_sv)
158 #define RExC_rx (pRExC_state->rx)
159 #define RExC_rxi (pRExC_state->rxi)
160 #define RExC_start (pRExC_state->start)
161 #define RExC_end (pRExC_state->end)
162 #define RExC_parse (pRExC_state->parse)
163 #define RExC_whilem_seen (pRExC_state->whilem_seen)
164 #ifdef RE_TRACK_PATTERN_OFFSETS
165 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
167 #define RExC_emit (pRExC_state->emit)
168 #define RExC_emit_start (pRExC_state->emit_start)
169 #define RExC_emit_bound (pRExC_state->emit_bound)
170 #define RExC_naughty (pRExC_state->naughty)
171 #define RExC_sawback (pRExC_state->sawback)
172 #define RExC_seen (pRExC_state->seen)
173 #define RExC_size (pRExC_state->size)
174 #define RExC_npar (pRExC_state->npar)
175 #define RExC_nestroot (pRExC_state->nestroot)
176 #define RExC_extralen (pRExC_state->extralen)
177 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178 #define RExC_seen_evals (pRExC_state->seen_evals)
179 #define RExC_utf8 (pRExC_state->utf8)
180 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
205 #define SPSTART 0x04 /* Starts with * or +. */
206 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
207 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
211 /* whether trie related optimizations are enabled */
212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213 #define TRIE_STUDY_OPT
214 #define FULL_TRIE_STUDY
220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221 #define PBITVAL(paren) (1 << ((paren) & 7))
222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
226 /* If not already in utf8, do a longjmp back to the beginning */
227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228 #define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
232 /* About scan_data_t.
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear for at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
248 The strings can be composites, for instance
252 will result in a composite fixed substring 'foo'.
254 For each string some basic information is maintained:
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 character must match before the string we are searching.
260 Likewise when combined with minlenp and the length of the string
261 tells us how many characters must appear after the string we have
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. Ifset to I32 max it indicates that the
267 string can occur infinitely far to the right.
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
307 typedef struct scan_data_t {
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
313 I32 last_end; /* min value, <0 unless valid. */
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
329 struct regnode_charclass_class *start_class;
333 * Forward declarations for pregcomp()'s friends.
336 static const scan_data_t zero_scan_data =
337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
339 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
340 #define SF_BEFORE_SEOL 0x0001
341 #define SF_BEFORE_MEOL 0x0002
342 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
346 # define SF_FIX_SHIFT_EOL (0+2)
347 # define SF_FL_SHIFT_EOL (0+4)
349 # define SF_FIX_SHIFT_EOL (+2)
350 # define SF_FL_SHIFT_EOL (+4)
353 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
356 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
358 #define SF_IS_INF 0x0040
359 #define SF_HAS_PAR 0x0080
360 #define SF_IN_PAR 0x0100
361 #define SF_HAS_EVAL 0x0200
362 #define SCF_DO_SUBSTR 0x0400
363 #define SCF_DO_STCLASS_AND 0x0800
364 #define SCF_DO_STCLASS_OR 0x1000
365 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
366 #define SCF_WHILEM_VISITED_POS 0x2000
368 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
369 #define SCF_SEEN_ACCEPT 0x8000
371 #define UTF (RExC_utf8 != 0)
372 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
373 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
375 #define OOB_UNICODE 12345678
376 #define OOB_NAMEDCLASS -1
378 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
379 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
382 /* length of regex to show in messages that don't mark a position within */
383 #define RegexLengthToShowInErrorMessages 127
386 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
387 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
388 * op/pragma/warn/regcomp.
390 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
391 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
393 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
396 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
397 * arg. Show regex, up to a maximum length. If it's too long, chop and add
400 #define _FAIL(code) STMT_START { \
401 const char *ellipses = ""; \
402 IV len = RExC_end - RExC_precomp; \
405 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
406 if (len > RegexLengthToShowInErrorMessages) { \
407 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
408 len = RegexLengthToShowInErrorMessages - 10; \
414 #define FAIL(msg) _FAIL( \
415 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
416 msg, (int)len, RExC_precomp, ellipses))
418 #define FAIL2(msg,arg) _FAIL( \
419 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
420 arg, (int)len, RExC_precomp, ellipses))
423 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 #define Simple_vFAIL(m) STMT_START { \
426 const IV offset = RExC_parse - RExC_precomp; \
427 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
428 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
432 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 #define vFAIL(m) STMT_START { \
436 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
441 * Like Simple_vFAIL(), but accepts two arguments.
443 #define Simple_vFAIL2(m,a1) STMT_START { \
444 const IV offset = RExC_parse - RExC_precomp; \
445 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
446 (int)offset, RExC_precomp, RExC_precomp + offset); \
450 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 #define vFAIL2(m,a1) STMT_START { \
454 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
455 Simple_vFAIL2(m, a1); \
460 * Like Simple_vFAIL(), but accepts three arguments.
462 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
463 const IV offset = RExC_parse - RExC_precomp; \
464 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
465 (int)offset, RExC_precomp, RExC_precomp + offset); \
469 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 #define vFAIL3(m,a1,a2) STMT_START { \
473 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
474 Simple_vFAIL3(m, a1, a2); \
478 * Like Simple_vFAIL(), but accepts four arguments.
480 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
481 const IV offset = RExC_parse - RExC_precomp; \
482 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
483 (int)offset, RExC_precomp, RExC_precomp + offset); \
486 #define ckWARNreg(loc,m) STMT_START { \
487 const IV offset = loc - RExC_precomp; \
488 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
489 (int)offset, RExC_precomp, RExC_precomp + offset); \
492 #define ckWARNregdep(loc,m) STMT_START { \
493 const IV offset = loc - RExC_precomp; \
494 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
496 (int)offset, RExC_precomp, RExC_precomp + offset); \
499 #define ckWARN2reg(loc, m, a1) STMT_START { \
500 const IV offset = loc - RExC_precomp; \
501 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
505 #define vWARN3(loc, m, a1, a2) STMT_START { \
506 const IV offset = loc - RExC_precomp; \
507 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
508 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
511 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
512 const IV offset = loc - RExC_precomp; \
513 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
514 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
517 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
518 const IV offset = loc - RExC_precomp; \
519 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
520 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
523 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
524 const IV offset = loc - RExC_precomp; \
525 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
526 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
529 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
530 const IV offset = loc - RExC_precomp; \
531 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
532 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
536 /* Allow for side effects in s */
537 #define REGC(c,s) STMT_START { \
538 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
541 /* Macros for recording node offsets. 20001227 mjd@plover.com
542 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
543 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
544 * Element 0 holds the number n.
545 * Position is 1 indexed.
547 #ifndef RE_TRACK_PATTERN_OFFSETS
548 #define Set_Node_Offset_To_R(node,byte)
549 #define Set_Node_Offset(node,byte)
550 #define Set_Cur_Node_Offset
551 #define Set_Node_Length_To_R(node,len)
552 #define Set_Node_Length(node,len)
553 #define Set_Node_Cur_Length(node)
554 #define Node_Offset(n)
555 #define Node_Length(n)
556 #define Set_Node_Offset_Length(node,offset,len)
557 #define ProgLen(ri) ri->u.proglen
558 #define SetProgLen(ri,x) ri->u.proglen = x
560 #define ProgLen(ri) ri->u.offsets[0]
561 #define SetProgLen(ri,x) ri->u.offsets[0] = x
562 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
565 __LINE__, (int)(node), (int)(byte))); \
567 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
569 RExC_offsets[2*(node)-1] = (byte); \
574 #define Set_Node_Offset(node,byte) \
575 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
576 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578 #define Set_Node_Length_To_R(node,len) STMT_START { \
580 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
581 __LINE__, (int)(node), (int)(len))); \
583 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
585 RExC_offsets[2*(node)] = (len); \
590 #define Set_Node_Length(node,len) \
591 Set_Node_Length_To_R((node)-RExC_emit_start, len)
592 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
593 #define Set_Node_Cur_Length(node) \
594 Set_Node_Length(node, RExC_parse - parse_start)
596 /* Get offsets and lengths */
597 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
598 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
601 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
602 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
606 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
607 #define EXPERIMENTAL_INPLACESCAN
608 #endif /*RE_TRACK_PATTERN_OFFSETS*/
610 #define DEBUG_STUDYDATA(str,data,depth) \
611 DEBUG_OPTIMISE_MORE_r(if(data){ \
612 PerlIO_printf(Perl_debug_log, \
613 "%*s" str "Pos:%"IVdf"/%"IVdf \
614 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
615 (int)(depth)*2, "", \
616 (IV)((data)->pos_min), \
617 (IV)((data)->pos_delta), \
618 (UV)((data)->flags), \
619 (IV)((data)->whilem_c), \
620 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
621 is_inf ? "INF " : "" \
623 if ((data)->last_found) \
624 PerlIO_printf(Perl_debug_log, \
625 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
626 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
627 SvPVX_const((data)->last_found), \
628 (IV)((data)->last_end), \
629 (IV)((data)->last_start_min), \
630 (IV)((data)->last_start_max), \
631 ((data)->longest && \
632 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
633 SvPVX_const((data)->longest_fixed), \
634 (IV)((data)->offset_fixed), \
635 ((data)->longest && \
636 (data)->longest==&((data)->longest_float)) ? "*" : "", \
637 SvPVX_const((data)->longest_float), \
638 (IV)((data)->offset_float_min), \
639 (IV)((data)->offset_float_max) \
641 PerlIO_printf(Perl_debug_log,"\n"); \
644 static void clear_re(pTHX_ void *r);
646 /* Mark that we cannot extend a found fixed substring at this point.
647 Update the longest found anchored substring and the longest found
648 floating substrings if needed. */
651 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
653 const STRLEN l = CHR_SVLEN(data->last_found);
654 const STRLEN old_l = CHR_SVLEN(*data->longest);
655 GET_RE_DEBUG_FLAGS_DECL;
657 PERL_ARGS_ASSERT_SCAN_COMMIT;
659 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
660 SvSetMagicSV(*data->longest, data->last_found);
661 if (*data->longest == data->longest_fixed) {
662 data->offset_fixed = l ? data->last_start_min : data->pos_min;
663 if (data->flags & SF_BEFORE_EOL)
665 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 data->flags &= ~SF_FIX_BEFORE_EOL;
668 data->minlen_fixed=minlenp;
669 data->lookbehind_fixed=0;
671 else { /* *data->longest == data->longest_float */
672 data->offset_float_min = l ? data->last_start_min : data->pos_min;
673 data->offset_float_max = (l
674 ? data->last_start_max
675 : data->pos_min + data->pos_delta);
676 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
677 data->offset_float_max = I32_MAX;
678 if (data->flags & SF_BEFORE_EOL)
680 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 data->flags &= ~SF_FL_BEFORE_EOL;
683 data->minlen_float=minlenp;
684 data->lookbehind_float=0;
687 SvCUR_set(data->last_found, 0);
689 SV * const sv = data->last_found;
690 if (SvUTF8(sv) && SvMAGICAL(sv)) {
691 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
697 data->flags &= ~SF_BEFORE_EOL;
698 DEBUG_STUDYDATA("commit: ",data,0);
701 /* Can match anything (initialization) */
703 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
705 PERL_ARGS_ASSERT_CL_ANYTHING;
707 ANYOF_CLASS_ZERO(cl);
708 ANYOF_BITMAP_SETALL(cl);
709 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
711 cl->flags |= ANYOF_LOCALE;
714 /* Can match anything (initialization) */
716 S_cl_is_anything(const struct regnode_charclass_class *cl)
720 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
722 for (value = 0; value <= ANYOF_MAX; value += 2)
723 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
725 if (!(cl->flags & ANYOF_UNICODE_ALL))
727 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
732 /* Can match anything (initialization) */
734 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
736 PERL_ARGS_ASSERT_CL_INIT;
738 Zero(cl, 1, struct regnode_charclass_class);
740 cl_anything(pRExC_state, cl);
744 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
746 PERL_ARGS_ASSERT_CL_INIT_ZERO;
748 Zero(cl, 1, struct regnode_charclass_class);
750 cl_anything(pRExC_state, cl);
752 cl->flags |= ANYOF_LOCALE;
755 /* 'And' a given class with another one. Can create false positives */
756 /* We assume that cl is not inverted */
758 S_cl_and(struct regnode_charclass_class *cl,
759 const struct regnode_charclass_class *and_with)
761 PERL_ARGS_ASSERT_CL_AND;
763 assert(and_with->type == ANYOF);
764 if (!(and_with->flags & ANYOF_CLASS)
765 && !(cl->flags & ANYOF_CLASS)
766 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
767 && !(and_with->flags & ANYOF_FOLD)
768 && !(cl->flags & ANYOF_FOLD)) {
771 if (and_with->flags & ANYOF_INVERT)
772 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
773 cl->bitmap[i] &= ~and_with->bitmap[i];
775 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
776 cl->bitmap[i] &= and_with->bitmap[i];
777 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
778 if (!(and_with->flags & ANYOF_EOS))
779 cl->flags &= ~ANYOF_EOS;
781 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
782 !(and_with->flags & ANYOF_INVERT)) {
783 cl->flags &= ~ANYOF_UNICODE_ALL;
784 cl->flags |= ANYOF_UNICODE;
785 ARG_SET(cl, ARG(and_with));
787 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
788 !(and_with->flags & ANYOF_INVERT))
789 cl->flags &= ~ANYOF_UNICODE_ALL;
790 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
791 !(and_with->flags & ANYOF_INVERT))
792 cl->flags &= ~ANYOF_UNICODE;
795 /* 'OR' a given class with another one. Can create false positives */
796 /* We assume that cl is not inverted */
798 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
800 PERL_ARGS_ASSERT_CL_OR;
802 if (or_with->flags & ANYOF_INVERT) {
804 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
805 * <= (B1 | !B2) | (CL1 | !CL2)
806 * which is wasteful if CL2 is small, but we ignore CL2:
807 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
808 * XXXX Can we handle case-fold? Unclear:
809 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
810 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
812 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
813 && !(or_with->flags & ANYOF_FOLD)
814 && !(cl->flags & ANYOF_FOLD) ) {
817 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
818 cl->bitmap[i] |= ~or_with->bitmap[i];
819 } /* XXXX: logic is complicated otherwise */
821 cl_anything(pRExC_state, cl);
824 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
825 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
826 && (!(or_with->flags & ANYOF_FOLD)
827 || (cl->flags & ANYOF_FOLD)) ) {
830 /* OR char bitmap and class bitmap separately */
831 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
832 cl->bitmap[i] |= or_with->bitmap[i];
833 if (or_with->flags & ANYOF_CLASS) {
834 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
835 cl->classflags[i] |= or_with->classflags[i];
836 cl->flags |= ANYOF_CLASS;
839 else { /* XXXX: logic is complicated, leave it along for a moment. */
840 cl_anything(pRExC_state, cl);
843 if (or_with->flags & ANYOF_EOS)
844 cl->flags |= ANYOF_EOS;
846 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
847 ARG(cl) != ARG(or_with)) {
848 cl->flags |= ANYOF_UNICODE_ALL;
849 cl->flags &= ~ANYOF_UNICODE;
851 if (or_with->flags & ANYOF_UNICODE_ALL) {
852 cl->flags |= ANYOF_UNICODE_ALL;
853 cl->flags &= ~ANYOF_UNICODE;
857 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
858 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
859 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
860 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
865 dump_trie(trie,widecharmap,revcharmap)
866 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
867 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
869 These routines dump out a trie in a somewhat readable format.
870 The _interim_ variants are used for debugging the interim
871 tables that are used to generate the final compressed
872 representation which is what dump_trie expects.
874 Part of the reason for their existance is to provide a form
875 of documentation as to how the different representations function.
880 Dumps the final compressed table form of the trie to Perl_debug_log.
881 Used for debugging make_trie().
885 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
886 AV *revcharmap, U32 depth)
889 SV *sv=sv_newmortal();
890 int colwidth= widecharmap ? 6 : 4;
892 GET_RE_DEBUG_FLAGS_DECL;
894 PERL_ARGS_ASSERT_DUMP_TRIE;
896 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
897 (int)depth * 2 + 2,"",
898 "Match","Base","Ofs" );
900 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
901 SV ** const tmp = av_fetch( revcharmap, state, 0);
903 PerlIO_printf( Perl_debug_log, "%*s",
905 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
906 PL_colors[0], PL_colors[1],
907 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
908 PERL_PV_ESCAPE_FIRSTCHAR
913 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
914 (int)depth * 2 + 2,"");
916 for( state = 0 ; state < trie->uniquecharcount ; state++ )
917 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
918 PerlIO_printf( Perl_debug_log, "\n");
920 for( state = 1 ; state < trie->statecount ; state++ ) {
921 const U32 base = trie->states[ state ].trans.base;
923 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
925 if ( trie->states[ state ].wordnum ) {
926 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
928 PerlIO_printf( Perl_debug_log, "%6s", "" );
931 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
936 while( ( base + ofs < trie->uniquecharcount ) ||
937 ( base + ofs - trie->uniquecharcount < trie->lasttrans
938 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
941 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
943 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
944 if ( ( base + ofs >= trie->uniquecharcount ) &&
945 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
946 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
948 PerlIO_printf( Perl_debug_log, "%*"UVXf,
950 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
952 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
956 PerlIO_printf( Perl_debug_log, "]");
959 PerlIO_printf( Perl_debug_log, "\n" );
961 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
962 for (word=1; word <= trie->wordcount; word++) {
963 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
964 (int)word, (int)(trie->wordinfo[word].prev),
965 (int)(trie->wordinfo[word].len));
967 PerlIO_printf(Perl_debug_log, "\n" );
970 Dumps a fully constructed but uncompressed trie in list form.
971 List tries normally only are used for construction when the number of
972 possible chars (trie->uniquecharcount) is very high.
973 Used for debugging make_trie().
976 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
977 HV *widecharmap, AV *revcharmap, U32 next_alloc,
981 SV *sv=sv_newmortal();
982 int colwidth= widecharmap ? 6 : 4;
983 GET_RE_DEBUG_FLAGS_DECL;
985 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
987 /* print out the table precompression. */
988 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
989 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
990 "------:-----+-----------------\n" );
992 for( state=1 ; state < next_alloc ; state ++ ) {
995 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
996 (int)depth * 2 + 2,"", (UV)state );
997 if ( ! trie->states[ state ].wordnum ) {
998 PerlIO_printf( Perl_debug_log, "%5s| ","");
1000 PerlIO_printf( Perl_debug_log, "W%4x| ",
1001 trie->states[ state ].wordnum
1004 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1005 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1007 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1009 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1010 PL_colors[0], PL_colors[1],
1011 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1012 PERL_PV_ESCAPE_FIRSTCHAR
1014 TRIE_LIST_ITEM(state,charid).forid,
1015 (UV)TRIE_LIST_ITEM(state,charid).newstate
1018 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1019 (int)((depth * 2) + 14), "");
1022 PerlIO_printf( Perl_debug_log, "\n");
1027 Dumps a fully constructed but uncompressed trie in table form.
1028 This is the normal DFA style state transition table, with a few
1029 twists to facilitate compression later.
1030 Used for debugging make_trie().
1033 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1034 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1039 SV *sv=sv_newmortal();
1040 int colwidth= widecharmap ? 6 : 4;
1041 GET_RE_DEBUG_FLAGS_DECL;
1043 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1046 print out the table precompression so that we can do a visual check
1047 that they are identical.
1050 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1052 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1053 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1055 PerlIO_printf( Perl_debug_log, "%*s",
1057 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1058 PL_colors[0], PL_colors[1],
1059 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1060 PERL_PV_ESCAPE_FIRSTCHAR
1066 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1068 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1069 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1072 PerlIO_printf( Perl_debug_log, "\n" );
1074 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1076 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1077 (int)depth * 2 + 2,"",
1078 (UV)TRIE_NODENUM( state ) );
1080 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1081 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1083 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1085 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1087 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1088 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1090 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1091 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1099 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1100 startbranch: the first branch in the whole branch sequence
1101 first : start branch of sequence of branch-exact nodes.
1102 May be the same as startbranch
1103 last : Thing following the last branch.
1104 May be the same as tail.
1105 tail : item following the branch sequence
1106 count : words in the sequence
1107 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1108 depth : indent depth
1110 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1112 A trie is an N'ary tree where the branches are determined by digital
1113 decomposition of the key. IE, at the root node you look up the 1st character and
1114 follow that branch repeat until you find the end of the branches. Nodes can be
1115 marked as "accepting" meaning they represent a complete word. Eg:
1119 would convert into the following structure. Numbers represent states, letters
1120 following numbers represent valid transitions on the letter from that state, if
1121 the number is in square brackets it represents an accepting state, otherwise it
1122 will be in parenthesis.
1124 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1128 (1) +-i->(6)-+-s->[7]
1130 +-s->(3)-+-h->(4)-+-e->[5]
1132 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1134 This shows that when matching against the string 'hers' we will begin at state 1
1135 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1136 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1137 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1138 single traverse. We store a mapping from accepting to state to which word was
1139 matched, and then when we have multiple possibilities we try to complete the
1140 rest of the regex in the order in which they occured in the alternation.
1142 The only prior NFA like behaviour that would be changed by the TRIE support is
1143 the silent ignoring of duplicate alternations which are of the form:
1145 / (DUPE|DUPE) X? (?{ ... }) Y /x
1147 Thus EVAL blocks follwing a trie may be called a different number of times with
1148 and without the optimisation. With the optimisations dupes will be silently
1149 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1150 the following demonstrates:
1152 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1154 which prints out 'word' three times, but
1156 'words'=~/(word|word|word)(?{ print $1 })S/
1158 which doesnt print it out at all. This is due to other optimisations kicking in.
1160 Example of what happens on a structural level:
1162 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1164 1: CURLYM[1] {1,32767}(18)
1175 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1176 and should turn into:
1178 1: CURLYM[1] {1,32767}(18)
1180 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1188 Cases where tail != last would be like /(?foo|bar)baz/:
1198 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1199 and would end up looking like:
1202 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1209 d = uvuni_to_utf8_flags(d, uv, 0);
1211 is the recommended Unicode-aware way of saying
1216 #define TRIE_STORE_REVCHAR \
1219 SV *zlopp = newSV(2); \
1220 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1221 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1222 SvCUR_set(zlopp, kapow - flrbbbbb); \
1225 av_push(revcharmap, zlopp); \
1227 char ooooff = (char)uvc; \
1228 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1232 #define TRIE_READ_CHAR STMT_START { \
1236 if ( foldlen > 0 ) { \
1237 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1242 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1243 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1244 foldlen -= UNISKIP( uvc ); \
1245 scan = foldbuf + UNISKIP( uvc ); \
1248 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1258 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1259 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1260 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1261 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1263 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1264 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1265 TRIE_LIST_CUR( state )++; \
1268 #define TRIE_LIST_NEW(state) STMT_START { \
1269 Newxz( trie->states[ state ].trans.list, \
1270 4, reg_trie_trans_le ); \
1271 TRIE_LIST_CUR( state ) = 1; \
1272 TRIE_LIST_LEN( state ) = 4; \
1275 #define TRIE_HANDLE_WORD(state) STMT_START { \
1276 U16 dupe= trie->states[ state ].wordnum; \
1277 regnode * const noper_next = regnext( noper ); \
1280 /* store the word for dumping */ \
1282 if (OP(noper) != NOTHING) \
1283 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1285 tmp = newSVpvn_utf8( "", 0, UTF ); \
1286 av_push( trie_words, tmp ); \
1290 trie->wordinfo[curword].prev = 0; \
1291 trie->wordinfo[curword].len = wordlen; \
1292 trie->wordinfo[curword].accept = state; \
1294 if ( noper_next < tail ) { \
1296 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1297 trie->jump[curword] = (U16)(noper_next - convert); \
1299 jumper = noper_next; \
1301 nextbranch= regnext(cur); \
1305 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1306 /* chain, so that when the bits of chain are later */\
1307 /* linked together, the dups appear in the chain */\
1308 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1309 trie->wordinfo[dupe].prev = curword; \
1311 /* we haven't inserted this word yet. */ \
1312 trie->states[ state ].wordnum = curword; \
1317 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1318 ( ( base + charid >= ucharcount \
1319 && base + charid < ubound \
1320 && state == trie->trans[ base - ucharcount + charid ].check \
1321 && trie->trans[ base - ucharcount + charid ].next ) \
1322 ? trie->trans[ base - ucharcount + charid ].next \
1323 : ( state==1 ? special : 0 ) \
1327 #define MADE_JUMP_TRIE 2
1328 #define MADE_EXACT_TRIE 4
1331 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1334 /* first pass, loop through and scan words */
1335 reg_trie_data *trie;
1336 HV *widecharmap = NULL;
1337 AV *revcharmap = newAV();
1339 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1344 regnode *jumper = NULL;
1345 regnode *nextbranch = NULL;
1346 regnode *convert = NULL;
1347 U32 *prev_states; /* temp array mapping each state to previous one */
1348 /* we just use folder as a flag in utf8 */
1349 const U8 * const folder = ( flags == EXACTF
1351 : ( flags == EXACTFL
1358 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1359 AV *trie_words = NULL;
1360 /* along with revcharmap, this only used during construction but both are
1361 * useful during debugging so we store them in the struct when debugging.
1364 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1365 STRLEN trie_charcount=0;
1367 SV *re_trie_maxbuff;
1368 GET_RE_DEBUG_FLAGS_DECL;
1370 PERL_ARGS_ASSERT_MAKE_TRIE;
1372 PERL_UNUSED_ARG(depth);
1375 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1377 trie->startstate = 1;
1378 trie->wordcount = word_count;
1379 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1380 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1381 if (!(UTF && folder))
1382 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1383 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1384 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1387 trie_words = newAV();
1390 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1391 if (!SvIOK(re_trie_maxbuff)) {
1392 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1395 PerlIO_printf( Perl_debug_log,
1396 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1397 (int)depth * 2 + 2, "",
1398 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1399 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1403 /* Find the node we are going to overwrite */
1404 if ( first == startbranch && OP( last ) != BRANCH ) {
1405 /* whole branch chain */
1408 /* branch sub-chain */
1409 convert = NEXTOPER( first );
1412 /* -- First loop and Setup --
1414 We first traverse the branches and scan each word to determine if it
1415 contains widechars, and how many unique chars there are, this is
1416 important as we have to build a table with at least as many columns as we
1419 We use an array of integers to represent the character codes 0..255
1420 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1421 native representation of the character value as the key and IV's for the
1424 *TODO* If we keep track of how many times each character is used we can
1425 remap the columns so that the table compression later on is more
1426 efficient in terms of memory by ensuring most common value is in the
1427 middle and the least common are on the outside. IMO this would be better
1428 than a most to least common mapping as theres a decent chance the most
1429 common letter will share a node with the least common, meaning the node
1430 will not be compressable. With a middle is most common approach the worst
1431 case is when we have the least common nodes twice.
1435 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1436 regnode * const noper = NEXTOPER( cur );
1437 const U8 *uc = (U8*)STRING( noper );
1438 const U8 * const e = uc + STR_LEN( noper );
1440 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1441 const U8 *scan = (U8*)NULL;
1442 U32 wordlen = 0; /* required init */
1444 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1446 if (OP(noper) == NOTHING) {
1450 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1451 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1452 regardless of encoding */
1454 for ( ; uc < e ; uc += len ) {
1455 TRIE_CHARCOUNT(trie)++;
1459 if ( !trie->charmap[ uvc ] ) {
1460 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1462 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1466 /* store the codepoint in the bitmap, and if its ascii
1467 also store its folded equivelent. */
1468 TRIE_BITMAP_SET(trie,uvc);
1470 /* store the folded codepoint */
1471 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1474 /* store first byte of utf8 representation of
1475 codepoints in the 127 < uvc < 256 range */
1476 if (127 < uvc && uvc < 192) {
1477 TRIE_BITMAP_SET(trie,194);
1478 } else if (191 < uvc ) {
1479 TRIE_BITMAP_SET(trie,195);
1480 /* && uvc < 256 -- we know uvc is < 256 already */
1483 set_bit = 0; /* We've done our bit :-) */
1488 widecharmap = newHV();
1490 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1493 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1495 if ( !SvTRUE( *svpp ) ) {
1496 sv_setiv( *svpp, ++trie->uniquecharcount );
1501 if( cur == first ) {
1504 } else if (chars < trie->minlen) {
1506 } else if (chars > trie->maxlen) {
1510 } /* end first pass */
1511 DEBUG_TRIE_COMPILE_r(
1512 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1513 (int)depth * 2 + 2,"",
1514 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1515 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1516 (int)trie->minlen, (int)trie->maxlen )
1520 We now know what we are dealing with in terms of unique chars and
1521 string sizes so we can calculate how much memory a naive
1522 representation using a flat table will take. If it's over a reasonable
1523 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1524 conservative but potentially much slower representation using an array
1527 At the end we convert both representations into the same compressed
1528 form that will be used in regexec.c for matching with. The latter
1529 is a form that cannot be used to construct with but has memory
1530 properties similar to the list form and access properties similar
1531 to the table form making it both suitable for fast searches and
1532 small enough that its feasable to store for the duration of a program.
1534 See the comment in the code where the compressed table is produced
1535 inplace from the flat tabe representation for an explanation of how
1536 the compression works.
1541 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1544 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1546 Second Pass -- Array Of Lists Representation
1548 Each state will be represented by a list of charid:state records
1549 (reg_trie_trans_le) the first such element holds the CUR and LEN
1550 points of the allocated array. (See defines above).
1552 We build the initial structure using the lists, and then convert
1553 it into the compressed table form which allows faster lookups
1554 (but cant be modified once converted).
1557 STRLEN transcount = 1;
1559 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1560 "%*sCompiling trie using list compiler\n",
1561 (int)depth * 2 + 2, ""));
1563 trie->states = (reg_trie_state *)
1564 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1565 sizeof(reg_trie_state) );
1569 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1571 regnode * const noper = NEXTOPER( cur );
1572 U8 *uc = (U8*)STRING( noper );
1573 const U8 * const e = uc + STR_LEN( noper );
1574 U32 state = 1; /* required init */
1575 U16 charid = 0; /* sanity init */
1576 U8 *scan = (U8*)NULL; /* sanity init */
1577 STRLEN foldlen = 0; /* required init */
1578 U32 wordlen = 0; /* required init */
1579 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1581 if (OP(noper) != NOTHING) {
1582 for ( ; uc < e ; uc += len ) {
1587 charid = trie->charmap[ uvc ];
1589 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1593 charid=(U16)SvIV( *svpp );
1596 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1603 if ( !trie->states[ state ].trans.list ) {
1604 TRIE_LIST_NEW( state );
1606 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1607 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1608 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1613 newstate = next_alloc++;
1614 prev_states[newstate] = state;
1615 TRIE_LIST_PUSH( state, charid, newstate );
1620 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1624 TRIE_HANDLE_WORD(state);
1626 } /* end second pass */
1628 /* next alloc is the NEXT state to be allocated */
1629 trie->statecount = next_alloc;
1630 trie->states = (reg_trie_state *)
1631 PerlMemShared_realloc( trie->states,
1633 * sizeof(reg_trie_state) );
1635 /* and now dump it out before we compress it */
1636 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1637 revcharmap, next_alloc,
1641 trie->trans = (reg_trie_trans *)
1642 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1649 for( state=1 ; state < next_alloc ; state ++ ) {
1653 DEBUG_TRIE_COMPILE_MORE_r(
1654 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1658 if (trie->states[state].trans.list) {
1659 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1663 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1664 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1665 if ( forid < minid ) {
1667 } else if ( forid > maxid ) {
1671 if ( transcount < tp + maxid - minid + 1) {
1673 trie->trans = (reg_trie_trans *)
1674 PerlMemShared_realloc( trie->trans,
1676 * sizeof(reg_trie_trans) );
1677 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1679 base = trie->uniquecharcount + tp - minid;
1680 if ( maxid == minid ) {
1682 for ( ; zp < tp ; zp++ ) {
1683 if ( ! trie->trans[ zp ].next ) {
1684 base = trie->uniquecharcount + zp - minid;
1685 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1686 trie->trans[ zp ].check = state;
1692 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1693 trie->trans[ tp ].check = state;
1698 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1699 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1700 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1701 trie->trans[ tid ].check = state;
1703 tp += ( maxid - minid + 1 );
1705 Safefree(trie->states[ state ].trans.list);
1708 DEBUG_TRIE_COMPILE_MORE_r(
1709 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1712 trie->states[ state ].trans.base=base;
1714 trie->lasttrans = tp + 1;
1718 Second Pass -- Flat Table Representation.
1720 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1721 We know that we will need Charcount+1 trans at most to store the data
1722 (one row per char at worst case) So we preallocate both structures
1723 assuming worst case.
1725 We then construct the trie using only the .next slots of the entry
1728 We use the .check field of the first entry of the node temporarily to
1729 make compression both faster and easier by keeping track of how many non
1730 zero fields are in the node.
1732 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1735 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1736 number representing the first entry of the node, and state as a
1737 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1738 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1739 are 2 entrys per node. eg:
1747 The table is internally in the right hand, idx form. However as we also
1748 have to deal with the states array which is indexed by nodenum we have to
1749 use TRIE_NODENUM() to convert.
1752 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1753 "%*sCompiling trie using table compiler\n",
1754 (int)depth * 2 + 2, ""));
1756 trie->trans = (reg_trie_trans *)
1757 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1758 * trie->uniquecharcount + 1,
1759 sizeof(reg_trie_trans) );
1760 trie->states = (reg_trie_state *)
1761 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1762 sizeof(reg_trie_state) );
1763 next_alloc = trie->uniquecharcount + 1;
1766 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1768 regnode * const noper = NEXTOPER( cur );
1769 const U8 *uc = (U8*)STRING( noper );
1770 const U8 * const e = uc + STR_LEN( noper );
1772 U32 state = 1; /* required init */
1774 U16 charid = 0; /* sanity init */
1775 U32 accept_state = 0; /* sanity init */
1776 U8 *scan = (U8*)NULL; /* sanity init */
1778 STRLEN foldlen = 0; /* required init */
1779 U32 wordlen = 0; /* required init */
1780 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1782 if ( OP(noper) != NOTHING ) {
1783 for ( ; uc < e ; uc += len ) {
1788 charid = trie->charmap[ uvc ];
1790 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1791 charid = svpp ? (U16)SvIV(*svpp) : 0;
1795 if ( !trie->trans[ state + charid ].next ) {
1796 trie->trans[ state + charid ].next = next_alloc;
1797 trie->trans[ state ].check++;
1798 prev_states[TRIE_NODENUM(next_alloc)]
1799 = TRIE_NODENUM(state);
1800 next_alloc += trie->uniquecharcount;
1802 state = trie->trans[ state + charid ].next;
1804 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1806 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1809 accept_state = TRIE_NODENUM( state );
1810 TRIE_HANDLE_WORD(accept_state);
1812 } /* end second pass */
1814 /* and now dump it out before we compress it */
1815 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1817 next_alloc, depth+1));
1821 * Inplace compress the table.*
1823 For sparse data sets the table constructed by the trie algorithm will
1824 be mostly 0/FAIL transitions or to put it another way mostly empty.
1825 (Note that leaf nodes will not contain any transitions.)
1827 This algorithm compresses the tables by eliminating most such
1828 transitions, at the cost of a modest bit of extra work during lookup:
1830 - Each states[] entry contains a .base field which indicates the
1831 index in the state[] array wheres its transition data is stored.
1833 - If .base is 0 there are no valid transitions from that node.
1835 - If .base is nonzero then charid is added to it to find an entry in
1838 -If trans[states[state].base+charid].check!=state then the
1839 transition is taken to be a 0/Fail transition. Thus if there are fail
1840 transitions at the front of the node then the .base offset will point
1841 somewhere inside the previous nodes data (or maybe even into a node
1842 even earlier), but the .check field determines if the transition is
1846 The following process inplace converts the table to the compressed
1847 table: We first do not compress the root node 1,and mark its all its
1848 .check pointers as 1 and set its .base pointer as 1 as well. This
1849 allows to do a DFA construction from the compressed table later, and
1850 ensures that any .base pointers we calculate later are greater than
1853 - We set 'pos' to indicate the first entry of the second node.
1855 - We then iterate over the columns of the node, finding the first and
1856 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1857 and set the .check pointers accordingly, and advance pos
1858 appropriately and repreat for the next node. Note that when we copy
1859 the next pointers we have to convert them from the original
1860 NODEIDX form to NODENUM form as the former is not valid post
1863 - If a node has no transitions used we mark its base as 0 and do not
1864 advance the pos pointer.
1866 - If a node only has one transition we use a second pointer into the
1867 structure to fill in allocated fail transitions from other states.
1868 This pointer is independent of the main pointer and scans forward
1869 looking for null transitions that are allocated to a state. When it
1870 finds one it writes the single transition into the "hole". If the
1871 pointer doesnt find one the single transition is appended as normal.
1873 - Once compressed we can Renew/realloc the structures to release the
1876 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1877 specifically Fig 3.47 and the associated pseudocode.
1881 const U32 laststate = TRIE_NODENUM( next_alloc );
1884 trie->statecount = laststate;
1886 for ( state = 1 ; state < laststate ; state++ ) {
1888 const U32 stateidx = TRIE_NODEIDX( state );
1889 const U32 o_used = trie->trans[ stateidx ].check;
1890 U32 used = trie->trans[ stateidx ].check;
1891 trie->trans[ stateidx ].check = 0;
1893 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1894 if ( flag || trie->trans[ stateidx + charid ].next ) {
1895 if ( trie->trans[ stateidx + charid ].next ) {
1897 for ( ; zp < pos ; zp++ ) {
1898 if ( ! trie->trans[ zp ].next ) {
1902 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1903 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1904 trie->trans[ zp ].check = state;
1905 if ( ++zp > pos ) pos = zp;
1912 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1914 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1915 trie->trans[ pos ].check = state;
1920 trie->lasttrans = pos + 1;
1921 trie->states = (reg_trie_state *)
1922 PerlMemShared_realloc( trie->states, laststate
1923 * sizeof(reg_trie_state) );
1924 DEBUG_TRIE_COMPILE_MORE_r(
1925 PerlIO_printf( Perl_debug_log,
1926 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1927 (int)depth * 2 + 2,"",
1928 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1931 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1934 } /* end table compress */
1936 DEBUG_TRIE_COMPILE_MORE_r(
1937 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1938 (int)depth * 2 + 2, "",
1939 (UV)trie->statecount,
1940 (UV)trie->lasttrans)
1942 /* resize the trans array to remove unused space */
1943 trie->trans = (reg_trie_trans *)
1944 PerlMemShared_realloc( trie->trans, trie->lasttrans
1945 * sizeof(reg_trie_trans) );
1947 { /* Modify the program and insert the new TRIE node*/
1948 U8 nodetype =(U8)(flags & 0xFF);
1952 regnode *optimize = NULL;
1953 #ifdef RE_TRACK_PATTERN_OFFSETS
1956 U32 mjd_nodelen = 0;
1957 #endif /* RE_TRACK_PATTERN_OFFSETS */
1958 #endif /* DEBUGGING */
1960 This means we convert either the first branch or the first Exact,
1961 depending on whether the thing following (in 'last') is a branch
1962 or not and whther first is the startbranch (ie is it a sub part of
1963 the alternation or is it the whole thing.)
1964 Assuming its a sub part we conver the EXACT otherwise we convert
1965 the whole branch sequence, including the first.
1967 /* Find the node we are going to overwrite */
1968 if ( first != startbranch || OP( last ) == BRANCH ) {
1969 /* branch sub-chain */
1970 NEXT_OFF( first ) = (U16)(last - first);
1971 #ifdef RE_TRACK_PATTERN_OFFSETS
1973 mjd_offset= Node_Offset((convert));
1974 mjd_nodelen= Node_Length((convert));
1977 /* whole branch chain */
1979 #ifdef RE_TRACK_PATTERN_OFFSETS
1982 const regnode *nop = NEXTOPER( convert );
1983 mjd_offset= Node_Offset((nop));
1984 mjd_nodelen= Node_Length((nop));
1988 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1989 (int)depth * 2 + 2, "",
1990 (UV)mjd_offset, (UV)mjd_nodelen)
1993 /* But first we check to see if there is a common prefix we can
1994 split out as an EXACT and put in front of the TRIE node. */
1995 trie->startstate= 1;
1996 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1998 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2002 const U32 base = trie->states[ state ].trans.base;
2004 if ( trie->states[state].wordnum )
2007 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2008 if ( ( base + ofs >= trie->uniquecharcount ) &&
2009 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2010 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2012 if ( ++count > 1 ) {
2013 SV **tmp = av_fetch( revcharmap, ofs, 0);
2014 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2015 if ( state == 1 ) break;
2017 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2019 PerlIO_printf(Perl_debug_log,
2020 "%*sNew Start State=%"UVuf" Class: [",
2021 (int)depth * 2 + 2, "",
2024 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2025 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2027 TRIE_BITMAP_SET(trie,*ch);
2029 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2031 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2035 TRIE_BITMAP_SET(trie,*ch);
2037 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2038 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2044 SV **tmp = av_fetch( revcharmap, idx, 0);
2046 char *ch = SvPV( *tmp, len );
2048 SV *sv=sv_newmortal();
2049 PerlIO_printf( Perl_debug_log,
2050 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2051 (int)depth * 2 + 2, "",
2053 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2054 PL_colors[0], PL_colors[1],
2055 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2056 PERL_PV_ESCAPE_FIRSTCHAR
2061 OP( convert ) = nodetype;
2062 str=STRING(convert);
2065 STR_LEN(convert) += len;
2071 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2076 trie->prefixlen = (state-1);
2078 regnode *n = convert+NODE_SZ_STR(convert);
2079 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2080 trie->startstate = state;
2081 trie->minlen -= (state - 1);
2082 trie->maxlen -= (state - 1);
2084 /* At least the UNICOS C compiler choked on this
2085 * being argument to DEBUG_r(), so let's just have
2088 #ifdef PERL_EXT_RE_BUILD
2094 regnode *fix = convert;
2095 U32 word = trie->wordcount;
2097 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2098 while( ++fix < n ) {
2099 Set_Node_Offset_Length(fix, 0, 0);
2102 SV ** const tmp = av_fetch( trie_words, word, 0 );
2104 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2105 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2107 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2115 NEXT_OFF(convert) = (U16)(tail - convert);
2116 DEBUG_r(optimize= n);
2122 if ( trie->maxlen ) {
2123 NEXT_OFF( convert ) = (U16)(tail - convert);
2124 ARG_SET( convert, data_slot );
2125 /* Store the offset to the first unabsorbed branch in
2126 jump[0], which is otherwise unused by the jump logic.
2127 We use this when dumping a trie and during optimisation. */
2129 trie->jump[0] = (U16)(nextbranch - convert);
2132 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2133 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2135 OP( convert ) = TRIEC;
2136 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2137 PerlMemShared_free(trie->bitmap);
2140 OP( convert ) = TRIE;
2142 /* store the type in the flags */
2143 convert->flags = nodetype;
2147 + regarglen[ OP( convert ) ];
2149 /* XXX We really should free up the resource in trie now,
2150 as we won't use them - (which resources?) dmq */
2152 /* needed for dumping*/
2153 DEBUG_r(if (optimize) {
2154 regnode *opt = convert;
2156 while ( ++opt < optimize) {
2157 Set_Node_Offset_Length(opt,0,0);
2160 Try to clean up some of the debris left after the
2163 while( optimize < jumper ) {
2164 mjd_nodelen += Node_Length((optimize));
2165 OP( optimize ) = OPTIMIZED;
2166 Set_Node_Offset_Length(optimize,0,0);
2169 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2171 } /* end node insert */
2173 /* Finish populating the prev field of the wordinfo array. Walk back
2174 * from each accept state until we find another accept state, and if
2175 * so, point the first word's .prev field at the second word. If the
2176 * second already has a .prev field set, stop now. This will be the
2177 * case either if we've already processed that word's accept state,
2178 * or that that state had multiple words, and the overspill words
2179 * were already linked up earlier.
2186 for (word=1; word <= trie->wordcount; word++) {
2188 if (trie->wordinfo[word].prev)
2190 state = trie->wordinfo[word].accept;
2192 state = prev_states[state];
2195 prev = trie->states[state].wordnum;
2199 trie->wordinfo[word].prev = prev;
2201 Safefree(prev_states);
2205 /* and now dump out the compressed format */
2206 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2208 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2210 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2211 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2213 SvREFCNT_dec(revcharmap);
2217 : trie->startstate>1
2223 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2225 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2227 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2228 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2231 We find the fail state for each state in the trie, this state is the longest proper
2232 suffix of the current states 'word' that is also a proper prefix of another word in our
2233 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2234 the DFA not to have to restart after its tried and failed a word at a given point, it
2235 simply continues as though it had been matching the other word in the first place.
2237 'abcdgu'=~/abcdefg|cdgu/
2238 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2239 fail, which would bring use to the state representing 'd' in the second word where we would
2240 try 'g' and succeed, prodceding to match 'cdgu'.
2242 /* add a fail transition */
2243 const U32 trie_offset = ARG(source);
2244 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2246 const U32 ucharcount = trie->uniquecharcount;
2247 const U32 numstates = trie->statecount;
2248 const U32 ubound = trie->lasttrans + ucharcount;
2252 U32 base = trie->states[ 1 ].trans.base;
2255 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2256 GET_RE_DEBUG_FLAGS_DECL;
2258 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2260 PERL_UNUSED_ARG(depth);
2264 ARG_SET( stclass, data_slot );
2265 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2266 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2267 aho->trie=trie_offset;
2268 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2269 Copy( trie->states, aho->states, numstates, reg_trie_state );
2270 Newxz( q, numstates, U32);
2271 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2274 /* initialize fail[0..1] to be 1 so that we always have
2275 a valid final fail state */
2276 fail[ 0 ] = fail[ 1 ] = 1;
2278 for ( charid = 0; charid < ucharcount ; charid++ ) {
2279 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2281 q[ q_write ] = newstate;
2282 /* set to point at the root */
2283 fail[ q[ q_write++ ] ]=1;
2286 while ( q_read < q_write) {
2287 const U32 cur = q[ q_read++ % numstates ];
2288 base = trie->states[ cur ].trans.base;
2290 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2291 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2293 U32 fail_state = cur;
2296 fail_state = fail[ fail_state ];
2297 fail_base = aho->states[ fail_state ].trans.base;
2298 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2300 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2301 fail[ ch_state ] = fail_state;
2302 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2304 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2306 q[ q_write++ % numstates] = ch_state;
2310 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2311 when we fail in state 1, this allows us to use the
2312 charclass scan to find a valid start char. This is based on the principle
2313 that theres a good chance the string being searched contains lots of stuff
2314 that cant be a start char.
2316 fail[ 0 ] = fail[ 1 ] = 0;
2317 DEBUG_TRIE_COMPILE_r({
2318 PerlIO_printf(Perl_debug_log,
2319 "%*sStclass Failtable (%"UVuf" states): 0",
2320 (int)(depth * 2), "", (UV)numstates
2322 for( q_read=1; q_read<numstates; q_read++ ) {
2323 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2325 PerlIO_printf(Perl_debug_log, "\n");
2328 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2333 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2334 * These need to be revisited when a newer toolchain becomes available.
2336 #if defined(__sparc64__) && defined(__GNUC__)
2337 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2338 # undef SPARC64_GCC_WORKAROUND
2339 # define SPARC64_GCC_WORKAROUND 1
2343 #define DEBUG_PEEP(str,scan,depth) \
2344 DEBUG_OPTIMISE_r({if (scan){ \
2345 SV * const mysv=sv_newmortal(); \
2346 regnode *Next = regnext(scan); \
2347 regprop(RExC_rx, mysv, scan); \
2348 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2349 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2350 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2357 #define JOIN_EXACT(scan,min,flags) \
2358 if (PL_regkind[OP(scan)] == EXACT) \
2359 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2362 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2363 /* Merge several consecutive EXACTish nodes into one. */
2364 regnode *n = regnext(scan);
2366 regnode *next = scan + NODE_SZ_STR(scan);
2370 regnode *stop = scan;
2371 GET_RE_DEBUG_FLAGS_DECL;
2373 PERL_UNUSED_ARG(depth);
2376 PERL_ARGS_ASSERT_JOIN_EXACT;
2377 #ifndef EXPERIMENTAL_INPLACESCAN
2378 PERL_UNUSED_ARG(flags);
2379 PERL_UNUSED_ARG(val);
2381 DEBUG_PEEP("join",scan,depth);
2383 /* Skip NOTHING, merge EXACT*. */
2385 ( PL_regkind[OP(n)] == NOTHING ||
2386 (stringok && (OP(n) == OP(scan))))
2388 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2390 if (OP(n) == TAIL || n > next)
2392 if (PL_regkind[OP(n)] == NOTHING) {
2393 DEBUG_PEEP("skip:",n,depth);
2394 NEXT_OFF(scan) += NEXT_OFF(n);
2395 next = n + NODE_STEP_REGNODE;
2402 else if (stringok) {
2403 const unsigned int oldl = STR_LEN(scan);
2404 regnode * const nnext = regnext(n);
2406 DEBUG_PEEP("merg",n,depth);
2409 if (oldl + STR_LEN(n) > U8_MAX)
2411 NEXT_OFF(scan) += NEXT_OFF(n);
2412 STR_LEN(scan) += STR_LEN(n);
2413 next = n + NODE_SZ_STR(n);
2414 /* Now we can overwrite *n : */
2415 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2423 #ifdef EXPERIMENTAL_INPLACESCAN
2424 if (flags && !NEXT_OFF(n)) {
2425 DEBUG_PEEP("atch", val, depth);
2426 if (reg_off_by_arg[OP(n)]) {
2427 ARG_SET(n, val - n);
2430 NEXT_OFF(n) = val - n;
2437 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2439 Two problematic code points in Unicode casefolding of EXACT nodes:
2441 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2442 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2448 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2449 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2451 This means that in case-insensitive matching (or "loose matching",
2452 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2453 length of the above casefolded versions) can match a target string
2454 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2455 This would rather mess up the minimum length computation.
2457 What we'll do is to look for the tail four bytes, and then peek
2458 at the preceding two bytes to see whether we need to decrease
2459 the minimum length by four (six minus two).
2461 Thanks to the design of UTF-8, there cannot be false matches:
2462 A sequence of valid UTF-8 bytes cannot be a subsequence of
2463 another valid sequence of UTF-8 bytes.
2466 char * const s0 = STRING(scan), *s, *t;
2467 char * const s1 = s0 + STR_LEN(scan) - 1;
2468 char * const s2 = s1 - 4;
2469 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2470 const char t0[] = "\xaf\x49\xaf\x42";
2472 const char t0[] = "\xcc\x88\xcc\x81";
2474 const char * const t1 = t0 + 3;
2477 s < s2 && (t = ninstr(s, s1, t0, t1));
2480 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2481 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2483 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2484 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2492 n = scan + NODE_SZ_STR(scan);
2494 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2501 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2505 /* REx optimizer. Converts nodes into quickier variants "in place".
2506 Finds fixed substrings. */
2508 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2509 to the position after last scanned or to NULL. */
2511 #define INIT_AND_WITHP \
2512 assert(!and_withp); \
2513 Newx(and_withp,1,struct regnode_charclass_class); \
2514 SAVEFREEPV(and_withp)
2516 /* this is a chain of data about sub patterns we are processing that
2517 need to be handled seperately/specially in study_chunk. Its so
2518 we can simulate recursion without losing state. */
2520 typedef struct scan_frame {
2521 regnode *last; /* last node to process in this frame */
2522 regnode *next; /* next node to process when last is reached */
2523 struct scan_frame *prev; /*previous frame*/
2524 I32 stop; /* what stopparen do we use */
2528 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2530 #define CASE_SYNST_FNC(nAmE) \
2532 if (flags & SCF_DO_STCLASS_AND) { \
2533 for (value = 0; value < 256; value++) \
2534 if (!is_ ## nAmE ## _cp(value)) \
2535 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2538 for (value = 0; value < 256; value++) \
2539 if (is_ ## nAmE ## _cp(value)) \
2540 ANYOF_BITMAP_SET(data->start_class, value); \
2544 if (flags & SCF_DO_STCLASS_AND) { \
2545 for (value = 0; value < 256; value++) \
2546 if (is_ ## nAmE ## _cp(value)) \
2547 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2550 for (value = 0; value < 256; value++) \
2551 if (!is_ ## nAmE ## _cp(value)) \
2552 ANYOF_BITMAP_SET(data->start_class, value); \
2559 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2560 I32 *minlenp, I32 *deltap,
2565 struct regnode_charclass_class *and_withp,
2566 U32 flags, U32 depth)
2567 /* scanp: Start here (read-write). */
2568 /* deltap: Write maxlen-minlen here. */
2569 /* last: Stop before this one. */
2570 /* data: string data about the pattern */
2571 /* stopparen: treat close N as END */
2572 /* recursed: which subroutines have we recursed into */
2573 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2576 I32 min = 0, pars = 0, code;
2577 regnode *scan = *scanp, *next;
2579 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2580 int is_inf_internal = 0; /* The studied chunk is infinite */
2581 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2582 scan_data_t data_fake;
2583 SV *re_trie_maxbuff = NULL;
2584 regnode *first_non_open = scan;
2585 I32 stopmin = I32_MAX;
2586 scan_frame *frame = NULL;
2587 GET_RE_DEBUG_FLAGS_DECL;
2589 PERL_ARGS_ASSERT_STUDY_CHUNK;
2592 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2596 while (first_non_open && OP(first_non_open) == OPEN)
2597 first_non_open=regnext(first_non_open);
2602 while ( scan && OP(scan) != END && scan < last ){
2603 /* Peephole optimizer: */
2604 DEBUG_STUDYDATA("Peep:", data,depth);
2605 DEBUG_PEEP("Peep",scan,depth);
2606 JOIN_EXACT(scan,&min,0);
2608 /* Follow the next-chain of the current node and optimize
2609 away all the NOTHINGs from it. */
2610 if (OP(scan) != CURLYX) {
2611 const int max = (reg_off_by_arg[OP(scan)]
2613 /* I32 may be smaller than U16 on CRAYs! */
2614 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2615 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2619 /* Skip NOTHING and LONGJMP. */
2620 while ((n = regnext(n))
2621 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2622 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2623 && off + noff < max)
2625 if (reg_off_by_arg[OP(scan)])
2628 NEXT_OFF(scan) = off;
2633 /* The principal pseudo-switch. Cannot be a switch, since we
2634 look into several different things. */
2635 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2636 || OP(scan) == IFTHEN) {
2637 next = regnext(scan);
2639 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2641 if (OP(next) == code || code == IFTHEN) {
2642 /* NOTE - There is similar code to this block below for handling
2643 TRIE nodes on a re-study. If you change stuff here check there
2645 I32 max1 = 0, min1 = I32_MAX, num = 0;
2646 struct regnode_charclass_class accum;
2647 regnode * const startbranch=scan;
2649 if (flags & SCF_DO_SUBSTR)
2650 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2651 if (flags & SCF_DO_STCLASS)
2652 cl_init_zero(pRExC_state, &accum);
2654 while (OP(scan) == code) {
2655 I32 deltanext, minnext, f = 0, fake;
2656 struct regnode_charclass_class this_class;
2659 data_fake.flags = 0;
2661 data_fake.whilem_c = data->whilem_c;
2662 data_fake.last_closep = data->last_closep;
2665 data_fake.last_closep = &fake;
2667 data_fake.pos_delta = delta;
2668 next = regnext(scan);
2669 scan = NEXTOPER(scan);
2671 scan = NEXTOPER(scan);
2672 if (flags & SCF_DO_STCLASS) {
2673 cl_init(pRExC_state, &this_class);
2674 data_fake.start_class = &this_class;
2675 f = SCF_DO_STCLASS_AND;
2677 if (flags & SCF_WHILEM_VISITED_POS)
2678 f |= SCF_WHILEM_VISITED_POS;
2680 /* we suppose the run is continuous, last=next...*/
2681 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2683 stopparen, recursed, NULL, f,depth+1);
2686 if (max1 < minnext + deltanext)
2687 max1 = minnext + deltanext;
2688 if (deltanext == I32_MAX)
2689 is_inf = is_inf_internal = 1;
2691 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2693 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2694 if ( stopmin > minnext)
2695 stopmin = min + min1;
2696 flags &= ~SCF_DO_SUBSTR;
2698 data->flags |= SCF_SEEN_ACCEPT;
2701 if (data_fake.flags & SF_HAS_EVAL)
2702 data->flags |= SF_HAS_EVAL;
2703 data->whilem_c = data_fake.whilem_c;
2705 if (flags & SCF_DO_STCLASS)
2706 cl_or(pRExC_state, &accum, &this_class);
2708 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2710 if (flags & SCF_DO_SUBSTR) {
2711 data->pos_min += min1;
2712 data->pos_delta += max1 - min1;
2713 if (max1 != min1 || is_inf)
2714 data->longest = &(data->longest_float);
2717 delta += max1 - min1;
2718 if (flags & SCF_DO_STCLASS_OR) {
2719 cl_or(pRExC_state, data->start_class, &accum);
2721 cl_and(data->start_class, and_withp);
2722 flags &= ~SCF_DO_STCLASS;
2725 else if (flags & SCF_DO_STCLASS_AND) {
2727 cl_and(data->start_class, &accum);
2728 flags &= ~SCF_DO_STCLASS;
2731 /* Switch to OR mode: cache the old value of
2732 * data->start_class */
2734 StructCopy(data->start_class, and_withp,
2735 struct regnode_charclass_class);
2736 flags &= ~SCF_DO_STCLASS_AND;
2737 StructCopy(&accum, data->start_class,
2738 struct regnode_charclass_class);
2739 flags |= SCF_DO_STCLASS_OR;
2740 data->start_class->flags |= ANYOF_EOS;
2744 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2747 Assuming this was/is a branch we are dealing with: 'scan' now
2748 points at the item that follows the branch sequence, whatever
2749 it is. We now start at the beginning of the sequence and look
2756 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2758 If we can find such a subseqence we need to turn the first
2759 element into a trie and then add the subsequent branch exact
2760 strings to the trie.
2764 1. patterns where the whole set of branch can be converted.
2766 2. patterns where only a subset can be converted.
2768 In case 1 we can replace the whole set with a single regop
2769 for the trie. In case 2 we need to keep the start and end
2772 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2773 becomes BRANCH TRIE; BRANCH X;
2775 There is an additional case, that being where there is a
2776 common prefix, which gets split out into an EXACT like node
2777 preceding the TRIE node.
2779 If x(1..n)==tail then we can do a simple trie, if not we make
2780 a "jump" trie, such that when we match the appropriate word
2781 we "jump" to the appopriate tail node. Essentailly we turn
2782 a nested if into a case structure of sorts.
2787 if (!re_trie_maxbuff) {
2788 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2789 if (!SvIOK(re_trie_maxbuff))
2790 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2792 if ( SvIV(re_trie_maxbuff)>=0 ) {
2794 regnode *first = (regnode *)NULL;
2795 regnode *last = (regnode *)NULL;
2796 regnode *tail = scan;
2801 SV * const mysv = sv_newmortal(); /* for dumping */
2803 /* var tail is used because there may be a TAIL
2804 regop in the way. Ie, the exacts will point to the
2805 thing following the TAIL, but the last branch will
2806 point at the TAIL. So we advance tail. If we
2807 have nested (?:) we may have to move through several
2811 while ( OP( tail ) == TAIL ) {
2812 /* this is the TAIL generated by (?:) */
2813 tail = regnext( tail );
2818 regprop(RExC_rx, mysv, tail );
2819 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2820 (int)depth * 2 + 2, "",
2821 "Looking for TRIE'able sequences. Tail node is: ",
2822 SvPV_nolen_const( mysv )
2828 step through the branches, cur represents each
2829 branch, noper is the first thing to be matched
2830 as part of that branch and noper_next is the
2831 regnext() of that node. if noper is an EXACT
2832 and noper_next is the same as scan (our current
2833 position in the regex) then the EXACT branch is
2834 a possible optimization target. Once we have
2835 two or more consequetive such branches we can
2836 create a trie of the EXACT's contents and stich
2837 it in place. If the sequence represents all of
2838 the branches we eliminate the whole thing and
2839 replace it with a single TRIE. If it is a
2840 subsequence then we need to stitch it in. This
2841 means the first branch has to remain, and needs
2842 to be repointed at the item on the branch chain
2843 following the last branch optimized. This could
2844 be either a BRANCH, in which case the
2845 subsequence is internal, or it could be the
2846 item following the branch sequence in which
2847 case the subsequence is at the end.
2851 /* dont use tail as the end marker for this traverse */
2852 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2853 regnode * const noper = NEXTOPER( cur );
2854 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2855 regnode * const noper_next = regnext( noper );
2859 regprop(RExC_rx, mysv, cur);
2860 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2861 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2863 regprop(RExC_rx, mysv, noper);
2864 PerlIO_printf( Perl_debug_log, " -> %s",
2865 SvPV_nolen_const(mysv));
2868 regprop(RExC_rx, mysv, noper_next );
2869 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2870 SvPV_nolen_const(mysv));
2872 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2873 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2875 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2876 : PL_regkind[ OP( noper ) ] == EXACT )
2877 || OP(noper) == NOTHING )
2879 && noper_next == tail
2884 if ( !first || optype == NOTHING ) {
2885 if (!first) first = cur;
2886 optype = OP( noper );
2892 Currently we do not believe that the trie logic can
2893 handle case insensitive matching properly when the
2894 pattern is not unicode (thus forcing unicode semantics).
2896 If/when this is fixed the following define can be swapped
2897 in below to fully enable trie logic.
2899 #define TRIE_TYPE_IS_SAFE 1
2902 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2904 if ( last && TRIE_TYPE_IS_SAFE ) {
2905 make_trie( pRExC_state,
2906 startbranch, first, cur, tail, count,
2909 if ( PL_regkind[ OP( noper ) ] == EXACT
2911 && noper_next == tail
2916 optype = OP( noper );
2926 regprop(RExC_rx, mysv, cur);
2927 PerlIO_printf( Perl_debug_log,
2928 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2929 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2933 if ( last && TRIE_TYPE_IS_SAFE ) {
2934 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2935 #ifdef TRIE_STUDY_OPT
2936 if ( ((made == MADE_EXACT_TRIE &&
2937 startbranch == first)
2938 || ( first_non_open == first )) &&
2940 flags |= SCF_TRIE_RESTUDY;
2941 if ( startbranch == first
2944 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2954 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2955 scan = NEXTOPER(NEXTOPER(scan));
2956 } else /* single branch is optimized. */
2957 scan = NEXTOPER(scan);
2959 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2960 scan_frame *newframe = NULL;
2965 if (OP(scan) != SUSPEND) {
2966 /* set the pointer */
2967 if (OP(scan) == GOSUB) {
2969 RExC_recurse[ARG2L(scan)] = scan;
2970 start = RExC_open_parens[paren-1];
2971 end = RExC_close_parens[paren-1];
2974 start = RExC_rxi->program + 1;
2978 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2979 SAVEFREEPV(recursed);
2981 if (!PAREN_TEST(recursed,paren+1)) {
2982 PAREN_SET(recursed,paren+1);
2983 Newx(newframe,1,scan_frame);
2985 if (flags & SCF_DO_SUBSTR) {
2986 SCAN_COMMIT(pRExC_state,data,minlenp);
2987 data->longest = &(data->longest_float);
2989 is_inf = is_inf_internal = 1;
2990 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2991 cl_anything(pRExC_state, data->start_class);
2992 flags &= ~SCF_DO_STCLASS;
2995 Newx(newframe,1,scan_frame);
2998 end = regnext(scan);
3003 SAVEFREEPV(newframe);
3004 newframe->next = regnext(scan);
3005 newframe->last = last;
3006 newframe->stop = stopparen;
3007 newframe->prev = frame;
3017 else if (OP(scan) == EXACT) {
3018 I32 l = STR_LEN(scan);
3021 const U8 * const s = (U8*)STRING(scan);
3022 l = utf8_length(s, s + l);
3023 uc = utf8_to_uvchr(s, NULL);
3025 uc = *((U8*)STRING(scan));
3028 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3029 /* The code below prefers earlier match for fixed
3030 offset, later match for variable offset. */
3031 if (data->last_end == -1) { /* Update the start info. */
3032 data->last_start_min = data->pos_min;
3033 data->last_start_max = is_inf
3034 ? I32_MAX : data->pos_min + data->pos_delta;
3036 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3038 SvUTF8_on(data->last_found);
3040 SV * const sv = data->last_found;
3041 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3042 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3043 if (mg && mg->mg_len >= 0)
3044 mg->mg_len += utf8_length((U8*)STRING(scan),
3045 (U8*)STRING(scan)+STR_LEN(scan));
3047 data->last_end = data->pos_min + l;
3048 data->pos_min += l; /* As in the first entry. */
3049 data->flags &= ~SF_BEFORE_EOL;
3051 if (flags & SCF_DO_STCLASS_AND) {
3052 /* Check whether it is compatible with what we know already! */
3056 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3057 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3058 && (!(data->start_class->flags & ANYOF_FOLD)
3059 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3062 ANYOF_CLASS_ZERO(data->start_class);
3063 ANYOF_BITMAP_ZERO(data->start_class);
3065 ANYOF_BITMAP_SET(data->start_class, uc);
3066 data->start_class->flags &= ~ANYOF_EOS;
3068 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3070 else if (flags & SCF_DO_STCLASS_OR) {
3071 /* false positive possible if the class is case-folded */
3073 ANYOF_BITMAP_SET(data->start_class, uc);
3075 data->start_class->flags |= ANYOF_UNICODE_ALL;
3076 data->start_class->flags &= ~ANYOF_EOS;
3077 cl_and(data->start_class, and_withp);
3079 flags &= ~SCF_DO_STCLASS;
3081 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3082 I32 l = STR_LEN(scan);
3083 UV uc = *((U8*)STRING(scan));
3085 /* Search for fixed substrings supports EXACT only. */
3086 if (flags & SCF_DO_SUBSTR) {
3088 SCAN_COMMIT(pRExC_state, data, minlenp);
3091 const U8 * const s = (U8 *)STRING(scan);
3092 l = utf8_length(s, s + l);
3093 uc = utf8_to_uvchr(s, NULL);
3096 if (flags & SCF_DO_SUBSTR)
3098 if (flags & SCF_DO_STCLASS_AND) {
3099 /* Check whether it is compatible with what we know already! */
3103 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3104 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3105 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3107 ANYOF_CLASS_ZERO(data->start_class);
3108 ANYOF_BITMAP_ZERO(data->start_class);
3110 ANYOF_BITMAP_SET(data->start_class, uc);
3111 data->start_class->flags &= ~ANYOF_EOS;
3112 data->start_class->flags |= ANYOF_FOLD;
3113 if (OP(scan) == EXACTFL)
3114 data->start_class->flags |= ANYOF_LOCALE;
3117 else if (flags & SCF_DO_STCLASS_OR) {
3118 if (data->start_class->flags & ANYOF_FOLD) {
3119 /* false positive possible if the class is case-folded.
3120 Assume that the locale settings are the same... */
3122 ANYOF_BITMAP_SET(data->start_class, uc);
3123 data->start_class->flags &= ~ANYOF_EOS;
3125 cl_and(data->start_class, and_withp);
3127 flags &= ~SCF_DO_STCLASS;
3129 else if (REGNODE_VARIES(OP(scan))) {
3130 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3131 I32 f = flags, pos_before = 0;
3132 regnode * const oscan = scan;
3133 struct regnode_charclass_class this_class;
3134 struct regnode_charclass_class *oclass = NULL;
3135 I32 next_is_eval = 0;
3137 switch (PL_regkind[OP(scan)]) {
3138 case WHILEM: /* End of (?:...)* . */
3139 scan = NEXTOPER(scan);
3142 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3143 next = NEXTOPER(scan);
3144 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3146 maxcount = REG_INFTY;
3147 next = regnext(scan);
3148 scan = NEXTOPER(scan);
3152 if (flags & SCF_DO_SUBSTR)
3157 if (flags & SCF_DO_STCLASS) {
3159 maxcount = REG_INFTY;
3160 next = regnext(scan);
3161 scan = NEXTOPER(scan);
3164 is_inf = is_inf_internal = 1;
3165 scan = regnext(scan);
3166 if (flags & SCF_DO_SUBSTR) {
3167 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3168 data->longest = &(data->longest_float);
3170 goto optimize_curly_tail;
3172 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3173 && (scan->flags == stopparen))
3178 mincount = ARG1(scan);
3179 maxcount = ARG2(scan);
3181 next = regnext(scan);
3182 if (OP(scan) == CURLYX) {
3183 I32 lp = (data ? *(data->last_closep) : 0);
3184 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3186 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3187 next_is_eval = (OP(scan) == EVAL);
3189 if (flags & SCF_DO_SUBSTR) {
3190 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3191 pos_before = data->pos_min;
3195 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3197 data->flags |= SF_IS_INF;
3199 if (flags & SCF_DO_STCLASS) {
3200 cl_init(pRExC_state, &this_class);
3201 oclass = data->start_class;
3202 data->start_class = &this_class;
3203 f |= SCF_DO_STCLASS_AND;
3204 f &= ~SCF_DO_STCLASS_OR;
3206 /* These are the cases when once a subexpression
3207 fails at a particular position, it cannot succeed
3208 even after backtracking at the enclosing scope.
3210 XXXX what if minimal match and we are at the
3211 initial run of {n,m}? */
3212 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3213 f &= ~SCF_WHILEM_VISITED_POS;
3215 /* This will finish on WHILEM, setting scan, or on NULL: */
3216 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3217 last, data, stopparen, recursed, NULL,
3219 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3221 if (flags & SCF_DO_STCLASS)
3222 data->start_class = oclass;
3223 if (mincount == 0 || minnext == 0) {
3224 if (flags & SCF_DO_STCLASS_OR) {
3225 cl_or(pRExC_state, data->start_class, &this_class);
3227 else if (flags & SCF_DO_STCLASS_AND) {
3228 /* Switch to OR mode: cache the old value of
3229 * data->start_class */
3231 StructCopy(data->start_class, and_withp,
3232 struct regnode_charclass_class);
3233 flags &= ~SCF_DO_STCLASS_AND;
3234 StructCopy(&this_class, data->start_class,
3235 struct regnode_charclass_class);
3236 flags |= SCF_DO_STCLASS_OR;
3237 data->start_class->flags |= ANYOF_EOS;
3239 } else { /* Non-zero len */
3240 if (flags & SCF_DO_STCLASS_OR) {
3241 cl_or(pRExC_state, data->start_class, &this_class);
3242 cl_and(data->start_class, and_withp);
3244 else if (flags & SCF_DO_STCLASS_AND)
3245 cl_and(data->start_class, &this_class);
3246 flags &= ~SCF_DO_STCLASS;
3248 if (!scan) /* It was not CURLYX, but CURLY. */
3250 if ( /* ? quantifier ok, except for (?{ ... }) */
3251 (next_is_eval || !(mincount == 0 && maxcount == 1))
3252 && (minnext == 0) && (deltanext == 0)
3253 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3254 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3256 ckWARNreg(RExC_parse,
3257 "Quantifier unexpected on zero-length expression");
3260 min += minnext * mincount;
3261 is_inf_internal |= ((maxcount == REG_INFTY
3262 && (minnext + deltanext) > 0)
3263 || deltanext == I32_MAX);
3264 is_inf |= is_inf_internal;
3265 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3267 /* Try powerful optimization CURLYX => CURLYN. */
3268 if ( OP(oscan) == CURLYX && data
3269 && data->flags & SF_IN_PAR
3270 && !(data->flags & SF_HAS_EVAL)
3271 && !deltanext && minnext == 1 ) {
3272 /* Try to optimize to CURLYN. */
3273 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3274 regnode * const nxt1 = nxt;
3281 if (!REGNODE_SIMPLE(OP(nxt))
3282 && !(PL_regkind[OP(nxt)] == EXACT
3283 && STR_LEN(nxt) == 1))
3289 if (OP(nxt) != CLOSE)
3291 if (RExC_open_parens) {
3292 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3293 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3295 /* Now we know that nxt2 is the only contents: */
3296 oscan->flags = (U8)ARG(nxt);
3298 OP(nxt1) = NOTHING; /* was OPEN. */
3301 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3302 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3303 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3304 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3305 OP(nxt + 1) = OPTIMIZED; /* was count. */
3306 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3311 /* Try optimization CURLYX => CURLYM. */
3312 if ( OP(oscan) == CURLYX && data
3313 && !(data->flags & SF_HAS_PAR)
3314 && !(data->flags & SF_HAS_EVAL)
3315 && !deltanext /* atom is fixed width */
3316 && minnext != 0 /* CURLYM can't handle zero width */
3318 /* XXXX How to optimize if data == 0? */
3319 /* Optimize to a simpler form. */
3320 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3324 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3325 && (OP(nxt2) != WHILEM))
3327 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3328 /* Need to optimize away parenths. */
3329 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3330 /* Set the parenth number. */
3331 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3333 oscan->flags = (U8)ARG(nxt);
3334 if (RExC_open_parens) {
3335 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3336 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3338 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3339 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3342 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3343 OP(nxt + 1) = OPTIMIZED; /* was count. */
3344 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3345 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3348 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3349 regnode *nnxt = regnext(nxt1);
3352 if (reg_off_by_arg[OP(nxt1)])
3353 ARG_SET(nxt1, nxt2 - nxt1);
3354 else if (nxt2 - nxt1 < U16_MAX)
3355 NEXT_OFF(nxt1) = nxt2 - nxt1;
3357 OP(nxt) = NOTHING; /* Cannot beautify */
3362 /* Optimize again: */
3363 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3364 NULL, stopparen, recursed, NULL, 0,depth+1);
3369 else if ((OP(oscan) == CURLYX)
3370 && (flags & SCF_WHILEM_VISITED_POS)
3371 /* See the comment on a similar expression above.
3372 However, this time it not a subexpression
3373 we care about, but the expression itself. */
3374 && (maxcount == REG_INFTY)
3375 && data && ++data->whilem_c < 16) {
3376 /* This stays as CURLYX, we can put the count/of pair. */
3377 /* Find WHILEM (as in regexec.c) */
3378 regnode *nxt = oscan + NEXT_OFF(oscan);
3380 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3382 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3383 | (RExC_whilem_seen << 4)); /* On WHILEM */
3385 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3387 if (flags & SCF_DO_SUBSTR) {
3388 SV *last_str = NULL;
3389 int counted = mincount != 0;
3391 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3392 #if defined(SPARC64_GCC_WORKAROUND)
3395 const char *s = NULL;
3398 if (pos_before >= data->last_start_min)
3401 b = data->last_start_min;
3404 s = SvPV_const(data->last_found, l);
3405 old = b - data->last_start_min;
3408 I32 b = pos_before >= data->last_start_min
3409 ? pos_before : data->last_start_min;
3411 const char * const s = SvPV_const(data->last_found, l);
3412 I32 old = b - data->last_start_min;
3416 old = utf8_hop((U8*)s, old) - (U8*)s;
3419 /* Get the added string: */
3420 last_str = newSVpvn_utf8(s + old, l, UTF);
3421 if (deltanext == 0 && pos_before == b) {
3422 /* What was added is a constant string */
3424 SvGROW(last_str, (mincount * l) + 1);
3425 repeatcpy(SvPVX(last_str) + l,
3426 SvPVX_const(last_str), l, mincount - 1);
3427 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3428 /* Add additional parts. */
3429 SvCUR_set(data->last_found,
3430 SvCUR(data->last_found) - l);
3431 sv_catsv(data->last_found, last_str);
3433 SV * sv = data->last_found;
3435 SvUTF8(sv) && SvMAGICAL(sv) ?
3436 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3437 if (mg && mg->mg_len >= 0)
3438 mg->mg_len += CHR_SVLEN(last_str) - l;
3440 data->last_end += l * (mincount - 1);
3443 /* start offset must point into the last copy */
3444 data->last_start_min += minnext * (mincount - 1);
3445 data->last_start_max += is_inf ? I32_MAX
3446 : (maxcount - 1) * (minnext + data->pos_delta);
3449 /* It is counted once already... */
3450 data->pos_min += minnext * (mincount - counted);
3451 data->pos_delta += - counted * deltanext +
3452 (minnext + deltanext) * maxcount - minnext * mincount;
3453 if (mincount != maxcount) {
3454 /* Cannot extend fixed substrings found inside
3456 SCAN_COMMIT(pRExC_state,data,minlenp);
3457 if (mincount && last_str) {
3458 SV * const sv = data->last_found;
3459 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3460 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3464 sv_setsv(sv, last_str);
3465 data->last_end = data->pos_min;
3466 data->last_start_min =
3467 data->pos_min - CHR_SVLEN(last_str);
3468 data->last_start_max = is_inf
3470 : data->pos_min + data->pos_delta
3471 - CHR_SVLEN(last_str);
3473 data->longest = &(data->longest_float);
3475 SvREFCNT_dec(last_str);
3477 if (data && (fl & SF_HAS_EVAL))
3478 data->flags |= SF_HAS_EVAL;
3479 optimize_curly_tail:
3480 if (OP(oscan) != CURLYX) {
3481 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3483 NEXT_OFF(oscan) += NEXT_OFF(next);
3486 default: /* REF and CLUMP only? */
3487 if (flags & SCF_DO_SUBSTR) {
3488 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3489 data->longest = &(data->longest_float);
3491 is_inf = is_inf_internal = 1;
3492 if (flags & SCF_DO_STCLASS_OR)
3493 cl_anything(pRExC_state, data->start_class);
3494 flags &= ~SCF_DO_STCLASS;
3498 else if (OP(scan) == LNBREAK) {
3499 if (flags & SCF_DO_STCLASS) {
3501 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3502 if (flags & SCF_DO_STCLASS_AND) {
3503 for (value = 0; value < 256; value++)
3504 if (!is_VERTWS_cp(value))
3505 ANYOF_BITMAP_CLEAR(data->start_class, value);
3508 for (value = 0; value < 256; value++)
3509 if (is_VERTWS_cp(value))
3510 ANYOF_BITMAP_SET(data->start_class, value);
3512 if (flags & SCF_DO_STCLASS_OR)
3513 cl_and(data->start_class, and_withp);
3514 flags &= ~SCF_DO_STCLASS;
3518 if (flags & SCF_DO_SUBSTR) {
3519 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3521 data->pos_delta += 1;
3522 data->longest = &(data->longest_float);
3526 else if (OP(scan) == FOLDCHAR) {
3527 int d = ARG(scan)==0xDF ? 1 : 2;
3528 flags &= ~SCF_DO_STCLASS;
3531 if (flags & SCF_DO_SUBSTR) {
3532 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3534 data->pos_delta += d;
3535 data->longest = &(data->longest_float);
3538 else if (REGNODE_SIMPLE(OP(scan))) {
3541 if (flags & SCF_DO_SUBSTR) {
3542 SCAN_COMMIT(pRExC_state,data,minlenp);
3546 if (flags & SCF_DO_STCLASS) {
3547 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3549 /* Some of the logic below assumes that switching
3550 locale on will only add false positives. */
3551 switch (PL_regkind[OP(scan)]) {
3555 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3556 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3557 cl_anything(pRExC_state, data->start_class);
3560 if (OP(scan) == SANY)
3562 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3563 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3564 || (data->start_class->flags & ANYOF_CLASS));
3565 cl_anything(pRExC_state, data->start_class);
3567 if (flags & SCF_DO_STCLASS_AND || !value)
3568 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3571 if (flags & SCF_DO_STCLASS_AND)
3572 cl_and(data->start_class,
3573 (struct regnode_charclass_class*)scan);
3575 cl_or(pRExC_state, data->start_class,
3576 (struct regnode_charclass_class*)scan);
3579 if (flags & SCF_DO_STCLASS_AND) {
3580 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3581 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3582 for (value = 0; value < 256; value++)
3583 if (!isALNUM(value))
3584 ANYOF_BITMAP_CLEAR(data->start_class, value);
3588 if (data->start_class->flags & ANYOF_LOCALE)
3589 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3591 for (value = 0; value < 256; value++)
3593 ANYOF_BITMAP_SET(data->start_class, value);
3598 if (flags & SCF_DO_STCLASS_AND) {
3599 if (data->start_class->flags & ANYOF_LOCALE)
3600 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3603 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3604 data->start_class->flags |= ANYOF_LOCALE;
3608 if (flags & SCF_DO_STCLASS_AND) {
3609 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3610 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3611 for (value = 0; value < 256; value++)
3613 ANYOF_BITMAP_CLEAR(data->start_class, value);
3617 if (data->start_class->flags & ANYOF_LOCALE)
3618 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3620 for (value = 0; value < 256; value++)
3621 if (!isALNUM(value))
3622 ANYOF_BITMAP_SET(data->start_class, value);
3627 if (flags & SCF_DO_STCLASS_AND) {
3628 if (data->start_class->flags & ANYOF_LOCALE)
3629 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3632 data->start_class->flags |= ANYOF_LOCALE;
3633 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3637 if (flags & SCF_DO_STCLASS_AND) {
3638 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3639 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3640 for (value = 0; value < 256; value++)
3641 if (!isSPACE(value))
3642 ANYOF_BITMAP_CLEAR(data->start_class, value);
3646 if (data->start_class->flags & ANYOF_LOCALE)
3647 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3649 for (value = 0; value < 256; value++)
3651 ANYOF_BITMAP_SET(data->start_class, value);
3656 if (flags & SCF_DO_STCLASS_AND) {
3657 if (data->start_class->flags & ANYOF_LOCALE)
3658 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3661 data->start_class->flags |= ANYOF_LOCALE;
3662 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3666 if (flags & SCF_DO_STCLASS_AND) {
3667 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3668 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3669 for (value = 0; value < 256; value++)
3671 ANYOF_BITMAP_CLEAR(data->start_class, value);
3675 if (data->start_class->flags & ANYOF_LOCALE)
3676 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3678 for (value = 0; value < 256; value++)
3679 if (!isSPACE(value))
3680 ANYOF_BITMAP_SET(data->start_class, value);
3685 if (flags & SCF_DO_STCLASS_AND) {
3686 if (data->start_class->flags & ANYOF_LOCALE) {
3687 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3688 for (value = 0; value < 256; value++)
3689 if (!isSPACE(value))
3690 ANYOF_BITMAP_CLEAR(data->start_class, value);
3694 data->start_class->flags |= ANYOF_LOCALE;
3695 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3699 if (flags & SCF_DO_STCLASS_AND) {
3700 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3701 for (value = 0; value < 256; value++)
3702 if (!isDIGIT(value))
3703 ANYOF_BITMAP_CLEAR(data->start_class, value);
3706 if (data->start_class->flags & ANYOF_LOCALE)
3707 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3709 for (value = 0; value < 256; value++)
3711 ANYOF_BITMAP_SET(data->start_class, value);
3716 if (flags & SCF_DO_STCLASS_AND) {
3717 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3718 for (value = 0; value < 256; value++)
3720 ANYOF_BITMAP_CLEAR(data->start_class, value);
3723 if (data->start_class->flags & ANYOF_LOCALE)
3724 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3726 for (value = 0; value < 256; value++)
3727 if (!isDIGIT(value))
3728 ANYOF_BITMAP_SET(data->start_class, value);
3732 CASE_SYNST_FNC(VERTWS);
3733 CASE_SYNST_FNC(HORIZWS);
3736 if (flags & SCF_DO_STCLASS_OR)
3737 cl_and(data->start_class, and_withp);
3738 flags &= ~SCF_DO_STCLASS;
3741 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3742 data->flags |= (OP(scan) == MEOL
3746 else if ( PL_regkind[OP(scan)] == BRANCHJ
3747 /* Lookbehind, or need to calculate parens/evals/stclass: */
3748 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3749 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3750 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3751 || OP(scan) == UNLESSM )
3753 /* Negative Lookahead/lookbehind
3754 In this case we can't do fixed string optimisation.
3757 I32 deltanext, minnext, fake = 0;
3759 struct regnode_charclass_class intrnl;
3762 data_fake.flags = 0;
3764 data_fake.whilem_c = data->whilem_c;
3765 data_fake.last_closep = data->last_closep;
3768 data_fake.last_closep = &fake;
3769 data_fake.pos_delta = delta;
3770 if ( flags & SCF_DO_STCLASS && !scan->flags
3771 && OP(scan) == IFMATCH ) { /* Lookahead */
3772 cl_init(pRExC_state, &intrnl);
3773 data_fake.start_class = &intrnl;
3774 f |= SCF_DO_STCLASS_AND;
3776 if (flags & SCF_WHILEM_VISITED_POS)
3777 f |= SCF_WHILEM_VISITED_POS;
3778 next = regnext(scan);
3779 nscan = NEXTOPER(NEXTOPER(scan));
3780 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3781 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3784 FAIL("Variable length lookbehind not implemented");
3786 else if (minnext > (I32)U8_MAX) {
3787 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3789 scan->flags = (U8)minnext;
3792 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3794 if (data_fake.flags & SF_HAS_EVAL)
3795 data->flags |= SF_HAS_EVAL;
3796 data->whilem_c = data_fake.whilem_c;
3798 if (f & SCF_DO_STCLASS_AND) {
3799 if (flags & SCF_DO_STCLASS_OR) {
3800 /* OR before, AND after: ideally we would recurse with
3801 * data_fake to get the AND applied by study of the
3802 * remainder of the pattern, and then derecurse;
3803 * *** HACK *** for now just treat as "no information".
3804 * See [perl #56690].
3806 cl_init(pRExC_state, data->start_class);
3808 /* AND before and after: combine and continue */
3809 const int was = (data->start_class->flags & ANYOF_EOS);
3811 cl_and(data->start_class, &intrnl);
3813 data->start_class->flags |= ANYOF_EOS;
3817 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3819 /* Positive Lookahead/lookbehind
3820 In this case we can do fixed string optimisation,
3821 but we must be careful about it. Note in the case of
3822 lookbehind the positions will be offset by the minimum
3823 length of the pattern, something we won't know about
3824 until after the recurse.