5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
116 I32 size; /* Code size. */
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
119 I32 nestroot; /* root parens we are in - used by accept */
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
127 HV *charnames; /* cache of named sequences */
128 HV *paren_names; /* Paren names */
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
133 char *starttry; /* -Dr: where regtry was called. */
134 #define RExC_starttry (pRExC_state->starttry)
137 const char *lastparse;
139 AV *paren_name_list; /* idx -> name */
140 #define RExC_lastparse (pRExC_state->lastparse)
141 #define RExC_lastnum (pRExC_state->lastnum)
142 #define RExC_paren_name_list (pRExC_state->paren_name_list)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_rxi (pRExC_state->rxi)
150 #define RExC_start (pRExC_state->start)
151 #define RExC_end (pRExC_state->end)
152 #define RExC_parse (pRExC_state->parse)
153 #define RExC_whilem_seen (pRExC_state->whilem_seen)
154 #define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
155 #define RExC_emit (pRExC_state->emit)
156 #define RExC_emit_start (pRExC_state->emit_start)
157 #define RExC_naughty (pRExC_state->naughty)
158 #define RExC_sawback (pRExC_state->sawback)
159 #define RExC_seen (pRExC_state->seen)
160 #define RExC_size (pRExC_state->size)
161 #define RExC_npar (pRExC_state->npar)
162 #define RExC_nestroot (pRExC_state->nestroot)
163 #define RExC_extralen (pRExC_state->extralen)
164 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
165 #define RExC_seen_evals (pRExC_state->seen_evals)
166 #define RExC_utf8 (pRExC_state->utf8)
167 #define RExC_charnames (pRExC_state->charnames)
168 #define RExC_open_parens (pRExC_state->open_parens)
169 #define RExC_close_parens (pRExC_state->close_parens)
170 #define RExC_opend (pRExC_state->opend)
171 #define RExC_paren_names (pRExC_state->paren_names)
172 #define RExC_recurse (pRExC_state->recurse)
173 #define RExC_recurse_count (pRExC_state->recurse_count)
176 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
177 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
178 ((*s) == '{' && regcurly(s)))
181 #undef SPSTART /* dratted cpp namespace... */
184 * Flags to be passed up and down.
186 #define WORST 0 /* Worst case. */
187 #define HASWIDTH 0x1 /* Known to match non-null strings. */
188 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
189 #define SPSTART 0x4 /* Starts with * or +. */
190 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
192 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
194 /* whether trie related optimizations are enabled */
195 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
196 #define TRIE_STUDY_OPT
197 #define FULL_TRIE_STUDY
203 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
204 #define PBITVAL(paren) (1 << ((paren) & 7))
205 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
206 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
207 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
210 /* About scan_data_t.
212 During optimisation we recurse through the regexp program performing
213 various inplace (keyhole style) optimisations. In addition study_chunk
214 and scan_commit populate this data structure with information about
215 what strings MUST appear in the pattern. We look for the longest
216 string that must appear for at a fixed location, and we look for the
217 longest string that may appear at a floating location. So for instance
222 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
223 strings (because they follow a .* construct). study_chunk will identify
224 both FOO and BAR as being the longest fixed and floating strings respectively.
226 The strings can be composites, for instance
230 will result in a composite fixed substring 'foo'.
232 For each string some basic information is maintained:
234 - offset or min_offset
235 This is the position the string must appear at, or not before.
236 It also implicitly (when combined with minlenp) tells us how many
237 character must match before the string we are searching.
238 Likewise when combined with minlenp and the length of the string
239 tells us how many characters must appear after the string we have
243 Only used for floating strings. This is the rightmost point that
244 the string can appear at. Ifset to I32 max it indicates that the
245 string can occur infinitely far to the right.
248 A pointer to the minimum length of the pattern that the string
249 was found inside. This is important as in the case of positive
250 lookahead or positive lookbehind we can have multiple patterns
255 The minimum length of the pattern overall is 3, the minimum length
256 of the lookahead part is 3, but the minimum length of the part that
257 will actually match is 1. So 'FOO's minimum length is 3, but the
258 minimum length for the F is 1. This is important as the minimum length
259 is used to determine offsets in front of and behind the string being
260 looked for. Since strings can be composites this is the length of the
261 pattern at the time it was commited with a scan_commit. Note that
262 the length is calculated by study_chunk, so that the minimum lengths
263 are not known until the full pattern has been compiled, thus the
264 pointer to the value.
268 In the case of lookbehind the string being searched for can be
269 offset past the start point of the final matching string.
270 If this value was just blithely removed from the min_offset it would
271 invalidate some of the calculations for how many chars must match
272 before or after (as they are derived from min_offset and minlen and
273 the length of the string being searched for).
274 When the final pattern is compiled and the data is moved from the
275 scan_data_t structure into the regexp structure the information
276 about lookbehind is factored in, with the information that would
277 have been lost precalculated in the end_shift field for the
280 The fields pos_min and pos_delta are used to store the minimum offset
281 and the delta to the maximum offset at the current point in the pattern.
285 typedef struct scan_data_t {
286 /*I32 len_min; unused */
287 /*I32 len_delta; unused */
291 I32 last_end; /* min value, <0 unless valid. */
294 SV **longest; /* Either &l_fixed, or &l_float. */
295 SV *longest_fixed; /* longest fixed string found in pattern */
296 I32 offset_fixed; /* offset where it starts */
297 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
299 SV *longest_float; /* longest floating string found in pattern */
300 I32 offset_float_min; /* earliest point in string it can appear */
301 I32 offset_float_max; /* latest point in string it can appear */
302 I32 *minlen_float; /* pointer to the minlen relevent to the string */
303 I32 lookbehind_float; /* is the position of the string modified by LB */
307 struct regnode_charclass_class *start_class;
311 * Forward declarations for pregcomp()'s friends.
314 static const scan_data_t zero_scan_data =
315 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
317 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
318 #define SF_BEFORE_SEOL 0x0001
319 #define SF_BEFORE_MEOL 0x0002
320 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
321 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
324 # define SF_FIX_SHIFT_EOL (0+2)
325 # define SF_FL_SHIFT_EOL (0+4)
327 # define SF_FIX_SHIFT_EOL (+2)
328 # define SF_FL_SHIFT_EOL (+4)
331 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
332 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
334 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
335 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
336 #define SF_IS_INF 0x0040
337 #define SF_HAS_PAR 0x0080
338 #define SF_IN_PAR 0x0100
339 #define SF_HAS_EVAL 0x0200
340 #define SCF_DO_SUBSTR 0x0400
341 #define SCF_DO_STCLASS_AND 0x0800
342 #define SCF_DO_STCLASS_OR 0x1000
343 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
344 #define SCF_WHILEM_VISITED_POS 0x2000
346 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
347 #define SCF_SEEN_ACCEPT 0x8000
349 #define UTF (RExC_utf8 != 0)
350 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
351 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
353 #define OOB_UNICODE 12345678
354 #define OOB_NAMEDCLASS -1
356 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
357 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
360 /* length of regex to show in messages that don't mark a position within */
361 #define RegexLengthToShowInErrorMessages 127
364 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
365 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
366 * op/pragma/warn/regcomp.
368 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
369 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
371 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
374 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
375 * arg. Show regex, up to a maximum length. If it's too long, chop and add
378 #define _FAIL(code) STMT_START { \
379 const char *ellipses = ""; \
380 IV len = RExC_end - RExC_precomp; \
383 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
384 if (len > RegexLengthToShowInErrorMessages) { \
385 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
386 len = RegexLengthToShowInErrorMessages - 10; \
392 #define FAIL(msg) _FAIL( \
393 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
394 msg, (int)len, RExC_precomp, ellipses))
396 #define FAIL2(msg,arg) _FAIL( \
397 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
398 arg, (int)len, RExC_precomp, ellipses))
401 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
403 #define Simple_vFAIL(m) STMT_START { \
404 const IV offset = RExC_parse - RExC_precomp; \
405 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
406 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
410 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
412 #define vFAIL(m) STMT_START { \
414 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
419 * Like Simple_vFAIL(), but accepts two arguments.
421 #define Simple_vFAIL2(m,a1) STMT_START { \
422 const IV offset = RExC_parse - RExC_precomp; \
423 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
424 (int)offset, RExC_precomp, RExC_precomp + offset); \
428 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
430 #define vFAIL2(m,a1) STMT_START { \
432 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
433 Simple_vFAIL2(m, a1); \
438 * Like Simple_vFAIL(), but accepts three arguments.
440 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
441 const IV offset = RExC_parse - RExC_precomp; \
442 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
443 (int)offset, RExC_precomp, RExC_precomp + offset); \
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
449 #define vFAIL3(m,a1,a2) STMT_START { \
451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
452 Simple_vFAIL3(m, a1, a2); \
456 * Like Simple_vFAIL(), but accepts four arguments.
458 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
459 const IV offset = RExC_parse - RExC_precomp; \
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
464 #define vWARN(loc,m) STMT_START { \
465 const IV offset = loc - RExC_precomp; \
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
467 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
470 #define vWARNdep(loc,m) STMT_START { \
471 const IV offset = loc - RExC_precomp; \
472 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
473 "%s" REPORT_LOCATION, \
474 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
478 #define vWARN2(loc, m, a1) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define vWARN3(loc, m, a1, a2) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
490 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
491 const IV offset = loc - RExC_precomp; \
492 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
493 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
496 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
497 const IV offset = loc - RExC_precomp; \
498 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
499 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
503 /* Allow for side effects in s */
504 #define REGC(c,s) STMT_START { \
505 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
508 /* Macros for recording node offsets. 20001227 mjd@plover.com
509 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
510 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
511 * Element 0 holds the number n.
512 * Position is 1 indexed.
515 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
517 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
518 __LINE__, (int)(node), (int)(byte))); \
520 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
522 RExC_offsets[2*(node)-1] = (byte); \
527 #define Set_Node_Offset(node,byte) \
528 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
529 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
531 #define Set_Node_Length_To_R(node,len) STMT_START { \
533 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
534 __LINE__, (int)(node), (int)(len))); \
536 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
538 RExC_offsets[2*(node)] = (len); \
543 #define Set_Node_Length(node,len) \
544 Set_Node_Length_To_R((node)-RExC_emit_start, len)
545 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
546 #define Set_Node_Cur_Length(node) \
547 Set_Node_Length(node, RExC_parse - parse_start)
549 /* Get offsets and lengths */
550 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
551 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
553 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
554 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
555 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
559 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
560 #define EXPERIMENTAL_INPLACESCAN
563 #define DEBUG_STUDYDATA(str,data,depth) \
564 DEBUG_OPTIMISE_MORE_r(if(data){ \
565 PerlIO_printf(Perl_debug_log, \
566 "%*s" str "Pos:%"IVdf"/%"IVdf \
567 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
568 (int)(depth)*2, "", \
569 (IV)((data)->pos_min), \
570 (IV)((data)->pos_delta), \
571 (UV)((data)->flags), \
572 (IV)((data)->whilem_c), \
573 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
574 is_inf ? "INF " : "" \
576 if ((data)->last_found) \
577 PerlIO_printf(Perl_debug_log, \
578 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
579 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
580 SvPVX_const((data)->last_found), \
581 (IV)((data)->last_end), \
582 (IV)((data)->last_start_min), \
583 (IV)((data)->last_start_max), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
586 SvPVX_const((data)->longest_fixed), \
587 (IV)((data)->offset_fixed), \
588 ((data)->longest && \
589 (data)->longest==&((data)->longest_float)) ? "*" : "", \
590 SvPVX_const((data)->longest_float), \
591 (IV)((data)->offset_float_min), \
592 (IV)((data)->offset_float_max) \
594 PerlIO_printf(Perl_debug_log,"\n"); \
597 static void clear_re(pTHX_ void *r);
599 /* Mark that we cannot extend a found fixed substring at this point.
600 Update the longest found anchored substring and the longest found
601 floating substrings if needed. */
604 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
606 const STRLEN l = CHR_SVLEN(data->last_found);
607 const STRLEN old_l = CHR_SVLEN(*data->longest);
608 GET_RE_DEBUG_FLAGS_DECL;
610 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
611 SvSetMagicSV(*data->longest, data->last_found);
612 if (*data->longest == data->longest_fixed) {
613 data->offset_fixed = l ? data->last_start_min : data->pos_min;
614 if (data->flags & SF_BEFORE_EOL)
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
618 data->flags &= ~SF_FIX_BEFORE_EOL;
619 data->minlen_fixed=minlenp;
620 data->lookbehind_fixed=0;
622 else { /* *data->longest == data->longest_float */
623 data->offset_float_min = l ? data->last_start_min : data->pos_min;
624 data->offset_float_max = (l
625 ? data->last_start_max
626 : data->pos_min + data->pos_delta);
627 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
628 data->offset_float_max = I32_MAX;
629 if (data->flags & SF_BEFORE_EOL)
631 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
633 data->flags &= ~SF_FL_BEFORE_EOL;
634 data->minlen_float=minlenp;
635 data->lookbehind_float=0;
638 SvCUR_set(data->last_found, 0);
640 SV * const sv = data->last_found;
641 if (SvUTF8(sv) && SvMAGICAL(sv)) {
642 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
648 data->flags &= ~SF_BEFORE_EOL;
649 DEBUG_STUDYDATA("cl_anything: ",data,0);
652 /* Can match anything (initialization) */
654 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
656 ANYOF_CLASS_ZERO(cl);
657 ANYOF_BITMAP_SETALL(cl);
658 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
660 cl->flags |= ANYOF_LOCALE;
663 /* Can match anything (initialization) */
665 S_cl_is_anything(const struct regnode_charclass_class *cl)
669 for (value = 0; value <= ANYOF_MAX; value += 2)
670 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
672 if (!(cl->flags & ANYOF_UNICODE_ALL))
674 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
679 /* Can match anything (initialization) */
681 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
683 Zero(cl, 1, struct regnode_charclass_class);
685 cl_anything(pRExC_state, cl);
689 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
691 Zero(cl, 1, struct regnode_charclass_class);
693 cl_anything(pRExC_state, cl);
695 cl->flags |= ANYOF_LOCALE;
698 /* 'And' a given class with another one. Can create false positives */
699 /* We assume that cl is not inverted */
701 S_cl_and(struct regnode_charclass_class *cl,
702 const struct regnode_charclass_class *and_with)
705 assert(and_with->type == ANYOF);
706 if (!(and_with->flags & ANYOF_CLASS)
707 && !(cl->flags & ANYOF_CLASS)
708 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
709 && !(and_with->flags & ANYOF_FOLD)
710 && !(cl->flags & ANYOF_FOLD)) {
713 if (and_with->flags & ANYOF_INVERT)
714 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
715 cl->bitmap[i] &= ~and_with->bitmap[i];
717 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
718 cl->bitmap[i] &= and_with->bitmap[i];
719 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
720 if (!(and_with->flags & ANYOF_EOS))
721 cl->flags &= ~ANYOF_EOS;
723 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
724 !(and_with->flags & ANYOF_INVERT)) {
725 cl->flags &= ~ANYOF_UNICODE_ALL;
726 cl->flags |= ANYOF_UNICODE;
727 ARG_SET(cl, ARG(and_with));
729 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
730 !(and_with->flags & ANYOF_INVERT))
731 cl->flags &= ~ANYOF_UNICODE_ALL;
732 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
733 !(and_with->flags & ANYOF_INVERT))
734 cl->flags &= ~ANYOF_UNICODE;
737 /* 'OR' a given class with another one. Can create false positives */
738 /* We assume that cl is not inverted */
740 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
742 if (or_with->flags & ANYOF_INVERT) {
744 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
745 * <= (B1 | !B2) | (CL1 | !CL2)
746 * which is wasteful if CL2 is small, but we ignore CL2:
747 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
748 * XXXX Can we handle case-fold? Unclear:
749 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
750 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
753 && !(or_with->flags & ANYOF_FOLD)
754 && !(cl->flags & ANYOF_FOLD) ) {
757 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
758 cl->bitmap[i] |= ~or_with->bitmap[i];
759 } /* XXXX: logic is complicated otherwise */
761 cl_anything(pRExC_state, cl);
764 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
765 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
766 && (!(or_with->flags & ANYOF_FOLD)
767 || (cl->flags & ANYOF_FOLD)) ) {
770 /* OR char bitmap and class bitmap separately */
771 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
772 cl->bitmap[i] |= or_with->bitmap[i];
773 if (or_with->flags & ANYOF_CLASS) {
774 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
775 cl->classflags[i] |= or_with->classflags[i];
776 cl->flags |= ANYOF_CLASS;
779 else { /* XXXX: logic is complicated, leave it along for a moment. */
780 cl_anything(pRExC_state, cl);
783 if (or_with->flags & ANYOF_EOS)
784 cl->flags |= ANYOF_EOS;
786 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
787 ARG(cl) != ARG(or_with)) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
791 if (or_with->flags & ANYOF_UNICODE_ALL) {
792 cl->flags |= ANYOF_UNICODE_ALL;
793 cl->flags &= ~ANYOF_UNICODE;
797 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
798 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
799 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
800 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
805 dump_trie(trie,widecharmap,revcharmap)
806 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
807 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
809 These routines dump out a trie in a somewhat readable format.
810 The _interim_ variants are used for debugging the interim
811 tables that are used to generate the final compressed
812 representation which is what dump_trie expects.
814 Part of the reason for their existance is to provide a form
815 of documentation as to how the different representations function.
820 Dumps the final compressed table form of the trie to Perl_debug_log.
821 Used for debugging make_trie().
825 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
826 AV *revcharmap, U32 depth)
829 SV *sv=sv_newmortal();
830 int colwidth= widecharmap ? 6 : 4;
831 GET_RE_DEBUG_FLAGS_DECL;
834 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
835 (int)depth * 2 + 2,"",
836 "Match","Base","Ofs" );
838 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
839 SV ** const tmp = av_fetch( revcharmap, state, 0);
841 PerlIO_printf( Perl_debug_log, "%*s",
843 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
844 PL_colors[0], PL_colors[1],
845 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
846 PERL_PV_ESCAPE_FIRSTCHAR
851 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
852 (int)depth * 2 + 2,"");
854 for( state = 0 ; state < trie->uniquecharcount ; state++ )
855 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
856 PerlIO_printf( Perl_debug_log, "\n");
858 for( state = 1 ; state < trie->statecount ; state++ ) {
859 const U32 base = trie->states[ state ].trans.base;
861 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
863 if ( trie->states[ state ].wordnum ) {
864 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
866 PerlIO_printf( Perl_debug_log, "%6s", "" );
869 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
874 while( ( base + ofs < trie->uniquecharcount ) ||
875 ( base + ofs - trie->uniquecharcount < trie->lasttrans
876 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
879 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
881 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
882 if ( ( base + ofs >= trie->uniquecharcount ) &&
883 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
884 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
886 PerlIO_printf( Perl_debug_log, "%*"UVXf,
888 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
890 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
894 PerlIO_printf( Perl_debug_log, "]");
897 PerlIO_printf( Perl_debug_log, "\n" );
901 Dumps a fully constructed but uncompressed trie in list form.
902 List tries normally only are used for construction when the number of
903 possible chars (trie->uniquecharcount) is very high.
904 Used for debugging make_trie().
907 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
908 HV *widecharmap, AV *revcharmap, U32 next_alloc,
912 SV *sv=sv_newmortal();
913 int colwidth= widecharmap ? 6 : 4;
914 GET_RE_DEBUG_FLAGS_DECL;
915 /* print out the table precompression. */
916 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
917 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
918 "------:-----+-----------------\n" );
920 for( state=1 ; state < next_alloc ; state ++ ) {
923 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
924 (int)depth * 2 + 2,"", (UV)state );
925 if ( ! trie->states[ state ].wordnum ) {
926 PerlIO_printf( Perl_debug_log, "%5s| ","");
928 PerlIO_printf( Perl_debug_log, "W%4x| ",
929 trie->states[ state ].wordnum
932 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
933 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
935 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
942 TRIE_LIST_ITEM(state,charid).forid,
943 (UV)TRIE_LIST_ITEM(state,charid).newstate
946 PerlIO_printf(Perl_debug_log, "\n%*s| ",
947 (int)((depth * 2) + 14), "");
950 PerlIO_printf( Perl_debug_log, "\n");
955 Dumps a fully constructed but uncompressed trie in table form.
956 This is the normal DFA style state transition table, with a few
957 twists to facilitate compression later.
958 Used for debugging make_trie().
961 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
962 HV *widecharmap, AV *revcharmap, U32 next_alloc,
967 SV *sv=sv_newmortal();
968 int colwidth= widecharmap ? 6 : 4;
969 GET_RE_DEBUG_FLAGS_DECL;
972 print out the table precompression so that we can do a visual check
973 that they are identical.
976 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
978 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
979 SV ** const tmp = av_fetch( revcharmap, charid, 0);
981 PerlIO_printf( Perl_debug_log, "%*s",
983 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
984 PL_colors[0], PL_colors[1],
985 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
986 PERL_PV_ESCAPE_FIRSTCHAR
992 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
994 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
995 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
998 PerlIO_printf( Perl_debug_log, "\n" );
1000 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1002 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1003 (int)depth * 2 + 2,"",
1004 (UV)TRIE_NODENUM( state ) );
1006 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1007 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1009 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1011 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1013 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1014 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1016 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1017 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1024 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1025 startbranch: the first branch in the whole branch sequence
1026 first : start branch of sequence of branch-exact nodes.
1027 May be the same as startbranch
1028 last : Thing following the last branch.
1029 May be the same as tail.
1030 tail : item following the branch sequence
1031 count : words in the sequence
1032 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1033 depth : indent depth
1035 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1037 A trie is an N'ary tree where the branches are determined by digital
1038 decomposition of the key. IE, at the root node you look up the 1st character and
1039 follow that branch repeat until you find the end of the branches. Nodes can be
1040 marked as "accepting" meaning they represent a complete word. Eg:
1044 would convert into the following structure. Numbers represent states, letters
1045 following numbers represent valid transitions on the letter from that state, if
1046 the number is in square brackets it represents an accepting state, otherwise it
1047 will be in parenthesis.
1049 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1053 (1) +-i->(6)-+-s->[7]
1055 +-s->(3)-+-h->(4)-+-e->[5]
1057 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1059 This shows that when matching against the string 'hers' we will begin at state 1
1060 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1061 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1062 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1063 single traverse. We store a mapping from accepting to state to which word was
1064 matched, and then when we have multiple possibilities we try to complete the
1065 rest of the regex in the order in which they occured in the alternation.
1067 The only prior NFA like behaviour that would be changed by the TRIE support is
1068 the silent ignoring of duplicate alternations which are of the form:
1070 / (DUPE|DUPE) X? (?{ ... }) Y /x
1072 Thus EVAL blocks follwing a trie may be called a different number of times with
1073 and without the optimisation. With the optimisations dupes will be silently
1074 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1075 the following demonstrates:
1077 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1079 which prints out 'word' three times, but
1081 'words'=~/(word|word|word)(?{ print $1 })S/
1083 which doesnt print it out at all. This is due to other optimisations kicking in.
1085 Example of what happens on a structural level:
1087 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1089 1: CURLYM[1] {1,32767}(18)
1100 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1101 and should turn into:
1103 1: CURLYM[1] {1,32767}(18)
1105 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1113 Cases where tail != last would be like /(?foo|bar)baz/:
1123 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1124 and would end up looking like:
1127 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1134 d = uvuni_to_utf8_flags(d, uv, 0);
1136 is the recommended Unicode-aware way of saying
1141 #define TRIE_STORE_REVCHAR \
1143 SV *tmp = newSVpvs(""); \
1144 if (UTF) SvUTF8_on(tmp); \
1145 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1146 av_push( revcharmap, tmp ); \
1149 #define TRIE_READ_CHAR STMT_START { \
1153 if ( foldlen > 0 ) { \
1154 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1161 foldlen -= UNISKIP( uvc ); \
1162 scan = foldbuf + UNISKIP( uvc ); \
1165 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1175 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1176 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1177 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1178 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1180 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1181 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1182 TRIE_LIST_CUR( state )++; \
1185 #define TRIE_LIST_NEW(state) STMT_START { \
1186 Newxz( trie->states[ state ].trans.list, \
1187 4, reg_trie_trans_le ); \
1188 TRIE_LIST_CUR( state ) = 1; \
1189 TRIE_LIST_LEN( state ) = 4; \
1192 #define TRIE_HANDLE_WORD(state) STMT_START { \
1193 U16 dupe= trie->states[ state ].wordnum; \
1194 regnode * const noper_next = regnext( noper ); \
1196 if (trie->wordlen) \
1197 trie->wordlen[ curword ] = wordlen; \
1199 /* store the word for dumping */ \
1201 if (OP(noper) != NOTHING) \
1202 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1204 tmp = newSVpvn( "", 0 ); \
1205 if ( UTF ) SvUTF8_on( tmp ); \
1206 av_push( trie_words, tmp ); \
1211 if ( noper_next < tail ) { \
1213 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1214 trie->jump[curword] = (U16)(noper_next - convert); \
1216 jumper = noper_next; \
1218 nextbranch= regnext(cur); \
1222 /* So it's a dupe. This means we need to maintain a */\
1223 /* linked-list from the first to the next. */\
1224 /* we only allocate the nextword buffer when there */\
1225 /* a dupe, so first time we have to do the allocation */\
1226 if (!trie->nextword) \
1227 trie->nextword = (U16 *) \
1228 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1229 while ( trie->nextword[dupe] ) \
1230 dupe= trie->nextword[dupe]; \
1231 trie->nextword[dupe]= curword; \
1233 /* we haven't inserted this word yet. */ \
1234 trie->states[ state ].wordnum = curword; \
1239 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1240 ( ( base + charid >= ucharcount \
1241 && base + charid < ubound \
1242 && state == trie->trans[ base - ucharcount + charid ].check \
1243 && trie->trans[ base - ucharcount + charid ].next ) \
1244 ? trie->trans[ base - ucharcount + charid ].next \
1245 : ( state==1 ? special : 0 ) \
1249 #define MADE_JUMP_TRIE 2
1250 #define MADE_EXACT_TRIE 4
1253 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1256 /* first pass, loop through and scan words */
1257 reg_trie_data *trie;
1258 HV *widecharmap = NULL;
1259 AV *revcharmap = newAV();
1261 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1266 regnode *jumper = NULL;
1267 regnode *nextbranch = NULL;
1268 regnode *convert = NULL;
1269 /* we just use folder as a flag in utf8 */
1270 const U8 * const folder = ( flags == EXACTF
1272 : ( flags == EXACTFL
1279 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1280 AV *trie_words = NULL;
1281 /* along with revcharmap, this only used during construction but both are
1282 * useful during debugging so we store them in the struct when debugging.
1285 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1286 STRLEN trie_charcount=0;
1288 SV *re_trie_maxbuff;
1289 GET_RE_DEBUG_FLAGS_DECL;
1291 PERL_UNUSED_ARG(depth);
1294 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1296 trie->startstate = 1;
1297 trie->wordcount = word_count;
1298 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1299 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1300 if (!(UTF && folder))
1301 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1303 trie_words = newAV();
1306 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1307 if (!SvIOK(re_trie_maxbuff)) {
1308 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1311 PerlIO_printf( Perl_debug_log,
1312 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1313 (int)depth * 2 + 2, "",
1314 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1315 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1319 /* Find the node we are going to overwrite */
1320 if ( first == startbranch && OP( last ) != BRANCH ) {
1321 /* whole branch chain */
1324 /* branch sub-chain */
1325 convert = NEXTOPER( first );
1328 /* -- First loop and Setup --
1330 We first traverse the branches and scan each word to determine if it
1331 contains widechars, and how many unique chars there are, this is
1332 important as we have to build a table with at least as many columns as we
1335 We use an array of integers to represent the character codes 0..255
1336 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1337 native representation of the character value as the key and IV's for the
1340 *TODO* If we keep track of how many times each character is used we can
1341 remap the columns so that the table compression later on is more
1342 efficient in terms of memory by ensuring most common value is in the
1343 middle and the least common are on the outside. IMO this would be better
1344 than a most to least common mapping as theres a decent chance the most
1345 common letter will share a node with the least common, meaning the node
1346 will not be compressable. With a middle is most common approach the worst
1347 case is when we have the least common nodes twice.
1351 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1352 regnode * const noper = NEXTOPER( cur );
1353 const U8 *uc = (U8*)STRING( noper );
1354 const U8 * const e = uc + STR_LEN( noper );
1356 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1357 const U8 *scan = (U8*)NULL;
1358 U32 wordlen = 0; /* required init */
1361 if (OP(noper) == NOTHING) {
1366 TRIE_BITMAP_SET(trie,*uc);
1367 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1369 for ( ; uc < e ; uc += len ) {
1370 TRIE_CHARCOUNT(trie)++;
1374 if ( !trie->charmap[ uvc ] ) {
1375 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1377 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1383 widecharmap = newHV();
1385 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1388 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1390 if ( !SvTRUE( *svpp ) ) {
1391 sv_setiv( *svpp, ++trie->uniquecharcount );
1396 if( cur == first ) {
1399 } else if (chars < trie->minlen) {
1401 } else if (chars > trie->maxlen) {
1405 } /* end first pass */
1406 DEBUG_TRIE_COMPILE_r(
1407 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1408 (int)depth * 2 + 2,"",
1409 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1410 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1411 (int)trie->minlen, (int)trie->maxlen )
1413 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1416 We now know what we are dealing with in terms of unique chars and
1417 string sizes so we can calculate how much memory a naive
1418 representation using a flat table will take. If it's over a reasonable
1419 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1420 conservative but potentially much slower representation using an array
1423 At the end we convert both representations into the same compressed
1424 form that will be used in regexec.c for matching with. The latter
1425 is a form that cannot be used to construct with but has memory
1426 properties similar to the list form and access properties similar
1427 to the table form making it both suitable for fast searches and
1428 small enough that its feasable to store for the duration of a program.
1430 See the comment in the code where the compressed table is produced
1431 inplace from the flat tabe representation for an explanation of how
1432 the compression works.
1437 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1439 Second Pass -- Array Of Lists Representation
1441 Each state will be represented by a list of charid:state records
1442 (reg_trie_trans_le) the first such element holds the CUR and LEN
1443 points of the allocated array. (See defines above).
1445 We build the initial structure using the lists, and then convert
1446 it into the compressed table form which allows faster lookups
1447 (but cant be modified once converted).
1450 STRLEN transcount = 1;
1452 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1453 "%*sCompiling trie using list compiler\n",
1454 (int)depth * 2 + 2, ""));
1456 trie->states = (reg_trie_state *)
1457 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1458 sizeof(reg_trie_state) );
1462 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1464 regnode * const noper = NEXTOPER( cur );
1465 U8 *uc = (U8*)STRING( noper );
1466 const U8 * const e = uc + STR_LEN( noper );
1467 U32 state = 1; /* required init */
1468 U16 charid = 0; /* sanity init */
1469 U8 *scan = (U8*)NULL; /* sanity init */
1470 STRLEN foldlen = 0; /* required init */
1471 U32 wordlen = 0; /* required init */
1472 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1474 if (OP(noper) != NOTHING) {
1475 for ( ; uc < e ; uc += len ) {
1480 charid = trie->charmap[ uvc ];
1482 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1486 charid=(U16)SvIV( *svpp );
1489 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1496 if ( !trie->states[ state ].trans.list ) {
1497 TRIE_LIST_NEW( state );
1499 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1500 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1501 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1506 newstate = next_alloc++;
1507 TRIE_LIST_PUSH( state, charid, newstate );
1512 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1516 TRIE_HANDLE_WORD(state);
1518 } /* end second pass */
1520 /* next alloc is the NEXT state to be allocated */
1521 trie->statecount = next_alloc;
1522 trie->states = (reg_trie_state *)
1523 PerlMemShared_realloc( trie->states,
1525 * sizeof(reg_trie_state) );
1527 /* and now dump it out before we compress it */
1528 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1529 revcharmap, next_alloc,
1533 trie->trans = (reg_trie_trans *)
1534 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1541 for( state=1 ; state < next_alloc ; state ++ ) {
1545 DEBUG_TRIE_COMPILE_MORE_r(
1546 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1550 if (trie->states[state].trans.list) {
1551 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1555 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1556 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1557 if ( forid < minid ) {
1559 } else if ( forid > maxid ) {
1563 if ( transcount < tp + maxid - minid + 1) {
1565 trie->trans = (reg_trie_trans *)
1566 PerlMemShared_realloc( trie->trans,
1568 * sizeof(reg_trie_trans) );
1569 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1571 base = trie->uniquecharcount + tp - minid;
1572 if ( maxid == minid ) {
1574 for ( ; zp < tp ; zp++ ) {
1575 if ( ! trie->trans[ zp ].next ) {
1576 base = trie->uniquecharcount + zp - minid;
1577 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1578 trie->trans[ zp ].check = state;
1584 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1585 trie->trans[ tp ].check = state;
1590 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1591 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1592 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1593 trie->trans[ tid ].check = state;
1595 tp += ( maxid - minid + 1 );
1597 Safefree(trie->states[ state ].trans.list);
1600 DEBUG_TRIE_COMPILE_MORE_r(
1601 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1604 trie->states[ state ].trans.base=base;
1606 trie->lasttrans = tp + 1;
1610 Second Pass -- Flat Table Representation.
1612 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1613 We know that we will need Charcount+1 trans at most to store the data
1614 (one row per char at worst case) So we preallocate both structures
1615 assuming worst case.
1617 We then construct the trie using only the .next slots of the entry
1620 We use the .check field of the first entry of the node temporarily to
1621 make compression both faster and easier by keeping track of how many non
1622 zero fields are in the node.
1624 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1627 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1628 number representing the first entry of the node, and state as a
1629 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1630 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1631 are 2 entrys per node. eg:
1639 The table is internally in the right hand, idx form. However as we also
1640 have to deal with the states array which is indexed by nodenum we have to
1641 use TRIE_NODENUM() to convert.
1644 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1645 "%*sCompiling trie using table compiler\n",
1646 (int)depth * 2 + 2, ""));
1648 trie->trans = (reg_trie_trans *)
1649 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1650 * trie->uniquecharcount + 1,
1651 sizeof(reg_trie_trans) );
1652 trie->states = (reg_trie_state *)
1653 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1654 sizeof(reg_trie_state) );
1655 next_alloc = trie->uniquecharcount + 1;
1658 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1660 regnode * const noper = NEXTOPER( cur );
1661 const U8 *uc = (U8*)STRING( noper );
1662 const U8 * const e = uc + STR_LEN( noper );
1664 U32 state = 1; /* required init */
1666 U16 charid = 0; /* sanity init */
1667 U32 accept_state = 0; /* sanity init */
1668 U8 *scan = (U8*)NULL; /* sanity init */
1670 STRLEN foldlen = 0; /* required init */
1671 U32 wordlen = 0; /* required init */
1672 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1674 if ( OP(noper) != NOTHING ) {
1675 for ( ; uc < e ; uc += len ) {
1680 charid = trie->charmap[ uvc ];
1682 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1683 charid = svpp ? (U16)SvIV(*svpp) : 0;
1687 if ( !trie->trans[ state + charid ].next ) {
1688 trie->trans[ state + charid ].next = next_alloc;
1689 trie->trans[ state ].check++;
1690 next_alloc += trie->uniquecharcount;
1692 state = trie->trans[ state + charid ].next;
1694 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1696 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1699 accept_state = TRIE_NODENUM( state );
1700 TRIE_HANDLE_WORD(accept_state);
1702 } /* end second pass */
1704 /* and now dump it out before we compress it */
1705 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1707 next_alloc, depth+1));
1711 * Inplace compress the table.*
1713 For sparse data sets the table constructed by the trie algorithm will
1714 be mostly 0/FAIL transitions or to put it another way mostly empty.
1715 (Note that leaf nodes will not contain any transitions.)
1717 This algorithm compresses the tables by eliminating most such
1718 transitions, at the cost of a modest bit of extra work during lookup:
1720 - Each states[] entry contains a .base field which indicates the
1721 index in the state[] array wheres its transition data is stored.
1723 - If .base is 0 there are no valid transitions from that node.
1725 - If .base is nonzero then charid is added to it to find an entry in
1728 -If trans[states[state].base+charid].check!=state then the
1729 transition is taken to be a 0/Fail transition. Thus if there are fail
1730 transitions at the front of the node then the .base offset will point
1731 somewhere inside the previous nodes data (or maybe even into a node
1732 even earlier), but the .check field determines if the transition is
1736 The following process inplace converts the table to the compressed
1737 table: We first do not compress the root node 1,and mark its all its
1738 .check pointers as 1 and set its .base pointer as 1 as well. This
1739 allows to do a DFA construction from the compressed table later, and
1740 ensures that any .base pointers we calculate later are greater than
1743 - We set 'pos' to indicate the first entry of the second node.
1745 - We then iterate over the columns of the node, finding the first and
1746 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1747 and set the .check pointers accordingly, and advance pos
1748 appropriately and repreat for the next node. Note that when we copy
1749 the next pointers we have to convert them from the original
1750 NODEIDX form to NODENUM form as the former is not valid post
1753 - If a node has no transitions used we mark its base as 0 and do not
1754 advance the pos pointer.
1756 - If a node only has one transition we use a second pointer into the
1757 structure to fill in allocated fail transitions from other states.
1758 This pointer is independent of the main pointer and scans forward
1759 looking for null transitions that are allocated to a state. When it
1760 finds one it writes the single transition into the "hole". If the
1761 pointer doesnt find one the single transition is appended as normal.
1763 - Once compressed we can Renew/realloc the structures to release the
1766 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1767 specifically Fig 3.47 and the associated pseudocode.
1771 const U32 laststate = TRIE_NODENUM( next_alloc );
1774 trie->statecount = laststate;
1776 for ( state = 1 ; state < laststate ; state++ ) {
1778 const U32 stateidx = TRIE_NODEIDX( state );
1779 const U32 o_used = trie->trans[ stateidx ].check;
1780 U32 used = trie->trans[ stateidx ].check;
1781 trie->trans[ stateidx ].check = 0;
1783 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1784 if ( flag || trie->trans[ stateidx + charid ].next ) {
1785 if ( trie->trans[ stateidx + charid ].next ) {
1787 for ( ; zp < pos ; zp++ ) {
1788 if ( ! trie->trans[ zp ].next ) {
1792 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1793 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1794 trie->trans[ zp ].check = state;
1795 if ( ++zp > pos ) pos = zp;
1802 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1804 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1805 trie->trans[ pos ].check = state;
1810 trie->lasttrans = pos + 1;
1811 trie->states = (reg_trie_state *)
1812 PerlMemShared_realloc( trie->states, laststate
1813 * sizeof(reg_trie_state) );
1814 DEBUG_TRIE_COMPILE_MORE_r(
1815 PerlIO_printf( Perl_debug_log,
1816 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1817 (int)depth * 2 + 2,"",
1818 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1821 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1824 } /* end table compress */
1826 DEBUG_TRIE_COMPILE_MORE_r(
1827 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1828 (int)depth * 2 + 2, "",
1829 (UV)trie->statecount,
1830 (UV)trie->lasttrans)
1832 /* resize the trans array to remove unused space */
1833 trie->trans = (reg_trie_trans *)
1834 PerlMemShared_realloc( trie->trans, trie->lasttrans
1835 * sizeof(reg_trie_trans) );
1837 /* and now dump out the compressed format */
1838 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1840 { /* Modify the program and insert the new TRIE node*/
1841 U8 nodetype =(U8)(flags & 0xFF);
1845 regnode *optimize = NULL;
1847 U32 mjd_nodelen = 0;
1850 This means we convert either the first branch or the first Exact,
1851 depending on whether the thing following (in 'last') is a branch
1852 or not and whther first is the startbranch (ie is it a sub part of
1853 the alternation or is it the whole thing.)
1854 Assuming its a sub part we conver the EXACT otherwise we convert
1855 the whole branch sequence, including the first.
1857 /* Find the node we are going to overwrite */
1858 if ( first != startbranch || OP( last ) == BRANCH ) {
1859 /* branch sub-chain */
1860 NEXT_OFF( first ) = (U16)(last - first);
1862 mjd_offset= Node_Offset((convert));
1863 mjd_nodelen= Node_Length((convert));
1865 /* whole branch chain */
1868 const regnode *nop = NEXTOPER( convert );
1869 mjd_offset= Node_Offset((nop));
1870 mjd_nodelen= Node_Length((nop));
1875 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1876 (int)depth * 2 + 2, "",
1877 (UV)mjd_offset, (UV)mjd_nodelen)
1880 /* But first we check to see if there is a common prefix we can
1881 split out as an EXACT and put in front of the TRIE node. */
1882 trie->startstate= 1;
1883 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1885 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1889 const U32 base = trie->states[ state ].trans.base;
1891 if ( trie->states[state].wordnum )
1894 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1895 if ( ( base + ofs >= trie->uniquecharcount ) &&
1896 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1897 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1899 if ( ++count > 1 ) {
1900 SV **tmp = av_fetch( revcharmap, ofs, 0);
1901 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1902 if ( state == 1 ) break;
1904 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1906 PerlIO_printf(Perl_debug_log,
1907 "%*sNew Start State=%"UVuf" Class: [",
1908 (int)depth * 2 + 2, "",
1911 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1912 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1914 TRIE_BITMAP_SET(trie,*ch);
1916 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1918 PerlIO_printf(Perl_debug_log, (char*)ch)
1922 TRIE_BITMAP_SET(trie,*ch);
1924 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1925 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1931 SV **tmp = av_fetch( revcharmap, idx, 0);
1932 char *ch = SvPV_nolen( *tmp );
1934 SV *sv=sv_newmortal();
1935 PerlIO_printf( Perl_debug_log,
1936 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1937 (int)depth * 2 + 2, "",
1939 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1940 PL_colors[0], PL_colors[1],
1941 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1942 PERL_PV_ESCAPE_FIRSTCHAR
1947 OP( convert ) = nodetype;
1948 str=STRING(convert);
1959 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1965 regnode *n = convert+NODE_SZ_STR(convert);
1966 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1967 trie->startstate = state;
1968 trie->minlen -= (state - 1);
1969 trie->maxlen -= (state - 1);
1971 regnode *fix = convert;
1972 U32 word = trie->wordcount;
1974 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1975 while( ++fix < n ) {
1976 Set_Node_Offset_Length(fix, 0, 0);
1979 SV ** const tmp = av_fetch( trie_words, word, 0 );
1981 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1982 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1984 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1991 NEXT_OFF(convert) = (U16)(tail - convert);
1992 DEBUG_r(optimize= n);
1998 if ( trie->maxlen ) {
1999 NEXT_OFF( convert ) = (U16)(tail - convert);
2000 ARG_SET( convert, data_slot );
2001 /* Store the offset to the first unabsorbed branch in
2002 jump[0], which is otherwise unused by the jump logic.
2003 We use this when dumping a trie and during optimisation. */
2005 trie->jump[0] = (U16)(nextbranch - convert);
2008 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2009 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2011 OP( convert ) = TRIEC;
2012 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2013 PerlMemShared_free(trie->bitmap);
2016 OP( convert ) = TRIE;
2018 /* store the type in the flags */
2019 convert->flags = nodetype;
2023 + regarglen[ OP( convert ) ];
2025 /* XXX We really should free up the resource in trie now,
2026 as we won't use them - (which resources?) dmq */
2028 /* needed for dumping*/
2029 DEBUG_r(if (optimize) {
2030 regnode *opt = convert;
2031 while ( ++opt < optimize) {
2032 Set_Node_Offset_Length(opt,0,0);
2035 Try to clean up some of the debris left after the
2038 while( optimize < jumper ) {
2039 mjd_nodelen += Node_Length((optimize));
2040 OP( optimize ) = OPTIMIZED;
2041 Set_Node_Offset_Length(optimize,0,0);
2044 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2046 } /* end node insert */
2047 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2049 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2050 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2052 SvREFCNT_dec(revcharmap);
2056 : trie->startstate>1
2062 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2064 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2066 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2067 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2070 We find the fail state for each state in the trie, this state is the longest proper
2071 suffix of the current states 'word' that is also a proper prefix of another word in our
2072 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2073 the DFA not to have to restart after its tried and failed a word at a given point, it
2074 simply continues as though it had been matching the other word in the first place.
2076 'abcdgu'=~/abcdefg|cdgu/
2077 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2078 fail, which would bring use to the state representing 'd' in the second word where we would
2079 try 'g' and succeed, prodceding to match 'cdgu'.
2081 /* add a fail transition */
2082 const U32 trie_offset = ARG(source);
2083 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2085 const U32 ucharcount = trie->uniquecharcount;
2086 const U32 numstates = trie->statecount;
2087 const U32 ubound = trie->lasttrans + ucharcount;
2091 U32 base = trie->states[ 1 ].trans.base;
2094 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2095 GET_RE_DEBUG_FLAGS_DECL;
2097 PERL_UNUSED_ARG(depth);
2101 ARG_SET( stclass, data_slot );
2102 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2103 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2104 aho->trie=trie_offset;
2105 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2106 Copy( trie->states, aho->states, numstates, reg_trie_state );
2107 Newxz( q, numstates, U32);
2108 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2111 /* initialize fail[0..1] to be 1 so that we always have
2112 a valid final fail state */
2113 fail[ 0 ] = fail[ 1 ] = 1;
2115 for ( charid = 0; charid < ucharcount ; charid++ ) {
2116 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2118 q[ q_write ] = newstate;
2119 /* set to point at the root */
2120 fail[ q[ q_write++ ] ]=1;
2123 while ( q_read < q_write) {
2124 const U32 cur = q[ q_read++ % numstates ];
2125 base = trie->states[ cur ].trans.base;
2127 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2128 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2130 U32 fail_state = cur;
2133 fail_state = fail[ fail_state ];
2134 fail_base = aho->states[ fail_state ].trans.base;
2135 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2137 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2138 fail[ ch_state ] = fail_state;
2139 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2141 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2143 q[ q_write++ % numstates] = ch_state;
2147 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2148 when we fail in state 1, this allows us to use the
2149 charclass scan to find a valid start char. This is based on the principle
2150 that theres a good chance the string being searched contains lots of stuff
2151 that cant be a start char.
2153 fail[ 0 ] = fail[ 1 ] = 0;
2154 DEBUG_TRIE_COMPILE_r({
2155 PerlIO_printf(Perl_debug_log,
2156 "%*sStclass Failtable (%"UVuf" states): 0",
2157 (int)(depth * 2), "", (UV)numstates
2159 for( q_read=1; q_read<numstates; q_read++ ) {
2160 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2162 PerlIO_printf(Perl_debug_log, "\n");
2165 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2170 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2171 * These need to be revisited when a newer toolchain becomes available.
2173 #if defined(__sparc64__) && defined(__GNUC__)
2174 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2175 # undef SPARC64_GCC_WORKAROUND
2176 # define SPARC64_GCC_WORKAROUND 1
2180 #define DEBUG_PEEP(str,scan,depth) \
2181 DEBUG_OPTIMISE_r({if (scan){ \
2182 SV * const mysv=sv_newmortal(); \
2183 regnode *Next = regnext(scan); \
2184 regprop(RExC_rx, mysv, scan); \
2185 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2186 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2187 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2194 #define JOIN_EXACT(scan,min,flags) \
2195 if (PL_regkind[OP(scan)] == EXACT) \
2196 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2199 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2200 /* Merge several consecutive EXACTish nodes into one. */
2201 regnode *n = regnext(scan);
2203 regnode *next = scan + NODE_SZ_STR(scan);
2207 regnode *stop = scan;
2208 GET_RE_DEBUG_FLAGS_DECL;
2210 PERL_UNUSED_ARG(depth);
2212 #ifndef EXPERIMENTAL_INPLACESCAN
2213 PERL_UNUSED_ARG(flags);
2214 PERL_UNUSED_ARG(val);
2216 DEBUG_PEEP("join",scan,depth);
2218 /* Skip NOTHING, merge EXACT*. */
2220 ( PL_regkind[OP(n)] == NOTHING ||
2221 (stringok && (OP(n) == OP(scan))))
2223 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2225 if (OP(n) == TAIL || n > next)
2227 if (PL_regkind[OP(n)] == NOTHING) {
2228 DEBUG_PEEP("skip:",n,depth);
2229 NEXT_OFF(scan) += NEXT_OFF(n);
2230 next = n + NODE_STEP_REGNODE;
2237 else if (stringok) {
2238 const unsigned int oldl = STR_LEN(scan);
2239 regnode * const nnext = regnext(n);
2241 DEBUG_PEEP("merg",n,depth);
2244 if (oldl + STR_LEN(n) > U8_MAX)
2246 NEXT_OFF(scan) += NEXT_OFF(n);
2247 STR_LEN(scan) += STR_LEN(n);
2248 next = n + NODE_SZ_STR(n);
2249 /* Now we can overwrite *n : */
2250 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2258 #ifdef EXPERIMENTAL_INPLACESCAN
2259 if (flags && !NEXT_OFF(n)) {
2260 DEBUG_PEEP("atch", val, depth);
2261 if (reg_off_by_arg[OP(n)]) {
2262 ARG_SET(n, val - n);
2265 NEXT_OFF(n) = val - n;
2272 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2274 Two problematic code points in Unicode casefolding of EXACT nodes:
2276 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2277 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2283 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2284 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2286 This means that in case-insensitive matching (or "loose matching",
2287 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2288 length of the above casefolded versions) can match a target string
2289 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2290 This would rather mess up the minimum length computation.
2292 What we'll do is to look for the tail four bytes, and then peek
2293 at the preceding two bytes to see whether we need to decrease
2294 the minimum length by four (six minus two).
2296 Thanks to the design of UTF-8, there cannot be false matches:
2297 A sequence of valid UTF-8 bytes cannot be a subsequence of
2298 another valid sequence of UTF-8 bytes.
2301 char * const s0 = STRING(scan), *s, *t;
2302 char * const s1 = s0 + STR_LEN(scan) - 1;
2303 char * const s2 = s1 - 4;
2304 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2305 const char t0[] = "\xaf\x49\xaf\x42";
2307 const char t0[] = "\xcc\x88\xcc\x81";
2309 const char * const t1 = t0 + 3;
2312 s < s2 && (t = ninstr(s, s1, t0, t1));
2315 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2316 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2318 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2319 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2327 n = scan + NODE_SZ_STR(scan);
2329 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2336 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2340 /* REx optimizer. Converts nodes into quickier variants "in place".
2341 Finds fixed substrings. */
2343 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2344 to the position after last scanned or to NULL. */
2346 #define INIT_AND_WITHP \
2347 assert(!and_withp); \
2348 Newx(and_withp,1,struct regnode_charclass_class); \
2349 SAVEFREEPV(and_withp)
2351 /* this is a chain of data about sub patterns we are processing that
2352 need to be handled seperately/specially in study_chunk. Its so
2353 we can simulate recursion without losing state. */
2355 typedef struct scan_frame {
2356 regnode *last; /* last node to process in this frame */
2357 regnode *next; /* next node to process when last is reached */
2358 struct scan_frame *prev; /*previous frame*/
2359 I32 stop; /* what stopparen do we use */
2363 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2366 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2367 I32 *minlenp, I32 *deltap,
2372 struct regnode_charclass_class *and_withp,
2373 U32 flags, U32 depth)
2374 /* scanp: Start here (read-write). */
2375 /* deltap: Write maxlen-minlen here. */
2376 /* last: Stop before this one. */
2377 /* data: string data about the pattern */
2378 /* stopparen: treat close N as END */
2379 /* recursed: which subroutines have we recursed into */
2380 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2383 I32 min = 0, pars = 0, code;
2384 regnode *scan = *scanp, *next;
2386 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2387 int is_inf_internal = 0; /* The studied chunk is infinite */
2388 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2389 scan_data_t data_fake;
2390 SV *re_trie_maxbuff = NULL;
2391 regnode *first_non_open = scan;
2392 I32 stopmin = I32_MAX;
2393 scan_frame *frame = NULL;
2395 GET_RE_DEBUG_FLAGS_DECL;
2398 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2402 while (first_non_open && OP(first_non_open) == OPEN)
2403 first_non_open=regnext(first_non_open);
2408 while ( scan && OP(scan) != END && scan < last ){
2409 /* Peephole optimizer: */
2410 DEBUG_STUDYDATA("Peep:", data,depth);
2411 DEBUG_PEEP("Peep",scan,depth);
2412 JOIN_EXACT(scan,&min,0);
2414 /* Follow the next-chain of the current node and optimize
2415 away all the NOTHINGs from it. */
2416 if (OP(scan) != CURLYX) {
2417 const int max = (reg_off_by_arg[OP(scan)]
2419 /* I32 may be smaller than U16 on CRAYs! */
2420 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2421 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2425 /* Skip NOTHING and LONGJMP. */
2426 while ((n = regnext(n))
2427 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2428 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2429 && off + noff < max)
2431 if (reg_off_by_arg[OP(scan)])
2434 NEXT_OFF(scan) = off;
2439 /* The principal pseudo-switch. Cannot be a switch, since we
2440 look into several different things. */
2441 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2442 || OP(scan) == IFTHEN) {
2443 next = regnext(scan);
2445 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2447 if (OP(next) == code || code == IFTHEN) {
2448 /* NOTE - There is similar code to this block below for handling
2449 TRIE nodes on a re-study. If you change stuff here check there
2451 I32 max1 = 0, min1 = I32_MAX, num = 0;
2452 struct regnode_charclass_class accum;
2453 regnode * const startbranch=scan;
2455 if (flags & SCF_DO_SUBSTR)
2456 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2457 if (flags & SCF_DO_STCLASS)
2458 cl_init_zero(pRExC_state, &accum);
2460 while (OP(scan) == code) {
2461 I32 deltanext, minnext, f = 0, fake;
2462 struct regnode_charclass_class this_class;
2465 data_fake.flags = 0;
2467 data_fake.whilem_c = data->whilem_c;
2468 data_fake.last_closep = data->last_closep;
2471 data_fake.last_closep = &fake;
2473 data_fake.pos_delta = delta;
2474 next = regnext(scan);
2475 scan = NEXTOPER(scan);
2477 scan = NEXTOPER(scan);
2478 if (flags & SCF_DO_STCLASS) {
2479 cl_init(pRExC_state, &this_class);
2480 data_fake.start_class = &this_class;
2481 f = SCF_DO_STCLASS_AND;
2483 if (flags & SCF_WHILEM_VISITED_POS)
2484 f |= SCF_WHILEM_VISITED_POS;
2486 /* we suppose the run is continuous, last=next...*/
2487 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2489 stopparen, recursed, NULL, f,depth+1);
2492 if (max1 < minnext + deltanext)
2493 max1 = minnext + deltanext;
2494 if (deltanext == I32_MAX)
2495 is_inf = is_inf_internal = 1;
2497 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2499 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2500 if ( stopmin > minnext)
2501 stopmin = min + min1;
2502 flags &= ~SCF_DO_SUBSTR;
2504 data->flags |= SCF_SEEN_ACCEPT;
2507 if (data_fake.flags & SF_HAS_EVAL)
2508 data->flags |= SF_HAS_EVAL;
2509 data->whilem_c = data_fake.whilem_c;
2511 if (flags & SCF_DO_STCLASS)
2512 cl_or(pRExC_state, &accum, &this_class);
2514 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2516 if (flags & SCF_DO_SUBSTR) {
2517 data->pos_min += min1;
2518 data->pos_delta += max1 - min1;
2519 if (max1 != min1 || is_inf)
2520 data->longest = &(data->longest_float);
2523 delta += max1 - min1;
2524 if (flags & SCF_DO_STCLASS_OR) {
2525 cl_or(pRExC_state, data->start_class, &accum);
2527 cl_and(data->start_class, and_withp);
2528 flags &= ~SCF_DO_STCLASS;
2531 else if (flags & SCF_DO_STCLASS_AND) {
2533 cl_and(data->start_class, &accum);
2534 flags &= ~SCF_DO_STCLASS;
2537 /* Switch to OR mode: cache the old value of
2538 * data->start_class */
2540 StructCopy(data->start_class, and_withp,
2541 struct regnode_charclass_class);
2542 flags &= ~SCF_DO_STCLASS_AND;
2543 StructCopy(&accum, data->start_class,
2544 struct regnode_charclass_class);
2545 flags |= SCF_DO_STCLASS_OR;
2546 data->start_class->flags |= ANYOF_EOS;
2550 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2553 Assuming this was/is a branch we are dealing with: 'scan' now
2554 points at the item that follows the branch sequence, whatever
2555 it is. We now start at the beginning of the sequence and look
2562 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2564 If we can find such a subseqence we need to turn the first
2565 element into a trie and then add the subsequent branch exact
2566 strings to the trie.
2570 1. patterns where the whole set of branch can be converted.
2572 2. patterns where only a subset can be converted.
2574 In case 1 we can replace the whole set with a single regop
2575 for the trie. In case 2 we need to keep the start and end
2578 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2579 becomes BRANCH TRIE; BRANCH X;
2581 There is an additional case, that being where there is a
2582 common prefix, which gets split out into an EXACT like node
2583 preceding the TRIE node.
2585 If x(1..n)==tail then we can do a simple trie, if not we make
2586 a "jump" trie, such that when we match the appropriate word
2587 we "jump" to the appopriate tail node. Essentailly we turn
2588 a nested if into a case structure of sorts.
2593 if (!re_trie_maxbuff) {
2594 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2595 if (!SvIOK(re_trie_maxbuff))
2596 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2598 if ( SvIV(re_trie_maxbuff)>=0 ) {
2600 regnode *first = (regnode *)NULL;
2601 regnode *last = (regnode *)NULL;
2602 regnode *tail = scan;
2607 SV * const mysv = sv_newmortal(); /* for dumping */
2609 /* var tail is used because there may be a TAIL
2610 regop in the way. Ie, the exacts will point to the
2611 thing following the TAIL, but the last branch will
2612 point at the TAIL. So we advance tail. If we
2613 have nested (?:) we may have to move through several
2617 while ( OP( tail ) == TAIL ) {
2618 /* this is the TAIL generated by (?:) */
2619 tail = regnext( tail );
2624 regprop(RExC_rx, mysv, tail );
2625 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2626 (int)depth * 2 + 2, "",
2627 "Looking for TRIE'able sequences. Tail node is: ",
2628 SvPV_nolen_const( mysv )
2634 step through the branches, cur represents each
2635 branch, noper is the first thing to be matched
2636 as part of that branch and noper_next is the
2637 regnext() of that node. if noper is an EXACT
2638 and noper_next is the same as scan (our current
2639 position in the regex) then the EXACT branch is
2640 a possible optimization target. Once we have
2641 two or more consequetive such branches we can
2642 create a trie of the EXACT's contents and stich
2643 it in place. If the sequence represents all of
2644 the branches we eliminate the whole thing and
2645 replace it with a single TRIE. If it is a
2646 subsequence then we need to stitch it in. This
2647 means the first branch has to remain, and needs
2648 to be repointed at the item on the branch chain
2649 following the last branch optimized. This could
2650 be either a BRANCH, in which case the
2651 subsequence is internal, or it could be the
2652 item following the branch sequence in which
2653 case the subsequence is at the end.
2657 /* dont use tail as the end marker for this traverse */
2658 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2659 regnode * const noper = NEXTOPER( cur );
2660 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2661 regnode * const noper_next = regnext( noper );
2665 regprop(RExC_rx, mysv, cur);
2666 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2667 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2669 regprop(RExC_rx, mysv, noper);
2670 PerlIO_printf( Perl_debug_log, " -> %s",
2671 SvPV_nolen_const(mysv));
2674 regprop(RExC_rx, mysv, noper_next );
2675 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2676 SvPV_nolen_const(mysv));
2678 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2679 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2681 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2682 : PL_regkind[ OP( noper ) ] == EXACT )
2683 || OP(noper) == NOTHING )
2685 && noper_next == tail
2690 if ( !first || optype == NOTHING ) {
2691 if (!first) first = cur;
2692 optype = OP( noper );
2698 make_trie( pRExC_state,
2699 startbranch, first, cur, tail, count,
2702 if ( PL_regkind[ OP( noper ) ] == EXACT
2704 && noper_next == tail
2709 optype = OP( noper );
2719 regprop(RExC_rx, mysv, cur);
2720 PerlIO_printf( Perl_debug_log,
2721 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2722 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2726 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2727 #ifdef TRIE_STUDY_OPT
2728 if ( ((made == MADE_EXACT_TRIE &&
2729 startbranch == first)
2730 || ( first_non_open == first )) &&
2732 flags |= SCF_TRIE_RESTUDY;
2733 if ( startbranch == first
2736 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2746 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2747 scan = NEXTOPER(NEXTOPER(scan));
2748 } else /* single branch is optimized. */
2749 scan = NEXTOPER(scan);
2751 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2752 scan_frame *newframe = NULL;
2757 if (OP(scan) != SUSPEND) {
2758 /* set the pointer */
2759 if (OP(scan) == GOSUB) {
2761 RExC_recurse[ARG2L(scan)] = scan;
2762 start = RExC_open_parens[paren-1];
2763 end = RExC_close_parens[paren-1];
2766 start = RExC_rxi->program + 1;
2770 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2771 SAVEFREEPV(recursed);
2773 if (!PAREN_TEST(recursed,paren+1)) {
2774 PAREN_SET(recursed,paren+1);
2775 Newx(newframe,1,scan_frame);
2777 if (flags & SCF_DO_SUBSTR) {
2778 SCAN_COMMIT(pRExC_state,data,minlenp);
2779 data->longest = &(data->longest_float);
2781 is_inf = is_inf_internal = 1;
2782 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2783 cl_anything(pRExC_state, data->start_class);
2784 flags &= ~SCF_DO_STCLASS;
2787 Newx(newframe,1,scan_frame);
2790 end = regnext(scan);
2795 SAVEFREEPV(newframe);
2796 newframe->next = regnext(scan);
2797 newframe->last = last;
2798 newframe->stop = stopparen;
2799 newframe->prev = frame;
2809 else if (OP(scan) == EXACT) {
2810 I32 l = STR_LEN(scan);
2813 const U8 * const s = (U8*)STRING(scan);
2814 l = utf8_length(s, s + l);
2815 uc = utf8_to_uvchr(s, NULL);
2817 uc = *((U8*)STRING(scan));
2820 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2821 /* The code below prefers earlier match for fixed
2822 offset, later match for variable offset. */
2823 if (data->last_end == -1) { /* Update the start info. */
2824 data->last_start_min = data->pos_min;
2825 data->last_start_max = is_inf
2826 ? I32_MAX : data->pos_min + data->pos_delta;
2828 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2830 SvUTF8_on(data->last_found);
2832 SV * const sv = data->last_found;
2833 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2834 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2835 if (mg && mg->mg_len >= 0)
2836 mg->mg_len += utf8_length((U8*)STRING(scan),
2837 (U8*)STRING(scan)+STR_LEN(scan));
2839 data->last_end = data->pos_min + l;
2840 data->pos_min += l; /* As in the first entry. */
2841 data->flags &= ~SF_BEFORE_EOL;
2843 if (flags & SCF_DO_STCLASS_AND) {
2844 /* Check whether it is compatible with what we know already! */
2848 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2849 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2850 && (!(data->start_class->flags & ANYOF_FOLD)
2851 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2854 ANYOF_CLASS_ZERO(data->start_class);
2855 ANYOF_BITMAP_ZERO(data->start_class);
2857 ANYOF_BITMAP_SET(data->start_class, uc);
2858 data->start_class->flags &= ~ANYOF_EOS;
2860 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2862 else if (flags & SCF_DO_STCLASS_OR) {
2863 /* false positive possible if the class is case-folded */
2865 ANYOF_BITMAP_SET(data->start_class, uc);
2867 data->start_class->flags |= ANYOF_UNICODE_ALL;
2868 data->start_class->flags &= ~ANYOF_EOS;
2869 cl_and(data->start_class, and_withp);
2871 flags &= ~SCF_DO_STCLASS;
2873 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2874 I32 l = STR_LEN(scan);
2875 UV uc = *((U8*)STRING(scan));
2877 /* Search for fixed substrings supports EXACT only. */
2878 if (flags & SCF_DO_SUBSTR) {
2880 SCAN_COMMIT(pRExC_state, data, minlenp);
2883 const U8 * const s = (U8 *)STRING(scan);
2884 l = utf8_length(s, s + l);
2885 uc = utf8_to_uvchr(s, NULL);
2888 if (flags & SCF_DO_SUBSTR)
2890 if (flags & SCF_DO_STCLASS_AND) {
2891 /* Check whether it is compatible with what we know already! */
2895 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2896 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2897 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2899 ANYOF_CLASS_ZERO(data->start_class);
2900 ANYOF_BITMAP_ZERO(data->start_class);
2902 ANYOF_BITMAP_SET(data->start_class, uc);
2903 data->start_class->flags &= ~ANYOF_EOS;
2904 data->start_class->flags |= ANYOF_FOLD;
2905 if (OP(scan) == EXACTFL)
2906 data->start_class->flags |= ANYOF_LOCALE;
2909 else if (flags & SCF_DO_STCLASS_OR) {
2910 if (data->start_class->flags & ANYOF_FOLD) {
2911 /* false positive possible if the class is case-folded.
2912 Assume that the locale settings are the same... */
2914 ANYOF_BITMAP_SET(data->start_class, uc);
2915 data->start_class->flags &= ~ANYOF_EOS;
2917 cl_and(data->start_class, and_withp);
2919 flags &= ~SCF_DO_STCLASS;
2921 else if (strchr((const char*)PL_varies,OP(scan))) {
2922 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2923 I32 f = flags, pos_before = 0;
2924 regnode * const oscan = scan;
2925 struct regnode_charclass_class this_class;
2926 struct regnode_charclass_class *oclass = NULL;
2927 I32 next_is_eval = 0;
2929 switch (PL_regkind[OP(scan)]) {
2930 case WHILEM: /* End of (?:...)* . */
2931 scan = NEXTOPER(scan);
2934 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2935 next = NEXTOPER(scan);
2936 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2938 maxcount = REG_INFTY;
2939 next = regnext(scan);
2940 scan = NEXTOPER(scan);
2944 if (flags & SCF_DO_SUBSTR)
2949 if (flags & SCF_DO_STCLASS) {
2951 maxcount = REG_INFTY;
2952 next = regnext(scan);
2953 scan = NEXTOPER(scan);
2956 is_inf = is_inf_internal = 1;
2957 scan = regnext(scan);
2958 if (flags & SCF_DO_SUBSTR) {
2959 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2960 data->longest = &(data->longest_float);
2962 goto optimize_curly_tail;
2964 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2965 && (scan->flags == stopparen))
2970 mincount = ARG1(scan);
2971 maxcount = ARG2(scan);
2973 next = regnext(scan);
2974 if (OP(scan) == CURLYX) {
2975 I32 lp = (data ? *(data->last_closep) : 0);
2976 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2978 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2979 next_is_eval = (OP(scan) == EVAL);
2981 if (flags & SCF_DO_SUBSTR) {
2982 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2983 pos_before = data->pos_min;
2987 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2989 data->flags |= SF_IS_INF;
2991 if (flags & SCF_DO_STCLASS) {
2992 cl_init(pRExC_state, &this_class);
2993 oclass = data->start_class;
2994 data->start_class = &this_class;
2995 f |= SCF_DO_STCLASS_AND;
2996 f &= ~SCF_DO_STCLASS_OR;
2998 /* These are the cases when once a subexpression
2999 fails at a particular position, it cannot succeed
3000 even after backtracking at the enclosing scope.
3002 XXXX what if minimal match and we are at the
3003 initial run of {n,m}? */
3004 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3005 f &= ~SCF_WHILEM_VISITED_POS;
3007 /* This will finish on WHILEM, setting scan, or on NULL: */
3008 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3009 last, data, stopparen, recursed, NULL,
3011 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3013 if (flags & SCF_DO_STCLASS)
3014 data->start_class = oclass;
3015 if (mincount == 0 || minnext == 0) {
3016 if (flags & SCF_DO_STCLASS_OR) {
3017 cl_or(pRExC_state, data->start_class, &this_class);
3019 else if (flags & SCF_DO_STCLASS_AND) {
3020 /* Switch to OR mode: cache the old value of
3021 * data->start_class */
3023 StructCopy(data->start_class, and_withp,
3024 struct regnode_charclass_class);
3025 flags &= ~SCF_DO_STCLASS_AND;
3026 StructCopy(&this_class, data->start_class,
3027 struct regnode_charclass_class);
3028 flags |= SCF_DO_STCLASS_OR;
3029 data->start_class->flags |= ANYOF_EOS;
3031 } else { /* Non-zero len */
3032 if (flags & SCF_DO_STCLASS_OR) {
3033 cl_or(pRExC_state, data->start_class, &this_class);
3034 cl_and(data->start_class, and_withp);
3036 else if (flags & SCF_DO_STCLASS_AND)
3037 cl_and(data->start_class, &this_class);
3038 flags &= ~SCF_DO_STCLASS;
3040 if (!scan) /* It was not CURLYX, but CURLY. */
3042 if ( /* ? quantifier ok, except for (?{ ... }) */
3043 (next_is_eval || !(mincount == 0 && maxcount == 1))
3044 && (minnext == 0) && (deltanext == 0)
3045 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3046 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3047 && ckWARN(WARN_REGEXP))
3050 "Quantifier unexpected on zero-length expression");
3053 min += minnext * mincount;
3054 is_inf_internal |= ((maxcount == REG_INFTY
3055 && (minnext + deltanext) > 0)
3056 || deltanext == I32_MAX);
3057 is_inf |= is_inf_internal;
3058 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3060 /* Try powerful optimization CURLYX => CURLYN. */
3061 if ( OP(oscan) == CURLYX && data
3062 && data->flags & SF_IN_PAR
3063 && !(data->flags & SF_HAS_EVAL)
3064 && !deltanext && minnext == 1 ) {
3065 /* Try to optimize to CURLYN. */
3066 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3067 regnode * const nxt1 = nxt;
3074 if (!strchr((const char*)PL_simple,OP(nxt))
3075 && !(PL_regkind[OP(nxt)] == EXACT
3076 && STR_LEN(nxt) == 1))
3082 if (OP(nxt) != CLOSE)
3084 if (RExC_open_parens) {
3085 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3086 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3088 /* Now we know that nxt2 is the only contents: */
3089 oscan->flags = (U8)ARG(nxt);
3091 OP(nxt1) = NOTHING; /* was OPEN. */
3094 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3095 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3096 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3097 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3098 OP(nxt + 1) = OPTIMIZED; /* was count. */
3099 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3104 /* Try optimization CURLYX => CURLYM. */
3105 if ( OP(oscan) == CURLYX && data
3106 && !(data->flags & SF_HAS_PAR)
3107 && !(data->flags & SF_HAS_EVAL)
3108 && !deltanext /* atom is fixed width */
3109 && minnext != 0 /* CURLYM can't handle zero width */
3111 /* XXXX How to optimize if data == 0? */
3112 /* Optimize to a simpler form. */
3113 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3117 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3118 && (OP(nxt2) != WHILEM))
3120 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3121 /* Need to optimize away parenths. */
3122 if (data->flags & SF_IN_PAR) {
3123 /* Set the parenth number. */
3124 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3126 if (OP(nxt) != CLOSE)
3127 FAIL("Panic opt close");
3128 oscan->flags = (U8)ARG(nxt);
3129 if (RExC_open_parens) {
3130 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3131 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3133 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3134 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3137 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3138 OP(nxt + 1) = OPTIMIZED; /* was count. */
3139 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3140 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3143 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3144 regnode *nnxt = regnext(nxt1);
3147 if (reg_off_by_arg[OP(nxt1)])
3148 ARG_SET(nxt1, nxt2 - nxt1);
3149 else if (nxt2 - nxt1 < U16_MAX)
3150 NEXT_OFF(nxt1) = nxt2 - nxt1;
3152 OP(nxt) = NOTHING; /* Cannot beautify */
3157 /* Optimize again: */
3158 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3159 NULL, stopparen, recursed, NULL, 0,depth+1);
3164 else if ((OP(oscan) == CURLYX)
3165 && (flags & SCF_WHILEM_VISITED_POS)
3166 /* See the comment on a similar expression above.
3167 However, this time it not a subexpression
3168 we care about, but the expression itself. */
3169 && (maxcount == REG_INFTY)
3170 && data && ++data->whilem_c < 16) {
3171 /* This stays as CURLYX, we can put the count/of pair. */
3172 /* Find WHILEM (as in regexec.c) */
3173 regnode *nxt = oscan + NEXT_OFF(oscan);
3175 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3177 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3178 | (RExC_whilem_seen << 4)); /* On WHILEM */
3180 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3182 if (flags & SCF_DO_SUBSTR) {
3183 SV *last_str = NULL;
3184 int counted = mincount != 0;
3186 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3187 #if defined(SPARC64_GCC_WORKAROUND)
3190 const char *s = NULL;
3193 if (pos_before >= data->last_start_min)
3196 b = data->last_start_min;
3199 s = SvPV_const(data->last_found, l);
3200 old = b - data->last_start_min;
3203 I32 b = pos_before >= data->last_start_min
3204 ? pos_before : data->last_start_min;
3206 const char * const s = SvPV_const(data->last_found, l);
3207 I32 old = b - data->last_start_min;
3211 old = utf8_hop((U8*)s, old) - (U8*)s;
3214 /* Get the added string: */
3215 last_str = newSVpvn(s + old, l);
3217 SvUTF8_on(last_str);
3218 if (deltanext == 0 && pos_before == b) {
3219 /* What was added is a constant string */
3221 SvGROW(last_str, (mincount * l) + 1);
3222 repeatcpy(SvPVX(last_str) + l,
3223 SvPVX_const(last_str), l, mincount - 1);
3224 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3225 /* Add additional parts. */
3226 SvCUR_set(data->last_found,
3227 SvCUR(data->last_found) - l);
3228 sv_catsv(data->last_found, last_str);
3230 SV * sv = data->last_found;
3232 SvUTF8(sv) && SvMAGICAL(sv) ?
3233 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3234 if (mg && mg->mg_len >= 0)
3235 mg->mg_len += CHR_SVLEN(last_str);
3237 data->last_end += l * (mincount - 1);
3240 /* start offset must point into the last copy */
3241 data->last_start_min += minnext * (mincount - 1);
3242 data->last_start_max += is_inf ? I32_MAX
3243 : (maxcount - 1) * (minnext + data->pos_delta);
3246 /* It is counted once already... */
3247 data->pos_min += minnext * (mincount - counted);
3248 data->pos_delta += - counted * deltanext +
3249 (minnext + deltanext) * maxcount - minnext * mincount;
3250 if (mincount != maxcount) {
3251 /* Cannot extend fixed substrings found inside
3253 SCAN_COMMIT(pRExC_state,data,minlenp);
3254 if (mincount && last_str) {
3255 SV * const sv = data->last_found;
3256 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3257 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3261 sv_setsv(sv, last_str);
3262 data->last_end = data->pos_min;
3263 data->last_start_min =
3264 data->pos_min - CHR_SVLEN(last_str);
3265 data->last_start_max = is_inf
3267 : data->pos_min + data->pos_delta
3268 - CHR_SVLEN(last_str);
3270 data->longest = &(data->longest_float);
3272 SvREFCNT_dec(last_str);
3274 if (data && (fl & SF_HAS_EVAL))
3275 data->flags |= SF_HAS_EVAL;
3276 optimize_curly_tail:
3277 if (OP(oscan) != CURLYX) {
3278 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3280 NEXT_OFF(oscan) += NEXT_OFF(next);
3283 default: /* REF and CLUMP only? */
3284 if (flags & SCF_DO_SUBSTR) {
3285 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3286 data->longest = &(data->longest_float);
3288 is_inf = is_inf_internal = 1;
3289 if (flags & SCF_DO_STCLASS_OR)
3290 cl_anything(pRExC_state, data->start_class);
3291 flags &= ~SCF_DO_STCLASS;
3295 else if (strchr((const char*)PL_simple,OP(scan))) {
3298 if (flags & SCF_DO_SUBSTR) {
3299 SCAN_COMMIT(pRExC_state,data,minlenp);
3303 if (flags & SCF_DO_STCLASS) {
3304 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3306 /* Some of the logic below assumes that switching
3307 locale on will only add false positives. */
3308 switch (PL_regkind[OP(scan)]) {
3312 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3313 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3314 cl_anything(pRExC_state, data->start_class);
3317 if (OP(scan) == SANY)
3319 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3320 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3321 || (data->start_class->flags & ANYOF_CLASS));
3322 cl_anything(pRExC_state, data->start_class);
3324 if (flags & SCF_DO_STCLASS_AND || !value)
3325 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3328 if (flags & SCF_DO_STCLASS_AND)
3329 cl_and(data->start_class,
3330 (struct regnode_charclass_class*)scan);
3332 cl_or(pRExC_state, data->start_class,
3333 (struct regnode_charclass_class*)scan);
3336 if (flags & SCF_DO_STCLASS_AND) {
3337 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3338 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3339 for (value = 0; value < 256; value++)
3340 if (!isALNUM(value))
3341 ANYOF_BITMAP_CLEAR(data->start_class, value);
3345 if (data->start_class->flags & ANYOF_LOCALE)
3346 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3348 for (value = 0; value < 256; value++)
3350 ANYOF_BITMAP_SET(data->start_class, value);
3355 if (flags & SCF_DO_STCLASS_AND) {
3356 if (data->start_class->flags & ANYOF_LOCALE)
3357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3360 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3361 data->start_class->flags |= ANYOF_LOCALE;
3365 if (flags & SCF_DO_STCLASS_AND) {
3366 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3367 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3368 for (value = 0; value < 256; value++)
3370 ANYOF_BITMAP_CLEAR(data->start_class, value);
3374 if (data->start_class->flags & ANYOF_LOCALE)
3375 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3377 for (value = 0; value < 256; value++)
3378 if (!isALNUM(value))
3379 ANYOF_BITMAP_SET(data->start_class, value);
3384 if (flags & SCF_DO_STCLASS_AND) {
3385 if (data->start_class->flags & ANYOF_LOCALE)
3386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3389 data->start_class->flags |= ANYOF_LOCALE;
3390 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3394 if (flags & SCF_DO_STCLASS_AND) {
3395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3397 for (value = 0; value < 256; value++)
3398 if (!isSPACE(value))
3399 ANYOF_BITMAP_CLEAR(data->start_class, value);
3403 if (data->start_class->flags & ANYOF_LOCALE)
3404 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3406 for (value = 0; value < 256; value++)
3408 ANYOF_BITMAP_SET(data->start_class, value);
3413 if (flags & SCF_DO_STCLASS_AND) {
3414 if (data->start_class->flags & ANYOF_LOCALE)
3415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3418 data->start_class->flags |= ANYOF_LOCALE;
3419 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3423 if (flags & SCF_DO_STCLASS_AND) {
3424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3426 for (value = 0; value < 256; value++)
3428 ANYOF_BITMAP_CLEAR(data->start_class, value);
3432 if (data->start_class->flags & ANYOF_LOCALE)
3433 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3435 for (value = 0; value < 256; value++)
3436 if (!isSPACE(value))
3437 ANYOF_BITMAP_SET(data->start_class, value);
3442 if (flags & SCF_DO_STCLASS_AND) {
3443 if (data->start_class->flags & ANYOF_LOCALE) {
3444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3445 for (value = 0; value < 256; value++)
3446 if (!isSPACE(value))
3447 ANYOF_BITMAP_CLEAR(data->start_class, value);
3451 data->start_class->flags |= ANYOF_LOCALE;
3452 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3456 if (flags & SCF_DO_STCLASS_AND) {
3457 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3458 for (value = 0; value < 256; value++)
3459 if (!isDIGIT(value))
3460 ANYOF_BITMAP_CLEAR(data->start_class, value);
3463 if (data->start_class->flags & ANYOF_LOCALE)
3464 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3466 for (value = 0; value < 256; value++)
3468 ANYOF_BITMAP_SET(data->start_class, value);
3473 if (flags & SCF_DO_STCLASS_AND) {
3474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3475 for (value = 0; value < 256; value++)
3477 ANYOF_BITMAP_CLEAR(data->start_class, value);
3480 if (data->start_class->flags & ANYOF_LOCALE)
3481 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3483 for (value = 0; value < 256; value++)
3484 if (!isDIGIT(value))
3485 ANYOF_BITMAP_SET(data->start_class, value);
3490 if (flags & SCF_DO_STCLASS_OR)
3491 cl_and(data->start_class, and_withp);
3492 flags &= ~SCF_DO_STCLASS;
3495 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3496 data->flags |= (OP(scan) == MEOL
3500 else if ( PL_regkind[OP(scan)] == BRANCHJ
3501 /* Lookbehind, or need to calculate parens/evals/stclass: */
3502 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3503 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3504 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3505 || OP(scan) == UNLESSM )
3507 /* Negative Lookahead/lookbehind
3508 In this case we can't do fixed string optimisation.
3511 I32 deltanext, minnext, fake = 0;
3513 struct regnode_charclass_class intrnl;
3516 data_fake.flags = 0;
3518 data_fake.whilem_c = data->whilem_c;
3519 data_fake.last_closep = data->last_closep;
3522 data_fake.last_closep = &fake;
3523 data_fake.pos_delta = delta;
3524 if ( flags & SCF_DO_STCLASS && !scan->flags
3525 && OP(scan) == IFMATCH ) { /* Lookahead */
3526 cl_init(pRExC_state, &intrnl);
3527 data_fake.start_class = &intrnl;
3528 f |= SCF_DO_STCLASS_AND;
3530 if (flags & SCF_WHILEM_VISITED_POS)
3531 f |= SCF_WHILEM_VISITED_POS;
3532 next = regnext(scan);
3533 nscan = NEXTOPER(NEXTOPER(scan));
3534 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3535 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3538 FAIL("Variable length lookbehind not implemented");
3540 else if (minnext > (I32)U8_MAX) {
3541 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3543 scan->flags = (U8)minnext;
3546 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3548 if (data_fake.flags & SF_HAS_EVAL)
3549 data->flags |= SF_HAS_EVAL;
3550 data->whilem_c = data_fake.whilem_c;
3552 if (f & SCF_DO_STCLASS_AND) {
3553 const int was = (data->start_class->flags & ANYOF_EOS);
3555 cl_and(data->start_class, &intrnl);
3557 data->start_class->flags |= ANYOF_EOS;
3560 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3562 /* Positive Lookahead/lookbehind
3563 In this case we can do fixed string optimisation,
3564 but we must be careful about it. Note in the case of
3565 lookbehind the positions will be offset by the minimum
3566 length of the pattern, something we won't know about
3567 until after the recurse.
3569 I32 deltanext, fake = 0;
3571 struct regnode_charclass_class intrnl;
3573 /* We use SAVEFREEPV so that when the full compile
3574 is finished perl will clean up the allocated
3575 minlens when its all done. This was we don't
3576 have to worry about freeing them when we know
3577 they wont be used, which would be a pain.
3580 Newx( minnextp, 1, I32 );
3581 SAVEFREEPV(minnextp);
3584 StructCopy(data, &data_fake, scan_data_t);
3585 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3588 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3589 data_fake.last_found=newSVsv(data->last_found);
3593 data_fake.last_closep = &fake;
3594 data_fake.flags = 0;
3595 data_fake.pos_delta = delta;
3597 data_fake.flags |= SF_IS_INF;
3598 if ( flags & SCF_DO_STCLASS && !scan->flags
3599 && OP(scan) == IFMATCH ) { /* Lookahead */
3600 cl_init(pRExC_state, &intrnl);
3601 data_fake.start_class = &intrnl;
3602 f |= SCF_DO_STCLASS_AND;
3604 if (flags & SCF_WHILEM_VISITED_POS)
3605 f |= SCF_WHILEM_VISITED_POS;
3606 next = regnext(scan);
3607 nscan = NEXTOPER(NEXTOPER(scan));
3609 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3610 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3613 FAIL("Variable length lookbehind not implemented");
3615 else if (*minnextp > (I32)U8_MAX) {
3616 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3618 scan->flags = (U8)*minnextp;
3623 if (f & SCF_DO_STCLASS_AND) {
3624 const int was = (data->start_class->flags & ANYOF_EOS);
3626 cl_and(data->start_class, &intrnl);
3628 data->start_class->flags |= ANYOF_EOS;
3631 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3633 if (data_fake.flags & SF_HAS_EVAL)
3634 data->flags |= SF_HAS_EVAL;
3635 data->whilem_c = data_fake.whilem_c;
3636 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3637 if (RExC_rx->minlen<*minnextp)
3638 RExC_rx->minlen=*minnextp;
3639 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3640 SvREFCNT_dec(data_fake.last_found);
3642 if ( data_fake.minlen_fixed != minlenp )
3644 data->offset_fixed= data_fake.offset_fixed;
3645 data->minlen_fixed= data_fake.minlen_fixed;
3646 data->lookbehind_fixed+= scan->flags;
3648 if ( data_fake.minlen_float != minlenp )
3650 data->minlen_float= data_fake.minlen_float;
3651 data->offset_float_min=data_fake.offset_float_min;
3652 data->offset_float_max=data_fake.offset_float_max;
3653 data->lookbehind_float+= scan->flags;
3662 else if (OP(scan) == OPEN) {
3663 if (stopparen != (I32)ARG(scan))
3666 else if (OP(scan) == CLOSE) {
3667 if (stopparen == (I32)ARG(scan)) {
3670 if ((I32)ARG(scan) == is_par) {
3671 next = regnext(scan);
3673 if ( next && (OP(next) != WHILEM) && next < last)
3674 is_par = 0; /* Disable optimization */
3677 *(data->last_closep) = ARG(scan);
3679 else if (OP(scan) == EVAL) {
3681 data->flags |= SF_HAS_EVAL;
3683 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3684 if (flags & SCF_DO_SUBSTR) {
3685 SCAN_COMMIT(pRExC_state,data,minlenp);
3686 flags &= ~SCF_DO_SUBSTR;
3688 if (data && OP(scan)==ACCEPT) {
3689 data->flags |= SCF_SEEN_ACCEPT;
3694 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3696 if (flags & SCF_DO_SUBSTR) {
3697 SCAN_COMMIT(pRExC_state,data,minlenp);
3698 data->longest = &(data->longest_float);
3700 is_inf = is_inf_internal = 1;
3701 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3702 cl_anything(pRExC_state, data->start_class);
3703 flags &= ~SCF_DO_STCLASS;
3705 else if (OP(scan) == GPOS) {
3706 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3707 !(delta || is_inf || (data && data->pos_delta)))
3709 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3710 RExC_rx->extflags |= RXf_ANCH_GPOS;
3711 if (RExC_rx->gofs < (U32)min)
3712 RExC_rx->gofs = min;
3714 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3718 #ifdef TRIE_STUDY_OPT
3719 #ifdef FULL_TRIE_STUDY
3720 else if (PL_regkind[OP(scan)] == TRIE) {
3721 /* NOTE - There is similar code to this block above for handling
3722 BRANCH nodes on the initial study. If you change stuff here
3724 regnode *trie_node= scan;
3725 regnode *tail= regnext(scan);
3726 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3727 I32 max1 = 0, min1 = I32_MAX;
3728 struct regnode_charclass_class accum;
3730 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3731 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3732 if (flags & SCF_DO_STCLASS)
3733 cl_init_zero(pRExC_state, &accum);
3739 const regnode *nextbranch= NULL;
3742 for ( word=1 ; word <= trie->wordcount ; word++)
3744 I32 deltanext=0, minnext=0, f = 0, fake;
3745 struct regnode_charclass_class this_class;
3747 data_fake.flags = 0;
3749 data_fake.whilem_c = data->whilem_c;
3750 data_fake.last_closep = data->last_closep;
3753 data_fake.last_closep = &fake;
3754 data_fake.pos_delta = delta;
3755 if (flags & SCF_DO_STCLASS) {
3756 cl_init(pRExC_state, &this_class);
3757 data_fake.start_class = &this_class;
3758 f = SCF_DO_STCLASS_AND;
3760 if (flags & SCF_WHILEM_VISITED_POS)
3761 f |= SCF_WHILEM_VISITED_POS;
3763 if (trie->jump[word]) {
3765 nextbranch = trie_node + trie->jump[0];
3766 scan= trie_node + trie->jump[word];
3767 /* We go from the jump point to the branch that follows
3768 it. Note this means we need the vestigal unused branches
3769 even though they arent otherwise used.
3771 minnext = study_chunk(pRExC_state, &scan, minlenp,
3772 &deltanext, (regnode *)nextbranch, &data_fake,
3773 stopparen, recursed, NULL, f,depth+1);
3775 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3776 nextbranch= regnext((regnode*)nextbranch);
3778 if (min1 > (I32)(minnext + trie->minlen))
3779 min1 = minnext + trie->minlen;
3780 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3781 max1 = minnext + deltanext + trie->maxlen;
3782 if (deltanext == I32_MAX)
3783 is_inf = is_inf_internal = 1;
3785 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3787 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3788 if ( stopmin > min + min1)
3789 stopmin = min + min1;
3790 flags &= ~SCF_DO_SUBSTR;
3792 data->flags |= SCF_SEEN_ACCEPT;
3795 if (data_fake.flags & SF_HAS_EVAL)
3796 data->flags |= SF_HAS_EVAL;
3797 data->whilem_c = data_fake.whilem_c;
3799 if (flags & SCF_DO_STCLASS)
3800 cl_or(pRExC_state, &accum, &this_class);
3803 if (flags & SCF_DO_SUBSTR) {
3804 data->pos_min += min1;
3805 data->pos_delta += max1 - min1;
3806 if (max1 != min1 || is_inf)
3807 data->longest = &(data->longest_float);
3810 delta += max1 - min1;