5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
760 #define TRIE_READ_CHAR STMT_START { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
785 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
801 #define TRIE_LIST_NEW(state) STMT_START { \
802 Newxz( trie->states[ state ].trans.list, \
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
808 #define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
812 trie->wordlen[ curword ] = wordlen; \
813 trie->states[ state ].wordnum = ++curword; \
815 /* store the word for dumping */ \
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
820 tmp = newSVpvn( "", 0 ); \
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
825 NOOP; /* It's a dupe. So ignore it. */ \
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
835 These routines dump out a trie in a somewhat readable format.
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
852 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
855 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
875 const U32 base = trie->states[ state ].trans.base;
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
909 PerlIO_printf( Perl_debug_log, "]");
912 PerlIO_printf( Perl_debug_log, "\n" );
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
923 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
932 for( state=1 ; state < next_alloc ; state ++ ) {
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
964 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
968 GET_RE_DEBUG_FLAGS_DECL;
971 print out the table precompression so that we can do a visual check
972 that they are identical.
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
990 PerlIO_printf( Perl_debug_log, "\n" );
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1013 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1023 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1025 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
1051 U32 base = trie->states[ 1 ].trans.base;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1056 PERL_UNUSED_ARG(depth);
1058 GET_RE_DEBUG_FLAGS_DECL;
1060 ARG_SET( stclass, data_slot );
1061 Newxz( aho, 1, reg_ac_data );
1062 RExC_rx->data->data[ data_slot ] = (void*)aho;
1064 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1065 (trie->laststate+1)*sizeof(reg_trie_state));
1066 Newxz( q, numstates, U32);
1067 Newxz( aho->fail, numstates, U32 );
1070 fail[ 0 ] = fail[ 1 ] = 1;
1072 for ( charid = 0; charid < ucharcount ; charid++ ) {
1073 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1075 q[ q_write ] = newstate;
1076 /* set to point at the root */
1077 fail[ q[ q_write++ ] ]=1;
1080 while ( q_read < q_write) {
1081 const U32 cur = q[ q_read++ % numstates ];
1082 base = trie->states[ cur ].trans.base;
1084 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1085 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1087 U32 fail_state = cur;
1090 fail_state = fail[ fail_state ];
1091 fail_base = aho->states[ fail_state ].trans.base;
1092 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1094 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1095 fail[ ch_state ] = fail_state;
1096 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1098 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1100 q[ q_write++ % numstates] = ch_state;
1105 DEBUG_TRIE_COMPILE_MORE_r({
1106 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1107 for( q_read=2; q_read<numstates; q_read++ ) {
1108 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1110 PerlIO_printf(Perl_debug_log, "\n");
1113 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1119 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1122 /* first pass, loop through and scan words */
1123 reg_trie_data *trie;
1125 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1130 /* we just use folder as a flag in utf8 */
1131 const U8 * const folder = ( flags == EXACTF
1133 : ( flags == EXACTFL
1139 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1140 SV *re_trie_maxbuff;
1142 /* these are only used during construction but are useful during
1143 * debugging so we store them in the struct when debugging.
1144 * Wordcount is actually superfluous in debugging as we have
1145 * (AV*)trie->words to use for it, but that's not available when
1146 * not debugging... We could make the macro use the AV during
1147 * debugging though...
1149 U16 trie_wordcount=0;
1150 STRLEN trie_charcount=0;
1151 /*U32 trie_laststate=0;*/
1152 AV *trie_revcharmap;
1153 PERL_UNUSED_ARG(depth);
1155 GET_RE_DEBUG_FLAGS_DECL;
1157 Newxz( trie, 1, reg_trie_data );
1159 trie->startstate = 1;
1160 RExC_rx->data->data[ data_slot ] = (void*)trie;
1161 Newxz( trie->charmap, 256, U16 );
1162 if (!(UTF && folder))
1163 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1165 trie->words = newAV();
1167 TRIE_REVCHARMAP(trie) = newAV();
1169 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1170 if (!SvIOK(re_trie_maxbuff)) {
1171 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1174 PerlIO_printf( Perl_debug_log,
1175 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1176 (int)depth * 2 + 2, "",
1177 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1178 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1180 /* -- First loop and Setup --
1182 We first traverse the branches and scan each word to determine if it
1183 contains widechars, and how many unique chars there are, this is
1184 important as we have to build a table with at least as many columns as we
1187 We use an array of integers to represent the character codes 0..255
1188 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1189 native representation of the character value as the key and IV's for the
1192 *TODO* If we keep track of how many times each character is used we can
1193 remap the columns so that the table compression later on is more
1194 efficient in terms of memory by ensuring most common value is in the
1195 middle and the least common are on the outside. IMO this would be better
1196 than a most to least common mapping as theres a decent chance the most
1197 common letter will share a node with the least common, meaning the node
1198 will not be compressable. With a middle is most common approach the worst
1199 case is when we have the least common nodes twice.
1203 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1204 regnode * const noper = NEXTOPER( cur );
1205 const U8 *uc = (U8*)STRING( noper );
1206 const U8 * const e = uc + STR_LEN( noper );
1208 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1209 const U8 *scan = (U8*)NULL;
1210 U32 wordlen = 0; /* required init */
1213 TRIE_WORDCOUNT(trie)++;
1214 if (OP(noper) == NOTHING) {
1219 TRIE_BITMAP_SET(trie,*uc);
1220 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1222 for ( ; uc < e ; uc += len ) {
1223 TRIE_CHARCOUNT(trie)++;
1227 if ( !trie->charmap[ uvc ] ) {
1228 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1230 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1235 if ( !trie->widecharmap )
1236 trie->widecharmap = newHV();
1238 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1241 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1243 if ( !SvTRUE( *svpp ) ) {
1244 sv_setiv( *svpp, ++trie->uniquecharcount );
1249 if( cur == first ) {
1252 } else if (chars < trie->minlen) {
1254 } else if (chars > trie->maxlen) {
1258 } /* end first pass */
1259 DEBUG_TRIE_COMPILE_r(
1260 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1261 (int)depth * 2 + 2,"",
1262 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1263 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1264 (int)trie->minlen, (int)trie->maxlen )
1266 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1269 We now know what we are dealing with in terms of unique chars and
1270 string sizes so we can calculate how much memory a naive
1271 representation using a flat table will take. If it's over a reasonable
1272 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1273 conservative but potentially much slower representation using an array
1276 At the end we convert both representations into the same compressed
1277 form that will be used in regexec.c for matching with. The latter
1278 is a form that cannot be used to construct with but has memory
1279 properties similar to the list form and access properties similar
1280 to the table form making it both suitable for fast searches and
1281 small enough that its feasable to store for the duration of a program.
1283 See the comment in the code where the compressed table is produced
1284 inplace from the flat tabe representation for an explanation of how
1285 the compression works.
1290 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1292 Second Pass -- Array Of Lists Representation
1294 Each state will be represented by a list of charid:state records
1295 (reg_trie_trans_le) the first such element holds the CUR and LEN
1296 points of the allocated array. (See defines above).
1298 We build the initial structure using the lists, and then convert
1299 it into the compressed table form which allows faster lookups
1300 (but cant be modified once converted).
1303 STRLEN transcount = 1;
1305 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1309 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1311 regnode * const noper = NEXTOPER( cur );
1312 U8 *uc = (U8*)STRING( noper );
1313 const U8 * const e = uc + STR_LEN( noper );
1314 U32 state = 1; /* required init */
1315 U16 charid = 0; /* sanity init */
1316 U8 *scan = (U8*)NULL; /* sanity init */
1317 STRLEN foldlen = 0; /* required init */
1318 U32 wordlen = 0; /* required init */
1319 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1321 if (OP(noper) != NOTHING) {
1322 for ( ; uc < e ; uc += len ) {
1327 charid = trie->charmap[ uvc ];
1329 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1333 charid=(U16)SvIV( *svpp );
1342 if ( !trie->states[ state ].trans.list ) {
1343 TRIE_LIST_NEW( state );
1345 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1346 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1347 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1352 newstate = next_alloc++;
1353 TRIE_LIST_PUSH( state, charid, newstate );
1358 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1360 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1363 TRIE_HANDLE_WORD(state);
1365 } /* end second pass */
1367 TRIE_LASTSTATE(trie) = next_alloc;
1368 Renew( trie->states, next_alloc, reg_trie_state );
1370 /* and now dump it out before we compress it */
1371 DEBUG_TRIE_COMPILE_MORE_r(
1372 dump_trie_interim_list(trie,next_alloc,depth+1)
1375 Newxz( trie->trans, transcount ,reg_trie_trans );
1382 for( state=1 ; state < next_alloc ; state ++ ) {
1386 DEBUG_TRIE_COMPILE_MORE_r(
1387 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1391 if (trie->states[state].trans.list) {
1392 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1396 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1397 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1398 if ( forid < minid ) {
1400 } else if ( forid > maxid ) {
1404 if ( transcount < tp + maxid - minid + 1) {
1406 Renew( trie->trans, transcount, reg_trie_trans );
1407 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1409 base = trie->uniquecharcount + tp - minid;
1410 if ( maxid == minid ) {
1412 for ( ; zp < tp ; zp++ ) {
1413 if ( ! trie->trans[ zp ].next ) {
1414 base = trie->uniquecharcount + zp - minid;
1415 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1416 trie->trans[ zp ].check = state;
1422 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1423 trie->trans[ tp ].check = state;
1428 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1429 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1430 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1431 trie->trans[ tid ].check = state;
1433 tp += ( maxid - minid + 1 );
1435 Safefree(trie->states[ state ].trans.list);
1438 DEBUG_TRIE_COMPILE_MORE_r(
1439 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1442 trie->states[ state ].trans.base=base;
1444 trie->lasttrans = tp + 1;
1448 Second Pass -- Flat Table Representation.
1450 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1451 We know that we will need Charcount+1 trans at most to store the data
1452 (one row per char at worst case) So we preallocate both structures
1453 assuming worst case.
1455 We then construct the trie using only the .next slots of the entry
1458 We use the .check field of the first entry of the node temporarily to
1459 make compression both faster and easier by keeping track of how many non
1460 zero fields are in the node.
1462 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1465 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1466 number representing the first entry of the node, and state as a
1467 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1468 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1469 are 2 entrys per node. eg:
1477 The table is internally in the right hand, idx form. However as we also
1478 have to deal with the states array which is indexed by nodenum we have to
1479 use TRIE_NODENUM() to convert.
1484 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1486 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1487 next_alloc = trie->uniquecharcount + 1;
1490 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1492 regnode * const noper = NEXTOPER( cur );
1493 const U8 *uc = (U8*)STRING( noper );
1494 const U8 * const e = uc + STR_LEN( noper );
1496 U32 state = 1; /* required init */
1498 U16 charid = 0; /* sanity init */
1499 U32 accept_state = 0; /* sanity init */
1500 U8 *scan = (U8*)NULL; /* sanity init */
1502 STRLEN foldlen = 0; /* required init */
1503 U32 wordlen = 0; /* required init */
1504 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1506 if ( OP(noper) != NOTHING ) {
1507 for ( ; uc < e ; uc += len ) {
1512 charid = trie->charmap[ uvc ];
1514 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1515 charid = svpp ? (U16)SvIV(*svpp) : 0;
1519 if ( !trie->trans[ state + charid ].next ) {
1520 trie->trans[ state + charid ].next = next_alloc;
1521 trie->trans[ state ].check++;
1522 next_alloc += trie->uniquecharcount;
1524 state = trie->trans[ state + charid ].next;
1526 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1528 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1531 accept_state = TRIE_NODENUM( state );
1532 TRIE_HANDLE_WORD(accept_state);
1534 } /* end second pass */
1536 /* and now dump it out before we compress it */
1537 DEBUG_TRIE_COMPILE_MORE_r(
1538 dump_trie_interim_table(trie,next_alloc,depth+1)
1543 * Inplace compress the table.*
1545 For sparse data sets the table constructed by the trie algorithm will
1546 be mostly 0/FAIL transitions or to put it another way mostly empty.
1547 (Note that leaf nodes will not contain any transitions.)
1549 This algorithm compresses the tables by eliminating most such
1550 transitions, at the cost of a modest bit of extra work during lookup:
1552 - Each states[] entry contains a .base field which indicates the
1553 index in the state[] array wheres its transition data is stored.
1555 - If .base is 0 there are no valid transitions from that node.
1557 - If .base is nonzero then charid is added to it to find an entry in
1560 -If trans[states[state].base+charid].check!=state then the
1561 transition is taken to be a 0/Fail transition. Thus if there are fail
1562 transitions at the front of the node then the .base offset will point
1563 somewhere inside the previous nodes data (or maybe even into a node
1564 even earlier), but the .check field determines if the transition is
1567 The following process inplace converts the table to the compressed
1568 table: We first do not compress the root node 1,and mark its all its
1569 .check pointers as 1 and set its .base pointer as 1 as well. This
1570 allows to do a DFA construction from the compressed table later, and
1571 ensures that any .base pointers we calculate later are greater than
1574 - We set 'pos' to indicate the first entry of the second node.
1576 - We then iterate over the columns of the node, finding the first and
1577 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1578 and set the .check pointers accordingly, and advance pos
1579 appropriately and repreat for the next node. Note that when we copy
1580 the next pointers we have to convert them from the original
1581 NODEIDX form to NODENUM form as the former is not valid post
1584 - If a node has no transitions used we mark its base as 0 and do not
1585 advance the pos pointer.
1587 - If a node only has one transition we use a second pointer into the
1588 structure to fill in allocated fail transitions from other states.
1589 This pointer is independent of the main pointer and scans forward
1590 looking for null transitions that are allocated to a state. When it
1591 finds one it writes the single transition into the "hole". If the
1592 pointer doesnt find one the single transition is appeneded as normal.
1594 - Once compressed we can Renew/realloc the structures to release the
1597 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1598 specifically Fig 3.47 and the associated pseudocode.
1602 const U32 laststate = TRIE_NODENUM( next_alloc );
1605 TRIE_LASTSTATE(trie) = laststate;
1607 for ( state = 1 ; state < laststate ; state++ ) {
1609 const U32 stateidx = TRIE_NODEIDX( state );
1610 const U32 o_used = trie->trans[ stateidx ].check;
1611 U32 used = trie->trans[ stateidx ].check;
1612 trie->trans[ stateidx ].check = 0;
1614 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1615 if ( flag || trie->trans[ stateidx + charid ].next ) {
1616 if ( trie->trans[ stateidx + charid ].next ) {
1618 for ( ; zp < pos ; zp++ ) {
1619 if ( ! trie->trans[ zp ].next ) {
1623 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1624 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1625 trie->trans[ zp ].check = state;
1626 if ( ++zp > pos ) pos = zp;
1633 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1635 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1636 trie->trans[ pos ].check = state;
1641 trie->lasttrans = pos + 1;
1642 Renew( trie->states, laststate + 1, reg_trie_state);
1643 DEBUG_TRIE_COMPILE_MORE_r(
1644 PerlIO_printf( Perl_debug_log,
1645 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1646 (int)depth * 2 + 2,"",
1647 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1650 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1653 } /* end table compress */
1655 /* resize the trans array to remove unused space */
1656 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1658 /* and now dump out the compressed format */
1659 DEBUG_TRIE_COMPILE_r(
1660 dump_trie(trie,depth+1)
1663 { /* Modify the program and insert the new TRIE node*/
1665 U8 nodetype =(U8)(flags & 0xFF);
1672 This means we convert either the first branch or the first Exact,
1673 depending on whether the thing following (in 'last') is a branch
1674 or not and whther first is the startbranch (ie is it a sub part of
1675 the alternation or is it the whole thing.)
1676 Assuming its a sub part we conver the EXACT otherwise we convert
1677 the whole branch sequence, including the first.
1679 /* Find the node we are going to overwrite */
1680 if ( first == startbranch && OP( last ) != BRANCH ) {
1681 /* whole branch chain */
1684 const regnode *nop = NEXTOPER( convert );
1685 mjd_offset= Node_Offset((nop));
1686 mjd_nodelen= Node_Length((nop));
1689 /* branch sub-chain */
1690 convert = NEXTOPER( first );
1691 NEXT_OFF( first ) = (U16)(last - first);
1693 mjd_offset= Node_Offset((convert));
1694 mjd_nodelen= Node_Length((convert));
1698 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1699 (int)depth * 2 + 2, "",
1700 mjd_offset,mjd_nodelen)
1703 /* But first we check to see if there is a common prefix we can
1704 split out as an EXACT and put in front of the TRIE node. */
1705 trie->startstate= 1;
1706 if ( trie->bitmap && !trie->widecharmap ) {
1709 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1710 (int)depth * 2 + 2, "",
1711 TRIE_LASTSTATE(trie))
1713 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1717 const U32 base = trie->states[ state ].trans.base;
1719 if ( trie->states[state].wordnum )
1722 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1723 if ( ( base + ofs >= trie->uniquecharcount ) &&
1724 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1725 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1727 if ( ++count > 1 ) {
1728 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1729 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1730 if ( state == 1 ) break;
1732 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1734 PerlIO_printf(Perl_debug_log,
1735 "%*sNew Start State=%"UVuf" Class: [",
1736 (int)depth * 2 + 2, "",
1739 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1740 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1742 TRIE_BITMAP_SET(trie,*ch);
1744 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1746 PerlIO_printf(Perl_debug_log, (char*)ch)
1750 TRIE_BITMAP_SET(trie,*ch);
1752 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1753 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1759 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1760 const char *ch = SvPV_nolen_const( *tmp );
1762 PerlIO_printf( Perl_debug_log,
1763 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1764 (int)depth * 2 + 2, "",
1768 OP( convert ) = nodetype;
1769 str=STRING(convert);
1778 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1784 regnode *n = convert+NODE_SZ_STR(convert);
1785 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1786 trie->startstate = state;
1787 trie->minlen -= (state - 1);
1788 trie->maxlen -= (state - 1);
1790 regnode *fix = convert;
1792 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1793 while( ++fix < n ) {
1794 Set_Node_Offset_Length(fix, 0, 0);
1800 NEXT_OFF(convert) = (U16)(tail - convert);
1804 if ( trie->maxlen ) {
1805 OP( convert ) = TRIE;
1806 NEXT_OFF( convert ) = (U16)(tail - convert);
1807 ARG_SET( convert, data_slot );
1809 /* store the type in the flags */
1810 convert->flags = nodetype;
1811 /* XXX We really should free up the resource in trie now, as we wont use them */
1813 /* needed for dumping*/
1815 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1816 regnode *opt = convert;
1817 while (++opt<optimize) {
1818 Set_Node_Offset_Length(opt,0,0);
1820 /* We now need to mark all of the space originally used by the
1821 branches as optimized away. This keeps the dumpuntil from
1822 throwing a wobbly as it doesnt use regnext() to traverse the
1824 We also "fix" the offsets
1826 while( optimize < last ) {
1827 mjd_nodelen += Node_Length((optimize));
1828 OP( optimize ) = OPTIMIZED;
1829 Set_Node_Offset_Length(optimize,0,0);
1832 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1834 } /* end node insert */
1836 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1842 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1843 * These need to be revisited when a newer toolchain becomes available.
1845 #if defined(__sparc64__) && defined(__GNUC__)
1846 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1847 # undef SPARC64_GCC_WORKAROUND
1848 # define SPARC64_GCC_WORKAROUND 1
1852 #define DEBUG_PEEP(str,scan,depth) \
1853 DEBUG_OPTIMISE_r({ \
1854 SV * const mysv=sv_newmortal(); \
1855 regnode *Next = regnext(scan); \
1856 regprop(RExC_rx, mysv, scan); \
1857 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1858 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1859 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1862 #define JOIN_EXACT(scan,min,flags) \
1863 if (PL_regkind[OP(scan)] == EXACT) \
1864 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1867 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1868 /* Merge several consecutive EXACTish nodes into one. */
1869 regnode *n = regnext(scan);
1871 regnode *next = scan + NODE_SZ_STR(scan);
1875 regnode *stop = scan;
1877 PERL_UNUSED_ARG(flags);
1878 PERL_UNUSED_ARG(val);
1879 PERL_UNUSED_ARG(depth);
1881 GET_RE_DEBUG_FLAGS_DECL;
1882 DEBUG_PEEP("join",scan,depth);
1884 /* Skip NOTHING, merge EXACT*. */
1886 ( PL_regkind[OP(n)] == NOTHING ||
1887 (stringok && (OP(n) == OP(scan))))
1889 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1891 if (OP(n) == TAIL || n > next)
1893 if (PL_regkind[OP(n)] == NOTHING) {
1895 DEBUG_PEEP("skip:",n,depth);
1896 NEXT_OFF(scan) += NEXT_OFF(n);
1897 next = n + NODE_STEP_REGNODE;
1904 else if (stringok) {
1905 const int oldl = STR_LEN(scan);
1906 regnode * const nnext = regnext(n);
1908 DEBUG_PEEP("merg",n,depth);
1911 if (oldl + STR_LEN(n) > U8_MAX)
1913 NEXT_OFF(scan) += NEXT_OFF(n);
1914 STR_LEN(scan) += STR_LEN(n);
1915 next = n + NODE_SZ_STR(n);
1916 /* Now we can overwrite *n : */
1917 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1925 #ifdef EXPERIMENTAL_INPLACESCAN
1926 if (flags && !NEXT_OFF(n)) {
1927 DEBUG_PEEP("atch",val,depth);
1928 if (reg_off_by_arg[OP(n)]) {
1929 ARG_SET(n, val - n);
1932 NEXT_OFF(n) = val - n;
1939 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1941 Two problematic code points in Unicode casefolding of EXACT nodes:
1943 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1944 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1950 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1951 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1953 This means that in case-insensitive matching (or "loose matching",
1954 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1955 length of the above casefolded versions) can match a target string
1956 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1957 This would rather mess up the minimum length computation.
1959 What we'll do is to look for the tail four bytes, and then peek
1960 at the preceding two bytes to see whether we need to decrease
1961 the minimum length by four (six minus two).
1963 Thanks to the design of UTF-8, there cannot be false matches:
1964 A sequence of valid UTF-8 bytes cannot be a subsequence of
1965 another valid sequence of UTF-8 bytes.
1968 char * const s0 = STRING(scan), *s, *t;
1969 char * const s1 = s0 + STR_LEN(scan) - 1;
1970 char * const s2 = s1 - 4;
1971 const char t0[] = "\xcc\x88\xcc\x81";
1972 const char * const t1 = t0 + 3;
1975 s < s2 && (t = ninstr(s, s1, t0, t1));
1977 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1978 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1985 n = scan + NODE_SZ_STR(scan);
1987 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1994 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1998 /* REx optimizer. Converts nodes into quickier variants "in place".
1999 Finds fixed substrings. */
2001 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2002 to the position after last scanned or to NULL. */
2007 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2008 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2009 /* scanp: Start here (read-write). */
2010 /* deltap: Write maxlen-minlen here. */
2011 /* last: Stop before this one. */
2014 I32 min = 0, pars = 0, code;
2015 regnode *scan = *scanp, *next;
2017 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2018 int is_inf_internal = 0; /* The studied chunk is infinite */
2019 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2020 scan_data_t data_fake;
2021 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2022 SV *re_trie_maxbuff = NULL;
2024 GET_RE_DEBUG_FLAGS_DECL;
2026 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2029 while (scan && OP(scan) != END && scan < last) {
2030 /* Peephole optimizer: */
2031 DEBUG_PEEP("Peep",scan,depth);
2033 JOIN_EXACT(scan,&min,0);
2035 /* Follow the next-chain of the current node and optimize
2036 away all the NOTHINGs from it. */
2037 if (OP(scan) != CURLYX) {
2038 const int max = (reg_off_by_arg[OP(scan)]
2040 /* I32 may be smaller than U16 on CRAYs! */
2041 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2042 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2046 /* Skip NOTHING and LONGJMP. */
2047 while ((n = regnext(n))
2048 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2049 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2050 && off + noff < max)
2052 if (reg_off_by_arg[OP(scan)])
2055 NEXT_OFF(scan) = off;
2060 /* The principal pseudo-switch. Cannot be a switch, since we
2061 look into several different things. */
2062 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2063 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2064 next = regnext(scan);
2066 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2068 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2069 I32 max1 = 0, min1 = I32_MAX, num = 0;
2070 struct regnode_charclass_class accum;
2071 regnode * const startbranch=scan;
2073 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2074 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2075 if (flags & SCF_DO_STCLASS)
2076 cl_init_zero(pRExC_state, &accum);
2078 while (OP(scan) == code) {
2079 I32 deltanext, minnext, f = 0, fake;
2080 struct regnode_charclass_class this_class;
2083 data_fake.flags = 0;
2085 data_fake.whilem_c = data->whilem_c;
2086 data_fake.last_closep = data->last_closep;
2089 data_fake.last_closep = &fake;
2090 next = regnext(scan);
2091 scan = NEXTOPER(scan);
2093 scan = NEXTOPER(scan);
2094 if (flags & SCF_DO_STCLASS) {
2095 cl_init(pRExC_state, &this_class);
2096 data_fake.start_class = &this_class;
2097 f = SCF_DO_STCLASS_AND;
2099 if (flags & SCF_WHILEM_VISITED_POS)
2100 f |= SCF_WHILEM_VISITED_POS;
2102 /* we suppose the run is continuous, last=next...*/
2103 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2104 next, &data_fake, f,depth+1);
2107 if (max1 < minnext + deltanext)
2108 max1 = minnext + deltanext;
2109 if (deltanext == I32_MAX)
2110 is_inf = is_inf_internal = 1;
2112 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2115 if (data_fake.flags & SF_HAS_EVAL)
2116 data->flags |= SF_HAS_EVAL;
2117 data->whilem_c = data_fake.whilem_c;
2119 if (flags & SCF_DO_STCLASS)
2120 cl_or(pRExC_state, &accum, &this_class);
2121 if (code == SUSPEND)
2124 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2126 if (flags & SCF_DO_SUBSTR) {
2127 data->pos_min += min1;
2128 data->pos_delta += max1 - min1;
2129 if (max1 != min1 || is_inf)
2130 data->longest = &(data->longest_float);
2133 delta += max1 - min1;
2134 if (flags & SCF_DO_STCLASS_OR) {
2135 cl_or(pRExC_state, data->start_class, &accum);
2137 cl_and(data->start_class, &and_with);
2138 flags &= ~SCF_DO_STCLASS;
2141 else if (flags & SCF_DO_STCLASS_AND) {
2143 cl_and(data->start_class, &accum);
2144 flags &= ~SCF_DO_STCLASS;
2147 /* Switch to OR mode: cache the old value of
2148 * data->start_class */
2149 StructCopy(data->start_class, &and_with,
2150 struct regnode_charclass_class);
2151 flags &= ~SCF_DO_STCLASS_AND;
2152 StructCopy(&accum, data->start_class,
2153 struct regnode_charclass_class);
2154 flags |= SCF_DO_STCLASS_OR;
2155 data->start_class->flags |= ANYOF_EOS;
2161 Assuming this was/is a branch we are dealing with: 'scan' now
2162 points at the item that follows the branch sequence, whatever
2163 it is. We now start at the beginning of the sequence and look
2169 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2171 If we can find such a subseqence we need to turn the first
2172 element into a trie and then add the subsequent branch exact
2173 strings to the trie.
2177 1. patterns where the whole set of branch can be converted to a trie,
2179 2. patterns where only a subset of the alternations can be
2180 converted to a trie.
2182 In case 1 we can replace the whole set with a single regop
2183 for the trie. In case 2 we need to keep the start and end
2186 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2187 becomes BRANCH TRIE; BRANCH X;
2189 Hypthetically when we know the regex isnt anchored we can
2190 turn a case 1 into a DFA and let it rip... Every time it finds a match
2191 it would just call its tail, no WHILEM/CURLY needed.
2194 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2196 if (!re_trie_maxbuff) {
2197 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2198 if (!SvIOK(re_trie_maxbuff))
2199 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2201 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2203 regnode *first = (regnode *)NULL;
2204 regnode *last = (regnode *)NULL;
2205 regnode *tail = scan;
2210 SV * const mysv = sv_newmortal(); /* for dumping */
2212 /* var tail is used because there may be a TAIL
2213 regop in the way. Ie, the exacts will point to the
2214 thing following the TAIL, but the last branch will
2215 point at the TAIL. So we advance tail. If we
2216 have nested (?:) we may have to move through several
2220 while ( OP( tail ) == TAIL ) {
2221 /* this is the TAIL generated by (?:) */
2222 tail = regnext( tail );
2227 regprop(RExC_rx, mysv, tail );
2228 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2229 (int)depth * 2 + 2, "",
2230 "Looking for TRIE'able sequences. Tail node is: ",
2231 SvPV_nolen_const( mysv )
2237 step through the branches, cur represents each
2238 branch, noper is the first thing to be matched
2239 as part of that branch and noper_next is the
2240 regnext() of that node. if noper is an EXACT
2241 and noper_next is the same as scan (our current
2242 position in the regex) then the EXACT branch is
2243 a possible optimization target. Once we have
2244 two or more consequetive such branches we can
2245 create a trie of the EXACT's contents and stich
2246 it in place. If the sequence represents all of
2247 the branches we eliminate the whole thing and
2248 replace it with a single TRIE. If it is a
2249 subsequence then we need to stitch it in. This
2250 means the first branch has to remain, and needs
2251 to be repointed at the item on the branch chain
2252 following the last branch optimized. This could
2253 be either a BRANCH, in which case the
2254 subsequence is internal, or it could be the
2255 item following the branch sequence in which
2256 case the subsequence is at the end.
2260 /* dont use tail as the end marker for this traverse */
2261 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2262 regnode * const noper = NEXTOPER( cur );
2263 regnode * const noper_next = regnext( noper );
2266 regprop(RExC_rx, mysv, cur);
2267 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2268 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2270 regprop(RExC_rx, mysv, noper);
2271 PerlIO_printf( Perl_debug_log, " -> %s",
2272 SvPV_nolen_const(mysv));
2275 regprop(RExC_rx, mysv, noper_next );
2276 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2277 SvPV_nolen_const(mysv));
2279 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2280 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2282 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2283 : PL_regkind[ OP( noper ) ] == EXACT )
2284 || OP(noper) == NOTHING )
2285 && noper_next == tail && count<U16_MAX)
2288 if ( !first || optype == NOTHING ) {
2289 if (!first) first = cur;
2290 optype = OP( noper );
2296 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2298 if ( PL_regkind[ OP( noper ) ] == EXACT
2299 && noper_next == tail )
2303 optype = OP( noper );
2313 regprop(RExC_rx, mysv, cur);
2314 PerlIO_printf( Perl_debug_log,
2315 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2316 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2320 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2321 #ifdef TRIE_STUDY_OPT
2322 if ( made && startbranch == first ) {
2323 if ( OP(first)!=TRIE )
2324 flags |= SCF_EXACT_TRIE;
2326 regnode *chk=*scanp;
2327 while ( OP( chk ) == OPEN )
2328 chk = regnext( chk );
2330 flags |= SCF_EXACT_TRIE;
2339 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2340 scan = NEXTOPER(NEXTOPER(scan));
2341 } else /* single branch is optimized. */
2342 scan = NEXTOPER(scan);
2345 else if (OP(scan) == EXACT) {
2346 I32 l = STR_LEN(scan);
2349 const U8 * const s = (U8*)STRING(scan);
2350 l = utf8_length(s, s + l);
2351 uc = utf8_to_uvchr(s, NULL);
2353 uc = *((U8*)STRING(scan));
2356 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2357 /* The code below prefers earlier match for fixed
2358 offset, later match for variable offset. */
2359 if (data->last_end == -1) { /* Update the start info. */
2360 data->last_start_min = data->pos_min;
2361 data->last_start_max = is_inf
2362 ? I32_MAX : data->pos_min + data->pos_delta;
2364 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2366 SvUTF8_on(data->last_found);
2368 SV * const sv = data->last_found;
2369 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2370 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2371 if (mg && mg->mg_len >= 0)
2372 mg->mg_len += utf8_length((U8*)STRING(scan),
2373 (U8*)STRING(scan)+STR_LEN(scan));
2375 data->last_end = data->pos_min + l;
2376 data->pos_min += l; /* As in the first entry. */
2377 data->flags &= ~SF_BEFORE_EOL;
2379 if (flags & SCF_DO_STCLASS_AND) {
2380 /* Check whether it is compatible with what we know already! */
2384 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2385 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2386 && (!(data->start_class->flags & ANYOF_FOLD)
2387 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2390 ANYOF_CLASS_ZERO(data->start_class);
2391 ANYOF_BITMAP_ZERO(data->start_class);
2393 ANYOF_BITMAP_SET(data->start_class, uc);
2394 data->start_class->flags &= ~ANYOF_EOS;
2396 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2398 else if (flags & SCF_DO_STCLASS_OR) {
2399 /* false positive possible if the class is case-folded */
2401 ANYOF_BITMAP_SET(data->start_class, uc);
2403 data->start_class->flags |= ANYOF_UNICODE_ALL;
2404 data->start_class->flags &= ~ANYOF_EOS;
2405 cl_and(data->start_class, &and_with);
2407 flags &= ~SCF_DO_STCLASS;
2409 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2410 I32 l = STR_LEN(scan);
2411 UV uc = *((U8*)STRING(scan));
2413 /* Search for fixed substrings supports EXACT only. */
2414 if (flags & SCF_DO_SUBSTR) {
2416 scan_commit(pRExC_state, data);
2419 const U8 * const s = (U8 *)STRING(scan);
2420 l = utf8_length(s, s + l);
2421 uc = utf8_to_uvchr(s, NULL);
2424 if (flags & SCF_DO_SUBSTR)
2426 if (flags & SCF_DO_STCLASS_AND) {
2427 /* Check whether it is compatible with what we know already! */
2431 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2432 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2433 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2435 ANYOF_CLASS_ZERO(data->start_class);
2436 ANYOF_BITMAP_ZERO(data->start_class);
2438 ANYOF_BITMAP_SET(data->start_class, uc);
2439 data->start_class->flags &= ~ANYOF_EOS;
2440 data->start_class->flags |= ANYOF_FOLD;
2441 if (OP(scan) == EXACTFL)
2442 data->start_class->flags |= ANYOF_LOCALE;
2445 else if (flags & SCF_DO_STCLASS_OR) {
2446 if (data->start_class->flags & ANYOF_FOLD) {
2447 /* false positive possible if the class is case-folded.
2448 Assume that the locale settings are the same... */
2450 ANYOF_BITMAP_SET(data->start_class, uc);
2451 data->start_class->flags &= ~ANYOF_EOS;
2453 cl_and(data->start_class, &and_with);
2455 flags &= ~SCF_DO_STCLASS;
2457 #ifdef TRIE_STUDY_OPT
2458 else if (OP(scan) == TRIE) {
2459 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2460 min += trie->minlen;
2461 delta += (trie->maxlen - trie->minlen);
2462 flags &= ~SCF_DO_STCLASS; /* xxx */
2463 if (flags & SCF_DO_SUBSTR) {
2464 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2465 data->pos_min += trie->minlen;
2466 data->pos_delta += (trie->maxlen - trie->minlen);
2467 if (trie->maxlen != trie->minlen)
2468 data->longest = &(data->longest_float);
2472 else if (strchr((const char*)PL_varies,OP(scan))) {
2473 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2474 I32 f = flags, pos_before = 0;
2475 regnode * const oscan = scan;
2476 struct regnode_charclass_class this_class;
2477 struct regnode_charclass_class *oclass = NULL;
2478 I32 next_is_eval = 0;
2480 switch (PL_regkind[OP(scan)]) {
2481 case WHILEM: /* End of (?:...)* . */
2482 scan = NEXTOPER(scan);
2485 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2486 next = NEXTOPER(scan);
2487 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2489 maxcount = REG_INFTY;
2490 next = regnext(scan);
2491 scan = NEXTOPER(scan);
2495 if (flags & SCF_DO_SUBSTR)
2500 if (flags & SCF_DO_STCLASS) {
2502 maxcount = REG_INFTY;
2503 next = regnext(scan);
2504 scan = NEXTOPER(scan);
2507 is_inf = is_inf_internal = 1;
2508 scan = regnext(scan);
2509 if (flags & SCF_DO_SUBSTR) {
2510 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2511 data->longest = &(data->longest_float);
2513 goto optimize_curly_tail;
2515 mincount = ARG1(scan);
2516 maxcount = ARG2(scan);
2517 next = regnext(scan);
2518 if (OP(scan) == CURLYX) {
2519 I32 lp = (data ? *(data->last_closep) : 0);
2520 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2522 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2523 next_is_eval = (OP(scan) == EVAL);
2525 if (flags & SCF_DO_SUBSTR) {
2526 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2527 pos_before = data->pos_min;
2531 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2533 data->flags |= SF_IS_INF;
2535 if (flags & SCF_DO_STCLASS) {
2536 cl_init(pRExC_state, &this_class);
2537 oclass = data->start_class;
2538 data->start_class = &this_class;
2539 f |= SCF_DO_STCLASS_AND;
2540 f &= ~SCF_DO_STCLASS_OR;
2542 /* These are the cases when once a subexpression
2543 fails at a particular position, it cannot succeed
2544 even after backtracking at the enclosing scope.
2546 XXXX what if minimal match and we are at the
2547 initial run of {n,m}? */
2548 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2549 f &= ~SCF_WHILEM_VISITED_POS;
2551 /* This will finish on WHILEM, setting scan, or on NULL: */
2552 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2554 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2556 if (flags & SCF_DO_STCLASS)
2557 data->start_class = oclass;
2558 if (mincount == 0 || minnext == 0) {
2559 if (flags & SCF_DO_STCLASS_OR) {
2560 cl_or(pRExC_state, data->start_class, &this_class);
2562 else if (flags & SCF_DO_STCLASS_AND) {
2563 /* Switch to OR mode: cache the old value of
2564 * data->start_class */
2565 StructCopy(data->start_class, &and_with,
2566 struct regnode_charclass_class);
2567 flags &= ~SCF_DO_STCLASS_AND;
2568 StructCopy(&this_class, data->start_class,
2569 struct regnode_charclass_class);
2570 flags |= SCF_DO_STCLASS_OR;
2571 data->start_class->flags |= ANYOF_EOS;
2573 } else { /* Non-zero len */
2574 if (flags & SCF_DO_STCLASS_OR) {
2575 cl_or(pRExC_state, data->start_class, &this_class);
2576 cl_and(data->start_class, &and_with);
2578 else if (flags & SCF_DO_STCLASS_AND)
2579 cl_and(data->start_class, &this_class);
2580 flags &= ~SCF_DO_STCLASS;
2582 if (!scan) /* It was not CURLYX, but CURLY. */
2584 if ( /* ? quantifier ok, except for (?{ ... }) */
2585 (next_is_eval || !(mincount == 0 && maxcount == 1))
2586 && (minnext == 0) && (deltanext == 0)
2587 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2588 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2589 && ckWARN(WARN_REGEXP))
2592 "Quantifier unexpected on zero-length expression");
2595 min += minnext * mincount;
2596 is_inf_internal |= ((maxcount == REG_INFTY
2597 && (minnext + deltanext) > 0)
2598 || deltanext == I32_MAX);
2599 is_inf |= is_inf_internal;
2600 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2602 /* Try powerful optimization CURLYX => CURLYN. */
2603 if ( OP(oscan) == CURLYX && data
2604 && data->flags & SF_IN_PAR
2605 && !(data->flags & SF_HAS_EVAL)
2606 && !deltanext && minnext == 1 ) {
2607 /* Try to optimize to CURLYN. */
2608 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2609 regnode * const nxt1 = nxt;
2616 if (!strchr((const char*)PL_simple,OP(nxt))
2617 && !(PL_regkind[OP(nxt)] == EXACT
2618 && STR_LEN(nxt) == 1))
2624 if (OP(nxt) != CLOSE)
2626 /* Now we know that nxt2 is the only contents: */
2627 oscan->flags = (U8)ARG(nxt);
2629 OP(nxt1) = NOTHING; /* was OPEN. */
2631 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2632 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2633 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2634 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2635 OP(nxt + 1) = OPTIMIZED; /* was count. */
2636 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2641 /* Try optimization CURLYX => CURLYM. */
2642 if ( OP(oscan) == CURLYX && data
2643 && !(data->flags & SF_HAS_PAR)
2644 && !(data->flags & SF_HAS_EVAL)
2645 && !deltanext /* atom is fixed width */
2646 && minnext != 0 /* CURLYM can't handle zero width */
2648 /* XXXX How to optimize if data == 0? */
2649 /* Optimize to a simpler form. */
2650 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2654 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2655 && (OP(nxt2) != WHILEM))
2657 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2658 /* Need to optimize away parenths. */
2659 if (data->flags & SF_IN_PAR) {
2660 /* Set the parenth number. */
2661 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2663 if (OP(nxt) != CLOSE)
2664 FAIL("Panic opt close");
2665 oscan->flags = (U8)ARG(nxt);
2666 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2667 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2669 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2670 OP(nxt + 1) = OPTIMIZED; /* was count. */
2671 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2672 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2675 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2676 regnode *nnxt = regnext(nxt1);
2679 if (reg_off_by_arg[OP(nxt1)])
2680 ARG_SET(nxt1, nxt2 - nxt1);
2681 else if (nxt2 - nxt1 < U16_MAX)
2682 NEXT_OFF(nxt1) = nxt2 - nxt1;
2684 OP(nxt) = NOTHING; /* Cannot beautify */
2689 /* Optimize again: */
2690 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2696 else if ((OP(oscan) == CURLYX)
2697 && (flags & SCF_WHILEM_VISITED_POS)
2698 /* See the comment on a similar expression above.
2699 However, this time it not a subexpression
2700 we care about, but the expression itself. */
2701 && (maxcount == REG_INFTY)
2702 && data && ++data->whilem_c < 16) {
2703 /* This stays as CURLYX, we can put the count/of pair. */
2704 /* Find WHILEM (as in regexec.c) */
2705 regnode *nxt = oscan + NEXT_OFF(oscan);
2707 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2709 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2710 | (RExC_whilem_seen << 4)); /* On WHILEM */
2712 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2714 if (flags & SCF_DO_SUBSTR) {
2715 SV *last_str = NULL;
2716 int counted = mincount != 0;
2718 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2719 #if defined(SPARC64_GCC_WORKAROUND)
2722 const char *s = NULL;
2725 if (pos_before >= data->last_start_min)
2728 b = data->last_start_min;
2731 s = SvPV_const(data->last_found, l);
2732 old = b - data->last_start_min;
2735 I32 b = pos_before >= data->last_start_min
2736 ? pos_before : data->last_start_min;
2738 const char * const s = SvPV_const(data->last_found, l);
2739 I32 old = b - data->last_start_min;
2743 old = utf8_hop((U8*)s, old) - (U8*)s;
2746 /* Get the added string: */
2747 last_str = newSVpvn(s + old, l);
2749 SvUTF8_on(last_str);
2750 if (deltanext == 0 && pos_before == b) {
2751 /* What was added is a constant string */
2753 SvGROW(last_str, (mincount * l) + 1);
2754 repeatcpy(SvPVX(last_str) + l,
2755 SvPVX_const(last_str), l, mincount - 1);
2756 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2757 /* Add additional parts. */
2758 SvCUR_set(data->last_found,
2759 SvCUR(data->last_found) - l);
2760 sv_catsv(data->last_found, last_str);
2762 SV * sv = data->last_found;
2764 SvUTF8(sv) && SvMAGICAL(sv) ?
2765 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2766 if (mg && mg->mg_len >= 0)
2767 mg->mg_len += CHR_SVLEN(last_str);
2769 data->last_end += l * (mincount - 1);
2772 /* start offset must point into the last copy */
2773 data->last_start_min += minnext * (mincount - 1);
2774 data->last_start_max += is_inf ? I32_MAX
2775 : (maxcount - 1) * (minnext + data->pos_delta);
2778 /* It is counted once already... */
2779 data->pos_min += minnext * (mincount - counted);
2780 data->pos_delta += - counted * deltanext +
2781 (minnext + deltanext) * maxcount - minnext * mincount;
2782 if (mincount != maxcount) {
2783 /* Cannot extend fixed substrings found inside
2785 scan_commit(pRExC_state,data);
2786 if (mincount && last_str) {
2787 SV * const sv = data->last_found;
2788 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2789 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2793 sv_setsv(sv, last_str);
2794 data->last_end = data->pos_min;
2795 data->last_start_min =
2796 data->pos_min - CHR_SVLEN(last_str);
2797 data->last_start_max = is_inf
2799 : data->pos_min + data->pos_delta
2800 - CHR_SVLEN(last_str);
2802 data->longest = &(data->longest_float);
2804 SvREFCNT_dec(last_str);
2806 if (data && (fl & SF_HAS_EVAL))
2807 data->flags |= SF_HAS_EVAL;
2808 optimize_curly_tail:
2809 if (OP(oscan) != CURLYX) {
2810 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2812 NEXT_OFF(oscan) += NEXT_OFF(next);
2815 default: /* REF and CLUMP only? */
2816 if (flags & SCF_DO_SUBSTR) {
2817 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2818 data->longest = &(data->longest_float);
2820 is_inf = is_inf_internal = 1;
2821 if (flags & SCF_DO_STCLASS_OR)
2822 cl_anything(pRExC_state, data->start_class);
2823 flags &= ~SCF_DO_STCLASS;
2827 else if (strchr((const char*)PL_simple,OP(scan))) {
2830 if (flags & SCF_DO_SUBSTR) {
2831 scan_commit(pRExC_state,data);
2835 if (flags & SCF_DO_STCLASS) {
2836 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2838 /* Some of the logic below assumes that switching
2839 locale on will only add false positives. */
2840 switch (PL_regkind[OP(scan)]) {
2844 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2845 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2846 cl_anything(pRExC_state, data->start_class);
2849 if (OP(scan) == SANY)
2851 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2852 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2853 || (data->start_class->flags & ANYOF_CLASS));
2854 cl_anything(pRExC_state, data->start_class);
2856 if (flags & SCF_DO_STCLASS_AND || !value)
2857 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2860 if (flags & SCF_DO_STCLASS_AND)
2861 cl_and(data->start_class,
2862 (struct regnode_charclass_class*)scan);
2864 cl_or(pRExC_state, data->start_class,
2865 (struct regnode_charclass_class*)scan);
2868 if (flags & SCF_DO_STCLASS_AND) {
2869 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2870 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2871 for (value = 0; value < 256; value++)
2872 if (!isALNUM(value))
2873 ANYOF_BITMAP_CLEAR(data->start_class, value);
2877 if (data->start_class->flags & ANYOF_LOCALE)
2878 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2880 for (value = 0; value < 256; value++)
2882 ANYOF_BITMAP_SET(data->start_class, value);
2887 if (flags & SCF_DO_STCLASS_AND) {
2888 if (data->start_class->flags & ANYOF_LOCALE)
2889 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2892 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2893 data->start_class->flags |= ANYOF_LOCALE;
2897 if (flags & SCF_DO_STCLASS_AND) {
2898 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2899 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2900 for (value = 0; value < 256; value++)
2902 ANYOF_BITMAP_CLEAR(data->start_class, value);
2906 if (data->start_class->flags & ANYOF_LOCALE)
2907 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2909 for (value = 0; value < 256; value++)
2910 if (!isALNUM(value))
2911 ANYOF_BITMAP_SET(data->start_class, value);
2916 if (flags & SCF_DO_STCLASS_AND) {
2917 if (data->start_class->flags & ANYOF_LOCALE)
2918 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2921 data->start_class->flags |= ANYOF_LOCALE;
2922 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2926 if (flags & SCF_DO_STCLASS_AND) {
2927 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2928 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2929 for (value = 0; value < 256; value++)
2930 if (!isSPACE(value))
2931 ANYOF_BITMAP_CLEAR(data->start_class, value);
2935 if (data->start_class->flags & ANYOF_LOCALE)
2936 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2938 for (value = 0; value < 256; value++)
2940 ANYOF_BITMAP_SET(data->start_class, value);
2945 if (flags & SCF_DO_STCLASS_AND) {
2946 if (data->start_class->flags & ANYOF_LOCALE)
2947 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2950 data->start_class->flags |= ANYOF_LOCALE;
2951 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2955 if (flags & SCF_DO_STCLASS_AND) {
2956 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2957 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2958 for (value = 0; value < 256; value++)
2960 ANYOF_BITMAP_CLEAR(data->start_class, value);
2964 if (data->start_class->flags & ANYOF_LOCALE)
2965 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2967 for (value = 0; value < 256; value++)
2968 if (!isSPACE(value))
2969 ANYOF_BITMAP_SET(data->start_class, value);
2974 if (flags & SCF_DO_STCLASS_AND) {
2975 if (data->start_class->flags & ANYOF_LOCALE) {
2976 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2977 for (value = 0; value < 256; value++)
2978 if (!isSPACE(value))
2979 ANYOF_BITMAP_CLEAR(data->start_class, value);
2983 data->start_class->flags |= ANYOF_LOCALE;
2984 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2988 if (flags & SCF_DO_STCLASS_AND) {
2989 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2990 for (value = 0; value < 256; value++)
2991 if (!isDIGIT(value))
2992 ANYOF_BITMAP_CLEAR(data->start_class, value);
2995 if (data->start_class->flags & ANYOF_LOCALE)
2996 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2998 for (value = 0; value < 256; value++)
3000 ANYOF_BITMAP_SET(data->start_class, value);
3005 if (flags & SCF_DO_STCLASS_AND) {
3006 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3007 for (value = 0; value < 256; value++)
3009 ANYOF_BITMAP_CLEAR(data->start_class, value);
3012 if (data->start_class->flags & ANYOF_LOCALE)
3013 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3015 for (value = 0; value < 256; value++)
3016 if (!isDIGIT(value))
3017 ANYOF_BITMAP_SET(data->start_class, value);
3022 if (flags & SCF_DO_STCLASS_OR)
3023 cl_and(data->start_class, &and_with);
3024 flags &= ~SCF_DO_STCLASS;
3027 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3028 data->flags |= (OP(scan) == MEOL
3032 else if ( PL_regkind[OP(scan)] == BRANCHJ
3033 /* Lookbehind, or need to calculate parens/evals/stclass: */
3034 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3035 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3036 /* Lookahead/lookbehind */
3037 I32 deltanext, minnext, fake = 0;
3039 struct regnode_charclass_class intrnl;
3042 data_fake.flags = 0;
3044 data_fake.whilem_c = data->whilem_c;
3045 data_fake.last_closep = data->last_closep;
3048 data_fake.last_closep = &fake;
3049 if ( flags & SCF_DO_STCLASS && !scan->flags
3050 && OP(scan) == IFMATCH ) { /* Lookahead */
3051 cl_init(pRExC_state, &intrnl);
3052 data_fake.start_class = &intrnl;
3053 f |= SCF_DO_STCLASS_AND;
3055 if (flags & SCF_WHILEM_VISITED_POS)
3056 f |= SCF_WHILEM_VISITED_POS;
3057 next = regnext(scan);
3058 nscan = NEXTOPER(NEXTOPER(scan));
3059 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3062 vFAIL("Variable length lookbehind not implemented");
3064 else if (minnext > U8_MAX) {
3065 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3067 scan->flags = (U8)minnext;
3070 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3072 if (data_fake.flags & SF_HAS_EVAL)
3073 data->flags |= SF_HAS_EVAL;
3074 data->whilem_c = data_fake.whilem_c;
3076 if (f & SCF_DO_STCLASS_AND) {
3077 const int was = (data->start_class->flags & ANYOF_EOS);
3079 cl_and(data->start_class, &intrnl);
3081 data->start_class->flags |= ANYOF_EOS;
3084 else if (OP(scan) == OPEN) {
3087 else if (OP(scan) == CLOSE) {
3088 if ((I32)ARG(scan) == is_par) {
3089 next = regnext(scan);
3091 if ( next && (OP(next) != WHILEM) && next < last)
3092 is_par = 0; /* Disable optimization */
3095 *(data->last_closep) = ARG(scan);
3097 else if (OP(scan) == EVAL) {
3099 data->flags |= SF_HAS_EVAL;
3101 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3102 if (flags & SCF_DO_SUBSTR) {
3103 scan_commit(pRExC_state,data);
3104 data->longest = &(data->longest_float);
3106 is_inf = is_inf_internal = 1;
3107 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3108 cl_anything(pRExC_state, data->start_class);
3109 flags &= ~SCF_DO_STCLASS;
3111 /* Else: zero-length, ignore. */
3112 scan = regnext(scan);
3117 *deltap = is_inf_internal ? I32_MAX : delta;
3118 if (flags & SCF_DO_SUBSTR && is_inf)
3119 data->pos_delta = I32_MAX - data->pos_min;
3120 if (is_par > U8_MAX)
3122 if (is_par && pars==1 && data) {
3123 data->flags |= SF_IN_PAR;
3124 data->flags &= ~SF_HAS_PAR;
3126 else if (pars && data) {
3127 data->flags |= SF_HAS_PAR;
3128 data->flags &= ~SF_IN_PAR;
3130 if (flags & SCF_DO_STCLASS_OR)
3131 cl_and(data->start_class, &and_with);
3132 if (flags & SCF_EXACT_TRIE)
3133 data->flags |= SCF_EXACT_TRIE;
3138 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3140 if (RExC_rx->data) {
3141 Renewc(RExC_rx->data,
3142 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3143 char, struct reg_data);
3144 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3145 RExC_rx->data->count += n;
3148 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3149 char, struct reg_data);
3150 Newx(RExC_rx->data->what, n, U8);
3151 RExC_rx->data->count = n;
3153 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3154 return RExC_rx->data->count - n;
3157 #ifndef PERL_IN_XSUB_RE
3159 Perl_reginitcolors(pTHX)
3162 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3164 char *t = savepv(s);
3168 t = strchr(t, '\t');
3174 PL_colors[i] = t = (char *)"";
3179 PL_colors[i++] = (char *)"";
3187 - pregcomp - compile a regular expression into internal code
3189 * We can't allocate space until we know how big the compiled form will be,
3190 * but we can't compile it (and thus know how big it is) until we've got a
3191 * place to put the code. So we cheat: we compile it twice, once with code
3192 * generation turned off and size counting turned on, and once "for real".
3193 * This also means that we don't allocate space until we are sure that the
3194 * thing really will compile successfully, and we never have to move the
3195 * code and thus invalidate pointers into it. (Note that it has to be in
3196 * one piece because free() must be able to free it all.) [NB: not true in perl]
3198 * Beware that the optimization-preparation code in here knows about some
3199 * of the structure of the compiled regexp. [I'll say.]
3202 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3213 RExC_state_t RExC_state;
3214 RExC_state_t * const pRExC_state = &RExC_state;
3215 #ifdef TRIE_STUDY_OPT
3217 RExC_state_t copyRExC_state;
3220 GET_RE_DEBUG_FLAGS_DECL;
3223 FAIL("NULL regexp argument");
3225 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3228 DEBUG_r(if (!PL_colorset) reginitcolors());
3230 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3231 PL_colors[4],PL_colors[5],PL_colors[0],
3232 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3234 RExC_flags = pm->op_pmflags;
3238 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3239 RExC_seen_evals = 0;
3242 /* First pass: determine size, legality. */
3249 RExC_emit = &PL_regdummy;
3250 RExC_whilem_seen = 0;
3251 #if 0 /* REGC() is (currently) a NOP at the first pass.
3252 * Clever compilers notice this and complain. --jhi */
3253 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3255 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3256 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3257 RExC_precomp = NULL;
3260 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3261 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3262 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3265 RExC_lastparse=NULL;
3269 /* Small enough for pointer-storage convention?
3270 If extralen==0, this means that we will not need long jumps. */
3271 if (RExC_size >= 0x10000L && RExC_extralen)
3272 RExC_size += RExC_extralen;
3275 if (RExC_whilem_seen > 15)
3276 RExC_whilem_seen = 15;
3278 /* Allocate space and initialize. */
3279 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3282 FAIL("Regexp out of space");
3285 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3286 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3289 r->prelen = xend - exp;
3290 r->precomp = savepvn(RExC_precomp, r->prelen);
3292 #ifdef PERL_OLD_COPY_ON_WRITE
3293 r->saved_copy = NULL;
3295 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3296 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3297 r->lastparen = 0; /* mg.c reads this. */
3299 r->substrs = 0; /* Useful during FAIL. */
3300 r->startp = 0; /* Useful during FAIL. */
3301 r->endp = 0; /* Useful during FAIL. */
3303 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3305 r->offsets[0] = RExC_size;
3307 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3308 "%s %"UVuf" bytes for offset annotations.\n",
3309 r->offsets ? "Got" : "Couldn't get",
3310 (UV)((2*RExC_size+1) * sizeof(U32))));
3314 /* Second pass: emit code. */
3315 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3320 RExC_emit_start = r->program;
3321 RExC_emit = r->program;
3322 /* Store the count of eval-groups for security checks: */
3323 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3324 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3326 if (reg(pRExC_state, 0, &flags,1) == NULL)
3328 /* XXXX To minimize changes to RE engine we always allocate
3329 3-units-long substrs field. */
3330 Newx(r->substrs, 1, struct reg_substr_data);
3333 Zero(r->substrs, 1, struct reg_substr_data);
3334 StructCopy(&zero_scan_data, &data, scan_data_t);
3336 #ifdef TRIE_STUDY_OPT
3338 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3339 RExC_state=copyRExC_state;
3340 if (data.longest_fixed)
3341 SvREFCNT_dec(data.longest_fixed);
3342 if (data.longest_float)
3343 SvREFCNT_dec(data.longest_float);
3344 if (data.last_found)
3345 SvREFCNT_dec(data.last_found);
3347 copyRExC_state=RExC_state;
3350 /* Dig out information for optimizations. */
3351 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3352 pm->op_pmflags = RExC_flags;
3354 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3355 r->regstclass = NULL;
3356 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3357 r->reganch |= ROPT_NAUGHTY;
3358 scan = r->program + 1; /* First BRANCH. */
3360 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3361 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3363 STRLEN longest_float_length, longest_fixed_length;
3364 struct regnode_charclass_class ch_class; /* pointed to by data */
3366 I32 last_close = 0; /* pointed to by data */
3369 /* Skip introductions and multiplicators >= 1. */
3370 while ((OP(first) == OPEN && (sawopen = 1)) ||
3371 /* An OR of *one* alternative - should not happen now. */
3372 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3373 /* for now we can't handle lookbehind IFMATCH*/
3374 (OP(first) == IFMATCH && !first->flags) ||
3375 (OP(first) == PLUS) ||
3376 (OP(first) == MINMOD) ||
3377 /* An {n,m} with n>0 */
3378 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3380 DEBUG_PEEP("first:",first,0);
3381 if (OP(first) == PLUS)
3384 first += regarglen[OP(first)];
3385 if (OP(first) == IFMATCH) {
3386 first = NEXTOPER(first);
3387 first += EXTRA_STEP_2ARGS;
3388 } else /*xxx possible optimisation for /(?=)/*/
3389 first = NEXTOPER(first);
3392 /* Starting-point info. */
3394 /* Ignore EXACT as we deal with it later. */
3395 if (PL_regkind[OP(first)] == EXACT) {
3396 if (OP(first) == EXACT)
3397 NOOP; /* Empty, get anchored substr later. */
3398 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3399 r->regstclass = first;
3402 else if (OP(first) == TRIE &&
3403 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3405 /* this can happen only on restudy */
3406 struct regnode_1 *trie_op;
3407 Newxz(trie_op,1,struct regnode_1);
3408 StructCopy(first,trie_op,struct regnode_1);
3409 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3410 r->regstclass = (regnode *)trie_op;
3413 else if (strchr((const char*)PL_simple,OP(first)))
3414 r->regstclass = first;
3415 else if (PL_regkind[OP(first)] == BOUND ||
3416 PL_regkind[OP(first)] == NBOUND)
3417 r->regstclass = first;
3418 else if (PL_regkind[OP(first)] == BOL) {
3419 r->reganch |= (OP(first) == MBOL
3421 : (OP(first) == SBOL
3424 first = NEXTOPER(first);
3427 else if (OP(first) == GPOS) {
3428 r->reganch |= ROPT_ANCH_GPOS;
3429 first = NEXTOPER(first);
3432 else if (!sawopen && (OP(first) == STAR &&
3433 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3434 !(r->reganch & ROPT_ANCH) )
3436 /* turn .* into ^.* with an implied $*=1 */
3438 (OP(NEXTOPER(first)) == REG_ANY)
3441 r->reganch |= type | ROPT_IMPLICIT;
3442 first = NEXTOPER(first);
3445 if (sawplus && (!sawopen || !RExC_sawback)
3446 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3447 /* x+ must match at the 1st pos of run of x's */
3448 r->reganch |= ROPT_SKIP;
3450 /* Scan is after the zeroth branch, first is atomic matcher. */
3451 #ifdef TRIE_STUDY_OPT
3454 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3455 (IV)(first - scan + 1))
3459 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3460 (IV)(first - scan + 1))
3466 * If there's something expensive in the r.e., find the
3467 * longest literal string that must appear and make it the
3468 * regmust. Resolve ties in favor of later strings, since
3469 * the regstart check works with the beginning of the r.e.
3470 * and avoiding duplication strengthens checking. Not a
3471 * strong reason, but sufficient in the absence of others.
3472 * [Now we resolve ties in favor of the earlier string if
3473 * it happens that c_offset_min has been invalidated, since the
3474 * earlier string may buy us something the later one won't.]
3478 data.longest_fixed = newSVpvs("");
3479 data.longest_float = newSVpvs("");
3480 data.last_found = newSVpvs("");
3481 data.longest = &(data.longest_fixed);
3483 if (!r->regstclass) {
3484 cl_init(pRExC_state, &ch_class);
3485 data.start_class = &ch_class;
3486 stclass_flag = SCF_DO_STCLASS_AND;
3487 } else /* XXXX Check for BOUND? */
3489 data.last_closep = &last_close;
3491 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3492 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3494 #ifdef TRIE_STUDY_OPT
3495 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3500 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3501 && data.last_start_min == 0 && data.last_end > 0
3502 && !RExC_seen_zerolen
3503 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3504 r->reganch |= ROPT_CHECK_ALL;
3505 scan_commit(pRExC_state, &data);
3506 SvREFCNT_dec(data.last_found);
3508 longest_float_length = CHR_SVLEN(data.longest_float);
3509 if (longest_float_length
3510 || (data.flags & SF_FL_BEFORE_EOL
3511 && (!(data.flags & SF_FL_BEFORE_MEOL)
3512 || (RExC_flags & PMf_MULTILINE)))) {
3515 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3516 && data.offset_fixed == data.offset_float_min
3517 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3518 goto remove_float; /* As in (a)+. */
3520 if (SvUTF8(data.longest_float)) {
3521 r->float_utf8 = data.longest_float;
3522 r->float_substr = NULL;
3524 r->float_substr = data.longest_float;
3525 r->float_utf8 = NULL;
3527 r->float_min_offset = data.offset_float_min;
3528 r->float_max_offset = data.offset_float_max;
3529 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3530 && (!(data.flags & SF_FL_BEFORE_MEOL)
3531 || (RExC_flags & PMf_MULTILINE)));
3532 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3536 r->float_substr = r->float_utf8 = NULL;
3537 SvREFCNT_dec(data.longest_float);
3538 longest_float_length = 0;
3541 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3542 if (longest_fixed_length
3543 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3544 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3545 || (RExC_flags & PMf_MULTILINE)))) {
3548 if (SvUTF8(data.longest_fixed)) {
3549 r->anchored_utf8 = data.longest_fixed;
3550 r->anchored_substr = NULL;
3552 r->anchored_substr = data.longest_fixed;
3553 r->anchored_utf8 = NULL;
3555 r->anchored_offset = data.offset_fixed;
3556 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3557 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3558 || (RExC_flags & PMf_MULTILINE)));
3559 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3562 r->anchored_substr = r->anchored_utf8 = NULL;
3563 SvREFCNT_dec(data.longest_fixed);
3564 longest_fixed_length = 0;
3567 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3568 r->regstclass = NULL;
3569 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3571 && !(data.start_class->flags & ANYOF_EOS)
3572 && !cl_is_anything(data.start_class))
3574 const I32 n = add_data(pRExC_state, 1, "f");
3576 Newx(RExC_rx->data->data[n], 1,
3577 struct regnode_charclass_class);
3578 StructCopy(data.start_class,
3579 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3580 struct regnode_charclass_class);
3581 r->regstclass = (regnode*)RExC_rx->data->data[n];
3582 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3583 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3584 regprop(r, sv, (regnode*)data.start_class);
3585 PerlIO_printf(Perl_debug_log,
3586 "synthetic stclass \"%s\".\n",
3587 SvPVX_const(sv));});
3590 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3591 if (longest_fixed_length > longest_float_length) {
3592 r->check_substr = r->anchored_substr;
3593 r->check_utf8 = r->anchored_utf8;
3594 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3595 if (r->reganch & ROPT_ANCH_SINGLE)
3596 r->reganch |= ROPT_NOSCAN;
3599 r->check_substr = r->float_substr;
3600 r->check_utf8 = r->float_utf8;
3601 r->check_offset_min = data.offset_float_min;
3602 r->check_offset_max = data.offset_float_max;
3604 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3605 This should be changed ASAP! */
3606 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3607 r->reganch |= RE_USE_INTUIT;
3608 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3609 r->reganch |= RE_INTUIT_TAIL;
3613 /* Several toplevels. Best we can is to set minlen. */
3615 struct regnode_charclass_class ch_class;
3618 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3620 scan = r->program + 1;
3621 cl_init(pRExC_state, &ch_class);
3622 data.start_class = &ch_class;
3623 data.last_closep = &last_close;
3625 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3626 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3628 #ifdef TRIE_STUDY_OPT
3629 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3634 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3635 = r->float_substr = r->float_utf8 = NULL;
3636 if (!(data.start_class->flags & ANYOF_EOS)
3637 && !cl_is_anything(data.start_class))
3639 const I32 n = add_data(pRExC_state, 1, "f");
3641 Newx(RExC_rx->data->data[n], 1,
3642 struct regnode_charclass_class);
3643 StructCopy(data.start_class,
3644 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3645 struct regnode_charclass_class);
3646 r->regstclass = (regnode*)RExC_rx->data->data[n];
3647 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3648 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3649 regprop(r, sv, (regnode*)data.start_class);
3650 PerlIO_printf(Perl_debug_log,
3651 "synthetic stclass \"%s\".\n",
3652 SvPVX_const(sv));});
3657 if (RExC_seen & REG_SEEN_GPOS)
3658 r->reganch |= ROPT_GPOS_SEEN;
3659 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3660 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3661 if (RExC_seen & REG_SEEN_EVAL)
3662 r->reganch |= ROPT_EVAL_SEEN;
3663 if (RExC_seen & REG_SEEN_CANY)
3664 r->reganch |= ROPT_CANY_SEEN;
3665 Newxz(r->startp, RExC_npar, I32);
3666 Newxz(r->endp, RExC_npar, I32);
3668 DEBUG_r( RX_DEBUG_on(r) );
3670 PerlIO_printf(Perl_debug_log,"Final program:\n");
3673 DEBUG_OFFSETS_r(if (r->offsets) {
3674 const U32 len = r->offsets[0];
3676 GET_RE_DEBUG_FLAGS_DECL;
3677 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3678 for (i = 1; i <= len; i++) {
3679 if (r->offsets[i*2-1] || r->offsets[i*2])
3680 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3681 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3683 PerlIO_printf(Perl_debug_log, "\n");
3689 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3690 int rem=(int)(RExC_end - RExC_parse); \
3699 if (RExC_lastparse!=RExC_parse) \
3700 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3703 iscut ? "..." : "<" \
3706 PerlIO_printf(Perl_debug_log,"%16s",""); \
3711 num=REG_NODE_NUM(RExC_emit); \
3712 if (RExC_lastnum!=num) \
3713 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3715 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3716 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3717 (int)((depth*2)), "", \
3721 RExC_lastparse=RExC_parse; \
3726 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3727 DEBUG_PARSE_MSG((funcname)); \
3728 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3731 - reg - regular expression, i.e. main body or parenthesized thing
3733 * Caller must absorb opening parenthesis.
3735 * Combining parenthesis handling with the base level of regular expression
3736 * is a trifle forced, but the need to tie the tails of the branches to what
3737 * follows makes it hard to avoid.
3739 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3741 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3743 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3747 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3748 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3751 register regnode *ret; /* Will be the head of the group. */
3752 register regnode *br;
3753 register regnode *lastbr;
3754 register regnode *ender = NULL;
3755 register I32 parno = 0;
3757 const I32 oregflags = RExC_flags;
3758 bool have_branch = 0;
3761 /* for (?g), (?gc), and (?o) warnings; warning
3762 about (?c) will warn about (?g) -- japhy */
3764 #define WASTED_O 0x01
3765 #define WASTED_G 0x02
3766 #define WASTED_C 0x04
3767 #define WASTED_GC (0x02|0x04)
3768 I32 wastedflags = 0x00;
3770 char * parse_start = RExC_parse; /* MJD */
3771 char * const oregcomp_parse = RExC_parse;
3773 GET_RE_DEBUG_FLAGS_DECL;
3774 DEBUG_PARSE("reg ");
3777 *flagp = 0; /* Tentatively. */
3780 /* Make an OPEN node, if parenthesized. */
3782 if (*RExC_parse == '?') { /* (?...) */
3783 U32 posflags = 0, negflags = 0;
3784 U32 *flagsp = &posflags;
3785 bool is_logical = 0;
3786 const char * const seqstart = RExC_parse;
3789 paren = *RExC_parse++;
3790 ret = NULL; /* For look-ahead/behind. */
3792 case '<': /* (?<...) */
3793 RExC_seen |= REG_SEEN_LOOKBEHIND;
3794 if (*RExC_parse == '!')
3796 if (*RExC_parse != '=' && *RExC_parse != '!')
3799 case '=': /* (?=...) */
3800 case '!': /* (?!...) */
3801 RExC_seen_zerolen++;
3802 case ':': /* (?:...) */
3803 case '>': /* (?>...) */
3805 case '$': /* (?$...) */
3806 case '@': /* (?@...) */
3807 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3809 case '#': /* (?#...) */
3810 while (*RExC_parse && *RExC_parse != ')')
3812 if (*RExC_parse != ')')
3813 FAIL("Sequence (?#... not terminated");
3814 nextchar(pRExC_state);
3817 case 'p': /* (?p...) */
3818 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3819 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3821 case '?': /* (??...) */
3823 if (*RExC_parse != '{')
3825 paren = *RExC_parse++;
3827 case '{': /* (?{...}) */
3829 I32 count = 1, n = 0;
3831 char *s = RExC_parse;
3833 RExC_seen_zerolen++;
3834 RExC_seen |= REG_SEEN_EVAL;
3835 while (count && (c = *RExC_parse)) {
3846 if (*RExC_parse != ')') {
3848 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3852 OP_4tree *sop, *rop;
3853 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3856 Perl_save_re_context(aTHX);
3857 rop = sv_compile_2op(sv, &sop, "re", &pad);
3858 sop->op_private |= OPpREFCOUNTED;
3859 /* re_dup will OpREFCNT_inc */
3860 OpREFCNT_set(sop, 1);
3863 n = add_data(pRExC_state, 3, "nop");
3864 RExC_rx->data->data[n] = (void*)rop;
3865 RExC_rx->data->data[n+1] = (void*)sop;
3866 RExC_rx->data->data[n+2] = (void*)pad;
3869 else { /* First pass */
3870 if (PL_reginterp_cnt < ++RExC_seen_evals
3872 /* No compiled RE interpolated, has runtime
3873 components ===> unsafe. */
3874 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3875 if (PL_tainting && PL_tainted)
3876 FAIL("Eval-group in insecure regular expression");
3877 #if PERL_VERSION > 8
3878 if (IN_PERL_COMPILETIME)
3883 nextchar(pRExC_state);
3885 ret = reg_node(pRExC_state, LOGICAL);
3888 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3889 /* deal with the length of this later - MJD */
3892 ret = reganode(pRExC_state, EVAL, n);
3893 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3894 Set_Node_Offset(ret, parse_start);
3897 case '(': /* (?(?{...})...) and (?(?=...)...) */
3899 if (RExC_parse[0] == '?') { /* (?(?...)) */
3900 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3901 || RExC_parse[1] == '<'
3902 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3905 ret = reg_node(pRExC_state, LOGICAL);
3908 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3912 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {