5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
174 #define FULL_TRIE_STUDY
177 /* Length of a variant. */
179 typedef struct scan_data_t {
185 I32 last_end; /* min value, <0 unless valid. */
188 SV **longest; /* Either &l_fixed, or &l_float. */
192 I32 offset_float_min;
193 I32 offset_float_max;
197 struct regnode_charclass_class *start_class;
201 * Forward declarations for pregcomp()'s friends.
204 static const scan_data_t zero_scan_data =
205 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
207 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
208 #define SF_BEFORE_SEOL 0x0001
209 #define SF_BEFORE_MEOL 0x0002
210 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
211 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
214 # define SF_FIX_SHIFT_EOL (0+2)
215 # define SF_FL_SHIFT_EOL (0+4)
217 # define SF_FIX_SHIFT_EOL (+2)
218 # define SF_FL_SHIFT_EOL (+4)
221 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
222 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
224 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
225 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
226 #define SF_IS_INF 0x0040
227 #define SF_HAS_PAR 0x0080
228 #define SF_IN_PAR 0x0100
229 #define SF_HAS_EVAL 0x0200
230 #define SCF_DO_SUBSTR 0x0400
231 #define SCF_DO_STCLASS_AND 0x0800
232 #define SCF_DO_STCLASS_OR 0x1000
233 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
234 #define SCF_WHILEM_VISITED_POS 0x2000
236 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
239 #define UTF (RExC_utf8 != 0)
240 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
241 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
243 #define OOB_UNICODE 12345678
244 #define OOB_NAMEDCLASS -1
246 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
247 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250 /* length of regex to show in messages that don't mark a position within */
251 #define RegexLengthToShowInErrorMessages 127
254 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
255 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
256 * op/pragma/warn/regcomp.
258 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
259 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
261 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
264 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
265 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 #define FAIL(msg) STMT_START { \
269 const char *ellipses = ""; \
270 IV len = RExC_end - RExC_precomp; \
273 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
274 if (len > RegexLengthToShowInErrorMessages) { \
275 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
276 len = RegexLengthToShowInErrorMessages - 10; \
279 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
280 msg, (int)len, RExC_precomp, ellipses); \
284 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
286 #define Simple_vFAIL(m) STMT_START { \
287 const IV offset = RExC_parse - RExC_precomp; \
288 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
289 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
293 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
295 #define vFAIL(m) STMT_START { \
297 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
302 * Like Simple_vFAIL(), but accepts two arguments.
304 #define Simple_vFAIL2(m,a1) STMT_START { \
305 const IV offset = RExC_parse - RExC_precomp; \
306 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
307 (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
313 #define vFAIL2(m,a1) STMT_START { \
315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
316 Simple_vFAIL2(m, a1); \
321 * Like Simple_vFAIL(), but accepts three arguments.
323 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
324 const IV offset = RExC_parse - RExC_precomp; \
325 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
326 (int)offset, RExC_precomp, RExC_precomp + offset); \
330 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
332 #define vFAIL3(m,a1,a2) STMT_START { \
334 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
335 Simple_vFAIL3(m, a1, a2); \
339 * Like Simple_vFAIL(), but accepts four arguments.
341 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
342 const IV offset = RExC_parse - RExC_precomp; \
343 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
344 (int)offset, RExC_precomp, RExC_precomp + offset); \
347 #define vWARN(loc,m) STMT_START { \
348 const IV offset = loc - RExC_precomp; \
349 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
350 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
353 #define vWARNdep(loc,m) STMT_START { \
354 const IV offset = loc - RExC_precomp; \
355 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
356 "%s" REPORT_LOCATION, \
357 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
361 #define vWARN2(loc, m, a1) STMT_START { \
362 const IV offset = loc - RExC_precomp; \
363 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
364 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
367 #define vWARN3(loc, m, a1, a2) STMT_START { \
368 const IV offset = loc - RExC_precomp; \
369 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
370 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
373 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
374 const IV offset = loc - RExC_precomp; \
375 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
376 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
379 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
380 const IV offset = loc - RExC_precomp; \
381 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
382 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 /* Allow for side effects in s */
387 #define REGC(c,s) STMT_START { \
388 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
391 /* Macros for recording node offsets. 20001227 mjd@plover.com
392 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
393 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
394 * Element 0 holds the number n.
395 * Position is 1 indexed.
398 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
400 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
401 __LINE__, (node), (int)(byte))); \
403 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
405 RExC_offsets[2*(node)-1] = (byte); \
410 #define Set_Node_Offset(node,byte) \
411 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
412 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
414 #define Set_Node_Length_To_R(node,len) STMT_START { \
416 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
417 __LINE__, (int)(node), (int)(len))); \
419 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
421 RExC_offsets[2*(node)] = (len); \
426 #define Set_Node_Length(node,len) \
427 Set_Node_Length_To_R((node)-RExC_emit_start, len)
428 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
429 #define Set_Node_Cur_Length(node) \
430 Set_Node_Length(node, RExC_parse - parse_start)
432 /* Get offsets and lengths */
433 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
434 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
436 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
438 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
442 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
443 #define EXPERIMENTAL_INPLACESCAN
446 static void clear_re(pTHX_ void *r);
448 /* Mark that we cannot extend a found fixed substring at this point.
449 Update the longest found anchored substring and the longest found
450 floating substrings if needed. */
453 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
455 const STRLEN l = CHR_SVLEN(data->last_found);
456 const STRLEN old_l = CHR_SVLEN(*data->longest);
458 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
459 SvSetMagicSV(*data->longest, data->last_found);
460 if (*data->longest == data->longest_fixed) {
461 data->offset_fixed = l ? data->last_start_min : data->pos_min;
462 if (data->flags & SF_BEFORE_EOL)
464 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
466 data->flags &= ~SF_FIX_BEFORE_EOL;
469 data->offset_float_min = l ? data->last_start_min : data->pos_min;
470 data->offset_float_max = (l
471 ? data->last_start_max
472 : data->pos_min + data->pos_delta);
473 if ((U32)data->offset_float_max > (U32)I32_MAX)
474 data->offset_float_max = I32_MAX;
475 if (data->flags & SF_BEFORE_EOL)
477 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
479 data->flags &= ~SF_FL_BEFORE_EOL;
482 SvCUR_set(data->last_found, 0);
484 SV * const sv = data->last_found;
485 if (SvUTF8(sv) && SvMAGICAL(sv)) {
486 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
492 data->flags &= ~SF_BEFORE_EOL;
495 /* Can match anything (initialization) */
497 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
499 ANYOF_CLASS_ZERO(cl);
500 ANYOF_BITMAP_SETALL(cl);
501 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
503 cl->flags |= ANYOF_LOCALE;
506 /* Can match anything (initialization) */
508 S_cl_is_anything(const struct regnode_charclass_class *cl)
512 for (value = 0; value <= ANYOF_MAX; value += 2)
513 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
515 if (!(cl->flags & ANYOF_UNICODE_ALL))
517 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
522 /* Can match anything (initialization) */
524 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
526 Zero(cl, 1, struct regnode_charclass_class);
528 cl_anything(pRExC_state, cl);
532 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
534 Zero(cl, 1, struct regnode_charclass_class);
536 cl_anything(pRExC_state, cl);
538 cl->flags |= ANYOF_LOCALE;
541 /* 'And' a given class with another one. Can create false positives */
542 /* We assume that cl is not inverted */
544 S_cl_and(struct regnode_charclass_class *cl,
545 const struct regnode_charclass_class *and_with)
547 if (!(and_with->flags & ANYOF_CLASS)
548 && !(cl->flags & ANYOF_CLASS)
549 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
550 && !(and_with->flags & ANYOF_FOLD)
551 && !(cl->flags & ANYOF_FOLD)) {
554 if (and_with->flags & ANYOF_INVERT)
555 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
556 cl->bitmap[i] &= ~and_with->bitmap[i];
558 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
559 cl->bitmap[i] &= and_with->bitmap[i];
560 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
561 if (!(and_with->flags & ANYOF_EOS))
562 cl->flags &= ~ANYOF_EOS;
564 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
565 !(and_with->flags & ANYOF_INVERT)) {
566 cl->flags &= ~ANYOF_UNICODE_ALL;
567 cl->flags |= ANYOF_UNICODE;
568 ARG_SET(cl, ARG(and_with));
570 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
571 !(and_with->flags & ANYOF_INVERT))
572 cl->flags &= ~ANYOF_UNICODE_ALL;
573 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
574 !(and_with->flags & ANYOF_INVERT))
575 cl->flags &= ~ANYOF_UNICODE;
578 /* 'OR' a given class with another one. Can create false positives */
579 /* We assume that cl is not inverted */
581 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
583 if (or_with->flags & ANYOF_INVERT) {
585 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
586 * <= (B1 | !B2) | (CL1 | !CL2)
587 * which is wasteful if CL2 is small, but we ignore CL2:
588 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
589 * XXXX Can we handle case-fold? Unclear:
590 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
591 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
593 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
594 && !(or_with->flags & ANYOF_FOLD)
595 && !(cl->flags & ANYOF_FOLD) ) {
598 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
599 cl->bitmap[i] |= ~or_with->bitmap[i];
600 } /* XXXX: logic is complicated otherwise */
602 cl_anything(pRExC_state, cl);
605 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
606 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
607 && (!(or_with->flags & ANYOF_FOLD)
608 || (cl->flags & ANYOF_FOLD)) ) {
611 /* OR char bitmap and class bitmap separately */
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= or_with->bitmap[i];
614 if (or_with->flags & ANYOF_CLASS) {
615 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
616 cl->classflags[i] |= or_with->classflags[i];
617 cl->flags |= ANYOF_CLASS;
620 else { /* XXXX: logic is complicated, leave it along for a moment. */
621 cl_anything(pRExC_state, cl);
624 if (or_with->flags & ANYOF_EOS)
625 cl->flags |= ANYOF_EOS;
627 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
628 ARG(cl) != ARG(or_with)) {
629 cl->flags |= ANYOF_UNICODE_ALL;
630 cl->flags &= ~ANYOF_UNICODE;
632 if (or_with->flags & ANYOF_UNICODE_ALL) {
633 cl->flags |= ANYOF_UNICODE_ALL;
634 cl->flags &= ~ANYOF_UNICODE;
638 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
639 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
640 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
641 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
647 dump_trie_interim_list(trie,next_alloc)
648 dump_trie_interim_table(trie,next_alloc)
650 These routines dump out a trie in a somewhat readable format.
651 The _interim_ variants are used for debugging the interim
652 tables that are used to generate the final compressed
653 representation which is what dump_trie expects.
655 Part of the reason for their existance is to provide a form
656 of documentation as to how the different representations function.
662 Dumps the final compressed table form of the trie to Perl_debug_log.
663 Used for debugging make_trie().
667 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
670 SV *sv=sv_newmortal();
671 int colwidth= trie->widecharmap ? 6 : 4;
672 GET_RE_DEBUG_FLAGS_DECL;
675 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
676 (int)depth * 2 + 2,"",
677 "Match","Base","Ofs" );
679 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
680 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
682 PerlIO_printf( Perl_debug_log, "%*s",
684 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
685 PL_colors[0], PL_colors[1],
686 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
687 PERL_PV_ESCAPE_FIRSTCHAR
692 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
693 (int)depth * 2 + 2,"");
695 for( state = 0 ; state < trie->uniquecharcount ; state++ )
696 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
697 PerlIO_printf( Perl_debug_log, "\n");
699 for( state = 1 ; state < trie->laststate ; state++ ) {
700 const U32 base = trie->states[ state ].trans.base;
702 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
704 if ( trie->states[ state ].wordnum ) {
705 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
707 PerlIO_printf( Perl_debug_log, "%6s", "" );
710 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
715 while( ( base + ofs < trie->uniquecharcount ) ||
716 ( base + ofs - trie->uniquecharcount < trie->lasttrans
717 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
720 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
722 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
723 if ( ( base + ofs >= trie->uniquecharcount ) &&
724 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
725 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
727 PerlIO_printf( Perl_debug_log, "%*"UVXf,
729 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
731 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
735 PerlIO_printf( Perl_debug_log, "]");
738 PerlIO_printf( Perl_debug_log, "\n" );
742 dump_trie_interim_list(trie,next_alloc)
743 Dumps a fully constructed but uncompressed trie in list form.
744 List tries normally only are used for construction when the number of
745 possible chars (trie->uniquecharcount) is very high.
746 Used for debugging make_trie().
749 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
752 SV *sv=sv_newmortal();
753 int colwidth= trie->widecharmap ? 6 : 4;
754 GET_RE_DEBUG_FLAGS_DECL;
755 /* print out the table precompression. */
756 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
757 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
758 "------:-----+-----------------\n" );
760 for( state=1 ; state < next_alloc ; state ++ ) {
763 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
764 (int)depth * 2 + 2,"", (UV)state );
765 if ( ! trie->states[ state ].wordnum ) {
766 PerlIO_printf( Perl_debug_log, "%5s| ","");
768 PerlIO_printf( Perl_debug_log, "W%4x| ",
769 trie->states[ state ].wordnum
772 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
773 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
775 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
777 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
778 PL_colors[0], PL_colors[1],
779 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
780 PERL_PV_ESCAPE_FIRSTCHAR
782 TRIE_LIST_ITEM(state,charid).forid,
783 (UV)TRIE_LIST_ITEM(state,charid).newstate
787 PerlIO_printf( Perl_debug_log, "\n");
792 dump_trie_interim_table(trie,next_alloc)
793 Dumps a fully constructed but uncompressed trie in table form.
794 This is the normal DFA style state transition table, with a few
795 twists to facilitate compression later.
796 Used for debugging make_trie().
799 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
803 SV *sv=sv_newmortal();
804 int colwidth= trie->widecharmap ? 6 : 4;
805 GET_RE_DEBUG_FLAGS_DECL;
808 print out the table precompression so that we can do a visual check
809 that they are identical.
812 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
814 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
815 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
817 PerlIO_printf( Perl_debug_log, "%*s",
819 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
820 PL_colors[0], PL_colors[1],
821 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
822 PERL_PV_ESCAPE_FIRSTCHAR
828 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
830 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
831 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
834 PerlIO_printf( Perl_debug_log, "\n" );
836 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
838 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
839 (int)depth * 2 + 2,"",
840 (UV)TRIE_NODENUM( state ) );
842 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
843 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
845 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
847 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
849 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
850 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
852 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
853 trie->states[ TRIE_NODENUM( state ) ].wordnum );
860 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
861 startbranch: the first branch in the whole branch sequence
862 first : start branch of sequence of branch-exact nodes.
863 May be the same as startbranch
864 last : Thing following the last branch.
865 May be the same as tail.
866 tail : item following the branch sequence
867 count : words in the sequence
868 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
871 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
873 A trie is an N'ary tree where the branches are determined by digital
874 decomposition of the key. IE, at the root node you look up the 1st character and
875 follow that branch repeat until you find the end of the branches. Nodes can be
876 marked as "accepting" meaning they represent a complete word. Eg:
880 would convert into the following structure. Numbers represent states, letters
881 following numbers represent valid transitions on the letter from that state, if
882 the number is in square brackets it represents an accepting state, otherwise it
883 will be in parenthesis.
885 +-h->+-e->[3]-+-r->(8)-+-s->[9]
889 (1) +-i->(6)-+-s->[7]
891 +-s->(3)-+-h->(4)-+-e->[5]
893 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
895 This shows that when matching against the string 'hers' we will begin at state 1
896 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
897 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
898 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
899 single traverse. We store a mapping from accepting to state to which word was
900 matched, and then when we have multiple possibilities we try to complete the
901 rest of the regex in the order in which they occured in the alternation.
903 The only prior NFA like behaviour that would be changed by the TRIE support is
904 the silent ignoring of duplicate alternations which are of the form:
906 / (DUPE|DUPE) X? (?{ ... }) Y /x
908 Thus EVAL blocks follwing a trie may be called a different number of times with
909 and without the optimisation. With the optimisations dupes will be silently
910 ignored. This inconsistant behaviour of EVAL type nodes is well established as
911 the following demonstrates:
913 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
915 which prints out 'word' three times, but
917 'words'=~/(word|word|word)(?{ print $1 })S/
919 which doesnt print it out at all. This is due to other optimisations kicking in.
921 Example of what happens on a structural level:
923 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
925 1: CURLYM[1] {1,32767}(18)
936 This would be optimizable with startbranch=5, first=5, last=16, tail=16
937 and should turn into:
939 1: CURLYM[1] {1,32767}(18)
941 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
949 Cases where tail != last would be like /(?foo|bar)baz/:
959 which would be optimizable with startbranch=1, first=1, last=7, tail=8
960 and would end up looking like:
963 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
970 d = uvuni_to_utf8_flags(d, uv, 0);
972 is the recommended Unicode-aware way of saying
977 #define TRIE_STORE_REVCHAR \
979 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
980 if (UTF) SvUTF8_on(tmp); \
981 av_push( TRIE_REVCHARMAP(trie), tmp ); \
984 #define TRIE_READ_CHAR STMT_START { \
988 if ( foldlen > 0 ) { \
989 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
994 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
995 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
996 foldlen -= UNISKIP( uvc ); \
997 scan = foldbuf + UNISKIP( uvc ); \
1000 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1010 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1011 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1012 TRIE_LIST_LEN( state ) *= 2; \
1013 Renew( trie->states[ state ].trans.list, \
1014 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1016 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1017 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1018 TRIE_LIST_CUR( state )++; \
1021 #define TRIE_LIST_NEW(state) STMT_START { \
1022 Newxz( trie->states[ state ].trans.list, \
1023 4, reg_trie_trans_le ); \
1024 TRIE_LIST_CUR( state ) = 1; \
1025 TRIE_LIST_LEN( state ) = 4; \
1028 #define TRIE_HANDLE_WORD(state) STMT_START { \
1029 U16 dupe= trie->states[ state ].wordnum; \
1030 regnode * const noper_next = regnext( noper ); \
1032 if (trie->wordlen) \
1033 trie->wordlen[ curword ] = wordlen; \
1035 /* store the word for dumping */ \
1037 if (OP(noper) != NOTHING) \
1038 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1040 tmp = newSVpvn( "", 0 ); \
1041 if ( UTF ) SvUTF8_on( tmp ); \
1042 av_push( trie->words, tmp ); \
1047 if ( noper_next < tail ) { \
1049 Newxz( trie->jump, word_count + 1, U16); \
1050 trie->jump[curword] = (U16)(tail - noper_next); \
1052 jumper = noper_next; \
1054 nextbranch= regnext(cur); \
1058 /* So it's a dupe. This means we need to maintain a */\
1059 /* linked-list from the first to the next. */\
1060 /* we only allocate the nextword buffer when there */\
1061 /* a dupe, so first time we have to do the allocation */\
1062 if (!trie->nextword) \
1063 Newxz( trie->nextword, word_count + 1, U16); \
1064 while ( trie->nextword[dupe] ) \
1065 dupe= trie->nextword[dupe]; \
1066 trie->nextword[dupe]= curword; \
1068 /* we haven't inserted this word yet. */ \
1069 trie->states[ state ].wordnum = curword; \
1074 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1075 ( ( base + charid >= ucharcount \
1076 && base + charid < ubound \
1077 && state == trie->trans[ base - ucharcount + charid ].check \
1078 && trie->trans[ base - ucharcount + charid ].next ) \
1079 ? trie->trans[ base - ucharcount + charid ].next \
1080 : ( state==1 ? special : 0 ) \
1084 #define MADE_JUMP_TRIE 2
1085 #define MADE_EXACT_TRIE 4
1088 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1091 /* first pass, loop through and scan words */
1092 reg_trie_data *trie;
1094 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1099 regnode *jumper = NULL;
1100 regnode *nextbranch = NULL;
1101 /* we just use folder as a flag in utf8 */
1102 const U8 * const folder = ( flags == EXACTF
1104 : ( flags == EXACTFL
1110 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1111 SV *re_trie_maxbuff;
1113 /* these are only used during construction but are useful during
1114 * debugging so we store them in the struct when debugging.
1116 STRLEN trie_charcount=0;
1117 AV *trie_revcharmap;
1119 GET_RE_DEBUG_FLAGS_DECL;
1121 PERL_UNUSED_ARG(depth);
1124 Newxz( trie, 1, reg_trie_data );
1126 trie->startstate = 1;
1127 trie->wordcount = word_count;
1128 RExC_rx->data->data[ data_slot ] = (void*)trie;
1129 Newxz( trie->charmap, 256, U16 );
1130 if (!(UTF && folder))
1131 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1133 trie->words = newAV();
1135 TRIE_REVCHARMAP(trie) = newAV();
1137 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1138 if (!SvIOK(re_trie_maxbuff)) {
1139 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1142 PerlIO_printf( Perl_debug_log,
1143 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1144 (int)depth * 2 + 2, "",
1145 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1146 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1149 /* -- First loop and Setup --
1151 We first traverse the branches and scan each word to determine if it
1152 contains widechars, and how many unique chars there are, this is
1153 important as we have to build a table with at least as many columns as we
1156 We use an array of integers to represent the character codes 0..255
1157 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1158 native representation of the character value as the key and IV's for the
1161 *TODO* If we keep track of how many times each character is used we can
1162 remap the columns so that the table compression later on is more
1163 efficient in terms of memory by ensuring most common value is in the
1164 middle and the least common are on the outside. IMO this would be better
1165 than a most to least common mapping as theres a decent chance the most
1166 common letter will share a node with the least common, meaning the node
1167 will not be compressable. With a middle is most common approach the worst
1168 case is when we have the least common nodes twice.
1172 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1173 regnode * const noper = NEXTOPER( cur );
1174 const U8 *uc = (U8*)STRING( noper );
1175 const U8 * const e = uc + STR_LEN( noper );
1177 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1178 const U8 *scan = (U8*)NULL;
1179 U32 wordlen = 0; /* required init */
1182 if (OP(noper) == NOTHING) {
1187 TRIE_BITMAP_SET(trie,*uc);
1188 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1190 for ( ; uc < e ; uc += len ) {
1191 TRIE_CHARCOUNT(trie)++;
1195 if ( !trie->charmap[ uvc ] ) {
1196 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1198 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1203 if ( !trie->widecharmap )
1204 trie->widecharmap = newHV();
1206 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1209 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1211 if ( !SvTRUE( *svpp ) ) {
1212 sv_setiv( *svpp, ++trie->uniquecharcount );
1217 if( cur == first ) {
1220 } else if (chars < trie->minlen) {
1222 } else if (chars > trie->maxlen) {
1226 } /* end first pass */
1227 DEBUG_TRIE_COMPILE_r(
1228 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1229 (int)depth * 2 + 2,"",
1230 ( trie->widecharmap ? "UTF8" : "NATIVE" ), word_count,
1231 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1232 (int)trie->minlen, (int)trie->maxlen )
1234 Newxz( trie->wordlen, word_count, U32 );
1237 We now know what we are dealing with in terms of unique chars and
1238 string sizes so we can calculate how much memory a naive
1239 representation using a flat table will take. If it's over a reasonable
1240 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1241 conservative but potentially much slower representation using an array
1244 At the end we convert both representations into the same compressed
1245 form that will be used in regexec.c for matching with. The latter
1246 is a form that cannot be used to construct with but has memory
1247 properties similar to the list form and access properties similar
1248 to the table form making it both suitable for fast searches and
1249 small enough that its feasable to store for the duration of a program.
1251 See the comment in the code where the compressed table is produced
1252 inplace from the flat tabe representation for an explanation of how
1253 the compression works.
1258 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1260 Second Pass -- Array Of Lists Representation
1262 Each state will be represented by a list of charid:state records
1263 (reg_trie_trans_le) the first such element holds the CUR and LEN
1264 points of the allocated array. (See defines above).
1266 We build the initial structure using the lists, and then convert
1267 it into the compressed table form which allows faster lookups
1268 (but cant be modified once converted).
1271 STRLEN transcount = 1;
1273 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1277 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1279 regnode * const noper = NEXTOPER( cur );
1280 U8 *uc = (U8*)STRING( noper );
1281 const U8 * const e = uc + STR_LEN( noper );
1282 U32 state = 1; /* required init */
1283 U16 charid = 0; /* sanity init */
1284 U8 *scan = (U8*)NULL; /* sanity init */
1285 STRLEN foldlen = 0; /* required init */
1286 U32 wordlen = 0; /* required init */
1287 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1289 if (OP(noper) != NOTHING) {
1290 for ( ; uc < e ; uc += len ) {
1295 charid = trie->charmap[ uvc ];
1297 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1301 charid=(U16)SvIV( *svpp );
1304 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1311 if ( !trie->states[ state ].trans.list ) {
1312 TRIE_LIST_NEW( state );
1314 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1315 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1316 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1321 newstate = next_alloc++;
1322 TRIE_LIST_PUSH( state, charid, newstate );
1327 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1331 TRIE_HANDLE_WORD(state);
1333 } /* end second pass */
1335 trie->laststate = next_alloc;
1336 Renew( trie->states, next_alloc, reg_trie_state );
1338 /* and now dump it out before we compress it */
1339 DEBUG_TRIE_COMPILE_MORE_r(
1340 dump_trie_interim_list(trie,next_alloc,depth+1)
1343 Newxz( trie->trans, transcount ,reg_trie_trans );
1350 for( state=1 ; state < next_alloc ; state ++ ) {
1354 DEBUG_TRIE_COMPILE_MORE_r(
1355 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1359 if (trie->states[state].trans.list) {
1360 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1364 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1365 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1366 if ( forid < minid ) {
1368 } else if ( forid > maxid ) {
1372 if ( transcount < tp + maxid - minid + 1) {
1374 Renew( trie->trans, transcount, reg_trie_trans );
1375 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1377 base = trie->uniquecharcount + tp - minid;
1378 if ( maxid == minid ) {
1380 for ( ; zp < tp ; zp++ ) {
1381 if ( ! trie->trans[ zp ].next ) {
1382 base = trie->uniquecharcount + zp - minid;
1383 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1384 trie->trans[ zp ].check = state;
1390 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1391 trie->trans[ tp ].check = state;
1396 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1397 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1398 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1399 trie->trans[ tid ].check = state;
1401 tp += ( maxid - minid + 1 );
1403 Safefree(trie->states[ state ].trans.list);
1406 DEBUG_TRIE_COMPILE_MORE_r(
1407 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1410 trie->states[ state ].trans.base=base;
1412 trie->lasttrans = tp + 1;
1416 Second Pass -- Flat Table Representation.
1418 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1419 We know that we will need Charcount+1 trans at most to store the data
1420 (one row per char at worst case) So we preallocate both structures
1421 assuming worst case.
1423 We then construct the trie using only the .next slots of the entry
1426 We use the .check field of the first entry of the node temporarily to
1427 make compression both faster and easier by keeping track of how many non
1428 zero fields are in the node.
1430 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1433 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1434 number representing the first entry of the node, and state as a
1435 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1436 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1437 are 2 entrys per node. eg:
1445 The table is internally in the right hand, idx form. However as we also
1446 have to deal with the states array which is indexed by nodenum we have to
1447 use TRIE_NODENUM() to convert.
1452 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1454 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1455 next_alloc = trie->uniquecharcount + 1;
1458 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1460 regnode * const noper = NEXTOPER( cur );
1461 const U8 *uc = (U8*)STRING( noper );
1462 const U8 * const e = uc + STR_LEN( noper );
1464 U32 state = 1; /* required init */
1466 U16 charid = 0; /* sanity init */
1467 U32 accept_state = 0; /* sanity init */
1468 U8 *scan = (U8*)NULL; /* sanity init */
1470 STRLEN foldlen = 0; /* required init */
1471 U32 wordlen = 0; /* required init */
1472 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1474 if ( OP(noper) != NOTHING ) {
1475 for ( ; uc < e ; uc += len ) {
1480 charid = trie->charmap[ uvc ];
1482 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1483 charid = svpp ? (U16)SvIV(*svpp) : 0;
1487 if ( !trie->trans[ state + charid ].next ) {
1488 trie->trans[ state + charid ].next = next_alloc;
1489 trie->trans[ state ].check++;
1490 next_alloc += trie->uniquecharcount;
1492 state = trie->trans[ state + charid ].next;
1494 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1496 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1499 accept_state = TRIE_NODENUM( state );
1500 TRIE_HANDLE_WORD(accept_state);
1502 } /* end second pass */
1504 /* and now dump it out before we compress it */
1505 DEBUG_TRIE_COMPILE_MORE_r(
1506 dump_trie_interim_table(trie,next_alloc,depth+1)
1511 * Inplace compress the table.*
1513 For sparse data sets the table constructed by the trie algorithm will
1514 be mostly 0/FAIL transitions or to put it another way mostly empty.
1515 (Note that leaf nodes will not contain any transitions.)
1517 This algorithm compresses the tables by eliminating most such
1518 transitions, at the cost of a modest bit of extra work during lookup:
1520 - Each states[] entry contains a .base field which indicates the
1521 index in the state[] array wheres its transition data is stored.
1523 - If .base is 0 there are no valid transitions from that node.
1525 - If .base is nonzero then charid is added to it to find an entry in
1528 -If trans[states[state].base+charid].check!=state then the
1529 transition is taken to be a 0/Fail transition. Thus if there are fail
1530 transitions at the front of the node then the .base offset will point
1531 somewhere inside the previous nodes data (or maybe even into a node
1532 even earlier), but the .check field determines if the transition is
1536 The following process inplace converts the table to the compressed
1537 table: We first do not compress the root node 1,and mark its all its
1538 .check pointers as 1 and set its .base pointer as 1 as well. This
1539 allows to do a DFA construction from the compressed table later, and
1540 ensures that any .base pointers we calculate later are greater than
1543 - We set 'pos' to indicate the first entry of the second node.
1545 - We then iterate over the columns of the node, finding the first and
1546 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1547 and set the .check pointers accordingly, and advance pos
1548 appropriately and repreat for the next node. Note that when we copy
1549 the next pointers we have to convert them from the original
1550 NODEIDX form to NODENUM form as the former is not valid post
1553 - If a node has no transitions used we mark its base as 0 and do not
1554 advance the pos pointer.
1556 - If a node only has one transition we use a second pointer into the
1557 structure to fill in allocated fail transitions from other states.
1558 This pointer is independent of the main pointer and scans forward
1559 looking for null transitions that are allocated to a state. When it
1560 finds one it writes the single transition into the "hole". If the
1561 pointer doesnt find one the single transition is appended as normal.
1563 - Once compressed we can Renew/realloc the structures to release the
1566 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1567 specifically Fig 3.47 and the associated pseudocode.
1571 const U32 laststate = TRIE_NODENUM( next_alloc );
1574 trie->laststate = laststate;
1576 for ( state = 1 ; state < laststate ; state++ ) {
1578 const U32 stateidx = TRIE_NODEIDX( state );
1579 const U32 o_used = trie->trans[ stateidx ].check;
1580 U32 used = trie->trans[ stateidx ].check;
1581 trie->trans[ stateidx ].check = 0;
1583 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1584 if ( flag || trie->trans[ stateidx + charid ].next ) {
1585 if ( trie->trans[ stateidx + charid ].next ) {
1587 for ( ; zp < pos ; zp++ ) {
1588 if ( ! trie->trans[ zp ].next ) {
1592 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1593 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1594 trie->trans[ zp ].check = state;
1595 if ( ++zp > pos ) pos = zp;
1602 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1604 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1605 trie->trans[ pos ].check = state;
1610 trie->lasttrans = pos + 1;
1611 Renew( trie->states, laststate + 1, reg_trie_state);
1612 DEBUG_TRIE_COMPILE_MORE_r(
1613 PerlIO_printf( Perl_debug_log,
1614 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1615 (int)depth * 2 + 2,"",
1616 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1619 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1622 } /* end table compress */
1624 /* resize the trans array to remove unused space */
1625 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1627 /* and now dump out the compressed format */
1628 DEBUG_TRIE_COMPILE_r(
1629 dump_trie(trie,depth+1)
1632 { /* Modify the program and insert the new TRIE node*/
1634 U8 nodetype =(U8)(flags & 0xFF);
1643 This means we convert either the first branch or the first Exact,
1644 depending on whether the thing following (in 'last') is a branch
1645 or not and whther first is the startbranch (ie is it a sub part of
1646 the alternation or is it the whole thing.)
1647 Assuming its a sub part we conver the EXACT otherwise we convert
1648 the whole branch sequence, including the first.
1650 /* Find the node we are going to overwrite */
1651 if ( first == startbranch && OP( last ) != BRANCH ) {
1652 /* whole branch chain */
1655 const regnode *nop = NEXTOPER( convert );
1656 mjd_offset= Node_Offset((nop));
1657 mjd_nodelen= Node_Length((nop));
1660 /* branch sub-chain */
1661 convert = NEXTOPER( first );
1662 NEXT_OFF( first ) = (U16)(last - first);
1664 mjd_offset= Node_Offset((convert));
1665 mjd_nodelen= Node_Length((convert));
1669 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1670 (int)depth * 2 + 2, "",
1671 (UV)mjd_offset, (UV)mjd_nodelen)
1674 /* But first we check to see if there is a common prefix we can
1675 split out as an EXACT and put in front of the TRIE node. */
1676 trie->startstate= 1;
1677 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1680 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1681 (int)depth * 2 + 2, "",
1682 (UV)trie->laststate)
1684 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
1688 const U32 base = trie->states[ state ].trans.base;
1690 if ( trie->states[state].wordnum )
1693 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1694 if ( ( base + ofs >= trie->uniquecharcount ) &&
1695 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1696 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1698 if ( ++count > 1 ) {
1699 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1700 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1701 if ( state == 1 ) break;
1703 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1705 PerlIO_printf(Perl_debug_log,
1706 "%*sNew Start State=%"UVuf" Class: [",
1707 (int)depth * 2 + 2, "",
1710 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1711 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1713 TRIE_BITMAP_SET(trie,*ch);
1715 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1717 PerlIO_printf(Perl_debug_log, (char*)ch)
1721 TRIE_BITMAP_SET(trie,*ch);
1723 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1724 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1730 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1731 const char *ch = SvPV_nolen_const( *tmp );
1733 PerlIO_printf( Perl_debug_log,
1734 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1735 (int)depth * 2 + 2, "",
1736 (UV)state, (UV)idx, ch)
1739 OP( convert ) = nodetype;
1740 str=STRING(convert);
1749 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1755 regnode *n = convert+NODE_SZ_STR(convert);
1756 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1757 trie->startstate = state;
1758 trie->minlen -= (state - 1);
1759 trie->maxlen -= (state - 1);
1761 regnode *fix = convert;
1763 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1764 while( ++fix < n ) {
1765 Set_Node_Offset_Length(fix, 0, 0);
1771 NEXT_OFF(convert) = (U16)(tail - convert);
1775 if ( trie->maxlen ) {
1776 NEXT_OFF( convert ) = (U16)(tail - convert);
1777 ARG_SET( convert, data_slot );
1778 /* Store the offset to the first unabsorbed branch in
1779 jump[0], which is otherwise unused by the jump logic.
1780 We use this when dumping a trie and during optimisation. */
1782 trie->jump[0] = (U16)(tail - nextbranch);
1786 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1787 ( (char *)jumper - (char *)convert) >= sizeof(struct regnode_charclass) )
1789 OP( convert ) = TRIEC;
1790 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1791 Safefree(trie->bitmap);
1794 OP( convert ) = TRIE;
1796 /* store the type in the flags */
1797 convert->flags = nodetype;
1798 /* XXX We really should free up the resource in trie now, as we wont use them */
1800 /* needed for dumping*/
1802 regnode *optimize = convert
1804 + regarglen[ OP( convert ) ];
1805 regnode *opt = convert;
1806 while (++opt<optimize) {
1807 Set_Node_Offset_Length(opt,0,0);
1810 Try to clean up some of the debris left after the
1813 while( optimize < jumper ) {
1814 mjd_nodelen += Node_Length((optimize));
1815 OP( optimize ) = OPTIMIZED;
1816 Set_Node_Offset_Length(optimize,0,0);
1819 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1821 } /* end node insert */
1823 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1827 : trie->startstate>1
1833 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1835 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1837 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1838 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1841 We find the fail state for each state in the trie, this state is the longest proper
1842 suffix of the current states 'word' that is also a proper prefix of another word in our
1843 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1844 the DFA not to have to restart after its tried and failed a word at a given point, it
1845 simply continues as though it had been matching the other word in the first place.
1847 'abcdgu'=~/abcdefg|cdgu/
1848 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1849 fail, which would bring use to the state representing 'd' in the second word where we would
1850 try 'g' and succeed, prodceding to match 'cdgu'.
1852 /* add a fail transition */
1853 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1855 const U32 ucharcount = trie->uniquecharcount;
1856 const U32 numstates = trie->laststate;
1857 const U32 ubound = trie->lasttrans + ucharcount;
1861 U32 base = trie->states[ 1 ].trans.base;
1864 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1865 GET_RE_DEBUG_FLAGS_DECL;
1867 PERL_UNUSED_ARG(depth);
1871 ARG_SET( stclass, data_slot );
1872 Newxz( aho, 1, reg_ac_data );
1873 RExC_rx->data->data[ data_slot ] = (void*)aho;
1875 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1876 (trie->laststate+1)*sizeof(reg_trie_state));
1877 Newxz( q, numstates, U32);
1878 Newxz( aho->fail, numstates, U32 );
1881 /* initialize fail[0..1] to be 1 so that we always have
1882 a valid final fail state */
1883 fail[ 0 ] = fail[ 1 ] = 1;
1885 for ( charid = 0; charid < ucharcount ; charid++ ) {
1886 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1888 q[ q_write ] = newstate;
1889 /* set to point at the root */
1890 fail[ q[ q_write++ ] ]=1;
1893 while ( q_read < q_write) {
1894 const U32 cur = q[ q_read++ % numstates ];
1895 base = trie->states[ cur ].trans.base;
1897 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1898 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1900 U32 fail_state = cur;
1903 fail_state = fail[ fail_state ];
1904 fail_base = aho->states[ fail_state ].trans.base;
1905 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1907 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1908 fail[ ch_state ] = fail_state;
1909 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1911 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1913 q[ q_write++ % numstates] = ch_state;
1917 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
1918 when we fail in state 1, this allows us to use the
1919 charclass scan to find a valid start char. This is based on the principle
1920 that theres a good chance the string being searched contains lots of stuff
1921 that cant be a start char.
1923 fail[ 0 ] = fail[ 1 ] = 0;
1924 DEBUG_TRIE_COMPILE_r({
1925 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
1926 for( q_read=1; q_read<numstates; q_read++ ) {
1927 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
1929 PerlIO_printf(Perl_debug_log, "\n");
1932 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1937 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1938 * These need to be revisited when a newer toolchain becomes available.
1940 #if defined(__sparc64__) && defined(__GNUC__)
1941 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1942 # undef SPARC64_GCC_WORKAROUND
1943 # define SPARC64_GCC_WORKAROUND 1
1947 #define DEBUG_PEEP(str,scan,depth) \
1948 DEBUG_OPTIMISE_r({ \
1949 SV * const mysv=sv_newmortal(); \
1950 regnode *Next = regnext(scan); \
1951 regprop(RExC_rx, mysv, scan); \
1952 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1953 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1954 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1957 #define JOIN_EXACT(scan,min,flags) \
1958 if (PL_regkind[OP(scan)] == EXACT) \
1959 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1962 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1963 /* Merge several consecutive EXACTish nodes into one. */
1964 regnode *n = regnext(scan);
1966 regnode *next = scan + NODE_SZ_STR(scan);
1970 regnode *stop = scan;
1971 GET_RE_DEBUG_FLAGS_DECL;
1973 PERL_UNUSED_ARG(depth);
1975 #ifndef EXPERIMENTAL_INPLACESCAN
1976 PERL_UNUSED_ARG(flags);
1977 PERL_UNUSED_ARG(val);
1979 DEBUG_PEEP("join",scan,depth);
1981 /* Skip NOTHING, merge EXACT*. */
1983 ( PL_regkind[OP(n)] == NOTHING ||
1984 (stringok && (OP(n) == OP(scan))))
1986 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1988 if (OP(n) == TAIL || n > next)
1990 if (PL_regkind[OP(n)] == NOTHING) {
1991 DEBUG_PEEP("skip:",n,depth);
1992 NEXT_OFF(scan) += NEXT_OFF(n);
1993 next = n + NODE_STEP_REGNODE;
2000 else if (stringok) {
2001 const unsigned int oldl = STR_LEN(scan);
2002 regnode * const nnext = regnext(n);
2004 DEBUG_PEEP("merg",n,depth);
2007 if (oldl + STR_LEN(n) > U8_MAX)
2009 NEXT_OFF(scan) += NEXT_OFF(n);
2010 STR_LEN(scan) += STR_LEN(n);
2011 next = n + NODE_SZ_STR(n);
2012 /* Now we can overwrite *n : */
2013 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2021 #ifdef EXPERIMENTAL_INPLACESCAN
2022 if (flags && !NEXT_OFF(n)) {
2023 DEBUG_PEEP("atch", val, depth);
2024 if (reg_off_by_arg[OP(n)]) {
2025 ARG_SET(n, val - n);
2028 NEXT_OFF(n) = val - n;
2035 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2037 Two problematic code points in Unicode casefolding of EXACT nodes:
2039 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2040 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2046 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2047 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2049 This means that in case-insensitive matching (or "loose matching",
2050 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2051 length of the above casefolded versions) can match a target string
2052 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2053 This would rather mess up the minimum length computation.
2055 What we'll do is to look for the tail four bytes, and then peek
2056 at the preceding two bytes to see whether we need to decrease
2057 the minimum length by four (six minus two).
2059 Thanks to the design of UTF-8, there cannot be false matches:
2060 A sequence of valid UTF-8 bytes cannot be a subsequence of
2061 another valid sequence of UTF-8 bytes.
2064 char * const s0 = STRING(scan), *s, *t;
2065 char * const s1 = s0 + STR_LEN(scan) - 1;
2066 char * const s2 = s1 - 4;
2067 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2068 const char t0[] = "\xaf\x49\xaf\x42";
2070 const char t0[] = "\xcc\x88\xcc\x81";
2072 const char * const t1 = t0 + 3;
2075 s < s2 && (t = ninstr(s, s1, t0, t1));
2078 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2079 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2081 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2082 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2090 n = scan + NODE_SZ_STR(scan);
2092 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2099 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2103 /* REx optimizer. Converts nodes into quickier variants "in place".
2104 Finds fixed substrings. */
2106 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2107 to the position after last scanned or to NULL. */
2112 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2113 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2114 /* scanp: Start here (read-write). */
2115 /* deltap: Write maxlen-minlen here. */
2116 /* last: Stop before this one. */
2119 I32 min = 0, pars = 0, code;
2120 regnode *scan = *scanp, *next;
2122 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2123 int is_inf_internal = 0; /* The studied chunk is infinite */
2124 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2125 scan_data_t data_fake;
2126 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2127 SV *re_trie_maxbuff = NULL;
2128 regnode *first_non_open = scan;
2131 GET_RE_DEBUG_FLAGS_DECL;
2133 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2136 while (first_non_open && OP(first_non_open) == OPEN)
2137 first_non_open=regnext(first_non_open);
2141 while (scan && OP(scan) != END && scan < last) {
2142 /* Peephole optimizer: */
2143 DEBUG_PEEP("Peep",scan,depth);
2145 JOIN_EXACT(scan,&min,0);
2147 /* Follow the next-chain of the current node and optimize
2148 away all the NOTHINGs from it. */
2149 if (OP(scan) != CURLYX) {
2150 const int max = (reg_off_by_arg[OP(scan)]
2152 /* I32 may be smaller than U16 on CRAYs! */
2153 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2154 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2158 /* Skip NOTHING and LONGJMP. */
2159 while ((n = regnext(n))
2160 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2161 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2162 && off + noff < max)
2164 if (reg_off_by_arg[OP(scan)])
2167 NEXT_OFF(scan) = off;
2172 /* The principal pseudo-switch. Cannot be a switch, since we
2173 look into several different things. */
2174 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2175 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2176 next = regnext(scan);
2178 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2180 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2181 /* NOTE - There is similar code to this block below for handling
2182 TRIE nodes on a re-study. If you change stuff here check there
2184 I32 max1 = 0, min1 = I32_MAX, num = 0;
2185 struct regnode_charclass_class accum;
2186 regnode * const startbranch=scan;
2188 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2189 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2190 if (flags & SCF_DO_STCLASS)
2191 cl_init_zero(pRExC_state, &accum);
2193 while (OP(scan) == code) {
2194 I32 deltanext, minnext, f = 0, fake;
2195 struct regnode_charclass_class this_class;
2198 data_fake.flags = 0;
2200 data_fake.whilem_c = data->whilem_c;
2201 data_fake.last_closep = data->last_closep;
2204 data_fake.last_closep = &fake;
2205 next = regnext(scan);
2206 scan = NEXTOPER(scan);
2208 scan = NEXTOPER(scan);
2209 if (flags & SCF_DO_STCLASS) {
2210 cl_init(pRExC_state, &this_class);
2211 data_fake.start_class = &this_class;
2212 f = SCF_DO_STCLASS_AND;
2214 if (flags & SCF_WHILEM_VISITED_POS)
2215 f |= SCF_WHILEM_VISITED_POS;
2217 /* we suppose the run is continuous, last=next...*/
2218 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2219 next, &data_fake, f,depth+1);
2222 if (max1 < minnext + deltanext)
2223 max1 = minnext + deltanext;
2224 if (deltanext == I32_MAX)
2225 is_inf = is_inf_internal = 1;
2227 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2230 if (data_fake.flags & SF_HAS_EVAL)
2231 data->flags |= SF_HAS_EVAL;
2232 data->whilem_c = data_fake.whilem_c;
2234 if (flags & SCF_DO_STCLASS)
2235 cl_or(pRExC_state, &accum, &this_class);
2236 if (code == SUSPEND)
2239 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2241 if (flags & SCF_DO_SUBSTR) {
2242 data->pos_min += min1;
2243 data->pos_delta += max1 - min1;
2244 if (max1 != min1 || is_inf)
2245 data->longest = &(data->longest_float);
2248 delta += max1 - min1;
2249 if (flags & SCF_DO_STCLASS_OR) {
2250 cl_or(pRExC_state, data->start_class, &accum);
2252 cl_and(data->start_class, &and_with);
2253 flags &= ~SCF_DO_STCLASS;
2256 else if (flags & SCF_DO_STCLASS_AND) {
2258 cl_and(data->start_class, &accum);
2259 flags &= ~SCF_DO_STCLASS;
2262 /* Switch to OR mode: cache the old value of
2263 * data->start_class */
2264 StructCopy(data->start_class, &and_with,
2265 struct regnode_charclass_class);
2266 flags &= ~SCF_DO_STCLASS_AND;
2267 StructCopy(&accum, data->start_class,
2268 struct regnode_charclass_class);
2269 flags |= SCF_DO_STCLASS_OR;
2270 data->start_class->flags |= ANYOF_EOS;
2274 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2277 Assuming this was/is a branch we are dealing with: 'scan' now
2278 points at the item that follows the branch sequence, whatever
2279 it is. We now start at the beginning of the sequence and look
2286 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2288 If we can find such a subseqence we need to turn the first
2289 element into a trie and then add the subsequent branch exact
2290 strings to the trie.
2294 1. patterns where the whole set of branch can be converted.
2296 2. patterns where only a subset can be converted.
2298 In case 1 we can replace the whole set with a single regop
2299 for the trie. In case 2 we need to keep the start and end
2302 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2303 becomes BRANCH TRIE; BRANCH X;
2305 There is an additional case, that being where there is a
2306 common prefix, which gets split out into an EXACT like node
2307 preceding the TRIE node.
2309 If x(1..n)==tail then we can do a simple trie, if not we make
2310 a "jump" trie, such that when we match the appropriate word
2311 we "jump" to the appopriate tail node. Essentailly we turn
2312 a nested if into a case structure of sorts.
2317 if (!re_trie_maxbuff) {
2318 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2319 if (!SvIOK(re_trie_maxbuff))
2320 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2322 if ( SvIV(re_trie_maxbuff)>=0 ) {
2324 regnode *first = (regnode *)NULL;
2325 regnode *last = (regnode *)NULL;
2326 regnode *tail = scan;
2331 SV * const mysv = sv_newmortal(); /* for dumping */
2333 /* var tail is used because there may be a TAIL
2334 regop in the way. Ie, the exacts will point to the
2335 thing following the TAIL, but the last branch will
2336 point at the TAIL. So we advance tail. If we
2337 have nested (?:) we may have to move through several
2341 while ( OP( tail ) == TAIL ) {
2342 /* this is the TAIL generated by (?:) */
2343 tail = regnext( tail );
2348 regprop(RExC_rx, mysv, tail );
2349 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2350 (int)depth * 2 + 2, "",
2351 "Looking for TRIE'able sequences. Tail node is: ",
2352 SvPV_nolen_const( mysv )
2358 step through the branches, cur represents each
2359 branch, noper is the first thing to be matched
2360 as part of that branch and noper_next is the
2361 regnext() of that node. if noper is an EXACT
2362 and noper_next is the same as scan (our current
2363 position in the regex) then the EXACT branch is
2364 a possible optimization target. Once we have
2365 two or more consequetive such branches we can
2366 create a trie of the EXACT's contents and stich
2367 it in place. If the sequence represents all of
2368 the branches we eliminate the whole thing and
2369 replace it with a single TRIE. If it is a
2370 subsequence then we need to stitch it in. This
2371 means the first branch has to remain, and needs
2372 to be repointed at the item on the branch chain
2373 following the last branch optimized. This could
2374 be either a BRANCH, in which case the
2375 subsequence is internal, or it could be the
2376 item following the branch sequence in which
2377 case the subsequence is at the end.
2381 /* dont use tail as the end marker for this traverse */
2382 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2383 regnode * const noper = NEXTOPER( cur );
2384 regnode * const noper_next = regnext( noper );
2387 regprop(RExC_rx, mysv, cur);
2388 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2389 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2391 regprop(RExC_rx, mysv, noper);
2392 PerlIO_printf( Perl_debug_log, " -> %s",
2393 SvPV_nolen_const(mysv));
2396 regprop(RExC_rx, mysv, noper_next );
2397 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2398 SvPV_nolen_const(mysv));
2400 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2401 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2403 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2404 : PL_regkind[ OP( noper ) ] == EXACT )
2405 || OP(noper) == NOTHING )
2407 && noper_next == tail
2412 if ( !first || optype == NOTHING ) {
2413 if (!first) first = cur;
2414 optype = OP( noper );
2420 make_trie( pRExC_state,
2421 startbranch, first, cur, tail, count,
2424 if ( PL_regkind[ OP( noper ) ] == EXACT
2426 && noper_next == tail
2431 optype = OP( noper );
2441 regprop(RExC_rx, mysv, cur);
2442 PerlIO_printf( Perl_debug_log,
2443 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2444 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2448 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2449 #ifdef TRIE_STUDY_OPT
2450 if ( ((made == MADE_EXACT_TRIE &&
2451 startbranch == first)
2452 || ( first_non_open == first )) &&
2454 flags |= SCF_TRIE_RESTUDY;
2462 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2463 scan = NEXTOPER(NEXTOPER(scan));
2464 } else /* single branch is optimized. */
2465 scan = NEXTOPER(scan);
2468 else if (OP(scan) == EXACT) {
2469 I32 l = STR_LEN(scan);
2472 const U8 * const s = (U8*)STRING(scan);
2473 l = utf8_length(s, s + l);
2474 uc = utf8_to_uvchr(s, NULL);
2476 uc = *((U8*)STRING(scan));
2479 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2480 /* The code below prefers earlier match for fixed
2481 offset, later match for variable offset. */
2482 if (data->last_end == -1) { /* Update the start info. */
2483 data->last_start_min = data->pos_min;
2484 data->last_start_max = is_inf
2485 ? I32_MAX : data->pos_min + data->pos_delta;
2487 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2489 SvUTF8_on(data->last_found);
2491 SV * const sv = data->last_found;
2492 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2493 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2494 if (mg && mg->mg_len >= 0)
2495 mg->mg_len += utf8_length((U8*)STRING(scan),
2496 (U8*)STRING(scan)+STR_LEN(scan));
2498 data->last_end = data->pos_min + l;
2499 data->pos_min += l; /* As in the first entry. */
2500 data->flags &= ~SF_BEFORE_EOL;
2502 if (flags & SCF_DO_STCLASS_AND) {
2503 /* Check whether it is compatible with what we know already! */
2507 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2508 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2509 && (!(data->start_class->flags & ANYOF_FOLD)
2510 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2513 ANYOF_CLASS_ZERO(data->start_class);
2514 ANYOF_BITMAP_ZERO(data->start_class);
2516 ANYOF_BITMAP_SET(data->start_class, uc);
2517 data->start_class->flags &= ~ANYOF_EOS;
2519 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2521 else if (flags & SCF_DO_STCLASS_OR) {
2522 /* false positive possible if the class is case-folded */
2524 ANYOF_BITMAP_SET(data->start_class, uc);
2526 data->start_class->flags |= ANYOF_UNICODE_ALL;
2527 data->start_class->flags &= ~ANYOF_EOS;
2528 cl_and(data->start_class, &and_with);
2530 flags &= ~SCF_DO_STCLASS;
2532 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2533 I32 l = STR_LEN(scan);
2534 UV uc = *((U8*)STRING(scan));
2536 /* Search for fixed substrings supports EXACT only. */
2537 if (flags & SCF_DO_SUBSTR) {
2539 scan_commit(pRExC_state, data);
2542 const U8 * const s = (U8 *)STRING(scan);
2543 l = utf8_length(s, s + l);
2544 uc = utf8_to_uvchr(s, NULL);
2547 if (flags & SCF_DO_SUBSTR)
2549 if (flags & SCF_DO_STCLASS_AND) {
2550 /* Check whether it is compatible with what we know already! */
2554 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2555 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2556 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2558 ANYOF_CLASS_ZERO(data->start_class);
2559 ANYOF_BITMAP_ZERO(data->start_class);
2561 ANYOF_BITMAP_SET(data->start_class, uc);
2562 data->start_class->flags &= ~ANYOF_EOS;
2563 data->start_class->flags |= ANYOF_FOLD;
2564 if (OP(scan) == EXACTFL)
2565 data->start_class->flags |= ANYOF_LOCALE;
2568 else if (flags & SCF_DO_STCLASS_OR) {
2569 if (data->start_class->flags & ANYOF_FOLD) {
2570 /* false positive possible if the class is case-folded.
2571 Assume that the locale settings are the same... */
2573 ANYOF_BITMAP_SET(data->start_class, uc);
2574 data->start_class->flags &= ~ANYOF_EOS;
2576 cl_and(data->start_class, &and_with);
2578 flags &= ~SCF_DO_STCLASS;
2580 else if (strchr((const char*)PL_varies,OP(scan))) {
2581 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2582 I32 f = flags, pos_before = 0;
2583 regnode * const oscan = scan;
2584 struct regnode_charclass_class this_class;
2585 struct regnode_charclass_class *oclass = NULL;
2586 I32 next_is_eval = 0;
2588 switch (PL_regkind[OP(scan)]) {
2589 case WHILEM: /* End of (?:...)* . */
2590 scan = NEXTOPER(scan);
2593 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2594 next = NEXTOPER(scan);
2595 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2597 maxcount = REG_INFTY;
2598 next = regnext(scan);
2599 scan = NEXTOPER(scan);
2603 if (flags & SCF_DO_SUBSTR)
2608 if (flags & SCF_DO_STCLASS) {
2610 maxcount = REG_INFTY;
2611 next = regnext(scan);
2612 scan = NEXTOPER(scan);
2615 is_inf = is_inf_internal = 1;
2616 scan = regnext(scan);
2617 if (flags & SCF_DO_SUBSTR) {
2618 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2619 data->longest = &(data->longest_float);
2621 goto optimize_curly_tail;
2623 mincount = ARG1(scan);
2624 maxcount = ARG2(scan);
2625 next = regnext(scan);
2626 if (OP(scan) == CURLYX) {
2627 I32 lp = (data ? *(data->last_closep) : 0);
2628 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2630 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2631 next_is_eval = (OP(scan) == EVAL);
2633 if (flags & SCF_DO_SUBSTR) {
2634 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2635 pos_before = data->pos_min;
2639 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2641 data->flags |= SF_IS_INF;
2643 if (flags & SCF_DO_STCLASS) {
2644 cl_init(pRExC_state, &this_class);
2645 oclass = data->start_class;
2646 data->start_class = &this_class;
2647 f |= SCF_DO_STCLASS_AND;
2648 f &= ~SCF_DO_STCLASS_OR;
2650 /* These are the cases when once a subexpression
2651 fails at a particular position, it cannot succeed
2652 even after backtracking at the enclosing scope.
2654 XXXX what if minimal match and we are at the
2655 initial run of {n,m}? */
2656 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2657 f &= ~SCF_WHILEM_VISITED_POS;
2659 /* This will finish on WHILEM, setting scan, or on NULL: */
2660 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2662 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2664 if (flags & SCF_DO_STCLASS)
2665 data->start_class = oclass;
2666 if (mincount == 0 || minnext == 0) {
2667 if (flags & SCF_DO_STCLASS_OR) {
2668 cl_or(pRExC_state, data->start_class, &this_class);
2670 else if (flags & SCF_DO_STCLASS_AND) {
2671 /* Switch to OR mode: cache the old value of
2672 * data->start_class */
2673 StructCopy(data->start_class, &and_with,
2674 struct regnode_charclass_class);
2675 flags &= ~SCF_DO_STCLASS_AND;
2676 StructCopy(&this_class, data->start_class,
2677 struct regnode_charclass_class);
2678 flags |= SCF_DO_STCLASS_OR;
2679 data->start_class->flags |= ANYOF_EOS;
2681 } else { /* Non-zero len */
2682 if (flags & SCF_DO_STCLASS_OR) {
2683 cl_or(pRExC_state, data->start_class, &this_class);
2684 cl_and(data->start_class, &and_with);
2686 else if (flags & SCF_DO_STCLASS_AND)
2687 cl_and(data->start_class, &this_class);
2688 flags &= ~SCF_DO_STCLASS;
2690 if (!scan) /* It was not CURLYX, but CURLY. */
2692 if ( /* ? quantifier ok, except for (?{ ... }) */
2693 (next_is_eval || !(mincount == 0 && maxcount == 1))
2694 && (minnext == 0) && (deltanext == 0)
2695 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2696 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2697 && ckWARN(WARN_REGEXP))
2700 "Quantifier unexpected on zero-length expression");
2703 min += minnext * mincount;
2704 is_inf_internal |= ((maxcount == REG_INFTY
2705 && (minnext + deltanext) > 0)
2706 || deltanext == I32_MAX);
2707 is_inf |= is_inf_internal;
2708 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2710 /* Try powerful optimization CURLYX => CURLYN. */
2711 if ( OP(oscan) == CURLYX && data
2712 && data->flags & SF_IN_PAR
2713 && !(data->flags & SF_HAS_EVAL)
2714 && !deltanext && minnext == 1 ) {
2715 /* Try to optimize to CURLYN. */
2716 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2717 regnode * const nxt1 = nxt;
2724 if (!strchr((const char*)PL_simple,OP(nxt))
2725 && !(PL_regkind[OP(nxt)] == EXACT
2726 && STR_LEN(nxt) == 1))
2732 if (OP(nxt) != CLOSE)
2734 /* Now we know that nxt2 is the only contents: */
2735 oscan->flags = (U8)ARG(nxt);
2737 OP(nxt1) = NOTHING; /* was OPEN. */
2739 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2740 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2741 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2742 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2743 OP(nxt + 1) = OPTIMIZED; /* was count. */
2744 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2749 /* Try optimization CURLYX => CURLYM. */
2750 if ( OP(oscan) == CURLYX && data
2751 && !(data->flags & SF_HAS_PAR)
2752 && !(data->flags & SF_HAS_EVAL)
2753 && !deltanext /* atom is fixed width */
2754 && minnext != 0 /* CURLYM can't handle zero width */
2756 /* XXXX How to optimize if data == 0? */
2757 /* Optimize to a simpler form. */
2758 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2762 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2763 && (OP(nxt2) != WHILEM))
2765 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2766 /* Need to optimize away parenths. */
2767 if (data->flags & SF_IN_PAR) {
2768 /* Set the parenth number. */
2769 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2771 if (OP(nxt) != CLOSE)
2772 FAIL("Panic opt close");
2773 oscan->flags = (U8)ARG(nxt);
2774 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2775 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2777 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2778 OP(nxt + 1) = OPTIMIZED; /* was count. */
2779 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2780 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2783 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2784 regnode *nnxt = regnext(nxt1);
2787 if (reg_off_by_arg[OP(nxt1)])
2788 ARG_SET(nxt1, nxt2 - nxt1);
2789 else if (nxt2 - nxt1 < U16_MAX)
2790 NEXT_OFF(nxt1) = nxt2 - nxt1;
2792 OP(nxt) = NOTHING; /* Cannot beautify */
2797 /* Optimize again: */
2798 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2804 else if ((OP(oscan) == CURLYX)
2805 && (flags & SCF_WHILEM_VISITED_POS)
2806 /* See the comment on a similar expression above.
2807 However, this time it not a subexpression
2808 we care about, but the expression itself. */
2809 && (maxcount == REG_INFTY)
2810 && data && ++data->whilem_c < 16) {
2811 /* This stays as CURLYX, we can put the count/of pair. */
2812 /* Find WHILEM (as in regexec.c) */
2813 regnode *nxt = oscan + NEXT_OFF(oscan);
2815 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2817 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2818 | (RExC_whilem_seen << 4)); /* On WHILEM */
2820 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2822 if (flags & SCF_DO_SUBSTR) {
2823 SV *last_str = NULL;
2824 int counted = mincount != 0;
2826 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2827 #if defined(SPARC64_GCC_WORKAROUND)
2830 const char *s = NULL;
2833 if (pos_before >= data->last_start_min)
2836 b = data->last_start_min;
2839 s = SvPV_const(data->last_found, l);
2840 old = b - data->last_start_min;
2843 I32 b = pos_before >= data->last_start_min
2844 ? pos_before : data->last_start_min;
2846 const char * const s = SvPV_const(data->last_found, l);
2847 I32 old = b - data->last_start_min;
2851 old = utf8_hop((U8*)s, old) - (U8*)s;
2854 /* Get the added string: */
2855 last_str = newSVpvn(s + old, l);
2857 SvUTF8_on(last_str);
2858 if (deltanext == 0 && pos_before == b) {
2859 /* What was added is a constant string */
2861 SvGROW(last_str, (mincount * l) + 1);
2862 repeatcpy(SvPVX(last_str) + l,
2863 SvPVX_const(last_str), l, mincount - 1);
2864 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2865 /* Add additional parts. */
2866 SvCUR_set(data->last_found,
2867 SvCUR(data->last_found) - l);
2868 sv_catsv(data->last_found, last_str);
2870 SV * sv = data->last_found;
2872 SvUTF8(sv) && SvMAGICAL(sv) ?
2873 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2874 if (mg && mg->mg_len >= 0)
2875 mg->mg_len += CHR_SVLEN(last_str);
2877 data->last_end += l * (mincount - 1);
2880 /* start offset must point into the last copy */
2881 data->last_start_min += minnext * (mincount - 1);
2882 data->last_start_max += is_inf ? I32_MAX
2883 : (maxcount - 1) * (minnext + data->pos_delta);
2886 /* It is counted once already... */
2887 data->pos_min += minnext * (mincount - counted);
2888 data->pos_delta += - counted * deltanext +
2889 (minnext + deltanext) * maxcount - minnext * mincount;
2890 if (mincount != maxcount) {
2891 /* Cannot extend fixed substrings found inside
2893 scan_commit(pRExC_state,data);
2894 if (mincount && last_str) {
2895 SV * const sv = data->last_found;
2896 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2897 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2901 sv_setsv(sv, last_str);
2902 data->last_end = data->pos_min;
2903 data->last_start_min =
2904 data->pos_min - CHR_SVLEN(last_str);
2905 data->last_start_max = is_inf
2907 : data->pos_min + data->pos_delta
2908 - CHR_SVLEN(last_str);
2910 data->longest = &(data->longest_float);
2912 SvREFCNT_dec(last_str);
2914 if (data && (fl & SF_HAS_EVAL))
2915 data->flags |= SF_HAS_EVAL;
2916 optimize_curly_tail:
2917 if (OP(oscan) != CURLYX) {
2918 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2920 NEXT_OFF(oscan) += NEXT_OFF(next);
2923 default: /* REF and CLUMP only? */
2924 if (flags & SCF_DO_SUBSTR) {
2925 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2926 data->longest = &(data->longest_float);
2928 is_inf = is_inf_internal = 1;
2929 if (flags & SCF_DO_STCLASS_OR)
2930 cl_anything(pRExC_state, data->start_class);
2931 flags &= ~SCF_DO_STCLASS;
2935 else if (strchr((const char*)PL_simple,OP(scan))) {
2938 if (flags & SCF_DO_SUBSTR) {
2939 scan_commit(pRExC_state,data);
2943 if (flags & SCF_DO_STCLASS) {
2944 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2946 /* Some of the logic below assumes that switching
2947 locale on will only add false positives. */
2948 switch (PL_regkind[OP(scan)]) {
2952 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2953 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2954 cl_anything(pRExC_state, data->start_class);
2957 if (OP(scan) == SANY)
2959 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2960 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2961 || (data->start_class->flags & ANYOF_CLASS));
2962 cl_anything(pRExC_state, data->start_class);
2964 if (flags & SCF_DO_STCLASS_AND || !value)
2965 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2968 if (flags & SCF_DO_STCLASS_AND)
2969 cl_and(data->start_class,
2970 (struct regnode_charclass_class*)scan);
2972 cl_or(pRExC_state, data->start_class,
2973 (struct regnode_charclass_class*)scan);
2976 if (flags & SCF_DO_STCLASS_AND) {
2977 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2978 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2979 for (value = 0; value < 256; value++)
2980 if (!isALNUM(value))
2981 ANYOF_BITMAP_CLEAR(data->start_class, value);
2985 if (data->start_class->flags & ANYOF_LOCALE)
2986 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2988 for (value = 0; value < 256; value++)
2990 ANYOF_BITMAP_SET(data->start_class, value);
2995 if (flags & SCF_DO_STCLASS_AND) {
2996 if (data->start_class->flags & ANYOF_LOCALE)
2997 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3000 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3001 data->start_class->flags |= ANYOF_LOCALE;
3005 if (flags & SCF_DO_STCLASS_AND) {
3006 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3007 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3008 for (value = 0; value < 256; value++)
3010 ANYOF_BITMAP_CLEAR(data->start_class, value);
3014 if (data->start_class->flags & ANYOF_LOCALE)
3015 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3017 for (value = 0; value < 256; value++)
3018 if (!isALNUM(value))
3019 ANYOF_BITMAP_SET(data->start_class, value);
3024 if (flags & SCF_DO_STCLASS_AND) {
3025 if (data->start_class->flags & ANYOF_LOCALE)
3026 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3029 data->start_class->flags |= ANYOF_LOCALE;
3030 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3034 if (flags & SCF_DO_STCLASS_AND) {
3035 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3036 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3037 for (value = 0; value < 256; value++)
3038 if (!isSPACE(value))
3039 ANYOF_BITMAP_CLEAR(data->start_class, value);
3043 if (data->start_class->flags & ANYOF_LOCALE)
3044 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3046 for (value = 0; value < 256; value++)
3048 ANYOF_BITMAP_SET(data->start_class, value);
3053 if (flags & SCF_DO_STCLASS_AND) {
3054 if (data->start_class->flags & ANYOF_LOCALE)
3055 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3058 data->start_class->flags |= ANYOF_LOCALE;
3059 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3063 if (flags & SCF_DO_STCLASS_AND) {
3064 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3065 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3066 for (value = 0; value < 256; value++)
3068 ANYOF_BITMAP_CLEAR(data->start_class, value);
3072 if (data->start_class->flags & ANYOF_LOCALE)
3073 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3075 for (value = 0; value < 256; value++)
3076 if (!isSPACE(value))
3077 ANYOF_BITMAP_SET(data->start_class, value);
3082 if (flags & SCF_DO_STCLASS_AND) {
3083 if (data->start_class->flags & ANYOF_LOCALE) {
3084 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3085 for (value = 0; value < 256; value++)
3086 if (!isSPACE(value))
3087 ANYOF_BITMAP_CLEAR(data->start_class, value);
3091 data->start_class->flags |= ANYOF_LOCALE;
3092 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3096 if (flags & SCF_DO_STCLASS_AND) {
3097 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3098 for (value = 0; value < 256; value++)
3099 if (!isDIGIT(value))
3100 ANYOF_BITMAP_CLEAR(data->start_class, value);
3103 if (data->start_class->flags & ANYOF_LOCALE)
3104 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3106 for (value = 0; value < 256; value++)
3108 ANYOF_BITMAP_SET(data->start_class, value);
3113 if (flags & SCF_DO_STCLASS_AND) {
3114 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3115 for (value = 0; value < 256; value++)
3117 ANYOF_BITMAP_CLEAR(data->start_class, value);
3120 if (data->start_class->flags & ANYOF_LOCALE)
3121 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3123 for (value = 0; value < 256; value++)
3124 if (!isDIGIT(value))
3125 ANYOF_BITMAP_SET(data->start_class, value);
3130 if (flags & SCF_DO_STCLASS_OR)
3131 cl_and(data->start_class, &and_with);
3132 flags &= ~SCF_DO_STCLASS;
3135 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3136 data->flags |= (OP(scan) == MEOL
3140 else if ( PL_regkind[OP(scan)] == BRANCHJ
3141 /* Lookbehind, or need to calculate parens/evals/stclass: */
3142 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3143 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3144 /* Lookahead/lookbehind */
3145 I32 deltanext, minnext, fake = 0;
3147 struct regnode_charclass_class intrnl;
3150 data_fake.flags = 0;
3152 data_fake.whilem_c = data->whilem_c;
3153 data_fake.last_closep = data->last_closep;
3156 data_fake.last_closep = &fake;
3157 if ( flags & SCF_DO_STCLASS && !scan->flags
3158 && OP(scan) == IFMATCH ) { /* Lookahead */
3159 cl_init(pRExC_state, &intrnl);
3160 data_fake.start_class = &intrnl;
3161 f |= SCF_DO_STCLASS_AND;
3163 if (flags & SCF_WHILEM_VISITED_POS)
3164 f |= SCF_WHILEM_VISITED_POS;
3165 next = regnext(scan);
3166 nscan = NEXTOPER(NEXTOPER(scan));
3167 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3170 vFAIL("Variable length lookbehind not implemented");
3172 else if (minnext > (I32)U8_MAX) {
3173 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3175 scan->flags = (U8)minnext;
3178 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3180 if (data_fake.flags & SF_HAS_EVAL)
3181 data->flags |= SF_HAS_EVAL;
3182 data->whilem_c = data_fake.whilem_c;
3184 if (f & SCF_DO_STCLASS_AND) {
3185 const int was = (data->start_class->flags & ANYOF_EOS);
3187 cl_and(data->start_class, &intrnl);
3189 data->start_class->flags |= ANYOF_EOS;
3192 else if (OP(scan) == OPEN) {
3195 else if (OP(scan) == CLOSE) {
3196 if ((I32)ARG(scan) == is_par) {
3197 next = regnext(scan);
3199 if ( next && (OP(next) != WHILEM) && next < last)
3200 is_par = 0; /* Disable optimization */
3203 *(data->last_closep) = ARG(scan);
3205 else if (OP(scan) == EVAL) {
3207 data->flags |= SF_HAS_EVAL;
3209 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3210 if (flags & SCF_DO_SUBSTR) {
3211 scan_commit(pRExC_state,data);
3212 data->longest = &(data->longest_float);
3214 is_inf = is_inf_internal = 1;
3215 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3216 cl_anything(pRExC_state, data->start_class);
3217 flags &= ~SCF_DO_STCLASS;
3219 #ifdef TRIE_STUDY_OPT
3220 #ifdef FULL_TRIE_STUDY
3221 else if (PL_regkind[OP(scan)] == TRIE) {
3222 /* NOTE - There is similar code to this block above for handling
3223 BRANCH nodes on the initial study. If you change stuff here
3225 regnode *tail= regnext(scan);
3226 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3227 I32 max1 = 0, min1 = I32_MAX;
3228 struct regnode_charclass_class accum;
3230 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3231 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
3232 if (flags & SCF_DO_STCLASS)
3233 cl_init_zero(pRExC_state, &accum);
3239 const regnode *nextbranch= NULL;
3242 for ( word=1 ; word <= trie->wordcount ; word++)
3244 I32 deltanext=0, minnext=0, f = 0, fake;
3245 struct regnode_charclass_class this_class;
3247 data_fake.flags = 0;
3249 data_fake.whilem_c = data->whilem_c;
3250 data_fake.last_closep = data->last_closep;
3253 data_fake.last_closep = &fake;
3255 if (flags & SCF_DO_STCLASS) {
3256 cl_init(pRExC_state, &this_class);
3257 data_fake.start_class = &this_class;
3258 f = SCF_DO_STCLASS_AND;
3260 if (flags & SCF_WHILEM_VISITED_POS)
3261 f |= SCF_WHILEM_VISITED_POS;
3263 if (trie->jump[word]) {
3265 nextbranch = tail - trie->jump[0];
3266 scan= tail - trie->jump[word];
3267 /* We go from the jump point to the branch that follows
3268 it. Note this means we need the vestigal unused branches
3269 even though they arent otherwise used.
3271 minnext = study_chunk(pRExC_state, &scan, &deltanext,
3272 (regnode *)nextbranch, &data_fake, f,depth+1);
3274 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3275 nextbranch= regnext((regnode*)nextbranch);
3277 if (min1 > (I32)(minnext + trie->minlen))
3278 min1 = minnext + trie->minlen;
3279 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3280 max1 = minnext + deltanext + trie->maxlen;
3281 if (deltanext == I32_MAX)
3282 is_inf = is_inf_internal = 1;
3284 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3288 if (data_fake.flags & SF_HAS_EVAL)
3289 data->flags |= SF_HAS_EVAL;
3290 data->whilem_c = data_fake.whilem_c;
3292 if (flags & SCF_DO_STCLASS)
3293 cl_or(pRExC_state, &accum, &this_class);
3296 if (flags & SCF_DO_SUBSTR) {
3297 data->pos_min += min1;
3298 data->pos_delta += max1 - min1;
3299 if (max1 != min1 || is_inf)
3300 data->longest = &(data->longest_float);
3303 delta += max1 - min1;
3304 if (flags & SCF_DO_STCLASS_OR) {
3305 cl_or(pRExC_state, data->start_class, &accum);
3307 cl_and(data->start_class, &and_with);
3308 flags &= ~SCF_DO_STCLASS;
3311 else if (flags & SCF_DO_STCLASS_AND) {
3313 cl_and(data->start_class, &accum);
3314 flags &= ~SCF_DO_STCLASS;
3317 /* Switch to OR mode: cache the old value of
3318 * data->start_class */
3319 StructCopy(data->start_class, &and_with,
3320 struct regnode_charclass_class);
3321 flags &= ~SCF_DO_STCLASS_AND;
3322 StructCopy(&accum, data->start_class,
3323 struct regnode_charclass_class);
3324 flags |= SCF_DO_STCLASS_OR;
3325 data->start_class->flags |= ANYOF_EOS;
3332 else if (PL_regkind[OP(scan)] == TRIE) {
3333 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3336 min += trie->minlen;
3337 delta += (trie->maxlen - trie->minlen);
3338 flags &= ~SCF_DO_STCLASS; /* xxx */
3339 if (flags & SCF_DO_SUBSTR) {
3340 scan_commit(pRExC_state,data); /* Cannot expect anything... */
3341 data->pos_min += trie->minlen;
3342 data->pos_delta += (trie->maxlen - trie->minlen);
3343 if (trie->maxlen != trie->minlen)
3344 data->longest = &(data->longest_float);
3346 if (trie->jump) /* no more substrings -- for now /grr*/
3347 flags &= ~SCF_DO_SUBSTR;
3349 #endif /* old or new */
3350 #endif /* TRIE_STUDY_OPT */
3351 /* Else: zero-length, ignore. */
3352 scan = regnext(scan);
3357 *deltap = is_inf_internal ? I32_MAX : delta;
3358 if (flags & SCF_DO_SUBSTR && is_inf)
3359 data->pos_delta = I32_MAX - data->pos_min;
3360 if (is_par > (I32)U8_MAX)
3362 if (is_par && pars==1 && data) {
3363 data->flags |= SF_IN_PAR;
3364 data->flags &= ~SF_HAS_PAR;
3366 else if (pars && data) {
3367 data->flags |= SF_HAS_PAR;
3368 data->flags &= ~SF_IN_PAR;
3370 if (flags & SCF_DO_STCLASS_OR)
3371 cl_and(data->start_class, &and_with);
3372 if (flags & SCF_TRIE_RESTUDY)
3373 data->flags |= SCF_TRIE_RESTUDY;
3378 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3380 if (RExC_rx->data) {
3381 Renewc(RExC_rx->data,
3382 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3383 char, struct reg_data);
3384 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3385 RExC_rx->data->count += n;
3388 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3389 char, struct reg_data);
3390 Newx(RExC_rx->data->what, n, U8);
3391 RExC_rx->data->count = n;
3393 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3394 return RExC_rx->data->count - n;
3397 #ifndef PERL_IN_XSUB_RE
3399 Perl_reginitcolors(pTHX)
3402 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3404 char *t = savepv(s);
3408 t = strchr(t, '\t');
3414 PL_colors[i] = t = (char *)"";
3419 PL_colors[i++] = (char *)"";
3426 #ifdef TRIE_STUDY_OPT
3427 #define CHECK_RESTUDY_GOTO \
3429 (data.flags & SCF_TRIE_RESTUDY) \
3433 #define CHECK_RESTUDY_GOTO
3436 - pregcomp - compile a regular expression into internal code
3438 * We can't allocate space until we know how big the compiled form will be,
3439 * but we can't compile it (and thus know how big it is) until we've got a
3440 * place to put the code. So we cheat: we compile it twice, once with code
3441 * generation turned off and size counting turned on, and once "for real".
3442 * This also means that we don't allocate space until we are sure that the
3443 * thing really will compile successfully, and we never have to move the
3444 * code and thus invalidate pointers into it. (Note that it has to be in
3445 * one piece because free() must be able to free it all.) [NB: not true in perl]
3447 * Beware that the optimization-preparation code in here knows about some
3448 * of the structure of the compiled regexp. [I'll say.]
3451 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3462 RExC_state_t RExC_state;
3463 RExC_state_t * const pRExC_state = &RExC_state;
3464 #ifdef TRIE_STUDY_OPT
3466 RExC_state_t copyRExC_state;
3469 GET_RE_DEBUG_FLAGS_DECL;
3472 FAIL("NULL regexp argument");
3474 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3477 DEBUG_r(if (!PL_colorset) reginitcolors());
3479 SV *dsv= sv_newmortal();
3480 RE_PV_QUOTED_DECL(s, RExC_utf8,
3481 dsv, RExC_precomp, (xend - exp), 60);
3482 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3483 PL_colors[4],PL_colors[5],s);
3485 RExC_flags = pm->op_pmflags;
3489 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3490 RExC_seen_evals = 0;
3493 /* First pass: determine size, legality. */
3500 RExC_emit = &PL_regdummy;
3501 RExC_whilem_seen = 0;
3502 #if 0 /* REGC() is (currently) a NOP at the first pass.
3503 * Clever compilers notice this and complain. --jhi */
3504 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3506 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3507 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3508 RExC_precomp = NULL;
3511 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3512 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3513 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3516 RExC_lastparse=NULL;
3520 /* Small enough for pointer-storage convention?
3521 If extralen==0, this means that we will not need long jumps. */
3522 if (RExC_size >= 0x10000L && RExC_extralen)
3523 RExC_size += RExC_extralen;
3526 if (RExC_whilem_seen > 15)
3527 RExC_whilem_seen = 15;
3529 /* Allocate space and initialize. */
3530 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3533 FAIL("Regexp out of space");
3536 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3537 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3540 r->prelen = xend - exp;
3541 r->precomp = savepvn(RExC_precomp, r->prelen);
3543 #ifdef PERL_OLD_COPY_ON_WRITE
3544 r->saved_copy = NULL;
3546 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3547 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3548 r->lastparen = 0; /* mg.c reads this. */
3550 r->substrs = 0; /* Useful during FAIL. */
3551 r->startp = 0; /* Useful during FAIL. */
3552 r->endp = 0; /* Useful during FAIL. */
3554 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3556 r->offsets[0] = RExC_size;
3558 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3559 "%s %"UVuf" bytes for offset annotations.\n",
3560 r->offsets ? "Got" : "Couldn't get",
3561 (UV)((2*RExC_size+1) * sizeof(U32))));
3565 /* Second pass: emit code. */
3566 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3571 RExC_emit_start = r->program;
3572 RExC_emit = r->program;
3573 /* Store the count of eval-groups for security checks: */
3574 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
3575 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3577 if (reg(pRExC_state, 0, &flags,1) == NULL)
3579 /* XXXX To minimize changes to RE engine we always allocate
3580 3-units-long substrs field. */
3581 Newx(r->substrs, 1, struct reg_substr_data);
3584 minlen=sawplus=sawopen=0;
3585 Zero(r->substrs, 1, struct reg_substr_data);
3586 StructCopy(&zero_scan_data, &data, scan_data_t);
3588 #ifdef TRIE_STUDY_OPT
3590 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3591 RExC_state=copyRExC_state;
3592 if (data.longest_fixed)
3593 SvREFCNT_dec(data.longest_fixed);
3594 if (data.longest_float)
3595 SvREFCNT_dec(data.longest_float);
3596 if (data.last_found)
3597 SvREFCNT_dec(data.last_found);
3599 copyRExC_state=RExC_state;
3602 /* Dig out information for optimizations. */
3603 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3604 pm->op_pmflags = RExC_flags;
3606 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3607 r->regstclass = NULL;
3608 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3609 r->reganch |= ROPT_NAUGHTY;
3610 scan = r->program + 1; /* First BRANCH. */
3612 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3613 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3615 STRLEN longest_float_length, longest_fixed_length;
3616 struct regnode_charclass_class ch_class; /* pointed to by data */
3618 I32 last_close = 0; /* pointed to by data */
3621 /* Skip introductions and multiplicators >= 1. */
3622 while ((OP(first) == OPEN && (sawopen = 1)) ||
3623 /* An OR of *one* alternative - should not happen now. */
3624 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3625 /* for now we can't handle lookbehind IFMATCH*/
3626 (OP(first) == IFMATCH && !first->flags) ||
3627 (OP(first) == PLUS) ||
3628 (OP(first) == MINMOD) ||
3629 /* An {n,m} with n>0 */
3630 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3633 if (OP(first) == PLUS)
3636 first += regarglen[OP(first)];
3637 if (OP(first) == IFMATCH) {
3638 first = NEXTOPER(first);
3639 first += EXTRA_STEP_2ARGS;
3640 } else /* XXX possible optimisation for /(?=)/ */
3641 first = NEXTOPER(first);
3644 /* Starting-point info. */
3646 DEBUG_PEEP("first:",first,0);
3647 /* Ignore EXACT as we deal with it later. */
3648 if (PL_regkind[OP(first)] == EXACT) {
3649 if (OP(first) == EXACT)
3650 NOOP; /* Empty, get anchored substr later. */
3651 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3652 r->regstclass = first;
3655 else if (PL_regkind[OP(first)] == TRIE &&
3656 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3659 /* this can happen only on restudy */
3660 if ( OP(first) == TRIE ) {
3661 struct regnode_1 *trieop;
3662 Newxz(trieop,1,struct regnode_1);
3663 StructCopy(first,trieop,struct regnode_1);
3664 trie_op=(regnode *)trieop;
3666 struct regnode_charclass *trieop;
3667 Newxz(trieop,1,struct regnode_charclass);
3668 StructCopy(first,trieop,struct regnode_charclass);
3669 trie_op=(regnode *)trieop;
3671 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3672 r->regstclass = trie_op;
3675 else if (strchr((const char*)PL_simple,OP(first)))
3676 r->regstclass = first;
3677 else if (PL_regkind[OP(first)] == BOUND ||
3678 PL_regkind[OP(first)] == NBOUND)
3679 r->regstclass = first;
3680 else if (PL_regkind[OP(first)] == BOL) {
3681 r->reganch |= (OP(first) == MBOL
3683 : (OP(first) == SBOL
3686 first = NEXTOPER(first);
3689 else if (OP(first) == GPOS) {
3690 r->reganch |= ROPT_ANCH_GPOS;
3691 first = NEXTOPER(first);
3694 else if (!sawopen && (OP(first) == STAR &&
3695 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3696 !(r->reganch & ROPT_ANCH) )
3698 /* turn .* into ^.* with an implied $*=1 */
3700 (OP(NEXTOPER(first)) == REG_ANY)
3703 r->reganch |= type | ROPT_IMPLICIT;
3704 first = NEXTOPER(first);
3707 if (sawplus && (!sawopen || !RExC_sawback)
3708 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3709 /* x+ must match at the 1st pos of run of x's */
3710 r->reganch |= ROPT_SKIP;
3712 /* Scan is after the zeroth branch, first is atomic matcher. */
3713 #ifdef TRIE_STUDY_OPT
3716 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3717 (IV)(first - scan + 1))
3721 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3722 (IV)(first - scan + 1))
3728 * If there's something expensive in the r.e., find the
3729 * longest literal string that must appear and make it the
3730 * regmust. Resolve ties in favor of later strings, since
3731 * the regstart check works with the beginning of the r.e.
3732 * and avoiding duplication strengthens checking. Not a
3733 * strong reason, but sufficient in the absence of others.
3734 * [Now we resolve ties in favor of the earlier string if
3735 * it happens that c_offset_min has been invalidated, since the
3736 * earlier string may buy us something the later one won't.]
3740 data.longest_fixed = newSVpvs("");
3741 data.longest_float = newSVpvs("");
3742 data.last_found = newSVpvs("");
3743 data.longest = &(data.longest_fixed);
3745 if (!r->regstclass) {
3746 cl_init(pRExC_state, &ch_class);
3747 data.start_class = &ch_class;
3748 stclass_flag = SCF_DO_STCLASS_AND;
3749 } else /* XXXX Check for BOUND? */
3751 data.last_closep = &last_close;
3753 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3754 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3760 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3761 && data.last_start_min == 0 && data.last_end > 0
3762 && !RExC_seen_zerolen
3763 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3764 r->reganch |= ROPT_CHECK_ALL;
3765 scan_commit(pRExC_state, &data);
3766 SvREFCNT_dec(data.last_found);
3768 longest_float_length = CHR_SVLEN(data.longest_float);
3769 if (longest_float_length
3770 || (data.flags & SF_FL_BEFORE_EOL
3771 && (!(data.flags & SF_FL_BEFORE_MEOL)
3772 || (RExC_flags & PMf_MULTILINE)))) {
3775 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3776 && data.offset_fixed == data.offset_float_min
3777 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3778 goto remove_float; /* As in (a)+. */
3780 if (SvUTF8(data.longest_float)) {
3781 r->float_utf8 = data.longest_float;
3782 r->float_substr = NULL;
3784 r->float_substr = data.longest_float;
3785 r->float_utf8 = NULL;
3787 r->float_min_offset = data.offset_float_min;
3788 r->float_max_offset = data.offset_float_max;
3789 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3790 && (!(data.flags & SF_FL_BEFORE_MEOL)
3791 || (RExC_flags & PMf_MULTILINE)));
3792 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3796 r->float_substr = r->float_utf8 = NULL;
3797 SvREFCNT_dec(data.longest_float);
3798 longest_float_length = 0;
3801 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3802 if (longest_fixed_length
3803 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3804 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3805 || (RExC_flags & PMf_MULTILINE)))) {
3808 if (SvUTF8(data.longest_fixed)) {
3809 r->anchored_utf8 = data.longest_fixed;
3810 r->anchored_substr = NULL;
3812 r->anchored_substr = data.longest_fixed;
3813 r->anchored_utf8 = NULL;
3815 r->anchored_offset = data.offset_fixed;
3816 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3817 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3818 || (RExC_flags & PMf_MULTILINE)));
3819 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3822 r->anchored_substr = r->anchored_utf8 = NULL;
3823 SvREFCNT_dec(data.longest_fixed);
3824 longest_fixed_length = 0;
3827 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3828 r->regstclass = NULL;
3829 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3831 && !(data.start_class->flags & ANYOF_EOS)
3832 && !cl_is_anything(data.start_class))
3834 const I32 n = add_data(pRExC_state, 1, "f");
3836 Newx(RExC_rx->data->data[n], 1,
3837 struct regnode_charclass_class);
3838 StructCopy(data.start_class,
3839 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3840 struct regnode_charclass_class);
3841 r->regstclass = (regnode*)RExC_rx->data->data[n];
3842 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3843 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3844 regprop(r, sv, (regnode*)data.start_class);
3845 PerlIO_printf(Perl_debug_log,
3846 "synthetic stclass \"%s\".\n",
3847 SvPVX_const(sv));});
3850 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3851 if (longest_fixed_length > longest_float_length) {
3852 r->check_substr = r->anchored_substr;
3853 r->check_utf8 = r->anchored_utf8;
3854 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3855 if (r->reganch & ROPT_ANCH_SINGLE)
3856 r->reganch |= ROPT_NOSCAN;
3859 r->check_substr = r->float_substr;
3860 r->check_utf8 = r->float_utf8;
3861 r->check_offset_min = data.offset_float_min;
3862 r->check_offset_max = data.offset_float_max;
3864 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3865 This should be changed ASAP! */
3866 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3867 r->reganch |= RE_USE_INTUIT;
3868 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3869 r->reganch |= RE_INTUIT_TAIL;
3873 /* Several toplevels. Best we can is to set minlen. */
3875 struct regnode_charclass_class ch_class;
3878 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3880 scan = r->program + 1;
3881 cl_init(pRExC_state, &ch_class);
3882 data.start_class = &ch_class;
3883 data.last_closep = &last_close;
3885 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3886 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3890 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3891 = r->float_substr = r->float_utf8 = NULL;
3892 if (!(data.start_class->flags & ANYOF_EOS)
3893 && !cl_is_anything(data.start_class))
3895 const I32 n = add_data(pRExC_state, 1, "f");
3897 Newx(RExC_rx->data->data[n], 1,
3898 struct regnode_charclass_class);
3899 StructCopy(data.start_class,
3900 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3901 struct regnode_charclass_class);
3902 r->regstclass = (regnode*)RExC_rx->data->data[n];
3903 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3904 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3905 regprop(r, sv, (regnode*)data.start_class);
3906 PerlIO_printf(Perl_debug_log,
3907 "synthetic stclass \"%s\".\n",
3908 SvPVX_const(sv));});
3913 if (RExC_seen & REG_SEEN_GPOS)
3914 r->reganch |= ROPT_GPOS_SEEN;
3915 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3916 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3917 if (RExC_seen & REG_SEEN_EVAL)
3918 r->reganch |= ROPT_EVAL_SEEN;
3919 if (RExC_seen & REG_SEEN_CANY)
3920 r->reganch |= ROPT_CANY_SEEN;
3921 Newxz(r->startp, RExC_npar, I32);