5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
287 #define Simple_vFAIL(m) STMT_START { \
288 const IV offset = RExC_parse - RExC_precomp; \
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
296 #define vFAIL(m) STMT_START { \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
303 * Like Simple_vFAIL(), but accepts two arguments.
305 #define Simple_vFAIL2(m,a1) STMT_START { \
306 const IV offset = RExC_parse - RExC_precomp; \
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
314 #define vFAIL2(m,a1) STMT_START { \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
322 * Like Simple_vFAIL(), but accepts three arguments.
324 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
325 const IV offset = RExC_parse - RExC_precomp; \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
333 #define vFAIL3(m,a1,a2) STMT_START { \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
340 * Like Simple_vFAIL(), but accepts four arguments.
342 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
343 const IV offset = RExC_parse - RExC_precomp; \
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
348 #define vWARN(loc,m) STMT_START { \
349 const IV offset = loc - RExC_precomp; \
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
354 #define vWARNdep(loc,m) STMT_START { \
355 const IV offset = loc - RExC_precomp; \
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
362 #define vWARN2(loc, m, a1) STMT_START { \
363 const IV offset = loc - RExC_precomp; \
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
368 #define vWARN3(loc, m, a1, a2) STMT_START { \
369 const IV offset = loc - RExC_precomp; \
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
374 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
375 const IV offset = loc - RExC_precomp; \
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
381 const IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
387 /* Allow for side effects in s */
388 #define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
392 /* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
398 #define MJD_OFFSET_DEBUG(x)
399 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
402 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
409 RExC_offsets[2*(node)-1] = (byte); \
414 #define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
418 #define Set_Node_Length_To_R(node,len) STMT_START { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
421 __LINE__, (int)(node), (int)(len))); \
423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
425 RExC_offsets[2*(node)] = (len); \
430 #define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433 #define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
436 /* Get offsets and lengths */
437 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
440 static void clear_re(pTHX_ void *r);
442 /* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
447 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
453 SvSetMagicSV(*data->longest, data->last_found);
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
460 data->flags &= ~SF_FIX_BEFORE_EOL;
463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
464 data->offset_float_max = (l
465 ? data->last_start_max
466 : data->pos_min + data->pos_delta);
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
469 if (data->flags & SF_BEFORE_EOL)
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
473 data->flags &= ~SF_FL_BEFORE_EOL;
476 SvCUR_set(data->last_found, 0);
478 SV * const sv = data->last_found;
480 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
481 if (mg && mg->mg_len > 0)
485 data->flags &= ~SF_BEFORE_EOL;
488 /* Can match anything (initialization) */
490 S_cl_anything(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
492 ANYOF_CLASS_ZERO(cl);
493 ANYOF_BITMAP_SETALL(cl);
494 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
496 cl->flags |= ANYOF_LOCALE;
499 /* Can match anything (initialization) */
501 S_cl_is_anything(const struct regnode_charclass_class *cl)
505 for (value = 0; value <= ANYOF_MAX; value += 2)
506 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
508 if (!(cl->flags & ANYOF_UNICODE_ALL))
510 if (!ANYOF_BITMAP_TESTALLSET(cl))
515 /* Can match anything (initialization) */
517 S_cl_init(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
519 Zero(cl, 1, struct regnode_charclass_class);
521 cl_anything(pRExC_state, cl);
525 S_cl_init_zero(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
527 Zero(cl, 1, struct regnode_charclass_class);
529 cl_anything(pRExC_state, cl);
531 cl->flags |= ANYOF_LOCALE;
534 /* 'And' a given class with another one. Can create false positives */
535 /* We assume that cl is not inverted */
537 S_cl_and(struct regnode_charclass_class *cl,
538 const struct regnode_charclass_class *and_with)
540 if (!(and_with->flags & ANYOF_CLASS)
541 && !(cl->flags & ANYOF_CLASS)
542 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
543 && !(and_with->flags & ANYOF_FOLD)
544 && !(cl->flags & ANYOF_FOLD)) {
547 if (and_with->flags & ANYOF_INVERT)
548 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
549 cl->bitmap[i] &= ~and_with->bitmap[i];
551 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
552 cl->bitmap[i] &= and_with->bitmap[i];
553 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
554 if (!(and_with->flags & ANYOF_EOS))
555 cl->flags &= ~ANYOF_EOS;
557 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
558 !(and_with->flags & ANYOF_INVERT)) {
559 cl->flags &= ~ANYOF_UNICODE_ALL;
560 cl->flags |= ANYOF_UNICODE;
561 ARG_SET(cl, ARG(and_with));
563 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
564 !(and_with->flags & ANYOF_INVERT))
565 cl->flags &= ~ANYOF_UNICODE_ALL;
566 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
567 !(and_with->flags & ANYOF_INVERT))
568 cl->flags &= ~ANYOF_UNICODE;
571 /* 'OR' a given class with another one. Can create false positives */
572 /* We assume that cl is not inverted */
574 S_cl_or(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
576 if (or_with->flags & ANYOF_INVERT) {
578 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
579 * <= (B1 | !B2) | (CL1 | !CL2)
580 * which is wasteful if CL2 is small, but we ignore CL2:
581 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
582 * XXXX Can we handle case-fold? Unclear:
583 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
584 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
586 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
587 && !(or_with->flags & ANYOF_FOLD)
588 && !(cl->flags & ANYOF_FOLD) ) {
591 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
592 cl->bitmap[i] |= ~or_with->bitmap[i];
593 } /* XXXX: logic is complicated otherwise */
595 cl_anything(pRExC_state, cl);
598 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
599 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
600 && (!(or_with->flags & ANYOF_FOLD)
601 || (cl->flags & ANYOF_FOLD)) ) {
604 /* OR char bitmap and class bitmap separately */
605 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
606 cl->bitmap[i] |= or_with->bitmap[i];
607 if (or_with->flags & ANYOF_CLASS) {
608 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
609 cl->classflags[i] |= or_with->classflags[i];
610 cl->flags |= ANYOF_CLASS;
613 else { /* XXXX: logic is complicated, leave it along for a moment. */
614 cl_anything(pRExC_state, cl);
617 if (or_with->flags & ANYOF_EOS)
618 cl->flags |= ANYOF_EOS;
620 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
621 ARG(cl) != ARG(or_with)) {
622 cl->flags |= ANYOF_UNICODE_ALL;
623 cl->flags &= ~ANYOF_UNICODE;
625 if (or_with->flags & ANYOF_UNICODE_ALL) {
626 cl->flags |= ANYOF_UNICODE_ALL;
627 cl->flags &= ~ANYOF_UNICODE;
633 make_trie(startbranch,first,last,tail,flags)
634 startbranch: the first branch in the whole branch sequence
635 first : start branch of sequence of branch-exact nodes.
636 May be the same as startbranch
637 last : Thing following the last branch.
638 May be the same as tail.
639 tail : item following the branch sequence
640 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
642 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
644 A trie is an N'ary tree where the branches are determined by digital
645 decomposition of the key. IE, at the root node you look up the 1st character and
646 follow that branch repeat until you find the end of the branches. Nodes can be
647 marked as "accepting" meaning they represent a complete word. Eg:
651 would convert into the following structure. Numbers represent states, letters
652 following numbers represent valid transitions on the letter from that state, if
653 the number is in square brackets it represents an accepting state, otherwise it
654 will be in parenthesis.
656 +-h->+-e->[3]-+-r->(8)-+-s->[9]
660 (1) +-i->(6)-+-s->[7]
662 +-s->(3)-+-h->(4)-+-e->[5]
664 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
666 This shows that when matching against the string 'hers' we will begin at state 1
667 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
668 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
669 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
670 single traverse. We store a mapping from accepting to state to which word was
671 matched, and then when we have multiple possibilities we try to complete the
672 rest of the regex in the order in which they occured in the alternation.
674 The only prior NFA like behaviour that would be changed by the TRIE support is
675 the silent ignoring of duplicate alternations which are of the form:
677 / (DUPE|DUPE) X? (?{ ... }) Y /x
679 Thus EVAL blocks follwing a trie may be called a different number of times with
680 and without the optimisation. With the optimisations dupes will be silently
681 ignored. This inconsistant behaviour of EVAL type nodes is well established as
682 the following demonstrates:
684 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
686 which prints out 'word' three times, but
688 'words'=~/(word|word|word)(?{ print $1 })S/
690 which doesnt print it out at all. This is due to other optimisations kicking in.
692 Example of what happens on a structural level:
694 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
696 1: CURLYM[1] {1,32767}(18)
707 This would be optimizable with startbranch=5, first=5, last=16, tail=16
708 and should turn into:
710 1: CURLYM[1] {1,32767}(18)
712 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
720 Cases where tail != last would be like /(?foo|bar)baz/:
730 which would be optimizable with startbranch=1, first=1, last=7, tail=8
731 and would end up looking like:
734 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
743 #define TRIE_DEBUG_CHAR \
744 DEBUG_TRIE_COMPILE_r({ \
747 tmp = newSVpvs( "" ); \
748 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
750 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
752 av_push( trie->revcharmap, tmp ); \
755 #define TRIE_READ_CHAR STMT_START { \
758 if ( foldlen > 0 ) { \
759 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
764 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
765 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
766 foldlen -= UNISKIP( uvc ); \
767 scan = foldbuf + UNISKIP( uvc ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
779 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
780 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
781 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
782 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
785 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
786 TRIE_LIST_LEN( state ) *= 2; \
787 Renew( trie->states[ state ].trans.list, \
788 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
790 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
792 TRIE_LIST_CUR( state )++; \
795 #define TRIE_LIST_NEW(state) STMT_START { \
796 Newxz( trie->states[ state ].trans.list, \
797 4, reg_trie_trans_le ); \
798 TRIE_LIST_CUR( state ) = 1; \
799 TRIE_LIST_LEN( state ) = 4; \
803 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
806 /* first pass, loop through and scan words */
809 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
814 /* we just use folder as a flag in utf8 */
815 const U8 * const folder = ( flags == EXACTF
823 const U32 data_slot = add_data( pRExC_state, 1, "t" );
826 GET_RE_DEBUG_FLAGS_DECL;
828 Newxz( trie, 1, reg_trie_data );
830 RExC_rx->data->data[ data_slot ] = (void*)trie;
831 Newxz( trie->charmap, 256, U16 );
833 trie->words = newAV();
834 trie->revcharmap = newAV();
838 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
839 if (!SvIOK(re_trie_maxbuff)) {
840 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
843 /* -- First loop and Setup --
845 We first traverse the branches and scan each word to determine if it
846 contains widechars, and how many unique chars there are, this is
847 important as we have to build a table with at least as many columns as we
850 We use an array of integers to represent the character codes 0..255
851 (trie->charmap) and we use a an HV* to store unicode characters. We use the
852 native representation of the character value as the key and IV's for the
855 *TODO* If we keep track of how many times each character is used we can
856 remap the columns so that the table compression later on is more
857 efficient in terms of memory by ensuring most common value is in the
858 middle and the least common are on the outside. IMO this would be better
859 than a most to least common mapping as theres a decent chance the most
860 common letter will share a node with the least common, meaning the node
861 will not be compressable. With a middle is most common approach the worst
862 case is when we have the least common nodes twice.
867 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
868 regnode * const noper = NEXTOPER( cur );
869 const U8 *uc = (U8*)STRING( noper );
870 const U8 * const e = uc + STR_LEN( noper );
872 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
873 const U8 *scan = (U8*)NULL;
875 for ( ; uc < e ; uc += len ) {
879 if ( !trie->charmap[ uvc ] ) {
880 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
882 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
887 if ( !trie->widecharmap )
888 trie->widecharmap = newHV();
890 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
893 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
895 if ( !SvTRUE( *svpp ) ) {
896 sv_setiv( *svpp, ++trie->uniquecharcount );
902 } /* end first pass */
903 DEBUG_TRIE_COMPILE_r(
904 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
905 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
906 (int)trie->charcount, trie->uniquecharcount )
911 We now know what we are dealing with in terms of unique chars and
912 string sizes so we can calculate how much memory a naive
913 representation using a flat table will take. If it's over a reasonable
914 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
915 conservative but potentially much slower representation using an array
918 At the end we convert both representations into the same compressed
919 form that will be used in regexec.c for matching with. The latter
920 is a form that cannot be used to construct with but has memory
921 properties similar to the list form and access properties similar
922 to the table form making it both suitable for fast searches and
923 small enough that its feasable to store for the duration of a program.
925 See the comment in the code where the compressed table is produced
926 inplace from the flat tabe representation for an explanation of how
927 the compression works.
932 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
934 Second Pass -- Array Of Lists Representation
936 Each state will be represented by a list of charid:state records
937 (reg_trie_trans_le) the first such element holds the CUR and LEN
938 points of the allocated array. (See defines above).
940 We build the initial structure using the lists, and then convert
941 it into the compressed table form which allows faster lookups
942 (but cant be modified once converted).
948 STRLEN transcount = 1;
950 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
954 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
956 regnode * const noper = NEXTOPER( cur );
957 U8 *uc = (U8*)STRING( noper );
958 const U8 * const e = uc + STR_LEN( noper );
959 U32 state = 1; /* required init */
960 U16 charid = 0; /* sanity init */
961 U8 *scan = (U8*)NULL; /* sanity init */
962 STRLEN foldlen = 0; /* required init */
963 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
965 for ( ; uc < e ; uc += len ) {
970 charid = trie->charmap[ uvc ];
972 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
976 charid=(U16)SvIV( *svpp );
985 if ( !trie->states[ state ].trans.list ) {
986 TRIE_LIST_NEW( state );
988 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
989 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
990 newstate = TRIE_LIST_ITEM( state, check ).newstate;
995 newstate = next_alloc++;
996 TRIE_LIST_PUSH( state, charid, newstate );
1001 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1003 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1006 if ( !trie->states[ state ].wordnum ) {
1007 /* we havent inserted this word into the structure yet. */
1008 trie->states[ state ].wordnum = ++curword;
1011 /* store the word for dumping */
1012 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1013 if ( UTF ) SvUTF8_on( tmp );
1014 av_push( trie->words, tmp );
1018 /*EMPTY*/; /* It's a dupe. So ignore it. */
1021 } /* end second pass */
1023 trie->laststate = next_alloc;
1024 Renew( trie->states, next_alloc, reg_trie_state );
1026 DEBUG_TRIE_COMPILE_MORE_r({
1029 /* print out the table precompression. */
1031 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1032 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1034 for( state=1 ; state < next_alloc ; state ++ ) {
1037 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1038 if ( ! trie->states[ state ].wordnum ) {
1039 PerlIO_printf( Perl_debug_log, "%5s| ","");
1041 PerlIO_printf( Perl_debug_log, "W%04x| ",
1042 trie->states[ state ].wordnum
1045 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1046 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1047 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1048 SvPV_nolen_const( *tmp ),
1049 TRIE_LIST_ITEM(state,charid).forid,
1050 (UV)TRIE_LIST_ITEM(state,charid).newstate
1055 PerlIO_printf( Perl_debug_log, "\n\n" );
1058 Newxz( trie->trans, transcount ,reg_trie_trans );
1065 for( state=1 ; state < next_alloc ; state ++ ) {
1069 DEBUG_TRIE_COMPILE_MORE_r(
1070 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1074 if (trie->states[state].trans.list) {
1075 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1079 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1080 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1081 if ( forid < minid ) {
1083 } else if ( forid > maxid ) {
1087 if ( transcount < tp + maxid - minid + 1) {
1089 Renew( trie->trans, transcount, reg_trie_trans );
1090 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1092 base = trie->uniquecharcount + tp - minid;
1093 if ( maxid == minid ) {
1095 for ( ; zp < tp ; zp++ ) {
1096 if ( ! trie->trans[ zp ].next ) {
1097 base = trie->uniquecharcount + zp - minid;
1098 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1099 trie->trans[ zp ].check = state;
1105 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1106 trie->trans[ tp ].check = state;
1111 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1112 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1113 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1114 trie->trans[ tid ].check = state;
1116 tp += ( maxid - minid + 1 );
1118 Safefree(trie->states[ state ].trans.list);
1121 DEBUG_TRIE_COMPILE_MORE_r(
1122 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1125 trie->states[ state ].trans.base=base;
1127 trie->lasttrans = tp + 1;
1131 Second Pass -- Flat Table Representation.
1133 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1134 We know that we will need Charcount+1 trans at most to store the data
1135 (one row per char at worst case) So we preallocate both structures
1136 assuming worst case.
1138 We then construct the trie using only the .next slots of the entry
1141 We use the .check field of the first entry of the node temporarily to
1142 make compression both faster and easier by keeping track of how many non
1143 zero fields are in the node.
1145 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1148 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1149 number representing the first entry of the node, and state as a
1150 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1151 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1152 are 2 entrys per node. eg:
1160 The table is internally in the right hand, idx form. However as we also
1161 have to deal with the states array which is indexed by nodenum we have to
1162 use TRIE_NODENUM() to convert.
1166 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1168 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1169 next_alloc = trie->uniquecharcount + 1;
1171 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1173 regnode * const noper = NEXTOPER( cur );
1174 const U8 *uc = (U8*)STRING( noper );
1175 const U8 * const e = uc + STR_LEN( noper );
1177 U32 state = 1; /* required init */
1179 U16 charid = 0; /* sanity init */
1180 U32 accept_state = 0; /* sanity init */
1181 U8 *scan = (U8*)NULL; /* sanity init */
1183 STRLEN foldlen = 0; /* required init */
1184 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1187 for ( ; uc < e ; uc += len ) {
1192 charid = trie->charmap[ uvc ];
1194 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1195 charid = svpp ? (U16)SvIV(*svpp) : 0;
1199 if ( !trie->trans[ state + charid ].next ) {
1200 trie->trans[ state + charid ].next = next_alloc;
1201 trie->trans[ state ].check++;
1202 next_alloc += trie->uniquecharcount;
1204 state = trie->trans[ state + charid ].next;
1206 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1208 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1211 accept_state = TRIE_NODENUM( state );
1212 if ( !trie->states[ accept_state ].wordnum ) {
1213 /* we havent inserted this word into the structure yet. */
1214 trie->states[ accept_state ].wordnum = ++curword;
1217 /* store the word for dumping */
1218 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1219 if ( UTF ) SvUTF8_on( tmp );
1220 av_push( trie->words, tmp );
1224 /*EMPTY*/; /* Its a dupe. So ignore it. */
1227 } /* end second pass */
1229 DEBUG_TRIE_COMPILE_MORE_r({
1231 print out the table precompression so that we can do a visual check
1232 that they are identical.
1236 PerlIO_printf( Perl_debug_log, "\nChar : " );
1238 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1239 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1241 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1245 PerlIO_printf( Perl_debug_log, "\nState+-" );
1247 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1248 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1251 PerlIO_printf( Perl_debug_log, "\n" );
1253 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1255 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1257 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1258 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1259 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1261 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1262 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1264 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1265 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1268 PerlIO_printf( Perl_debug_log, "\n\n" );
1272 * Inplace compress the table.*
1274 For sparse data sets the table constructed by the trie algorithm will
1275 be mostly 0/FAIL transitions or to put it another way mostly empty.
1276 (Note that leaf nodes will not contain any transitions.)
1278 This algorithm compresses the tables by eliminating most such
1279 transitions, at the cost of a modest bit of extra work during lookup:
1281 - Each states[] entry contains a .base field which indicates the
1282 index in the state[] array wheres its transition data is stored.
1284 - If .base is 0 there are no valid transitions from that node.
1286 - If .base is nonzero then charid is added to it to find an entry in
1289 -If trans[states[state].base+charid].check!=state then the
1290 transition is taken to be a 0/Fail transition. Thus if there are fail
1291 transitions at the front of the node then the .base offset will point
1292 somewhere inside the previous nodes data (or maybe even into a node
1293 even earlier), but the .check field determines if the transition is
1296 The following process inplace converts the table to the compressed
1297 table: We first do not compress the root node 1,and mark its all its
1298 .check pointers as 1 and set its .base pointer as 1 as well. This
1299 allows to do a DFA construction from the compressed table later, and
1300 ensures that any .base pointers we calculate later are greater than
1303 - We set 'pos' to indicate the first entry of the second node.
1305 - We then iterate over the columns of the node, finding the first and
1306 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1307 and set the .check pointers accordingly, and advance pos
1308 appropriately and repreat for the next node. Note that when we copy
1309 the next pointers we have to convert them from the original
1310 NODEIDX form to NODENUM form as the former is not valid post
1313 - If a node has no transitions used we mark its base as 0 and do not
1314 advance the pos pointer.
1316 - If a node only has one transition we use a second pointer into the
1317 structure to fill in allocated fail transitions from other states.
1318 This pointer is independent of the main pointer and scans forward
1319 looking for null transitions that are allocated to a state. When it
1320 finds one it writes the single transition into the "hole". If the
1321 pointer doesnt find one the single transition is appeneded as normal.
1323 - Once compressed we can Renew/realloc the structures to release the
1326 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1327 specifically Fig 3.47 and the associated pseudocode.
1331 const U32 laststate = TRIE_NODENUM( next_alloc );
1334 trie->laststate = laststate;
1336 for ( state = 1 ; state < laststate ; state++ ) {
1338 const U32 stateidx = TRIE_NODEIDX( state );
1339 const U32 o_used = trie->trans[ stateidx ].check;
1340 U32 used = trie->trans[ stateidx ].check;
1341 trie->trans[ stateidx ].check = 0;
1343 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1344 if ( flag || trie->trans[ stateidx + charid ].next ) {
1345 if ( trie->trans[ stateidx + charid ].next ) {
1347 for ( ; zp < pos ; zp++ ) {
1348 if ( ! trie->trans[ zp ].next ) {
1352 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1353 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1354 trie->trans[ zp ].check = state;
1355 if ( ++zp > pos ) pos = zp;
1362 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1364 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1365 trie->trans[ pos ].check = state;
1370 trie->lasttrans = pos + 1;
1371 Renew( trie->states, laststate + 1, reg_trie_state);
1372 DEBUG_TRIE_COMPILE_MORE_r(
1373 PerlIO_printf( Perl_debug_log,
1374 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1375 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1378 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1381 } /* end table compress */
1383 /* resize the trans array to remove unused space */
1384 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1386 DEBUG_TRIE_COMPILE_r({
1389 Now we print it out again, in a slightly different form as there is additional
1390 info we want to be able to see when its compressed. They are close enough for
1391 visual comparison though.
1393 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1395 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1396 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1398 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1401 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1403 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1404 PerlIO_printf( Perl_debug_log, "-----");
1405 PerlIO_printf( Perl_debug_log, "\n");
1407 for( state = 1 ; state < trie->laststate ; state++ ) {
1408 const U32 base = trie->states[ state ].trans.base;
1410 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1412 if ( trie->states[ state ].wordnum ) {
1413 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1415 PerlIO_printf( Perl_debug_log, "%6s", "" );
1418 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1423 while( ( base + ofs < trie->uniquecharcount ) ||
1424 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1425 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1428 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1430 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1431 if ( ( base + ofs >= trie->uniquecharcount ) &&
1432 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1433 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1435 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1436 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1438 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1442 PerlIO_printf( Perl_debug_log, "]");
1445 PerlIO_printf( Perl_debug_log, "\n" );
1450 /* now finally we "stitch in" the new TRIE node
1451 This means we convert either the first branch or the first Exact,
1452 depending on whether the thing following (in 'last') is a branch
1453 or not and whther first is the startbranch (ie is it a sub part of
1454 the alternation or is it the whole thing.)
1455 Assuming its a sub part we conver the EXACT otherwise we convert
1456 the whole branch sequence, including the first.
1463 if ( first == startbranch && OP( last ) != BRANCH ) {
1466 convert = NEXTOPER( first );
1467 NEXT_OFF( first ) = (U16)(last - first);
1470 OP( convert ) = TRIE + (U8)( flags - EXACT );
1471 NEXT_OFF( convert ) = (U16)(tail - convert);
1472 ARG_SET( convert, data_slot );
1474 /* tells us if we need to handle accept buffers specially */
1475 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1478 /* needed for dumping*/
1480 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1481 /* We now need to mark all of the space originally used by the
1482 branches as optimized away. This keeps the dumpuntil from
1483 throwing a wobbly as it doesnt use regnext() to traverse the
1486 while( optimize < last ) {
1487 OP( optimize ) = OPTIMIZED;
1491 } /* end node insert */
1498 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1499 * These need to be revisited when a newer toolchain becomes available.
1501 #if defined(__sparc64__) && defined(__GNUC__)
1502 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1503 # undef SPARC64_GCC_WORKAROUND
1504 # define SPARC64_GCC_WORKAROUND 1
1508 /* REx optimizer. Converts nodes into quickier variants "in place".
1509 Finds fixed substrings. */
1511 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1512 to the position after last scanned or to NULL. */
1516 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1517 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1518 /* scanp: Start here (read-write). */
1519 /* deltap: Write maxlen-minlen here. */
1520 /* last: Stop before this one. */
1523 I32 min = 0, pars = 0, code;
1524 regnode *scan = *scanp, *next;
1526 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1527 int is_inf_internal = 0; /* The studied chunk is infinite */
1528 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1529 scan_data_t data_fake;
1530 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1531 SV *re_trie_maxbuff = NULL;
1533 GET_RE_DEBUG_FLAGS_DECL;
1535 while (scan && OP(scan) != END && scan < last) {
1536 /* Peephole optimizer: */
1538 SV * const mysv=sv_newmortal();
1539 regprop( mysv, scan);
1540 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1541 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1544 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1545 /* Merge several consecutive EXACTish nodes into one. */
1546 regnode *n = regnext(scan);
1549 regnode *stop = scan;
1552 next = scan + NODE_SZ_STR(scan);
1553 /* Skip NOTHING, merge EXACT*. */
1555 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1556 (stringok && (OP(n) == OP(scan))))
1558 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1559 if (OP(n) == TAIL || n > next)
1561 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1562 NEXT_OFF(scan) += NEXT_OFF(n);
1563 next = n + NODE_STEP_REGNODE;
1570 else if (stringok) {
1571 const int oldl = STR_LEN(scan);
1572 regnode * const nnext = regnext(n);
1574 if (oldl + STR_LEN(n) > U8_MAX)
1576 NEXT_OFF(scan) += NEXT_OFF(n);
1577 STR_LEN(scan) += STR_LEN(n);
1578 next = n + NODE_SZ_STR(n);
1579 /* Now we can overwrite *n : */
1580 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1588 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1590 Two problematic code points in Unicode casefolding of EXACT nodes:
1592 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1593 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1599 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1600 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1602 This means that in case-insensitive matching (or "loose matching",
1603 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1604 length of the above casefolded versions) can match a target string
1605 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1606 This would rather mess up the minimum length computation.
1608 What we'll do is to look for the tail four bytes, and then peek
1609 at the preceding two bytes to see whether we need to decrease
1610 the minimum length by four (six minus two).
1612 Thanks to the design of UTF-8, there cannot be false matches:
1613 A sequence of valid UTF-8 bytes cannot be a subsequence of
1614 another valid sequence of UTF-8 bytes.
1617 char * const s0 = STRING(scan), *s, *t;
1618 char * const s1 = s0 + STR_LEN(scan) - 1;
1619 char * const s2 = s1 - 4;
1620 const char * const t0 = "\xcc\x88\xcc\x81";
1621 const char * const t1 = t0 + 3;
1624 s < s2 && (t = ninstr(s, s1, t0, t1));
1626 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1627 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1634 n = scan + NODE_SZ_STR(scan);
1636 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1647 /* Follow the next-chain of the current node and optimize
1648 away all the NOTHINGs from it. */
1649 if (OP(scan) != CURLYX) {
1650 const int max = (reg_off_by_arg[OP(scan)]
1652 /* I32 may be smaller than U16 on CRAYs! */
1653 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1654 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1658 /* Skip NOTHING and LONGJMP. */
1659 while ((n = regnext(n))
1660 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1661 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1662 && off + noff < max)
1664 if (reg_off_by_arg[OP(scan)])
1667 NEXT_OFF(scan) = off;
1670 /* The principal pseudo-switch. Cannot be a switch, since we
1671 look into several different things. */
1672 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1673 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1674 next = regnext(scan);
1676 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1678 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1679 I32 max1 = 0, min1 = I32_MAX, num = 0;
1680 struct regnode_charclass_class accum;
1681 regnode *startbranch=scan;
1683 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1684 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1685 if (flags & SCF_DO_STCLASS)
1686 cl_init_zero(pRExC_state, &accum);
1688 while (OP(scan) == code) {
1689 I32 deltanext, minnext, f = 0, fake;
1690 struct regnode_charclass_class this_class;
1693 data_fake.flags = 0;
1695 data_fake.whilem_c = data->whilem_c;
1696 data_fake.last_closep = data->last_closep;
1699 data_fake.last_closep = &fake;
1700 next = regnext(scan);
1701 scan = NEXTOPER(scan);
1703 scan = NEXTOPER(scan);
1704 if (flags & SCF_DO_STCLASS) {
1705 cl_init(pRExC_state, &this_class);
1706 data_fake.start_class = &this_class;
1707 f = SCF_DO_STCLASS_AND;
1709 if (flags & SCF_WHILEM_VISITED_POS)
1710 f |= SCF_WHILEM_VISITED_POS;
1712 /* we suppose the run is continuous, last=next...*/
1713 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1714 next, &data_fake, f,depth+1);
1717 if (max1 < minnext + deltanext)
1718 max1 = minnext + deltanext;
1719 if (deltanext == I32_MAX)
1720 is_inf = is_inf_internal = 1;
1722 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1724 if (data && (data_fake.flags & SF_HAS_EVAL))
1725 data->flags |= SF_HAS_EVAL;
1727 data->whilem_c = data_fake.whilem_c;
1728 if (flags & SCF_DO_STCLASS)
1729 cl_or(pRExC_state, &accum, &this_class);
1730 if (code == SUSPEND)
1733 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1735 if (flags & SCF_DO_SUBSTR) {
1736 data->pos_min += min1;
1737 data->pos_delta += max1 - min1;
1738 if (max1 != min1 || is_inf)
1739 data->longest = &(data->longest_float);
1742 delta += max1 - min1;
1743 if (flags & SCF_DO_STCLASS_OR) {
1744 cl_or(pRExC_state, data->start_class, &accum);
1746 cl_and(data->start_class, &and_with);
1747 flags &= ~SCF_DO_STCLASS;
1750 else if (flags & SCF_DO_STCLASS_AND) {
1752 cl_and(data->start_class, &accum);
1753 flags &= ~SCF_DO_STCLASS;
1756 /* Switch to OR mode: cache the old value of
1757 * data->start_class */
1758 StructCopy(data->start_class, &and_with,
1759 struct regnode_charclass_class);
1760 flags &= ~SCF_DO_STCLASS_AND;
1761 StructCopy(&accum, data->start_class,
1762 struct regnode_charclass_class);
1763 flags |= SCF_DO_STCLASS_OR;
1764 data->start_class->flags |= ANYOF_EOS;
1770 Assuming this was/is a branch we are dealing with: 'scan' now
1771 points at the item that follows the branch sequence, whatever
1772 it is. We now start at the beginning of the sequence and look
1778 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1780 If we can find such a subseqence we need to turn the first
1781 element into a trie and then add the subsequent branch exact
1782 strings to the trie.
1786 1. patterns where the whole set of branch can be converted to a trie,
1788 2. patterns where only a subset of the alternations can be
1789 converted to a trie.
1791 In case 1 we can replace the whole set with a single regop
1792 for the trie. In case 2 we need to keep the start and end
1795 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1796 becomes BRANCH TRIE; BRANCH X;
1798 Hypthetically when we know the regex isnt anchored we can
1799 turn a case 1 into a DFA and let it rip... Every time it finds a match
1800 it would just call its tail, no WHILEM/CURLY needed.
1804 if (!re_trie_maxbuff) {
1805 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1806 if (!SvIOK(re_trie_maxbuff))
1807 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1809 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1811 regnode *first = (regnode *)NULL;
1812 regnode *last = (regnode *)NULL;
1813 regnode *tail = scan;
1818 SV * const mysv = sv_newmortal(); /* for dumping */
1820 /* var tail is used because there may be a TAIL
1821 regop in the way. Ie, the exacts will point to the
1822 thing following the TAIL, but the last branch will
1823 point at the TAIL. So we advance tail. If we
1824 have nested (?:) we may have to move through several
1828 while ( OP( tail ) == TAIL ) {
1829 /* this is the TAIL generated by (?:) */
1830 tail = regnext( tail );
1834 regprop( mysv, tail );
1835 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1836 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1837 (RExC_seen_evals) ? "[EVAL]" : ""
1842 step through the branches, cur represents each
1843 branch, noper is the first thing to be matched
1844 as part of that branch and noper_next is the
1845 regnext() of that node. if noper is an EXACT
1846 and noper_next is the same as scan (our current
1847 position in the regex) then the EXACT branch is
1848 a possible optimization target. Once we have
1849 two or more consequetive such branches we can
1850 create a trie of the EXACT's contents and stich
1851 it in place. If the sequence represents all of
1852 the branches we eliminate the whole thing and
1853 replace it with a single TRIE. If it is a
1854 subsequence then we need to stitch it in. This
1855 means the first branch has to remain, and needs
1856 to be repointed at the item on the branch chain
1857 following the last branch optimized. This could
1858 be either a BRANCH, in which case the
1859 subsequence is internal, or it could be the
1860 item following the branch sequence in which
1861 case the subsequence is at the end.
1865 /* dont use tail as the end marker for this traverse */
1866 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1867 regnode * const noper = NEXTOPER( cur );
1868 regnode * const noper_next = regnext( noper );
1871 regprop( mysv, cur);
1872 PerlIO_printf( Perl_debug_log, "%*s%s",
1873 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1875 regprop( mysv, noper);
1876 PerlIO_printf( Perl_debug_log, " -> %s",
1877 SvPV_nolen_const(mysv));
1880 regprop( mysv, noper_next );
1881 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1882 SvPV_nolen_const(mysv));
1884 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1887 if ( ( first ? OP( noper ) == optype
1888 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1889 && noper_next == tail && count<U16_MAX)
1894 optype = OP( noper );
1898 regprop( mysv, first);
1899 PerlIO_printf( Perl_debug_log, "%*s%s",
1900 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1901 regprop( mysv, NEXTOPER(first) );
1902 PerlIO_printf( Perl_debug_log, " -> %s\n",
1903 SvPV_nolen_const( mysv ) );
1908 regprop( mysv, cur);
1909 PerlIO_printf( Perl_debug_log, "%*s%s",
1910 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1911 regprop( mysv, noper );
1912 PerlIO_printf( Perl_debug_log, " -> %s\n",
1913 SvPV_nolen_const( mysv ) );
1919 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1920 (int)depth * 2 + 2, "E:", "**END**" );
1922 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1924 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1925 && noper_next == tail )
1929 optype = OP( noper );
1939 regprop( mysv, cur);
1940 PerlIO_printf( Perl_debug_log,
1941 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1942 " ", SvPV_nolen_const( mysv ), first, last, cur);
1947 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1948 (int)depth * 2 + 2, "E:", "==END==" );
1950 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1956 scan = NEXTOPER(NEXTOPER(scan));
1957 } else /* single branch is optimized. */
1958 scan = NEXTOPER(scan);
1961 else if (OP(scan) == EXACT) {
1962 I32 l = STR_LEN(scan);
1965 const U8 * const s = (U8*)STRING(scan);
1966 l = utf8_length(s, s + l);
1967 uc = utf8_to_uvchr(s, NULL);
1969 uc = *((U8*)STRING(scan));
1972 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1973 /* The code below prefers earlier match for fixed
1974 offset, later match for variable offset. */
1975 if (data->last_end == -1) { /* Update the start info. */
1976 data->last_start_min = data->pos_min;
1977 data->last_start_max = is_inf
1978 ? I32_MAX : data->pos_min + data->pos_delta;
1980 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
1982 SV * const sv = data->last_found;
1983 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1984 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1985 if (mg && mg->mg_len >= 0)
1986 mg->mg_len += utf8_length((U8*)STRING(scan),
1987 (U8*)STRING(scan)+STR_LEN(scan));
1990 SvUTF8_on(data->last_found);
1991 data->last_end = data->pos_min + l;
1992 data->pos_min += l; /* As in the first entry. */
1993 data->flags &= ~SF_BEFORE_EOL;
1995 if (flags & SCF_DO_STCLASS_AND) {
1996 /* Check whether it is compatible with what we know already! */
2000 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2001 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2002 && (!(data->start_class->flags & ANYOF_FOLD)
2003 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2006 ANYOF_CLASS_ZERO(data->start_class);
2007 ANYOF_BITMAP_ZERO(data->start_class);
2009 ANYOF_BITMAP_SET(data->start_class, uc);
2010 data->start_class->flags &= ~ANYOF_EOS;
2012 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2014 else if (flags & SCF_DO_STCLASS_OR) {
2015 /* false positive possible if the class is case-folded */
2017 ANYOF_BITMAP_SET(data->start_class, uc);
2019 data->start_class->flags |= ANYOF_UNICODE_ALL;
2020 data->start_class->flags &= ~ANYOF_EOS;
2021 cl_and(data->start_class, &and_with);
2023 flags &= ~SCF_DO_STCLASS;
2025 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2026 I32 l = STR_LEN(scan);
2027 UV uc = *((U8*)STRING(scan));
2029 /* Search for fixed substrings supports EXACT only. */
2030 if (flags & SCF_DO_SUBSTR)
2031 scan_commit(pRExC_state, data);
2033 const U8 * const s = (U8 *)STRING(scan);
2034 l = utf8_length(s, s + l);
2035 uc = utf8_to_uvchr(s, NULL);
2038 if (data && (flags & SCF_DO_SUBSTR))
2040 if (flags & SCF_DO_STCLASS_AND) {
2041 /* Check whether it is compatible with what we know already! */
2045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2049 ANYOF_CLASS_ZERO(data->start_class);
2050 ANYOF_BITMAP_ZERO(data->start_class);
2052 ANYOF_BITMAP_SET(data->start_class, uc);
2053 data->start_class->flags &= ~ANYOF_EOS;
2054 data->start_class->flags |= ANYOF_FOLD;
2055 if (OP(scan) == EXACTFL)
2056 data->start_class->flags |= ANYOF_LOCALE;
2059 else if (flags & SCF_DO_STCLASS_OR) {
2060 if (data->start_class->flags & ANYOF_FOLD) {
2061 /* false positive possible if the class is case-folded.
2062 Assume that the locale settings are the same... */
2064 ANYOF_BITMAP_SET(data->start_class, uc);
2065 data->start_class->flags &= ~ANYOF_EOS;
2067 cl_and(data->start_class, &and_with);
2069 flags &= ~SCF_DO_STCLASS;
2071 else if (strchr((const char*)PL_varies,OP(scan))) {
2072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2073 I32 f = flags, pos_before = 0;
2074 regnode *oscan = scan;
2075 struct regnode_charclass_class this_class;
2076 struct regnode_charclass_class *oclass = NULL;
2077 I32 next_is_eval = 0;
2079 switch (PL_regkind[(U8)OP(scan)]) {
2080 case WHILEM: /* End of (?:...)* . */
2081 scan = NEXTOPER(scan);
2084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2085 next = NEXTOPER(scan);
2086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2088 maxcount = REG_INFTY;
2089 next = regnext(scan);
2090 scan = NEXTOPER(scan);
2094 if (flags & SCF_DO_SUBSTR)
2099 if (flags & SCF_DO_STCLASS) {
2101 maxcount = REG_INFTY;
2102 next = regnext(scan);
2103 scan = NEXTOPER(scan);
2106 is_inf = is_inf_internal = 1;
2107 scan = regnext(scan);
2108 if (flags & SCF_DO_SUBSTR) {
2109 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2110 data->longest = &(data->longest_float);
2112 goto optimize_curly_tail;
2114 mincount = ARG1(scan);
2115 maxcount = ARG2(scan);
2116 next = regnext(scan);
2117 if (OP(scan) == CURLYX) {
2118 I32 lp = (data ? *(data->last_closep) : 0);
2119 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2121 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2122 next_is_eval = (OP(scan) == EVAL);
2124 if (flags & SCF_DO_SUBSTR) {
2125 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2126 pos_before = data->pos_min;
2130 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2132 data->flags |= SF_IS_INF;
2134 if (flags & SCF_DO_STCLASS) {
2135 cl_init(pRExC_state, &this_class);
2136 oclass = data->start_class;
2137 data->start_class = &this_class;
2138 f |= SCF_DO_STCLASS_AND;
2139 f &= ~SCF_DO_STCLASS_OR;
2141 /* These are the cases when once a subexpression
2142 fails at a particular position, it cannot succeed
2143 even after backtracking at the enclosing scope.
2145 XXXX what if minimal match and we are at the
2146 initial run of {n,m}? */
2147 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2148 f &= ~SCF_WHILEM_VISITED_POS;
2150 /* This will finish on WHILEM, setting scan, or on NULL: */
2151 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2153 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2155 if (flags & SCF_DO_STCLASS)
2156 data->start_class = oclass;
2157 if (mincount == 0 || minnext == 0) {
2158 if (flags & SCF_DO_STCLASS_OR) {
2159 cl_or(pRExC_state, data->start_class, &this_class);
2161 else if (flags & SCF_DO_STCLASS_AND) {
2162 /* Switch to OR mode: cache the old value of
2163 * data->start_class */
2164 StructCopy(data->start_class, &and_with,
2165 struct regnode_charclass_class);
2166 flags &= ~SCF_DO_STCLASS_AND;
2167 StructCopy(&this_class, data->start_class,
2168 struct regnode_charclass_class);
2169 flags |= SCF_DO_STCLASS_OR;
2170 data->start_class->flags |= ANYOF_EOS;
2172 } else { /* Non-zero len */
2173 if (flags & SCF_DO_STCLASS_OR) {
2174 cl_or(pRExC_state, data->start_class, &this_class);
2175 cl_and(data->start_class, &and_with);
2177 else if (flags & SCF_DO_STCLASS_AND)
2178 cl_and(data->start_class, &this_class);
2179 flags &= ~SCF_DO_STCLASS;
2181 if (!scan) /* It was not CURLYX, but CURLY. */
2183 if ( /* ? quantifier ok, except for (?{ ... }) */
2184 (next_is_eval || !(mincount == 0 && maxcount == 1))
2185 && (minnext == 0) && (deltanext == 0)
2186 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2187 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2188 && ckWARN(WARN_REGEXP))
2191 "Quantifier unexpected on zero-length expression");
2194 min += minnext * mincount;
2195 is_inf_internal |= ((maxcount == REG_INFTY
2196 && (minnext + deltanext) > 0)
2197 || deltanext == I32_MAX);
2198 is_inf |= is_inf_internal;
2199 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2201 /* Try powerful optimization CURLYX => CURLYN. */
2202 if ( OP(oscan) == CURLYX && data
2203 && data->flags & SF_IN_PAR
2204 && !(data->flags & SF_HAS_EVAL)
2205 && !deltanext && minnext == 1 ) {
2206 /* Try to optimize to CURLYN. */
2207 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2208 regnode *nxt1 = nxt;
2215 if (!strchr((const char*)PL_simple,OP(nxt))
2216 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2217 && STR_LEN(nxt) == 1))
2223 if (OP(nxt) != CLOSE)
2225 /* Now we know that nxt2 is the only contents: */
2226 oscan->flags = (U8)ARG(nxt);
2228 OP(nxt1) = NOTHING; /* was OPEN. */
2230 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2231 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2232 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2233 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2234 OP(nxt + 1) = OPTIMIZED; /* was count. */
2235 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2240 /* Try optimization CURLYX => CURLYM. */
2241 if ( OP(oscan) == CURLYX && data
2242 && !(data->flags & SF_HAS_PAR)
2243 && !(data->flags & SF_HAS_EVAL)
2244 && !deltanext /* atom is fixed width */
2245 && minnext != 0 /* CURLYM can't handle zero width */
2247 /* XXXX How to optimize if data == 0? */
2248 /* Optimize to a simpler form. */
2249 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2253 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2254 && (OP(nxt2) != WHILEM))
2256 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2257 /* Need to optimize away parenths. */
2258 if (data->flags & SF_IN_PAR) {
2259 /* Set the parenth number. */
2260 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2262 if (OP(nxt) != CLOSE)
2263 FAIL("Panic opt close");
2264 oscan->flags = (U8)ARG(nxt);
2265 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2266 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2268 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2269 OP(nxt + 1) = OPTIMIZED; /* was count. */
2270 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2271 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2274 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2275 regnode *nnxt = regnext(nxt1);
2278 if (reg_off_by_arg[OP(nxt1)])
2279 ARG_SET(nxt1, nxt2 - nxt1);
2280 else if (nxt2 - nxt1 < U16_MAX)
2281 NEXT_OFF(nxt1) = nxt2 - nxt1;
2283 OP(nxt) = NOTHING; /* Cannot beautify */
2288 /* Optimize again: */
2289 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2295 else if ((OP(oscan) == CURLYX)
2296 && (flags & SCF_WHILEM_VISITED_POS)
2297 /* See the comment on a similar expression above.
2298 However, this time it not a subexpression
2299 we care about, but the expression itself. */
2300 && (maxcount == REG_INFTY)
2301 && data && ++data->whilem_c < 16) {
2302 /* This stays as CURLYX, we can put the count/of pair. */
2303 /* Find WHILEM (as in regexec.c) */
2304 regnode *nxt = oscan + NEXT_OFF(oscan);
2306 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2308 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2309 | (RExC_whilem_seen << 4)); /* On WHILEM */
2311 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2313 if (flags & SCF_DO_SUBSTR) {
2314 SV *last_str = NULL;
2315 int counted = mincount != 0;
2317 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2318 #if defined(SPARC64_GCC_WORKAROUND)
2321 const char *s = NULL;
2324 if (pos_before >= data->last_start_min)
2327 b = data->last_start_min;
2330 s = SvPV_const(data->last_found, l);
2331 old = b - data->last_start_min;
2334 I32 b = pos_before >= data->last_start_min
2335 ? pos_before : data->last_start_min;
2337 const char *s = SvPV_const(data->last_found, l);
2338 I32 old = b - data->last_start_min;
2342 old = utf8_hop((U8*)s, old) - (U8*)s;
2345 /* Get the added string: */
2346 last_str = newSVpvn(s + old, l);
2348 SvUTF8_on(last_str);
2349 if (deltanext == 0 && pos_before == b) {
2350 /* What was added is a constant string */
2352 SvGROW(last_str, (mincount * l) + 1);
2353 repeatcpy(SvPVX(last_str) + l,
2354 SvPVX_const(last_str), l, mincount - 1);
2355 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2356 /* Add additional parts. */
2357 SvCUR_set(data->last_found,
2358 SvCUR(data->last_found) - l);
2359 sv_catsv(data->last_found, last_str);
2361 SV * sv = data->last_found;
2363 SvUTF8(sv) && SvMAGICAL(sv) ?
2364 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2365 if (mg && mg->mg_len >= 0)
2366 mg->mg_len += CHR_SVLEN(last_str);
2368 data->last_end += l * (mincount - 1);
2371 /* start offset must point into the last copy */
2372 data->last_start_min += minnext * (mincount - 1);
2373 data->last_start_max += is_inf ? I32_MAX
2374 : (maxcount - 1) * (minnext + data->pos_delta);
2377 /* It is counted once already... */
2378 data->pos_min += minnext * (mincount - counted);
2379 data->pos_delta += - counted * deltanext +
2380 (minnext + deltanext) * maxcount - minnext * mincount;
2381 if (mincount != maxcount) {
2382 /* Cannot extend fixed substrings found inside
2384 scan_commit(pRExC_state,data);
2385 if (mincount && last_str) {
2386 sv_setsv(data->last_found, last_str);
2387 data->last_end = data->pos_min;
2388 data->last_start_min =
2389 data->pos_min - CHR_SVLEN(last_str);
2390 data->last_start_max = is_inf
2392 : data->pos_min + data->pos_delta
2393 - CHR_SVLEN(last_str);
2395 data->longest = &(data->longest_float);
2397 SvREFCNT_dec(last_str);
2399 if (data && (fl & SF_HAS_EVAL))
2400 data->flags |= SF_HAS_EVAL;
2401 optimize_curly_tail:
2402 if (OP(oscan) != CURLYX) {
2403 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2405 NEXT_OFF(oscan) += NEXT_OFF(next);
2408 default: /* REF and CLUMP only? */
2409 if (flags & SCF_DO_SUBSTR) {
2410 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2411 data->longest = &(data->longest_float);
2413 is_inf = is_inf_internal = 1;
2414 if (flags & SCF_DO_STCLASS_OR)
2415 cl_anything(pRExC_state, data->start_class);
2416 flags &= ~SCF_DO_STCLASS;
2420 else if (strchr((const char*)PL_simple,OP(scan))) {
2423 if (flags & SCF_DO_SUBSTR) {
2424 scan_commit(pRExC_state,data);
2428 if (flags & SCF_DO_STCLASS) {
2429 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2431 /* Some of the logic below assumes that switching
2432 locale on will only add false positives. */
2433 switch (PL_regkind[(U8)OP(scan)]) {
2437 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2438 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2439 cl_anything(pRExC_state, data->start_class);
2442 if (OP(scan) == SANY)
2444 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2445 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2446 || (data->start_class->flags & ANYOF_CLASS));
2447 cl_anything(pRExC_state, data->start_class);
2449 if (flags & SCF_DO_STCLASS_AND || !value)
2450 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2453 if (flags & SCF_DO_STCLASS_AND)
2454 cl_and(data->start_class,
2455 (struct regnode_charclass_class*)scan);
2457 cl_or(pRExC_state, data->start_class,
2458 (struct regnode_charclass_class*)scan);
2461 if (flags & SCF_DO_STCLASS_AND) {
2462 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2464 for (value = 0; value < 256; value++)
2465 if (!isALNUM(value))
2466 ANYOF_BITMAP_CLEAR(data->start_class, value);
2470 if (data->start_class->flags & ANYOF_LOCALE)
2471 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2473 for (value = 0; value < 256; value++)
2475 ANYOF_BITMAP_SET(data->start_class, value);
2480 if (flags & SCF_DO_STCLASS_AND) {
2481 if (data->start_class->flags & ANYOF_LOCALE)
2482 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2485 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2486 data->start_class->flags |= ANYOF_LOCALE;
2490 if (flags & SCF_DO_STCLASS_AND) {
2491 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2493 for (value = 0; value < 256; value++)
2495 ANYOF_BITMAP_CLEAR(data->start_class, value);
2499 if (data->start_class->flags & ANYOF_LOCALE)
2500 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2502 for (value = 0; value < 256; value++)
2503 if (!isALNUM(value))
2504 ANYOF_BITMAP_SET(data->start_class, value);
2509 if (flags & SCF_DO_STCLASS_AND) {
2510 if (data->start_class->flags & ANYOF_LOCALE)
2511 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2514 data->start_class->flags |= ANYOF_LOCALE;
2515 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2519 if (flags & SCF_DO_STCLASS_AND) {
2520 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2521 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2522 for (value = 0; value < 256; value++)
2523 if (!isSPACE(value))
2524 ANYOF_BITMAP_CLEAR(data->start_class, value);
2528 if (data->start_class->flags & ANYOF_LOCALE)
2529 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2531 for (value = 0; value < 256; value++)
2533 ANYOF_BITMAP_SET(data->start_class, value);
2538 if (flags & SCF_DO_STCLASS_AND) {
2539 if (data->start_class->flags & ANYOF_LOCALE)
2540 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2543 data->start_class->flags |= ANYOF_LOCALE;
2544 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2548 if (flags & SCF_DO_STCLASS_AND) {
2549 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2550 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2551 for (value = 0; value < 256; value++)
2553 ANYOF_BITMAP_CLEAR(data->start_class, value);
2557 if (data->start_class->flags & ANYOF_LOCALE)
2558 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2560 for (value = 0; value < 256; value++)
2561 if (!isSPACE(value))
2562 ANYOF_BITMAP_SET(data->start_class, value);
2567 if (flags & SCF_DO_STCLASS_AND) {
2568 if (data->start_class->flags & ANYOF_LOCALE) {
2569 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2570 for (value = 0; value < 256; value++)
2571 if (!isSPACE(value))
2572 ANYOF_BITMAP_CLEAR(data->start_class, value);
2576 data->start_class->flags |= ANYOF_LOCALE;
2577 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2581 if (flags & SCF_DO_STCLASS_AND) {
2582 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2583 for (value = 0; value < 256; value++)
2584 if (!isDIGIT(value))
2585 ANYOF_BITMAP_CLEAR(data->start_class, value);
2588 if (data->start_class->flags & ANYOF_LOCALE)
2589 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2591 for (value = 0; value < 256; value++)
2593 ANYOF_BITMAP_SET(data->start_class, value);
2598 if (flags & SCF_DO_STCLASS_AND) {
2599 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2600 for (value = 0; value < 256; value++)
2602 ANYOF_BITMAP_CLEAR(data->start_class, value);
2605 if (data->start_class->flags & ANYOF_LOCALE)
2606 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2608 for (value = 0; value < 256; value++)
2609 if (!isDIGIT(value))
2610 ANYOF_BITMAP_SET(data->start_class, value);
2615 if (flags & SCF_DO_STCLASS_OR)
2616 cl_and(data->start_class, &and_with);
2617 flags &= ~SCF_DO_STCLASS;
2620 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2621 data->flags |= (OP(scan) == MEOL
2625 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2626 /* Lookbehind, or need to calculate parens/evals/stclass: */
2627 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2628 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2629 /* Lookahead/lookbehind */
2630 I32 deltanext, minnext, fake = 0;
2632 struct regnode_charclass_class intrnl;
2635 data_fake.flags = 0;
2637 data_fake.whilem_c = data->whilem_c;
2638 data_fake.last_closep = data->last_closep;
2641 data_fake.last_closep = &fake;
2642 if ( flags & SCF_DO_STCLASS && !scan->flags
2643 && OP(scan) == IFMATCH ) { /* Lookahead */
2644 cl_init(pRExC_state, &intrnl);
2645 data_fake.start_class = &intrnl;
2646 f |= SCF_DO_STCLASS_AND;
2648 if (flags & SCF_WHILEM_VISITED_POS)
2649 f |= SCF_WHILEM_VISITED_POS;
2650 next = regnext(scan);
2651 nscan = NEXTOPER(NEXTOPER(scan));
2652 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2655 vFAIL("Variable length lookbehind not implemented");
2657 else if (minnext > U8_MAX) {
2658 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2660 scan->flags = (U8)minnext;
2662 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2664 if (data && (data_fake.flags & SF_HAS_EVAL))
2665 data->flags |= SF_HAS_EVAL;
2667 data->whilem_c = data_fake.whilem_c;
2668 if (f & SCF_DO_STCLASS_AND) {
2669 const int was = (data->start_class->flags & ANYOF_EOS);
2671 cl_and(data->start_class, &intrnl);
2673 data->start_class->flags |= ANYOF_EOS;
2676 else if (OP(scan) == OPEN) {
2679 else if (OP(scan) == CLOSE) {
2680 if ((I32)ARG(scan) == is_par) {
2681 next = regnext(scan);
2683 if ( next && (OP(next) != WHILEM) && next < last)
2684 is_par = 0; /* Disable optimization */
2687 *(data->last_closep) = ARG(scan);
2689 else if (OP(scan) == EVAL) {
2691 data->flags |= SF_HAS_EVAL;
2693 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2694 if (flags & SCF_DO_SUBSTR) {
2695 scan_commit(pRExC_state,data);
2696 data->longest = &(data->longest_float);
2698 is_inf = is_inf_internal = 1;
2699 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2700 cl_anything(pRExC_state, data->start_class);
2701 flags &= ~SCF_DO_STCLASS;
2703 /* Else: zero-length, ignore. */
2704 scan = regnext(scan);
2709 *deltap = is_inf_internal ? I32_MAX : delta;
2710 if (flags & SCF_DO_SUBSTR && is_inf)
2711 data->pos_delta = I32_MAX - data->pos_min;
2712 if (is_par > U8_MAX)
2714 if (is_par && pars==1 && data) {
2715 data->flags |= SF_IN_PAR;
2716 data->flags &= ~SF_HAS_PAR;
2718 else if (pars && data) {
2719 data->flags |= SF_HAS_PAR;
2720 data->flags &= ~SF_IN_PAR;
2722 if (flags & SCF_DO_STCLASS_OR)
2723 cl_and(data->start_class, &and_with);
2728 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
2730 if (RExC_rx->data) {
2731 Renewc(RExC_rx->data,
2732 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2733 char, struct reg_data);
2734 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2735 RExC_rx->data->count += n;
2738 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2739 char, struct reg_data);
2740 Newx(RExC_rx->data->what, n, U8);
2741 RExC_rx->data->count = n;
2743 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2744 return RExC_rx->data->count - n;
2748 Perl_reginitcolors(pTHX)
2751 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2753 char *t = savepv(s);
2757 t = strchr(t, '\t');
2763 PL_colors[i] = t = (char *)"";
2768 PL_colors[i++] = (char *)"";
2775 - pregcomp - compile a regular expression into internal code
2777 * We can't allocate space until we know how big the compiled form will be,
2778 * but we can't compile it (and thus know how big it is) until we've got a
2779 * place to put the code. So we cheat: we compile it twice, once with code
2780 * generation turned off and size counting turned on, and once "for real".
2781 * This also means that we don't allocate space until we are sure that the
2782 * thing really will compile successfully, and we never have to move the
2783 * code and thus invalidate pointers into it. (Note that it has to be in
2784 * one piece because free() must be able to free it all.) [NB: not true in perl]
2786 * Beware that the optimization-preparation code in here knows about some
2787 * of the structure of the compiled regexp. [I'll say.]
2790 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2801 RExC_state_t RExC_state;
2802 RExC_state_t *pRExC_state = &RExC_state;
2804 GET_RE_DEBUG_FLAGS_DECL;
2807 FAIL("NULL regexp argument");
2809 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2812 DEBUG_r(if (!PL_colorset) reginitcolors());
2814 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2815 PL_colors[4],PL_colors[5],PL_colors[0],
2816 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2818 RExC_flags = pm->op_pmflags;
2822 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2823 RExC_seen_evals = 0;
2826 /* First pass: determine size, legality. */
2833 RExC_emit = &PL_regdummy;
2834 RExC_whilem_seen = 0;
2835 #if 0 /* REGC() is (currently) a NOP at the first pass.
2836 * Clever compilers notice this and complain. --jhi */
2837 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2839 if (reg(pRExC_state, 0, &flags) == NULL) {
2840 RExC_precomp = NULL;
2843 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2845 /* Small enough for pointer-storage convention?
2846 If extralen==0, this means that we will not need long jumps. */
2847 if (RExC_size >= 0x10000L && RExC_extralen)
2848 RExC_size += RExC_extralen;
2851 if (RExC_whilem_seen > 15)
2852 RExC_whilem_seen = 15;
2854 /* Allocate space and initialize. */
2855 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2858 FAIL("Regexp out of space");
2861 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2862 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2865 r->prelen = xend - exp;
2866 r->precomp = savepvn(RExC_precomp, r->prelen);
2868 #ifdef PERL_OLD_COPY_ON_WRITE
2869 r->saved_copy = NULL;
2871 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2872 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2873 r->lastparen = 0; /* mg.c reads this. */
2875 r->substrs = 0; /* Useful during FAIL. */
2876 r->startp = 0; /* Useful during FAIL. */
2877 r->endp = 0; /* Useful during FAIL. */
2879 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2881 r->offsets[0] = RExC_size;
2883 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2884 "%s %"UVuf" bytes for offset annotations.\n",
2885 r->offsets ? "Got" : "Couldn't get",
2886 (UV)((2*RExC_size+1) * sizeof(U32))));
2890 /* Second pass: emit code. */
2891 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2896 RExC_emit_start = r->program;
2897 RExC_emit = r->program;
2898 /* Store the count of eval-groups for security checks: */
2899 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2900 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2902 if (reg(pRExC_state, 0, &flags) == NULL)
2906 /* Dig out information for optimizations. */
2907 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2908 pm->op_pmflags = RExC_flags;
2910 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2911 r->regstclass = NULL;
2912 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2913 r->reganch |= ROPT_NAUGHTY;
2914 scan = r->program + 1; /* First BRANCH. */
2916 /* XXXX To minimize changes to RE engine we always allocate
2917 3-units-long substrs field. */
2918 Newxz(r->substrs, 1, struct reg_substr_data);
2920 StructCopy(&zero_scan_data, &data, scan_data_t);
2921 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2922 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2924 STRLEN longest_float_length, longest_fixed_length;
2925 struct regnode_charclass_class ch_class;
2930 /* Skip introductions and multiplicators >= 1. */
2931 while ((OP(first) == OPEN && (sawopen = 1)) ||
2932 /* An OR of *one* alternative - should not happen now. */
2933 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2934 (OP(first) == PLUS) ||
2935 (OP(first) == MINMOD) ||
2936 /* An {n,m} with n>0 */
2937 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2938 if (OP(first) == PLUS)
2941 first += regarglen[(U8)OP(first)];
2942 first = NEXTOPER(first);
2945 /* Starting-point info. */
2947 if (PL_regkind[(U8)OP(first)] == EXACT) {
2948 if (OP(first) == EXACT)
2949 /*EMPTY*/; /* Empty, get anchored substr later. */
2950 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2951 r->regstclass = first;
2953 else if (strchr((const char*)PL_simple,OP(first)))
2954 r->regstclass = first;
2955 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2956 PL_regkind[(U8)OP(first)] == NBOUND)
2957 r->regstclass = first;
2958 else if (PL_regkind[(U8)OP(first)] == BOL) {
2959 r->reganch |= (OP(first) == MBOL
2961 : (OP(first) == SBOL
2964 first = NEXTOPER(first);
2967 else if (OP(first) == GPOS) {
2968 r->reganch |= ROPT_ANCH_GPOS;
2969 first = NEXTOPER(first);
2972 else if (!sawopen && (OP(first) == STAR &&
2973 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2974 !(r->reganch & ROPT_ANCH) )
2976 /* turn .* into ^.* with an implied $*=1 */
2978 (OP(NEXTOPER(first)) == REG_ANY)
2981 r->reganch |= type | ROPT_IMPLICIT;
2982 first = NEXTOPER(first);
2985 if (sawplus && (!sawopen || !RExC_sawback)
2986 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
2987 /* x+ must match at the 1st pos of run of x's */
2988 r->reganch |= ROPT_SKIP;
2990 /* Scan is after the zeroth branch, first is atomic matcher. */
2991 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
2992 (IV)(first - scan + 1)));
2994 * If there's something expensive in the r.e., find the
2995 * longest literal string that must appear and make it the
2996 * regmust. Resolve ties in favor of later strings, since
2997 * the regstart check works with the beginning of the r.e.
2998 * and avoiding duplication strengthens checking. Not a
2999 * strong reason, but sufficient in the absence of others.
3000 * [Now we resolve ties in favor of the earlier string if
3001 * it happens that c_offset_min has been invalidated, since the
3002 * earlier string may buy us something the later one won't.]
3006 data.longest_fixed = newSVpvs("");
3007 data.longest_float = newSVpvs("");
3008 data.last_found = newSVpvs("");
3009 data.longest = &(data.longest_fixed);
3011 if (!r->regstclass) {
3012 cl_init(pRExC_state, &ch_class);
3013 data.start_class = &ch_class;
3014 stclass_flag = SCF_DO_STCLASS_AND;
3015 } else /* XXXX Check for BOUND? */
3017 data.last_closep = &last_close;
3019 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3020 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3021 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3022 && data.last_start_min == 0 && data.last_end > 0
3023 && !RExC_seen_zerolen
3024 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3025 r->reganch |= ROPT_CHECK_ALL;
3026 scan_commit(pRExC_state, &data);
3027 SvREFCNT_dec(data.last_found);
3029 longest_float_length = CHR_SVLEN(data.longest_float);
3030 if (longest_float_length
3031 || (data.flags & SF_FL_BEFORE_EOL
3032 && (!(data.flags & SF_FL_BEFORE_MEOL)
3033 || (RExC_flags & PMf_MULTILINE)))) {
3036 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3037 && data.offset_fixed == data.offset_float_min
3038 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3039 goto remove_float; /* As in (a)+. */
3041 if (SvUTF8(data.longest_float)) {
3042 r->float_utf8 = data.longest_float;
3043 r->float_substr = NULL;
3045 r->float_substr = data.longest_float;
3046 r->float_utf8 = NULL;
3048 r->float_min_offset = data.offset_float_min;
3049 r->float_max_offset = data.offset_float_max;
3050 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3051 && (!(data.flags & SF_FL_BEFORE_MEOL)
3052 || (RExC_flags & PMf_MULTILINE)));
3053 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3057 r->float_substr = r->float_utf8 = NULL;
3058 SvREFCNT_dec(data.longest_float);
3059 longest_float_length = 0;
3062 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3063 if (longest_fixed_length
3064 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3065 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3066 || (RExC_flags & PMf_MULTILINE)))) {
3069 if (SvUTF8(data.longest_fixed)) {
3070 r->anchored_utf8 = data.longest_fixed;
3071 r->anchored_substr = NULL;
3073 r->anchored_substr = data.longest_fixed;
3074 r->anchored_utf8 = NULL;
3076 r->anchored_offset = data.offset_fixed;
3077 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3078 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3079 || (RExC_flags & PMf_MULTILINE)));
3080 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3083 r->anchored_substr = r->anchored_utf8 = NULL;
3084 SvREFCNT_dec(data.longest_fixed);
3085 longest_fixed_length = 0;
3088 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3089 r->regstclass = NULL;
3090 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3092 && !(data.start_class->flags & ANYOF_EOS)
3093 && !cl_is_anything(data.start_class))
3095 const I32 n = add_data(pRExC_state, 1, "f");
3097 Newx(RExC_rx->data->data[n], 1,
3098 struct regnode_charclass_class);
3099 StructCopy(data.start_class,
3100 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3101 struct regnode_charclass_class);
3102 r->regstclass = (regnode*)RExC_rx->data->data[n];
3103 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3104 PL_regdata = r->data; /* for regprop() */
3105 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3106 regprop(sv, (regnode*)data.start_class);
3107 PerlIO_printf(Perl_debug_log,
3108 "synthetic stclass \"%s\".\n",
3109 SvPVX_const(sv));});
3112 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3113 if (longest_fixed_length > longest_float_length) {
3114 r->check_substr = r->anchored_substr;
3115 r->check_utf8 = r->anchored_utf8;
3116 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3117 if (r->reganch & ROPT_ANCH_SINGLE)
3118 r->reganch |= ROPT_NOSCAN;
3121 r->check_substr = r->float_substr;
3122 r->check_utf8 = r->float_utf8;
3123 r->check_offset_min = data.offset_float_min;
3124 r->check_offset_max = data.offset_float_max;
3126 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3127 This should be changed ASAP! */
3128 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3129 r->reganch |= RE_USE_INTUIT;
3130 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3131 r->reganch |= RE_INTUIT_TAIL;
3135 /* Several toplevels. Best we can is to set minlen. */
3137 struct regnode_charclass_class ch_class;
3140 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3141 scan = r->program + 1;
3142 cl_init(pRExC_state, &ch_class);
3143 data.start_class = &ch_class;
3144 data.last_closep = &last_close;
3145 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3146 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3147 = r->float_substr = r->float_utf8 = NULL;
3148 if (!(data.start_class->flags & ANYOF_EOS)
3149 && !cl_is_anything(data.start_class))
3151 const I32 n = add_data(pRExC_state, 1, "f");
3153 Newx(RExC_rx->data->data[n], 1,
3154 struct regnode_charclass_class);
3155 StructCopy(data.start_class,
3156 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3157 struct regnode_charclass_class);
3158 r->regstclass = (regnode*)RExC_rx->data->data[n];
3159 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3160 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3161 regprop(sv, (regnode*)data.start_class);
3162 PerlIO_printf(Perl_debug_log,
3163 "synthetic stclass \"%s\".\n",
3164 SvPVX_const(sv));});
3169 if (RExC_seen & REG_SEEN_GPOS)
3170 r->reganch |= ROPT_GPOS_SEEN;
3171 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3172 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3173 if (RExC_seen & REG_SEEN_EVAL)
3174 r->reganch |= ROPT_EVAL_SEEN;
3175 if (RExC_seen & REG_SEEN_CANY)
3176 r->reganch |= ROPT_CANY_SEEN;
3177 Newxz(r->startp, RExC_npar, I32);
3178 Newxz(r->endp, RExC_npar, I32);
3179 PL_regdata = r->data; /* for regprop() */
3180 DEBUG_COMPILE_r(regdump(r));
3185 - reg - regular expression, i.e. main body or parenthesized thing
3187 * Caller must absorb opening parenthesis.
3189 * Combining parenthesis handling with the base level of regular expression
3190 * is a trifle forced, but the need to tie the tails of the branches to what
3191 * follows makes it hard to avoid.
3194 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3195 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3198 register regnode *ret; /* Will be the head of the group. */
3199 register regnode *br;
3200 register regnode *lastbr;
3201 register regnode *ender = NULL;
3202 register I32 parno = 0;
3204 const I32 oregflags = RExC_flags;
3205 bool have_branch = 0;
3208 /* for (?g), (?gc), and (?o) warnings; warning
3209 about (?c) will warn about (?g) -- japhy */
3211 #define WASTED_O 0x01
3212 #define WASTED_G 0x02
3213 #define WASTED_C 0x04
3214 #define WASTED_GC (0x02|0x04)
3215 I32 wastedflags = 0x00;
3217 char * parse_start = RExC_parse; /* MJD */
3218 char * const oregcomp_parse = RExC_parse;
3220 *flagp = 0; /* Tentatively. */
3223 /* Make an OPEN node, if parenthesized. */
3225 if (*RExC_parse == '?') { /* (?...) */
3226 U32 posflags = 0, negflags = 0;
3227 U32 *flagsp = &posflags;
3228 bool is_logical = 0;
3229 const char * const seqstart = RExC_parse;
3232 paren = *RExC_parse++;
3233 ret = NULL; /* For look-ahead/behind. */
3235 case '<': /* (?<...) */
3236 RExC_seen |= REG_SEEN_LOOKBEHIND;
3237 if (*RExC_parse == '!')
3239 if (*RExC_parse != '=' && *RExC_parse != '!')
3242 case '=': /* (?=...) */
3243 case '!': /* (?!...) */
3244 RExC_seen_zerolen++;
3245 case ':': /* (?:...) */
3246 case '>': /* (?>...) */
3248 case '$': /* (?$...) */
3249 case '@': /* (?@...) */
3250 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3252 case '#': /* (?#...) */
3253 while (*RExC_parse && *RExC_parse != ')')
3255 if (*RExC_parse != ')')
3256 FAIL("Sequence (?#... not terminated");
3257 nextchar(pRExC_state);
3260 case 'p': /* (?p...) */
3261 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3262 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3264 case '?': /* (??...) */
3266 if (*RExC_parse != '{')
3268 paren = *RExC_parse++;
3270 case '{': /* (?{...}) */
3272 I32 count = 1, n = 0;
3274 char *s = RExC_parse;
3276 RExC_seen_zerolen++;
3277 RExC_seen |= REG_SEEN_EVAL;
3278 while (count && (c = *RExC_parse)) {
3289 if (*RExC_parse != ')') {
3291 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3295 OP_4tree *sop, *rop;
3296 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3299 Perl_save_re_context(aTHX);
3300 rop = sv_compile_2op(sv, &sop, "re", &pad);
3301 sop->op_private |= OPpREFCOUNTED;
3302 /* re_dup will OpREFCNT_inc */
3303 OpREFCNT_set(sop, 1);
3306 n = add_data(pRExC_state, 3, "nop");
3307 RExC_rx->data->data[n] = (void*)rop;
3308 RExC_rx->data->data[n+1] = (void*)sop;
3309 RExC_rx->data->data[n+2] = (void*)pad;
3312 else { /* First pass */
3313 if (PL_reginterp_cnt < ++RExC_seen_evals
3315 /* No compiled RE interpolated, has runtime
3316 components ===> unsafe. */
3317 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3318 if (PL_tainting && PL_tainted)
3319 FAIL("Eval-group in insecure regular expression");
3320 if (IN_PERL_COMPILETIME)
3324 nextchar(pRExC_state);
3326 ret = reg_node(pRExC_state, LOGICAL);
3329 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3330 /* deal with the length of this later - MJD */
3333 ret = reganode(pRExC_state, EVAL, n);
3334 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3335 Set_Node_Offset(ret, parse_start);
3338 case '(': /* (?(?{...})...) and (?(?=...)...) */
3340 if (RExC_parse[0] == '?') { /* (?(?...)) */
3341 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3342 || RExC_parse[1] == '<'
3343 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3346 ret = reg_node(pRExC_state, LOGICAL);
3349 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3353 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3356 parno = atoi(RExC_parse++);
3358 while (isDIGIT(*RExC_parse))
3360 ret = reganode(pRExC_state, GROUPP, parno);
3362 if ((c = *nextchar(pRExC_state)) != ')')
3363 vFAIL("Switch condition not recognized");
3365 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3366 br = regbranch(pRExC_state, &flags, 1);
3368 br = reganode(pRExC_state, LONGJMP, 0);
3370 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3371 c = *nextchar(pRExC_state);
3375 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3376 regbranch(pRExC_state, &flags, 1);
3377 regtail(pRExC_state, ret, lastbr);
3380 c = *nextchar(pRExC_state);
3385 vFAIL("Switch (?(condition)... contains too many branches");
3386 ender = reg_node(pRExC_state, TAIL);
3387 regtail(pRExC_state, br, ender);
3389 regtail(pRExC_state, lastbr, ender);
3390 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3393 regtail(pRExC_state, ret, ender);
3397 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3401 RExC_parse--; /* for vFAIL to print correctly */
3402 vFAIL("Sequence (? incomplete");
3406 parse_flags: /* (?i) */
3407 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3408 /* (?g), (?gc) and (?o) are useless here
3409 and must be globally applied -- japhy */
3411 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3412 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3413 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3414 if (! (wastedflags & wflagbit) ) {
3415 wastedflags |= wflagbit;
3418 "Useless (%s%c) - %suse /%c modifier",
3419 flagsp == &negflags ? "?-" : "?",
3421 flagsp == &negflags ? "don't " : "",
3427 else if (*RExC_parse == 'c') {
3428 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3429 if (! (wastedflags & WASTED_C) ) {
3430 wastedflags |= WASTED_GC;
3433 "Useless (%sc) - %suse /gc modifier",
3434 flagsp == &negflags ? "?-" : "?",
3435 flagsp == &negflags ? "don't " : ""
3440 else { pmflag(flagsp, *RExC_parse); }
3444 if (*RExC_parse == '-') {
3446 wastedflags = 0; /* reset so (?g-c) warns twice */
3450 RExC_flags |= posflags;
3451 RExC_flags &= ~negflags;
3452 if (*RExC_parse == ':') {
3458 if (*RExC_parse != ')') {
3460 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3462 nextchar(pRExC_state);
3470 ret = reganode(pRExC_state, OPEN, parno);
3471 Set_Node_Length(ret, 1); /* MJD */
3472 Set_Node_Offset(ret, RExC_parse); /* MJD */
3479 /* Pick up the branches, linking them together. */
3480 parse_start = RExC_parse; /* MJD */
3481 br = regbranch(pRExC_state, &flags, 1);
3482 /* branch_len = (paren != 0); */
3486 if (*RExC_parse == '|') {
3487 if (!SIZE_ONLY && RExC_extralen) {
3488 reginsert(pRExC_state, BRANCHJ, br);
3491 reginsert(pRExC_state, BRANCH, br);
3492 Set_Node_Length(br, paren != 0);
3493 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3497 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3499 else if (paren == ':') {
3500 *flagp |= flags&SIMPLE;
3502 if (is_open) { /* Starts with OPEN. */
3503 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3505 else if (paren != '?') /* Not Conditional */
3507 *flagp |= flags & (SPSTART | HASWIDTH);
3509 while (*RExC_parse == '|') {
3510 if (!SIZE_ONLY && RExC_extralen) {
3511 ender = reganode(pRExC_state, LONGJMP,0);
3512 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3515 RExC_extralen += 2; /* Account for LONGJMP. */
3516 nextchar(pRExC_state);
3517 br = regbranch(pRExC_state, &flags, 0);
3521 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3525 *flagp |= flags&SPSTART;
3528 if (have_branch || paren != ':') {
3529 /* Make a closing node, and hook it on the end. */
3532 ender = reg_node(pRExC_state, TAIL);
3535 ender = reganode(pRExC_state, CLOSE, parno);
3536 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3537 Set_Node_Length(ender,1); /* MJD */
3543 *flagp &= ~HASWIDTH;
3546 ender = reg_node(pRExC_state, SUCCEED);
3549 ender = reg_node(pRExC_state, END);
3552 regtail(pRExC_state, lastbr, ender);
3555 /* Hook the tails of the branches to the closing node. */
3556 for (br = ret; br != NULL; br = regnext(br)) {
3557 regoptail(pRExC_state, br, ender);
3564 static const char parens[] = "=!<,>";
3566 if (paren && (p = strchr(parens, paren))) {
3567 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3568 int flag = (p - parens) > 1;
3571 node = SUSPEND, flag = 0;
3572 reginsert(pRExC_state, node,ret);
3573 Set_Node_Cur_Length(ret);
3574 Set_Node_Offset(ret, parse_start + 1);
3576 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3580 /* Check for proper termination. */
3582 RExC_flags = oregflags;
3583 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3584 RExC_parse = oregcomp_parse;
3585 vFAIL("Unmatched (");
3588 else if (!paren && RExC_parse < RExC_end) {
3589 if (*RExC_parse == ')') {
3591 vFAIL("Unmatched )");
3594 FAIL("Junk on end of regexp"); /* "Can't happen". */
3602 - regbranch - one alternative of an | operator
3604 * Implements the concatenation operator.
3607 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3610 register regnode *ret;
3611 register regnode *chain = NULL;
3612 register regnode *latest;
3613 I32 flags = 0, c = 0;
3618 if (!SIZE_ONLY && RExC_extralen)
3619 ret = reganode(pRExC_state, BRANCHJ,0);
3621 ret = reg_node(pRExC_state, BRANCH);
3622 Set_Node_Length(ret, 1);
3626 if (!first && SIZE_ONLY)
3627 RExC_extralen += 1; /* BRANCHJ */
3629 *flagp = WORST; /* Tentatively. */
3632 nextchar(pRExC_state);
3633 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3635 latest = regpiece(pRExC_state, &flags);
3636 if (latest == NULL) {
3637 if (flags & TRYAGAIN)
3641 else if (ret == NULL)
3643 *flagp |= flags&HASWIDTH;
3644 if (chain == NULL) /* First piece. */
3645 *flagp |= flags&SPSTART;
3648 regtail(pRExC_state, chain, latest);
3653 if (chain == NULL) { /* Loop ran zero times. */
3654 chain = reg_node(pRExC_state, NOTHING);
3659 *flagp |= flags&SIMPLE;
3666 - regpiece - something followed by possible [*+?]
3668 * Note that the branching code sequences used for ? and the general cases
3669 * of * and + are somewhat optimized: they use the same NOTHING node as
3670 * both the endmarker for their branch list and the body of the last branch.
3671 * It might seem that this node could be dispensed with entirely, but the
3672 * endmarker role is not redundant.
3675 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3678 register regnode *ret;
3680 register char *next;
3682 const char * const origparse = RExC_parse;
3685 I32 max = REG_INFTY;
3688 ret = regatom(pRExC_state, &flags);
3690 if (flags & TRYAGAIN)
3697 if (op == '{' && regcurly(RExC_parse)) {
3698 parse_start = RExC_parse; /* MJD */
3699 next = RExC_parse + 1;
3701 while (isDIGIT(*next) || *next == ',') {
3710 if (*next == '}') { /* got one */
3714 min = atoi(RExC_parse);
3718 maxpos = RExC_parse;
3720 if (!max && *maxpos != '0')
3721 max = REG_INFTY; /* meaning "infinity" */
3722 else if (max >= REG_INFTY)
3723 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3725 nextchar(pRExC_state);
3728 if ((flags&SIMPLE)) {
3729 RExC_naughty += 2 + RExC_naughty / 2;
3730 reginsert(pRExC_state, CURLY, ret);
3731 Set_Node_Offset(ret, parse_start+1); /* MJD */
3732 Set_Node_Cur_Length(ret);
3735 regnode *w = reg_node(pRExC_state, WHILEM);
3738 regtail(pRExC_state, ret, w);
3739 if (!SIZE_ONLY && RExC_extralen) {
3740 reginsert(pRExC_state, LONGJMP,ret);
3741 reginsert(pRExC_state, NOTHING,ret);
3742 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3744 reginsert(pRExC_state, CURLYX,ret);
3746 Set_Node_Offset(ret, parse_start+1);
3747 Set_Node_Length(ret,
3748 op == '{' ? (RExC_parse - parse_start) : 1);
3750 if (!SIZE_ONLY && RExC_extralen)
3751 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3752 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3754 RExC_whilem_seen++, RExC_extralen += 3;
3755 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3763 if (max && max < min)
3764 vFAIL("Can't do {n,m} with n > m");
3766 ARG1_SET(ret, (U16)min);
3767 ARG2_SET(ret, (U16)max);
3779 #if 0 /* Now runtime fix should be reliable. */
3781 /* if this is reinstated, don't forget to put this back into perldiag:
3783 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3785 (F) The part of the regexp subject to either the * or + quantifier
3786 could match an empty string. The {#} shows in the regular
3787 expression about where the problem was discovered.
3791 if (!(flags&HASWIDTH) && op != '?')
3792 vFAIL("Regexp *+ operand could be empty");
3795 parse_start = RExC_parse;
3796 nextchar(pRExC_state);
3798 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3800 if (op == '*' && (flags&SIMPLE)) {
3801 reginsert(pRExC_state, STAR, ret);
3805 else if (op == '*') {
3809 else if (op == '+' && (flags&SIMPLE)) {
3810 reginsert(pRExC_state, PLUS, ret);
3814 else if (op == '+') {
3818 else if (op == '?') {
3823 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3825 "%.*s matches null string many times",
3826 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
3830 if (*RExC_parse == '?') {
3831 nextchar(pRExC_state);
3832 reginsert(pRExC_state, MINMOD, ret);
3833 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3835 if (ISMULT2(RExC_parse)) {
3837 vFAIL("Nested quantifiers");
3844 - regatom - the lowest level
3846 * Optimization: gobbles an entire sequence of ordinary characters so that
3847 * it can turn them into a single node, which is smaller to store and
3848 * faster to run. Backslashed characters are exceptions, each becoming a
3849 * separate node; the code is simpler that way and it's not worth fixing.
3851 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3853 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3856 register regnode *ret = NULL;
3858 char *parse_start = RExC_parse;
3860 *flagp = WORST; /* Tentatively. */
3863 switch (*RExC_parse) {
3865 RExC_seen_zerolen++;
3866 nextchar(pRExC_state);
3867 if (RExC_flags & PMf_MULTILINE)
3868 ret = reg_node(pRExC_state, MBOL);
3869 else if (RExC_flags & PMf_SINGLELINE)
3870 ret = reg_node(pRExC_state, SBOL);
3872 ret = reg_node(pRExC_state, BOL);
3873 Set_Node_Length(ret, 1); /* MJD */
3876 nextchar(pRExC_state);
3878 RExC_seen_zerolen++;
3879 if (RExC_flags & PMf_MULTILINE)
3880 ret = reg_node(pRExC_state, MEOL);
3881 else if (RExC_flags & PMf_SINGLELINE)
3882 ret = reg_node(pRExC_state, SEOL);
3884 ret = reg_node(pRExC_state, EOL);
3885 Set_Node_Length(ret, 1); /* MJD */
3888 nextchar(pRExC_state);
3889 if (RExC_flags & PMf_SINGLELINE)
3890 ret = reg_node(pRExC_state, SANY);
3892 ret = reg_node(pRExC_state, REG_ANY);
3893 *flagp |= HASWIDTH|SIMPLE;
3895 Set_Node_Length(ret, 1); /* MJD */
3899 char *oregcomp_parse = ++RExC_parse;
3900 ret = regclass(pRExC_state);
3901 if (*RExC_parse != ']') {
3902 RExC_parse = oregcomp_parse;
3903 vFAIL("Unmatched [");
3905 nextchar(pRExC_state);
3906 *flagp |= HASWIDTH|SIMPLE;
3907 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3911 nextchar(pRExC_state);
3912 ret = reg(pRExC_state, 1, &flags);
3914 if (flags & TRYAGAIN) {
3915 if (RExC_parse == RExC_end) {
3916 /* Make parent create an empty node if needed. */
3924 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3928 if (flags & TRYAGAIN) {
3932 vFAIL("Internal urp");
3933 /* Supposed to be caught earlier. */