5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
80 **** Alterations to Henry's code are...
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
94 #define PERL_IN_REGCOMP_C
97 #ifndef PERL_IN_XSUB_RE
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
121 typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
166 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
171 #undef SPSTART /* dratted cpp namespace... */
174 * Flags to be passed up and down.
176 #define WORST 0 /* Worst case. */
177 #define HASWIDTH 0x1 /* Known to match non-null strings. */
178 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 0x4 /* Starts with * or +. */
180 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182 /* Length of a variant. */
184 typedef struct scan_data_t {
190 I32 last_end; /* min value, <0 unless valid. */
193 SV **longest; /* Either &l_fixed, or &l_float. */
197 I32 offset_float_min;
198 I32 offset_float_max;
202 struct regnode_charclass_class *start_class;
206 * Forward declarations for pregcomp()'s friends.
209 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL 0x1
214 #define SF_BEFORE_MEOL 0x2
215 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
219 # define SF_FIX_SHIFT_EOL (0+2)
220 # define SF_FL_SHIFT_EOL (0+4)
222 # define SF_FIX_SHIFT_EOL (+2)
223 # define SF_FL_SHIFT_EOL (+4)
226 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF 0x40
232 #define SF_HAS_PAR 0x80
233 #define SF_IN_PAR 0x100
234 #define SF_HAS_EVAL 0x200
235 #define SCF_DO_SUBSTR 0x400
236 #define SCF_DO_STCLASS_AND 0x0800
237 #define SCF_DO_STCLASS_OR 0x1000
238 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS 0x2000
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245 #define OOB_UNICODE 12345678
246 #define OOB_NAMEDCLASS -1
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
260 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
270 #define FAIL(msg) STMT_START { \
271 const char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
290 #define FAIL2(pat,msg) STMT_START { \
291 const char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 #define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 #define vFAIL(m) STMT_START { \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
325 * Like Simple_vFAIL(), but accepts two arguments.
327 #define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 #define vFAIL2(m,a1) STMT_START { \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
344 * Like Simple_vFAIL(), but accepts three arguments.
346 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 #define vFAIL3(m,a1,a2) STMT_START { \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
362 * Like Simple_vFAIL(), but accepts four arguments.
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 * Like Simple_vFAIL(), but accepts five arguments.
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 #define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
394 #define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
424 /* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
434 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
441 RExC_offsets[2*(node)-1] = (byte); \
446 #define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
450 #define Set_Node_Length_To_R(node,len) STMT_START { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
457 RExC_offsets[2*(node)] = (len); \
462 #define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
472 static void clear_re(pTHX_ void *r);
474 /* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
481 STRLEN l = CHR_SVLEN(data->last_found);
482 STRLEN old_l = CHR_SVLEN(*data->longest);
484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485 SvSetMagicSV(*data->longest, data->last_found);
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
492 data->flags &= ~SF_FIX_BEFORE_EOL;
495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
496 data->offset_float_max = (l
497 ? data->last_start_max
498 : data->pos_min + data->pos_delta);
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
501 if (data->flags & SF_BEFORE_EOL)
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
505 data->flags &= ~SF_FL_BEFORE_EOL;
508 SvCUR_set(data->last_found, 0);
510 SV * sv = data->last_found;
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
600 cl->flags &= ~ANYOF_UNICODE;
603 /* 'OR' a given class with another one. Can create false positives */
604 /* We assume that cl is not inverted */
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
608 if (or_with->flags & ANYOF_INVERT) {
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
627 cl_anything(pRExC_state, cl);
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632 && (!(or_with->flags & ANYOF_FOLD)
633 || (cl->flags & ANYOF_FOLD)) ) {
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
646 cl_anything(pRExC_state, cl);
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
674 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
676 A trie is an N'ary tree where the branches are determined by digital
677 decomposition of the key. IE, at the root node you look up the 1st character and
678 follow that branch repeat until you find the end of the branches. Nodes can be
679 marked as "accepting" meaning they represent a complete word. Eg:
683 would convert into the following structure. Numbers represent states, letters
684 following numbers represent valid transitions on the letter from that state, if
685 the number is in square brackets it represents an accepting state, otherwise it
686 will be in parenthesis.
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
692 (1) +-i->(6)-+-s->[7]
694 +-s->(3)-+-h->(4)-+-e->[5]
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
698 This shows that when matching against the string 'hers' we will begin at state 1
699 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702 single traverse. We store a mapping from accepting to state to which word was
703 matched, and then when we have multiple possibilities we try to complete the
704 rest of the regex in the order in which they occured in the alternation.
706 The only prior NFA like behaviour that would be changed by the TRIE support is
707 the silent ignoring of duplicate alternations which are of the form:
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
711 Thus EVAL blocks follwing a trie may be called a different number of times with
712 and without the optimisation. With the optimisations dupes will be silently
713 ignored. This inconsistant behaviour of EVAL type nodes is well established as
714 the following demonstrates:
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
718 which prints out 'word' three times, but
720 'words'=~/(word|word|word)(?{ print $1 })S/
722 which doesnt print it out at all. This is due to other optimisations kicking in.
724 Example of what happens on a structural level:
726 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
728 1: CURLYM[1] {1,32767}(18)
739 This would be optimizable with startbranch=5, first=5, last=16, tail=16
740 and should turn into:
742 1: CURLYM[1] {1,32767}(18)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
752 Cases where tail != last would be like /(?foo|bar)baz/:
762 which would be optimizable with startbranch=1, first=1, last=7, tail=8
763 and would end up looking like:
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
775 #define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
782 tmp = Perl_newSVpvf_nocontext( "%c", uvc ); \
784 av_push( trie->revcharmap, tmp ); \
787 #define TRIE_READ_CHAR STMT_START { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
796 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
802 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
811 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
816 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
827 #define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
835 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
837 /* first pass, loop through and scan words */
840 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
845 /* we just use folder as a flag in utf8 */
846 const U8 *folder=( flags == EXACTF
854 U32 data_slot = add_data( pRExC_state, 1, "t" );
857 GET_RE_DEBUG_FLAGS_DECL;
859 Newz( 848200, trie, 1, reg_trie_data );
861 RExC_rx->data->data[ data_slot ] = (void*)trie;
862 Newz( 848201, trie->charmap, 256, U16 );
864 trie->words = newAV();
865 trie->revcharmap = newAV();
869 re_trie_maxbuff=get_sv(RE_TRIE_MAXBUFF, 1);
870 if (!SvIOK(re_trie_maxbuff)) {
871 sv_setiv(re_trie_maxbuff, TRIE_SIMPLE_MAX_BUFF);
874 /* -- First loop and Setup --
876 We first traverse the branches and scan each word to determine if it
877 contains widechars, and how many unique chars there are, this is
878 important as we have to build a table with at least as many columns as we
881 We use an array of integers to represent the character codes 0..255
882 (trie->charmap) and we use a an HV* to store unicode characters. We use the
883 native representation of the character value as the key and IV's for the
886 *TODO* If we keep track of how many times each character is used we can
887 remap the columns so that the table compression later on is more
888 efficient in terms of memory by ensuring most common value is in the
889 middle and the least common are on the outside. IMO this would be better
890 than a most to least common mapping as theres a decent chance the most
891 common letter will share a node with the least common, meaning the node
892 will not be compressable. With a middle is most common approach the worst
893 case is when we have the least common nodes twice.
898 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
899 regnode *noper = NEXTOPER( cur );
900 U8 *uc = (U8*)STRING( noper );
901 U8 *e = uc + STR_LEN( noper );
903 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
906 for ( ; uc < e ; uc += len ) {
910 if ( !trie->charmap[ uvc ] ) {
911 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
913 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
918 if ( !trie->widecharmap )
919 trie->widecharmap = newHV();
921 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
924 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%X", uvc );
926 if ( !SvTRUE( *svpp ) ) {
927 sv_setiv( *svpp, ++trie->uniquecharcount );
933 } /* end first pass */
934 DEBUG_TRIE_COMPILE_r(
935 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
936 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
937 trie->charcount, trie->uniquecharcount )
942 We now know what we are dealing with in terms of unique chars and
943 string sizes so we can calculate how much memory a naive
944 representation using a flat table will take. If its over a reasonable
945 limit (as specified by $^RE_TRIE_MAXBUFF) we use a more memory
946 conservative but potentially much slower representation using an array
949 At the end we convert both representations into the same compressed
950 form that will be used in regexec.c for matching with. The latter
951 is a form that cannot be used to construct with but has memory
952 properties similar to the list form and access properties similar
953 to the table form making it both suitable for fast searches and
954 small enough that its feasable to store for the duration of a program.
956 See the comment in the code where the compressed table is produced
957 inplace from the flat tabe representation for an explanation of how
958 the compression works.
963 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
965 Second Pass -- Array Of Lists Representation
967 Each state will be represented by a list of charid:state records
968 (reg_trie_trans_le) the first such element holds the CUR and LEN
969 points of the allocated array. (See defines above).
971 We build the initial structure using the lists, and then convert
972 it into the compressed table form which allows faster lookups
973 (but cant be modified once converted).
979 STRLEN transcount = 1;
981 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
985 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
987 regnode *noper = NEXTOPER( cur );
988 U8 *uc = (U8*)STRING( noper );
989 U8 *e = uc + STR_LEN( noper );
990 U32 state = 1; /* required init */
991 U16 charid = 0; /* sanity init */
992 U8 *scan = (U8*)NULL; /* sanity init */
993 STRLEN foldlen = 0; /* required init */
994 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
997 for ( ; uc < e ; uc += len ) {
1002 charid = trie->charmap[ uvc ];
1004 SV** svpp=(SV**)NULL;
1005 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1009 charid=(U16)SvIV( *svpp );
1018 if ( !trie->states[ state ].trans.list ) {
1019 TRIE_LIST_NEW( state );
1021 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1022 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1023 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1028 newstate = next_alloc++;
1029 TRIE_LIST_PUSH( state, charid, newstate );
1035 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1037 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1040 if ( !trie->states[ state ].wordnum ) {
1041 /* we havent inserted this word into the structure yet. */
1042 trie->states[ state ].wordnum = ++curword;
1045 /* store the word for dumping */
1046 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1047 if ( UTF ) SvUTF8_on( tmp );
1048 av_push( trie->words, tmp );
1052 /* Its a dupe. So ignore it. */
1055 } /* end second pass */
1057 trie->laststate = next_alloc;
1058 Renew( trie->states, next_alloc, reg_trie_state );
1060 DEBUG_TRIE_COMPILE_MORE_r({
1065 print out the table precompression.
1068 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1069 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1071 for( state=1 ; state < next_alloc ; state ++ ) {
1073 PerlIO_printf( Perl_debug_log, "\n %04X :", state );
1074 if ( ! trie->states[ state ].wordnum ) {
1075 PerlIO_printf( Perl_debug_log, "%5s| ","");
1077 PerlIO_printf( Perl_debug_log, "W%04X| ",
1078 trie->states[ state ].wordnum
1081 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1082 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1083 PerlIO_printf( Perl_debug_log, "%s:%3X=%04X | ",
1085 TRIE_LIST_ITEM(state,charid).forid,
1086 TRIE_LIST_ITEM(state,charid).newstate
1091 PerlIO_printf( Perl_debug_log, "\n\n" );
1094 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1102 for( state=1 ; state < next_alloc ; state ++ ) {
1106 DEBUG_TRIE_COMPILE_MORE_r(
1107 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1111 if (trie->states[state].trans.list) {
1112 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1116 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1117 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1118 minid=TRIE_LIST_ITEM( state, idx).forid;
1119 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1120 maxid=TRIE_LIST_ITEM( state, idx).forid;
1123 if ( transcount < tp + maxid - minid + 1) {
1125 Renew( trie->trans, transcount, reg_trie_trans );
1126 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1128 base = trie->uniquecharcount + tp - minid;
1129 if ( maxid == minid ) {
1131 for ( ; zp < tp ; zp++ ) {
1132 if ( ! trie->trans[ zp ].next ) {
1133 base = trie->uniquecharcount + zp - minid;
1134 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1135 trie->trans[ zp ].check = state;
1141 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1142 trie->trans[ tp ].check = state;
1147 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1148 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1149 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1150 trie->trans[ tid ].check = state;
1152 tp += ( maxid - minid + 1 );
1154 Safefree(trie->states[ state ].trans.list);
1157 DEBUG_TRIE_COMPILE_MORE_r(
1158 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1161 trie->states[ state ].trans.base=base;
1163 trie->lasttrans = tp + 1;
1167 Second Pass -- Flat Table Representation.
1169 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1170 We know that we will need Charcount+1 trans at most to store the data
1171 (one row per char at worst case) So we preallocate both structures
1172 assuming worst case.
1174 We then construct the trie using only the .next slots of the entry
1177 We use the .check field of the first entry of the node temporarily to
1178 make compression both faster and easier by keeping track of how many non
1179 zero fields are in the node.
1181 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1184 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1185 number representing the first entry of the node, and state as a
1186 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1187 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1188 are 2 entrys per node. eg:
1196 The table is internally in the right hand, idx form. However as we also
1197 have to deal with the states array which is indexed by nodenum we have to
1198 use TRIE_NODENUM() to convert.
1202 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1204 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1205 next_alloc = trie->uniquecharcount + 1;
1207 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1209 regnode *noper = NEXTOPER( cur );
1210 U8 *uc = (U8*)STRING( noper );
1211 U8 *e = uc + STR_LEN( noper );
1213 U32 state = 1; /* required init */
1215 U16 charid = 0; /* sanity init */
1216 U32 accept_state = 0; /* sanity init */
1217 U8 *scan = (U8*)NULL; /* sanity init */
1219 STRLEN foldlen = 0; /* required init */
1220 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1223 for ( ; uc < e ; uc += len ) {
1228 charid = trie->charmap[ uvc ];
1230 SV** svpp=(SV**)NULL;
1231 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1235 charid=(U16)SvIV( *svpp );
1240 if ( !trie->trans[ state + charid ].next ) {
1241 trie->trans[ state + charid ].next = next_alloc;
1242 trie->trans[ state ].check++;
1243 next_alloc += trie->uniquecharcount;
1245 state = trie->trans[ state + charid ].next;
1247 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1249 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1252 accept_state = TRIE_NODENUM( state );
1253 if ( !trie->states[ accept_state ].wordnum ) {
1254 /* we havent inserted this word into the structure yet. */
1255 trie->states[ accept_state ].wordnum = ++curword;
1258 /* store the word for dumping */
1259 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1260 if ( UTF ) SvUTF8_on( tmp );
1261 av_push( trie->words, tmp );
1265 /* Its a dupe. So ignore it. */
1268 } /* end second pass */
1270 DEBUG_TRIE_COMPILE_MORE_r({
1272 print out the table precompression so that we can do a visual check
1273 that they are identical.
1277 PerlIO_printf( Perl_debug_log, "\nChar : " );
1279 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1280 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1282 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1286 PerlIO_printf( Perl_debug_log, "\nState+-" );
1288 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1289 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1292 PerlIO_printf( Perl_debug_log, "\n" );
1294 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1296 PerlIO_printf( Perl_debug_log, "%04X : ", TRIE_NODENUM( state ) );
1298 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1299 PerlIO_printf( Perl_debug_log, "%04X ",
1300 SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1302 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1303 PerlIO_printf( Perl_debug_log, " (%04X)\n", trie->trans[ state ].check );
1305 PerlIO_printf( Perl_debug_log, " (%04X) W%04X\n", trie->trans[ state ].check,
1306 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1309 PerlIO_printf( Perl_debug_log, "\n\n" );
1313 * Inplace compress the table.*
1315 For sparse data sets the table constructed by the trie algorithm will
1316 be mostly 0/FAIL transitions or to put it another way mostly empty.
1317 (Note that leaf nodes will not contain any transitions.)
1319 This algorithm compresses the tables by eliminating most such
1320 transitions, at the cost of a modest bit of extra work during lookup:
1322 - Each states[] entry contains a .base field which indicates the
1323 index in the state[] array wheres its transition data is stored.
1325 - If .base is 0 there are no valid transitions from that node.
1327 - If .base is nonzero then charid is added to it to find an entry in
1330 -If trans[states[state].base+charid].check!=state then the
1331 transition is taken to be a 0/Fail transition. Thus if there are fail
1332 transitions at the front of the node then the .base offset will point
1333 somewhere inside the previous nodes data (or maybe even into a node
1334 even earlier), but the .check field determines if the transition is
1337 The following process inplace converts the table to the compressed
1338 table: We first do not compress the root node 1,and mark its all its
1339 .check pointers as 1 and set its .base pointer as 1 as well. This
1340 allows to do a DFA construction from the compressed table later, and
1341 ensures that any .base pointers we calculate later are greater than
1344 - We set 'pos' to indicate the first entry of the second node.
1346 - We then iterate over the columns of the node, finding the first and
1347 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1348 and set the .check pointers accordingly, and advance pos
1349 appropriately and repreat for the next node. Note that when we copy
1350 the next pointers we have to convert them from the original
1351 NODEIDX form to NODENUM form as the former is not valid post
1354 - If a node has no transitions used we mark its base as 0 and do not
1355 advance the pos pointer.
1357 - If a node only has one transition we use a second pointer into the
1358 structure to fill in allocated fail transitions from other states.
1359 This pointer is independent of the main pointer and scans forward
1360 looking for null transitions that are allocated to a state. When it
1361 finds one it writes the single transition into the "hole". If the
1362 pointer doesnt find one the single transition is appeneded as normal.
1364 - Once compressed we can Renew/realloc the structures to release the
1367 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1368 specifically Fig 3.47 and the associated pseudocode.
1372 U32 laststate = TRIE_NODENUM( next_alloc );
1373 U32 used , state, charid;
1375 trie->laststate = laststate;
1377 for ( state = 1 ; state < laststate ; state++ ) {
1379 U32 stateidx = TRIE_NODEIDX( state );
1380 U32 o_used=trie->trans[ stateidx ].check;
1381 used = trie->trans[ stateidx ].check;
1382 trie->trans[ stateidx ].check = 0;
1384 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1385 if ( flag || trie->trans[ stateidx + charid ].next ) {
1386 if ( trie->trans[ stateidx + charid ].next ) {
1388 for ( ; zp < pos ; zp++ ) {
1389 if ( ! trie->trans[ zp ].next ) {
1393 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1394 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1395 trie->trans[ zp ].check = state;
1396 if ( ++zp > pos ) pos = zp;
1403 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1405 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1406 trie->trans[ pos ].check = state;
1411 trie->lasttrans = pos + 1;
1412 Renew( trie->states, laststate + 1, reg_trie_state);
1413 DEBUG_TRIE_COMPILE_MORE_r(
1414 PerlIO_printf( Perl_debug_log, " Alloc: %d Orig: %d elements, Final:%d. Savings of %%%5.2f\n",
1415 ( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ), next_alloc, pos,
1416 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1419 } /* end table compress */
1421 /* resize the trans array to remove unused space */
1422 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1424 DEBUG_TRIE_COMPILE_r({
1427 Now we print it out again, in a slightly different form as there is additional
1428 info we want to be able to see when its compressed. They are close enough for
1429 visual comparison though.
1431 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1433 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1434 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1436 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1439 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1441 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1442 PerlIO_printf( Perl_debug_log, "-----");
1443 PerlIO_printf( Perl_debug_log, "\n");
1445 for( state = 1 ; state < trie->laststate ; state++ ) {
1446 U32 base = trie->states[ state ].trans.base;
1448 PerlIO_printf( Perl_debug_log, "#%04X ", state);
1450 if ( trie->states[ state ].wordnum ) {
1451 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1453 PerlIO_printf( Perl_debug_log, "%6s", "" );
1456 PerlIO_printf( Perl_debug_log, " @%04X ", base );
1461 while( ( base + ofs < trie->uniquecharcount ) ||
1462 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1463 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1466 PerlIO_printf( Perl_debug_log, "+%02X[ ", ofs);
1468 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1469 if ( ( base + ofs >= trie->uniquecharcount ) &&
1470 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1471 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1473 PerlIO_printf( Perl_debug_log, "%04X ",
1474 trie->trans[ base + ofs - trie->uniquecharcount ].next );
1476 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1480 PerlIO_printf( Perl_debug_log, "]", ofs);
1483 PerlIO_printf( Perl_debug_log, "\n" );
1488 /* now finally we "stitch in" the new TRIE node
1489 This means we convert either the first branch or the first Exact,
1490 depending on whether the thing following (in 'last') is a branch
1491 or not and whther first is the startbranch (ie is it a sub part of
1492 the alternation or is it the whole thing.)
1493 Assuming its a sub part we conver the EXACT otherwise we convert
1494 the whole branch sequence, including the first.
1501 if ( first == startbranch && OP( last ) != BRANCH ) {
1504 convert = NEXTOPER( first );
1505 NEXT_OFF( first ) = (U16)(last - first);
1508 OP( convert ) = TRIE + (U8)( flags - EXACT );
1509 NEXT_OFF( convert ) = (U16)(tail - convert);
1510 ARG_SET( convert, data_slot );
1512 /* tells us if we need to handle accept buffers specially */
1513 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1516 /* needed for dumping*/
1518 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1519 /* We now need to mark all of the space originally used by the
1520 branches as optimized away. This keeps the dumpuntil from
1521 throwing a wobbly as it doesnt use regnext() to traverse the
1524 while( optimize < last ) {
1525 OP( optimize ) = OPTIMIZED;
1529 } /* end node insert */
1536 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1537 * These need to be revisited when a newer toolchain becomes available.
1539 #if defined(__sparc64__) && defined(__GNUC__)
1540 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1541 # undef SPARC64_GCC_WORKAROUND
1542 # define SPARC64_GCC_WORKAROUND 1
1546 /* REx optimizer. Converts nodes into quickier variants "in place".
1547 Finds fixed substrings. */
1549 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1550 to the position after last scanned or to NULL. */
1554 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
1555 /* scanp: Start here (read-write). */
1556 /* deltap: Write maxlen-minlen here. */
1557 /* last: Stop before this one. */
1559 I32 min = 0, pars = 0, code;
1560 regnode *scan = *scanp, *next;
1562 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1563 int is_inf_internal = 0; /* The studied chunk is infinite */
1564 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1565 scan_data_t data_fake;
1566 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1567 SV *re_trie_maxbuff = NULL;
1569 GET_RE_DEBUG_FLAGS_DECL;
1571 while (scan && OP(scan) != END && scan < last) {
1572 /* Peephole optimizer: */
1574 SV *mysv=sv_newmortal();
1575 regprop( mysv, scan);
1576 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08X)\n",depth*2,"",SvPV_nolen(mysv),scan);
1579 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1580 /* Merge several consecutive EXACTish nodes into one. */
1581 regnode *n = regnext(scan);
1584 regnode *stop = scan;
1587 next = scan + NODE_SZ_STR(scan);
1588 /* Skip NOTHING, merge EXACT*. */
1590 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1591 (stringok && (OP(n) == OP(scan))))
1593 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1594 if (OP(n) == TAIL || n > next)
1596 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1597 NEXT_OFF(scan) += NEXT_OFF(n);
1598 next = n + NODE_STEP_REGNODE;
1605 else if (stringok) {
1606 int oldl = STR_LEN(scan);
1607 regnode *nnext = regnext(n);
1609 if (oldl + STR_LEN(n) > U8_MAX)
1611 NEXT_OFF(scan) += NEXT_OFF(n);
1612 STR_LEN(scan) += STR_LEN(n);
1613 next = n + NODE_SZ_STR(n);
1614 /* Now we can overwrite *n : */
1615 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1623 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1625 Two problematic code points in Unicode casefolding of EXACT nodes:
1627 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1628 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1634 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1635 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1637 This means that in case-insensitive matching (or "loose matching",
1638 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1639 length of the above casefolded versions) can match a target string
1640 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1641 This would rather mess up the minimum length computation.
1643 What we'll do is to look for the tail four bytes, and then peek
1644 at the preceding two bytes to see whether we need to decrease
1645 the minimum length by four (six minus two).
1647 Thanks to the design of UTF-8, there cannot be false matches:
1648 A sequence of valid UTF-8 bytes cannot be a subsequence of
1649 another valid sequence of UTF-8 bytes.
1652 char *s0 = STRING(scan), *s, *t;
1653 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1654 const char *t0 = "\xcc\x88\xcc\x81";
1655 const char *t1 = t0 + 3;
1658 s < s2 && (t = ninstr(s, s1, t0, t1));
1660 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1661 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1668 n = scan + NODE_SZ_STR(scan);
1670 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1681 /* Follow the next-chain of the current node and optimize
1682 away all the NOTHINGs from it. */
1683 if (OP(scan) != CURLYX) {
1684 int max = (reg_off_by_arg[OP(scan)]
1686 /* I32 may be smaller than U16 on CRAYs! */
1687 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1688 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1692 /* Skip NOTHING and LONGJMP. */
1693 while ((n = regnext(n))
1694 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1695 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1696 && off + noff < max)
1698 if (reg_off_by_arg[OP(scan)])
1701 NEXT_OFF(scan) = off;
1704 /* The principal pseudo-switch. Cannot be a switch, since we
1705 look into several different things. */
1706 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1707 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1708 next = regnext(scan);
1710 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1712 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1713 I32 max1 = 0, min1 = I32_MAX, num = 0;
1714 struct regnode_charclass_class accum;
1715 regnode *startbranch=scan;
1717 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1718 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1719 if (flags & SCF_DO_STCLASS)
1720 cl_init_zero(pRExC_state, &accum);
1722 while (OP(scan) == code) {
1723 I32 deltanext, minnext, f = 0, fake;
1724 struct regnode_charclass_class this_class;
1727 data_fake.flags = 0;
1729 data_fake.whilem_c = data->whilem_c;
1730 data_fake.last_closep = data->last_closep;
1733 data_fake.last_closep = &fake;
1734 next = regnext(scan);
1735 scan = NEXTOPER(scan);
1737 scan = NEXTOPER(scan);
1738 if (flags & SCF_DO_STCLASS) {
1739 cl_init(pRExC_state, &this_class);
1740 data_fake.start_class = &this_class;
1741 f = SCF_DO_STCLASS_AND;
1743 if (flags & SCF_WHILEM_VISITED_POS)
1744 f |= SCF_WHILEM_VISITED_POS;
1746 /* we suppose the run is continuous, last=next...*/
1747 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1748 next, &data_fake, f,depth+1);
1751 if (max1 < minnext + deltanext)
1752 max1 = minnext + deltanext;
1753 if (deltanext == I32_MAX)
1754 is_inf = is_inf_internal = 1;
1756 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1758 if (data && (data_fake.flags & SF_HAS_EVAL))
1759 data->flags |= SF_HAS_EVAL;
1761 data->whilem_c = data_fake.whilem_c;
1762 if (flags & SCF_DO_STCLASS)
1763 cl_or(pRExC_state, &accum, &this_class);
1764 if (code == SUSPEND)
1767 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1769 if (flags & SCF_DO_SUBSTR) {
1770 data->pos_min += min1;
1771 data->pos_delta += max1 - min1;
1772 if (max1 != min1 || is_inf)
1773 data->longest = &(data->longest_float);
1776 delta += max1 - min1;
1777 if (flags & SCF_DO_STCLASS_OR) {
1778 cl_or(pRExC_state, data->start_class, &accum);
1780 cl_and(data->start_class, &and_with);
1781 flags &= ~SCF_DO_STCLASS;
1784 else if (flags & SCF_DO_STCLASS_AND) {
1786 cl_and(data->start_class, &accum);
1787 flags &= ~SCF_DO_STCLASS;
1790 /* Switch to OR mode: cache the old value of
1791 * data->start_class */
1792 StructCopy(data->start_class, &and_with,
1793 struct regnode_charclass_class);
1794 flags &= ~SCF_DO_STCLASS_AND;
1795 StructCopy(&accum, data->start_class,
1796 struct regnode_charclass_class);
1797 flags |= SCF_DO_STCLASS_OR;
1798 data->start_class->flags |= ANYOF_EOS;
1804 Assuming this was/is a branch we are dealing with: 'scan' now
1805 points at the item that follows the branch sequence, whatever
1806 it is. We now start at the beginning of the sequence and look
1812 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1814 If we can find such a subseqence we need to turn the first
1815 element into a trie and then add the subsequent branch exact
1816 strings to the trie.
1820 1. patterns where the whole set of branch can be converted to a trie,
1822 2. patterns where only a subset of the alternations can be
1823 converted to a trie.
1825 In case 1 we can replace the whole set with a single regop
1826 for the trie. In case 2 we need to keep the start and end
1829 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1830 becomes BRANCH TRIE; BRANCH X;
1832 Hypthetically when we know the regex isnt anchored we can
1833 turn a case 1 into a DFA and let it rip... Every time it finds a match
1834 it would just call its tail, no WHILEM/CURLY needed.
1838 if (!re_trie_maxbuff) {
1839 re_trie_maxbuff=get_sv(RE_TRIE_MAXBUFF, 1);
1840 if (!SvIOK(re_trie_maxbuff))
1841 sv_setiv(re_trie_maxbuff, TRIE_SIMPLE_MAX_BUFF);
1844 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1846 regnode *first = (regnode *)NULL;
1847 regnode *last = (regnode *)NULL;
1848 regnode *tail = scan;
1853 SV *mysv = sv_newmortal(); /* for dumping */
1855 /* var tail is used because there may be a TAIL
1856 regop in the way. Ie, the exacts will point to the
1857 thing following the TAIL, but the last branch will
1858 point at the TAIL. So we advance tail. If we
1859 have nested (?:) we may have to move through several
1863 while ( OP( tail ) == TAIL ) {
1864 /* this is the TAIL generated by (?:) */
1865 tail = regnext( tail );
1869 regprop( mysv, tail );
1870 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1871 depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1872 (RExC_seen_evals) ? "[EVAL]" : ""
1877 step through the branches, cur represents each
1878 branch, noper is the first thing to be matched
1879 as part of that branch and noper_next is the
1880 regnext() of that node. if noper is an EXACT
1881 and noper_next is the same as scan (our current
1882 position in the regex) then the EXACT branch is
1883 a possible optimization target. Once we have
1884 two or more consequetive such branches we can
1885 create a trie of the EXACT's contents and stich
1886 it in place. If the sequence represents all of
1887 the branches we eliminate the whole thing and
1888 replace it with a single TRIE. If it is a
1889 subsequence then we need to stitch it in. This
1890 means the first branch has to remain, and needs
1891 to be repointed at the item on the branch chain
1892 following the last branch optimized. This could
1893 be either a BRANCH, in which case the
1894 subsequence is internal, or it could be the
1895 item following the branch sequence in which
1896 case the subsequence is at the end.
1900 /* dont use tail as the end marker for this traverse */
1901 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1902 regnode *noper = NEXTOPER( cur );
1903 regnode *noper_next = regnext( noper );
1907 regprop( mysv, cur);
1908 PerlIO_printf( Perl_debug_log, "%*s%s",
1909 depth * 2 + 2," ", SvPV_nolen( mysv ) );
1911 regprop( mysv, noper);
1912 PerlIO_printf( Perl_debug_log, " -> %s",
1916 regprop( mysv, noper_next );
1917 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1920 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1923 if ( ( first ? OP( noper ) == optype
1924 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1925 && noper_next == tail && count<U16_MAX)
1930 optype = OP( noper );
1934 regprop( mysv, first);
1935 PerlIO_printf( Perl_debug_log, "%*s%s",
1936 depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1937 regprop( mysv, NEXTOPER(first) );
1938 PerlIO_printf( Perl_debug_log, " -> %s\n",
1939 SvPV_nolen( mysv ) );
1944 regprop( mysv, cur);
1945 PerlIO_printf( Perl_debug_log, "%*s%s",
1946 depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1947 regprop( mysv, noper );
1948 PerlIO_printf( Perl_debug_log, " -> %s\n",
1949 SvPV_nolen( mysv ) );
1955 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1956 depth * 2 + 2, "E:", "**END**" );
1958 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1960 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1961 && noper_next == tail )
1965 optype = OP( noper );
1975 regprop( mysv, cur);
1976 PerlIO_printf( Perl_debug_log,
1977 "%*s%s\t(0x%p,0x%p,0x%p)\n", depth * 2 + 2,
1978 " ", SvPV_nolen( mysv ), first, last, cur);
1983 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1984 depth * 2 + 2, "E:", "==END==" );
1986 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1991 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1992 scan = NEXTOPER(NEXTOPER(scan));
1993 } else /* single branch is optimized. */
1994 scan = NEXTOPER(scan);
1997 else if (OP(scan) == EXACT) {
1998 I32 l = STR_LEN(scan);
1999 UV uc = *((U8*)STRING(scan));
2001 U8 *s = (U8*)STRING(scan);
2002 l = utf8_length(s, s + l);
2003 uc = utf8_to_uvchr(s, NULL);
2006 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2007 /* The code below prefers earlier match for fixed
2008 offset, later match for variable offset. */
2009 if (data->last_end == -1) { /* Update the start info. */
2010 data->last_start_min = data->pos_min;
2011 data->last_start_max = is_inf
2012 ? I32_MAX : data->pos_min + data->pos_delta;
2014 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2016 SV * sv = data->last_found;
2017 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2018 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2019 if (mg && mg->mg_len >= 0)
2020 mg->mg_len += utf8_length((U8*)STRING(scan),
2021 (U8*)STRING(scan)+STR_LEN(scan));
2024 SvUTF8_on(data->last_found);
2025 data->last_end = data->pos_min + l;
2026 data->pos_min += l; /* As in the first entry. */
2027 data->flags &= ~SF_BEFORE_EOL;
2029 if (flags & SCF_DO_STCLASS_AND) {
2030 /* Check whether it is compatible with what we know already! */
2034 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2035 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2036 && (!(data->start_class->flags & ANYOF_FOLD)
2037 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2040 ANYOF_CLASS_ZERO(data->start_class);
2041 ANYOF_BITMAP_ZERO(data->start_class);
2043 ANYOF_BITMAP_SET(data->start_class, uc);
2044 data->start_class->flags &= ~ANYOF_EOS;
2046 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2048 else if (flags & SCF_DO_STCLASS_OR) {
2049 /* false positive possible if the class is case-folded */
2051 ANYOF_BITMAP_SET(data->start_class, uc);
2053 data->start_class->flags |= ANYOF_UNICODE_ALL;
2054 data->start_class->flags &= ~ANYOF_EOS;
2055 cl_and(data->start_class, &and_with);
2057 flags &= ~SCF_DO_STCLASS;
2059 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2060 I32 l = STR_LEN(scan);
2061 UV uc = *((U8*)STRING(scan));
2063 /* Search for fixed substrings supports EXACT only. */
2064 if (flags & SCF_DO_SUBSTR)
2065 scan_commit(pRExC_state, data);
2067 U8 *s = (U8 *)STRING(scan);
2068 l = utf8_length(s, s + l);
2069 uc = utf8_to_uvchr(s, NULL);
2072 if (data && (flags & SCF_DO_SUBSTR))
2074 if (flags & SCF_DO_STCLASS_AND) {
2075 /* Check whether it is compatible with what we know already! */
2079 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2080 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2081 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2083 ANYOF_CLASS_ZERO(data->start_class);
2084 ANYOF_BITMAP_ZERO(data->start_class);
2086 ANYOF_BITMAP_SET(data->start_class, uc);
2087 data->start_class->flags &= ~ANYOF_EOS;
2088 data->start_class->flags |= ANYOF_FOLD;
2089 if (OP(scan) == EXACTFL)
2090 data->start_class->flags |= ANYOF_LOCALE;
2093 else if (flags & SCF_DO_STCLASS_OR) {
2094 if (data->start_class->flags & ANYOF_FOLD) {
2095 /* false positive possible if the class is case-folded.
2096 Assume that the locale settings are the same... */
2098 ANYOF_BITMAP_SET(data->start_class, uc);
2099 data->start_class->flags &= ~ANYOF_EOS;
2101 cl_and(data->start_class, &and_with);
2103 flags &= ~SCF_DO_STCLASS;
2105 else if (strchr((const char*)PL_varies,OP(scan))) {
2106 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2107 I32 f = flags, pos_before = 0;
2108 regnode *oscan = scan;
2109 struct regnode_charclass_class this_class;
2110 struct regnode_charclass_class *oclass = NULL;
2111 I32 next_is_eval = 0;
2113 switch (PL_regkind[(U8)OP(scan)]) {
2114 case WHILEM: /* End of (?:...)* . */
2115 scan = NEXTOPER(scan);
2118 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2119 next = NEXTOPER(scan);
2120 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2122 maxcount = REG_INFTY;
2123 next = regnext(scan);
2124 scan = NEXTOPER(scan);
2128 if (flags & SCF_DO_SUBSTR)
2133 if (flags & SCF_DO_STCLASS) {
2135 maxcount = REG_INFTY;
2136 next = regnext(scan);
2137 scan = NEXTOPER(scan);
2140 is_inf = is_inf_internal = 1;
2141 scan = regnext(scan);
2142 if (flags & SCF_DO_SUBSTR) {
2143 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2144 data->longest = &(data->longest_float);
2146 goto optimize_curly_tail;
2148 mincount = ARG1(scan);
2149 maxcount = ARG2(scan);
2150 next = regnext(scan);
2151 if (OP(scan) == CURLYX) {
2152 I32 lp = (data ? *(data->last_closep) : 0);
2153 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2155 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2156 next_is_eval = (OP(scan) == EVAL);
2158 if (flags & SCF_DO_SUBSTR) {
2159 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2160 pos_before = data->pos_min;
2164 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2166 data->flags |= SF_IS_INF;
2168 if (flags & SCF_DO_STCLASS) {
2169 cl_init(pRExC_state, &this_class);
2170 oclass = data->start_class;
2171 data->start_class = &this_class;
2172 f |= SCF_DO_STCLASS_AND;
2173 f &= ~SCF_DO_STCLASS_OR;
2175 /* These are the cases when once a subexpression
2176 fails at a particular position, it cannot succeed
2177 even after backtracking at the enclosing scope.
2179 XXXX what if minimal match and we are at the
2180 initial run of {n,m}? */
2181 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2182 f &= ~SCF_WHILEM_VISITED_POS;
2184 /* This will finish on WHILEM, setting scan, or on NULL: */
2185 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2187 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2189 if (flags & SCF_DO_STCLASS)
2190 data->start_class = oclass;
2191 if (mincount == 0 || minnext == 0) {
2192 if (flags & SCF_DO_STCLASS_OR) {
2193 cl_or(pRExC_state, data->start_class, &this_class);
2195 else if (flags & SCF_DO_STCLASS_AND) {
2196 /* Switch to OR mode: cache the old value of
2197 * data->start_class */
2198 StructCopy(data->start_class, &and_with,
2199 struct regnode_charclass_class);
2200 flags &= ~SCF_DO_STCLASS_AND;
2201 StructCopy(&this_class, data->start_class,
2202 struct regnode_charclass_class);
2203 flags |= SCF_DO_STCLASS_OR;
2204 data->start_class->flags |= ANYOF_EOS;
2206 } else { /* Non-zero len */
2207 if (flags & SCF_DO_STCLASS_OR) {
2208 cl_or(pRExC_state, data->start_class, &this_class);
2209 cl_and(data->start_class, &and_with);
2211 else if (flags & SCF_DO_STCLASS_AND)
2212 cl_and(data->start_class, &this_class);
2213 flags &= ~SCF_DO_STCLASS;
2215 if (!scan) /* It was not CURLYX, but CURLY. */
2217 if (ckWARN(WARN_REGEXP)
2218 /* ? quantifier ok, except for (?{ ... }) */
2219 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2220 && (minnext == 0) && (deltanext == 0)
2221 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2222 && maxcount <= REG_INFTY/3) /* Complement check for big count */
2225 "Quantifier unexpected on zero-length expression");
2228 min += minnext * mincount;
2229 is_inf_internal |= ((maxcount == REG_INFTY
2230 && (minnext + deltanext) > 0)
2231 || deltanext == I32_MAX);
2232 is_inf |= is_inf_internal;
2233 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2235 /* Try powerful optimization CURLYX => CURLYN. */
2236 if ( OP(oscan) == CURLYX && data
2237 && data->flags & SF_IN_PAR
2238 && !(data->flags & SF_HAS_EVAL)
2239 && !deltanext && minnext == 1 ) {
2240 /* Try to optimize to CURLYN. */
2241 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2242 regnode *nxt1 = nxt;
2249 if (!strchr((const char*)PL_simple,OP(nxt))
2250 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2251 && STR_LEN(nxt) == 1))
2257 if (OP(nxt) != CLOSE)
2259 /* Now we know that nxt2 is the only contents: */
2260 oscan->flags = (U8)ARG(nxt);
2262 OP(nxt1) = NOTHING; /* was OPEN. */
2264 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2265 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2266 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2267 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2268 OP(nxt + 1) = OPTIMIZED; /* was count. */
2269 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2274 /* Try optimization CURLYX => CURLYM. */
2275 if ( OP(oscan) == CURLYX && data
2276 && !(data->flags & SF_HAS_PAR)
2277 && !(data->flags & SF_HAS_EVAL)
2278 && !deltanext /* atom is fixed width */
2279 && minnext != 0 /* CURLYM can't handle zero width */
2281 /* XXXX How to optimize if data == 0? */
2282 /* Optimize to a simpler form. */
2283 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2287 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2288 && (OP(nxt2) != WHILEM))
2290 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2291 /* Need to optimize away parenths. */
2292 if (data->flags & SF_IN_PAR) {
2293 /* Set the parenth number. */
2294 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2296 if (OP(nxt) != CLOSE)
2297 FAIL("Panic opt close");
2298 oscan->flags = (U8)ARG(nxt);
2299 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2300 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2302 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2303 OP(nxt + 1) = OPTIMIZED; /* was count. */
2304 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2305 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2308 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2309 regnode *nnxt = regnext(nxt1);
2312 if (reg_off_by_arg[OP(nxt1)])
2313 ARG_SET(nxt1, nxt2 - nxt1);
2314 else if (nxt2 - nxt1 < U16_MAX)
2315 NEXT_OFF(nxt1) = nxt2 - nxt1;
2317 OP(nxt) = NOTHING; /* Cannot beautify */
2322 /* Optimize again: */
2323 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2329 else if ((OP(oscan) == CURLYX)
2330 && (flags & SCF_WHILEM_VISITED_POS)
2331 /* See the comment on a similar expression above.
2332 However, this time it not a subexpression
2333 we care about, but the expression itself. */
2334 && (maxcount == REG_INFTY)
2335 && data && ++data->whilem_c < 16) {
2336 /* This stays as CURLYX, we can put the count/of pair. */
2337 /* Find WHILEM (as in regexec.c) */
2338 regnode *nxt = oscan + NEXT_OFF(oscan);
2340 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2342 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2343 | (RExC_whilem_seen << 4)); /* On WHILEM */
2345 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2347 if (flags & SCF_DO_SUBSTR) {
2348 SV *last_str = Nullsv;
2349 int counted = mincount != 0;
2351 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2352 #if defined(SPARC64_GCC_WORKAROUND)
2358 if (pos_before >= data->last_start_min)
2361 b = data->last_start_min;
2364 s = SvPV(data->last_found, l);
2365 old = b - data->last_start_min;
2368 I32 b = pos_before >= data->last_start_min
2369 ? pos_before : data->last_start_min;
2371 char *s = SvPV(data->last_found, l);
2372 I32 old = b - data->last_start_min;
2376 old = utf8_hop((U8*)s, old) - (U8*)s;
2379 /* Get the added string: */
2380 last_str = newSVpvn(s + old, l);
2382 SvUTF8_on(last_str);
2383 if (deltanext == 0 && pos_before == b) {
2384 /* What was added is a constant string */
2386 SvGROW(last_str, (mincount * l) + 1);
2387 repeatcpy(SvPVX(last_str) + l,
2388 SvPVX(last_str), l, mincount - 1);
2389 SvCUR(last_str) *= mincount;
2390 /* Add additional parts. */
2391 SvCUR_set(data->last_found,
2392 SvCUR(data->last_found) - l);
2393 sv_catsv(data->last_found, last_str);
2395 SV * sv = data->last_found;
2397 SvUTF8(sv) && SvMAGICAL(sv) ?
2398 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2399 if (mg && mg->mg_len >= 0)
2400 mg->mg_len += CHR_SVLEN(last_str);
2402 data->last_end += l * (mincount - 1);
2405 /* start offset must point into the last copy */
2406 data->last_start_min += minnext * (mincount - 1);
2407 data->last_start_max += is_inf ? I32_MAX
2408 : (maxcount - 1) * (minnext + data->pos_delta);
2411 /* It is counted once already... */
2412 data->pos_min += minnext * (mincount - counted);
2413 data->pos_delta += - counted * deltanext +
2414 (minnext + deltanext) * maxcount - minnext * mincount;
2415 if (mincount != maxcount) {
2416 /* Cannot extend fixed substrings found inside
2418 scan_commit(pRExC_state,data);
2419 if (mincount && last_str) {
2420 sv_setsv(data->last_found, last_str);
2421 data->last_end = data->pos_min;
2422 data->last_start_min =
2423 data->pos_min - CHR_SVLEN(last_str);
2424 data->last_start_max = is_inf
2426 : data->pos_min + data->pos_delta
2427 - CHR_SVLEN(last_str);
2429 data->longest = &(data->longest_float);
2431 SvREFCNT_dec(last_str);
2433 if (data && (fl & SF_HAS_EVAL))
2434 data->flags |= SF_HAS_EVAL;
2435 optimize_curly_tail:
2436 if (OP(oscan) != CURLYX) {
2437 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2439 NEXT_OFF(oscan) += NEXT_OFF(next);
2442 default: /* REF and CLUMP only? */
2443 if (flags & SCF_DO_SUBSTR) {
2444 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2445 data->longest = &(data->longest_float);
2447 is_inf = is_inf_internal = 1;
2448 if (flags & SCF_DO_STCLASS_OR)
2449 cl_anything(pRExC_state, data->start_class);
2450 flags &= ~SCF_DO_STCLASS;
2454 else if (strchr((const char*)PL_simple,OP(scan))) {
2457 if (flags & SCF_DO_SUBSTR) {
2458 scan_commit(pRExC_state,data);
2462 if (flags & SCF_DO_STCLASS) {
2463 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2465 /* Some of the logic below assumes that switching
2466 locale on will only add false positives. */
2467 switch (PL_regkind[(U8)OP(scan)]) {
2471 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2472 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2473 cl_anything(pRExC_state, data->start_class);
2476 if (OP(scan) == SANY)
2478 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2479 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2480 || (data->start_class->flags & ANYOF_CLASS));
2481 cl_anything(pRExC_state, data->start_class);
2483 if (flags & SCF_DO_STCLASS_AND || !value)
2484 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2487 if (flags & SCF_DO_STCLASS_AND)
2488 cl_and(data->start_class,
2489 (struct regnode_charclass_class*)scan);
2491 cl_or(pRExC_state, data->start_class,
2492 (struct regnode_charclass_class*)scan);
2495 if (flags & SCF_DO_STCLASS_AND) {
2496 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2498 for (value = 0; value < 256; value++)
2499 if (!isALNUM(value))
2500 ANYOF_BITMAP_CLEAR(data->start_class, value);
2504 if (data->start_class->flags & ANYOF_LOCALE)
2505 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2507 for (value = 0; value < 256; value++)
2509 ANYOF_BITMAP_SET(data->start_class, value);
2514 if (flags & SCF_DO_STCLASS_AND) {
2515 if (data->start_class->flags & ANYOF_LOCALE)
2516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2519 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2520 data->start_class->flags |= ANYOF_LOCALE;
2524 if (flags & SCF_DO_STCLASS_AND) {
2525 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2527 for (value = 0; value < 256; value++)
2529 ANYOF_BITMAP_CLEAR(data->start_class, value);
2533 if (data->start_class->flags & ANYOF_LOCALE)
2534 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2536 for (value = 0; value < 256; value++)
2537 if (!isALNUM(value))
2538 ANYOF_BITMAP_SET(data->start_class, value);
2543 if (flags & SCF_DO_STCLASS_AND) {
2544 if (data->start_class->flags & ANYOF_LOCALE)
2545 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2548 data->start_class->flags |= ANYOF_LOCALE;
2549 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2553 if (flags & SCF_DO_STCLASS_AND) {
2554 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2555 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2556 for (value = 0; value < 256; value++)
2557 if (!isSPACE(value))
2558 ANYOF_BITMAP_CLEAR(data->start_class, value);
2562 if (data->start_class->flags & ANYOF_LOCALE)
2563 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2565 for (value = 0; value < 256; value++)
2567 ANYOF_BITMAP_SET(data->start_class, value);
2572 if (flags & SCF_DO_STCLASS_AND) {
2573 if (data->start_class->flags & ANYOF_LOCALE)
2574 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2577 data->start_class->flags |= ANYOF_LOCALE;
2578 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2582 if (flags & SCF_DO_STCLASS_AND) {
2583 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2584 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2585 for (value = 0; value < 256; value++)
2587 ANYOF_BITMAP_CLEAR(data->start_class, value);
2591 if (data->start_class->flags & ANYOF_LOCALE)
2592 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2594 for (value = 0; value < 256; value++)
2595 if (!isSPACE(value))
2596 ANYOF_BITMAP_SET(data->start_class, value);
2601 if (flags & SCF_DO_STCLASS_AND) {
2602 if (data->start_class->flags & ANYOF_LOCALE) {
2603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2604 for (value = 0; value < 256; value++)
2605 if (!isSPACE(value))
2606 ANYOF_BITMAP_CLEAR(data->start_class, value);
2610 data->start_class->flags |= ANYOF_LOCALE;
2611 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2615 if (flags & SCF_DO_STCLASS_AND) {
2616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2617 for (value = 0; value < 256; value++)
2618 if (!isDIGIT(value))
2619 ANYOF_BITMAP_CLEAR(data->start_class, value);
2622 if (data->start_class->flags & ANYOF_LOCALE)
2623 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2625 for (value = 0; value < 256; value++)
2627 ANYOF_BITMAP_SET(data->start_class, value);
2632 if (flags & SCF_DO_STCLASS_AND) {
2633 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2634 for (value = 0; value < 256; value++)
2636 ANYOF_BITMAP_CLEAR(data->start_class, value);
2639 if (data->start_class->flags & ANYOF_LOCALE)
2640 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2642 for (value = 0; value < 256; value++)
2643 if (!isDIGIT(value))
2644 ANYOF_BITMAP_SET(data->start_class, value);
2649 if (flags & SCF_DO_STCLASS_OR)
2650 cl_and(data->start_class, &and_with);
2651 flags &= ~SCF_DO_STCLASS;
2654 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2655 data->flags |= (OP(scan) == MEOL
2659 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2660 /* Lookbehind, or need to calculate parens/evals/stclass: */
2661 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2662 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2663 /* Lookahead/lookbehind */
2664 I32 deltanext, minnext, fake = 0;
2666 struct regnode_charclass_class intrnl;
2669 data_fake.flags = 0;
2671 data_fake.whilem_c = data->whilem_c;
2672 data_fake.last_closep = data->last_closep;
2675 data_fake.last_closep = &fake;
2676 if ( flags & SCF_DO_STCLASS && !scan->flags
2677 && OP(scan) == IFMATCH ) { /* Lookahead */
2678 cl_init(pRExC_state, &intrnl);
2679 data_fake.start_class = &intrnl;
2680 f |= SCF_DO_STCLASS_AND;
2682 if (flags & SCF_WHILEM_VISITED_POS)
2683 f |= SCF_WHILEM_VISITED_POS;
2684 next = regnext(scan);
2685 nscan = NEXTOPER(NEXTOPER(scan));
2686 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2689 vFAIL("Variable length lookbehind not implemented");
2691 else if (minnext > U8_MAX) {
2692 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2694 scan->flags = (U8)minnext;
2696 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2698 if (data && (data_fake.flags & SF_HAS_EVAL))
2699 data->flags |= SF_HAS_EVAL;
2701 data->whilem_c = data_fake.whilem_c;
2702 if (f & SCF_DO_STCLASS_AND) {
2703 int was = (data->start_class->flags & ANYOF_EOS);
2705 cl_and(data->start_class, &intrnl);
2707 data->start_class->flags |= ANYOF_EOS;
2710 else if (OP(scan) == OPEN) {
2713 else if (OP(scan) == CLOSE) {
2714 if ((I32)ARG(scan) == is_par) {
2715 next = regnext(scan);
2717 if ( next && (OP(next) != WHILEM) && next < last)
2718 is_par = 0; /* Disable optimization */
2721 *(data->last_closep) = ARG(scan);
2723 else if (OP(scan) == EVAL) {
2725 data->flags |= SF_HAS_EVAL;
2727 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2728 if (flags & SCF_DO_SUBSTR) {
2729 scan_commit(pRExC_state,data);
2730 data->longest = &(data->longest_float);
2732 is_inf = is_inf_internal = 1;
2733 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2734 cl_anything(pRExC_state, data->start_class);
2735 flags &= ~SCF_DO_STCLASS;
2737 /* Else: zero-length, ignore. */
2738 scan = regnext(scan);
2743 *deltap = is_inf_internal ? I32_MAX : delta;
2744 if (flags & SCF_DO_SUBSTR && is_inf)
2745 data->pos_delta = I32_MAX - data->pos_min;
2746 if (is_par > U8_MAX)
2748 if (is_par && pars==1 && data) {
2749 data->flags |= SF_IN_PAR;
2750 data->flags &= ~SF_HAS_PAR;
2752 else if (pars && data) {
2753 data->flags |= SF_HAS_PAR;
2754 data->flags &= ~SF_IN_PAR;
2756 if (flags & SCF_DO_STCLASS_OR)
2757 cl_and(data->start_class, &and_with);
2762 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2764 if (RExC_rx->data) {
2765 Renewc(RExC_rx->data,
2766 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2767 char, struct reg_data);
2768 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2769 RExC_rx->data->count += n;
2772 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2773 char, struct reg_data);
2774 New(1208, RExC_rx->data->what, n, U8);
2775 RExC_rx->data->count = n;
2777 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2778 return RExC_rx->data->count - n;
2782 Perl_reginitcolors(pTHX)
2785 char *s = PerlEnv_getenv("PERL_RE_COLORS");
2788 PL_colors[0] = s = savepv(s);
2790 s = strchr(s, '\t');
2796 PL_colors[i] = s = "";
2800 PL_colors[i++] = "";
2807 - pregcomp - compile a regular expression into internal code
2809 * We can't allocate space until we know how big the compiled form will be,
2810 * but we can't compile it (and thus know how big it is) until we've got a
2811 * place to put the code. So we cheat: we compile it twice, once with code
2812 * generation turned off and size counting turned on, and once "for real".
2813 * This also means that we don't allocate space until we are sure that the
2814 * thing really will compile successfully, and we never have to move the
2815 * code and thus invalidate pointers into it. (Note that it has to be in
2816 * one piece because free() must be able to free it all.) [NB: not true in perl]
2818 * Beware that the optimization-preparation code in here knows about some
2819 * of the structure of the compiled regexp. [I'll say.]
2822 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2832 RExC_state_t RExC_state;
2833 RExC_state_t *pRExC_state = &RExC_state;
2835 GET_RE_DEBUG_FLAGS_DECL;
2838 FAIL("NULL regexp argument");
2840 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2843 DEBUG_r(if (!PL_colorset) reginitcolors());
2845 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2846 PL_colors[4],PL_colors[5],PL_colors[0],
2847 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2849 RExC_flags = pm->op_pmflags;
2853 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2854 RExC_seen_evals = 0;
2857 /* First pass: determine size, legality. */
2864 RExC_emit = &PL_regdummy;
2865 RExC_whilem_seen = 0;
2866 #if 0 /* REGC() is (currently) a NOP at the first pass.
2867 * Clever compilers notice this and complain. --jhi */
2868 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2870 if (reg(pRExC_state, 0, &flags) == NULL) {
2871 RExC_precomp = Nullch;
2874 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2876 /* Small enough for pointer-storage convention?
2877 If extralen==0, this means that we will not need long jumps. */
2878 if (RExC_size >= 0x10000L && RExC_extralen)
2879 RExC_size += RExC_extralen;
2882 if (RExC_whilem_seen > 15)
2883 RExC_whilem_seen = 15;
2885 /* Allocate space and initialize. */
2886 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2889 FAIL("Regexp out of space");
2892 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2893 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2896 r->prelen = xend - exp;
2897 r->precomp = savepvn(RExC_precomp, r->prelen);
2899 #ifdef PERL_COPY_ON_WRITE
2900 r->saved_copy = Nullsv;
2902 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2903 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2905 r->substrs = 0; /* Useful during FAIL. */
2906 r->startp = 0; /* Useful during FAIL. */
2907 r->endp = 0; /* Useful during FAIL. */
2909 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2911 r->offsets[0] = RExC_size;
2913 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2914 "%s %"UVuf" bytes for offset annotations.\n",
2915 r->offsets ? "Got" : "Couldn't get",
2916 (UV)((2*RExC_size+1) * sizeof(U32))));
2920 /* Second pass: emit code. */
2921 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2926 RExC_emit_start = r->program;
2927 RExC_emit = r->program;
2928 /* Store the count of eval-groups for security checks: */
2929 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2930 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2932 if (reg(pRExC_state, 0, &flags) == NULL)
2936 /* Dig out information for optimizations. */
2937 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2938 pm->op_pmflags = RExC_flags;
2940 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2941 r->regstclass = NULL;
2942 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2943 r->reganch |= ROPT_NAUGHTY;
2944 scan = r->program + 1; /* First BRANCH. */
2946 /* XXXX To minimize changes to RE engine we always allocate
2947 3-units-long substrs field. */
2948 Newz(1004, r->substrs, 1, struct reg_substr_data);
2950 StructCopy(&zero_scan_data, &data, scan_data_t);
2951 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2952 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2954 STRLEN longest_float_length, longest_fixed_length;
2955 struct regnode_charclass_class ch_class;
2960 /* Skip introductions and multiplicators >= 1. */
2961 while ((OP(first) == OPEN && (sawopen = 1)) ||
2962 /* An OR of *one* alternative - should not happen now. */
2963 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2964 (OP(first) == PLUS) ||
2965 (OP(first) == MINMOD) ||
2966 /* An {n,m} with n>0 */
2967 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2968 if (OP(first) == PLUS)
2971 first += regarglen[(U8)OP(first)];
2972 first = NEXTOPER(first);
2975 /* Starting-point info. */
2977 if (PL_regkind[(U8)OP(first)] == EXACT) {
2978 if (OP(first) == EXACT)
2979 ; /* Empty, get anchored substr later. */
2980 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2981 r->regstclass = first;
2983 else if (strchr((const char*)PL_simple,OP(first)))
2984 r->regstclass = first;
2985 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2986 PL_regkind[(U8)OP(first)] == NBOUND)
2987 r->regstclass = first;
2988 else if (PL_regkind[(U8)OP(first)] == BOL) {
2989 r->reganch |= (OP(first) == MBOL
2991 : (OP(first) == SBOL
2994 first = NEXTOPER(first);
2997 else if (OP(first) == GPOS) {
2998 r->reganch |= ROPT_ANCH_GPOS;
2999 first = NEXTOPER(first);
3002 else if (!sawopen && (OP(first) == STAR &&
3003 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
3004 !(r->reganch & ROPT_ANCH) )
3006 /* turn .* into ^.* with an implied $*=1 */
3007 int type = OP(NEXTOPER(first));
3009 if (type == REG_ANY)
3010 type = ROPT_ANCH_MBOL;
3012 type = ROPT_ANCH_SBOL;
3014 r->reganch |= type | ROPT_IMPLICIT;
3015 first = NEXTOPER(first);
3018 if (sawplus && (!sawopen || !RExC_sawback)
3019 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3020 /* x+ must match at the 1st pos of run of x's */
3021 r->reganch |= ROPT_SKIP;
3023 /* Scan is after the zeroth branch, first is atomic matcher. */
3024 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3025 (IV)(first - scan + 1)));
3027 * If there's something expensive in the r.e., find the
3028 * longest literal string that must appear and make it the
3029 * regmust. Resolve ties in favor of later strings, since
3030 * the regstart check works with the beginning of the r.e.
3031 * and avoiding duplication strengthens checking. Not a
3032 * strong reason, but sufficient in the absence of others.
3033 * [Now we resolve ties in favor of the earlier string if
3034 * it happens that c_offset_min has been invalidated, since the
3035 * earlier string may buy us something the later one won't.]
3039 data.longest_fixed = newSVpvn("",0);
3040 data.longest_float = newSVpvn("",0);
3041 data.last_found = newSVpvn("",0);
3042 data.longest = &(data.longest_fixed);
3044 if (!r->regstclass) {
3045 cl_init(pRExC_state, &ch_class);
3046 data.start_class = &ch_class;
3047 stclass_flag = SCF_DO_STCLASS_AND;
3048 } else /* XXXX Check for BOUND? */
3050 data.last_closep = &last_close;
3052 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3053 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3054 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3055 && data.last_start_min == 0 && data.last_end > 0
3056 && !RExC_seen_zerolen
3057 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3058 r->reganch |= ROPT_CHECK_ALL;
3059 scan_commit(pRExC_state, &data);
3060 SvREFCNT_dec(data.last_found);
3062 longest_float_length = CHR_SVLEN(data.longest_float);
3063 if (longest_float_length
3064 || (data.flags & SF_FL_BEFORE_EOL
3065 && (!(data.flags & SF_FL_BEFORE_MEOL)
3066 || (RExC_flags & PMf_MULTILINE)))) {
3069 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3070 && data.offset_fixed == data.offset_float_min
3071 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3072 goto remove_float; /* As in (a)+. */
3074 if (SvUTF8(data.longest_float)) {
3075 r->float_utf8 = data.longest_float;
3076 r->float_substr = Nullsv;
3078 r->float_substr = data.longest_float;
3079 r->float_utf8 = Nullsv;
3081 r->float_min_offset = data.offset_float_min;
3082 r->float_max_offset = data.offset_float_max;
3083 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3084 && (!(data.flags & SF_FL_BEFORE_MEOL)
3085 || (RExC_flags & PMf_MULTILINE)));
3086 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3090 r->float_substr = r->float_utf8 = Nullsv;
3091 SvREFCNT_dec(data.longest_float);
3092 longest_float_length = 0;
3095 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3096 if (longest_fixed_length
3097 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3098 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3099 || (RExC_flags & PMf_MULTILINE)))) {
3102 if (SvUTF8(data.longest_fixed)) {
3103 r->anchored_utf8 = data.longest_fixed;
3104 r->anchored_substr = Nullsv;
3106 r->anchored_substr = data.longest_fixed;
3107 r->anchored_utf8 = Nullsv;
3109 r->anchored_offset = data.offset_fixed;
3110 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3111 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3112 || (RExC_flags & PMf_MULTILINE)));
3113 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3116 r->anchored_substr = r->anchored_utf8 = Nullsv;
3117 SvREFCNT_dec(data.longest_fixed);
3118 longest_fixed_length = 0;
3121 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3122 r->regstclass = NULL;
3123 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3125 && !(data.start_class->flags & ANYOF_EOS)
3126 && !cl_is_anything(data.start_class))
3128 I32 n = add_data(pRExC_state, 1, "f");
3130 New(1006, RExC_rx->data->data[n], 1,
3131 struct regnode_charclass_class);
3132 StructCopy(data.start_class,
3133 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3134 struct regnode_charclass_class);
3135 r->regstclass = (regnode*)RExC_rx->data->data[n];
3136 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3137 PL_regdata = r->data; /* for regprop() */
3138 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3139 regprop(sv, (regnode*)data.start_class);
3140 PerlIO_printf(Perl_debug_log,
3141 "synthetic stclass `%s'.\n",
3145 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3146 if (longest_fixed_length > longest_float_length) {
3147 r->check_substr = r->anchored_substr;
3148 r->check_utf8 = r->anchored_utf8;
3149 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3150 if (r->reganch & ROPT_ANCH_SINGLE)
3151 r->reganch |= ROPT_NOSCAN;
3154 r->check_substr = r->float_substr;
3155 r->check_utf8 = r->float_utf8;
3156 r->check_offset_min = data.offset_float_min;
3157 r->check_offset_max = data.offset_float_max;
3159 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3160 This should be changed ASAP! */
3161 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3162 r->reganch |= RE_USE_INTUIT;
3163 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3164 r->reganch |= RE_INTUIT_TAIL;
3168 /* Several toplevels. Best we can is to set minlen. */
3170 struct regnode_charclass_class ch_class;
3173 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3174 scan = r->program + 1;
3175 cl_init(pRExC_state, &ch_class);
3176 data.start_class = &ch_class;
3177 data.last_closep = &last_close;
3178 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3179 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3180 = r->float_substr = r->float_utf8 = Nullsv;
3181 if (!(data.start_class->flags & ANYOF_EOS)
3182 && !cl_is_anything(data.start_class))
3184 I32 n = add_data(pRExC_state, 1, "f");
3186 New(1006, RExC_rx->data->data[n], 1,
3187 struct regnode_charclass_class);
3188 StructCopy(data.start_class,
3189 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3190 struct regnode_charclass_class);
3191 r->regstclass = (regnode*)RExC_rx->data->data[n];
3192 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3193 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3194 regprop(sv, (regnode*)data.start_class);
3195 PerlIO_printf(Perl_debug_log,
3196 "synthetic stclass `%s'.\n",
3202 if (RExC_seen & REG_SEEN_GPOS)
3203 r->reganch |= ROPT_GPOS_SEEN;
3204 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3205 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3206 if (RExC_seen & REG_SEEN_EVAL)
3207 r->reganch |= ROPT_EVAL_SEEN;
3208 if (RExC_seen & REG_SEEN_CANY)
3209 r->reganch |= ROPT_CANY_SEEN;
3210 Newz(1002, r->startp, RExC_npar, I32);
3211 Newz(1002, r->endp, RExC_npar, I32);
3212 PL_regdata = r->data; /* for regprop() */
3213 DEBUG_COMPILE_r(regdump(r));
3218 - reg - regular expression, i.e. main body or parenthesized thing
3220 * Caller must absorb opening parenthesis.
3222 * Combining parenthesis handling with the base level of regular expression
3223 * is a trifle forced, but the need to tie the tails of the branches to what
3224 * follows makes it hard to avoid.
3227 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3228 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3230 register regnode *ret; /* Will be the head of the group. */
3231 register regnode *br;
3232 register regnode *lastbr;
3233 register regnode *ender = 0;
3234 register I32 parno = 0;
3235 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3237 /* for (?g), (?gc), and (?o) warnings; warning
3238 about (?c) will warn about (?g) -- japhy */
3240 I32 wastedflags = 0x00,
3243 wasted_gc = 0x02 | 0x04,
3246 char * parse_start = RExC_parse; /* MJD */
3247 char *oregcomp_parse = RExC_parse;
3250 *flagp = 0; /* Tentatively. */
3253 /* Make an OPEN node, if parenthesized. */
3255 if (*RExC_parse == '?') { /* (?...) */
3256 U32 posflags = 0, negflags = 0;
3257 U32 *flagsp = &posflags;
3259 char *seqstart = RExC_parse;
3262 paren = *RExC_parse++;
3263 ret = NULL; /* For look-ahead/behind. */
3265 case '<': /* (?<...) */
3266 RExC_seen |= REG_SEEN_LOOKBEHIND;
3267 if (*RExC_parse == '!')
3269 if (*RExC_parse != '=' && *RExC_parse != '!')
3272 case '=': /* (?=...) */
3273 case '!': /* (?!...) */
3274 RExC_seen_zerolen++;
3275 case ':': /* (?:...) */
3276 case '>': /* (?>...) */
3278 case '$': /* (?$...) */
3279 case '@': /* (?@...) */
3280 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3282 case '#': /* (?#...) */
3283 while (*RExC_parse && *RExC_parse != ')')
3285 if (*RExC_parse != ')')
3286 FAIL("Sequence (?#... not terminated");
3287 nextchar(pRExC_state);
3290 case 'p': /* (?p...) */
3291 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3292 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3294 case '?': /* (??...) */
3296 if (*RExC_parse != '{')
3298 paren = *RExC_parse++;
3300 case '{': /* (?{...}) */
3302 I32 count = 1, n = 0;
3304 char *s = RExC_parse;
3306 OP_4tree *sop, *rop;
3308 RExC_seen_zerolen++;
3309 RExC_seen |= REG_SEEN_EVAL;
3310 while (count && (c = *RExC_parse)) {
3311 if (c == '\\' && RExC_parse[1])
3319 if (*RExC_parse != ')')
3322 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3327 if (RExC_parse - 1 - s)
3328 sv = newSVpvn(s, RExC_parse - 1 - s);
3330 sv = newSVpvn("", 0);
3333 Perl_save_re_context(aTHX);
3334 rop = sv_compile_2op(sv, &sop, "re", &pad);
3335 sop->op_private |= OPpREFCOUNTED;
3336 /* re_dup will OpREFCNT_inc */
3337 OpREFCNT_set(sop, 1);
3340 n = add_data(pRExC_state, 3, "nop");
3341 RExC_rx->data->data[n] = (void*)rop;
3342 RExC_rx->data->data[n+1] = (void*)sop;
3343 RExC_rx->data->data[n+2] = (void*)pad;
3346 else { /* First pass */
3347 if (PL_reginterp_cnt < ++RExC_seen_evals
3349 /* No compiled RE interpolated, has runtime
3350 components ===> unsafe. */
3351 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3352 if (PL_tainting && PL_tainted)
3353 FAIL("Eval-group in insecure regular expression");
3354 if (IN_PERL_COMPILETIME)
3358 nextchar(pRExC_state);
3360 ret = reg_node(pRExC_state, LOGICAL);
3363 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3364 /* deal with the length of this later - MJD */
3367 ret = reganode(pRExC_state, EVAL, n);
3368 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3369 Set_Node_Offset(ret, parse_start);
3372 case '(': /* (?(?{...})...) and (?(?=...)...) */
3374 if (RExC_parse[0] == '?') { /* (?(?...)) */
3375 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3376 || RExC_parse[1] == '<'
3377 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3380 ret = reg_node(pRExC_state, LOGICAL);
3383 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3387 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3389 parno = atoi(RExC_parse++);
3391 while (isDIGIT(*RExC_parse))
3393 ret = reganode(pRExC_state, GROUPP, parno);
3395 if ((c = *nextchar(pRExC_state)) != ')')
3396 vFAIL("Switch condition not recognized");
3398 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3399 br = regbranch(pRExC_state, &flags, 1);
3401 br = reganode(pRExC_state, LONGJMP, 0);
3403 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3404 c = *nextchar(pRExC_state);
3408 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3409 regbranch(pRExC_state, &flags, 1);
3410 regtail(pRExC_state, ret, lastbr);
3413 c = *nextchar(pRExC_state);
3418 vFAIL("Switch (?(condition)... contains too many branches");
3419 ender = reg_node(pRExC_state, TAIL);
3420 regtail(pRExC_state, br, ender);
3422 regtail(pRExC_state, lastbr, ender);
3423 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3426 regtail(pRExC_state, ret, ender);
3430 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3434 RExC_parse--; /* for vFAIL to print correctly */
3435 vFAIL("Sequence (? incomplete");
3439 parse_flags: /* (?i) */
3440 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3441 /* (?g), (?gc) and (?o) are useless here
3442 and must be globally applied -- japhy */
3444 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3445 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3446 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3447 if (! (wastedflags & wflagbit) ) {
3448 wastedflags |= wflagbit;
3451 "Useless (%s%c) - %suse /%c modifier",
3452 flagsp == &negflags ? "?-" : "?",
3454 flagsp == &negflags ? "don't " : "",
3460 else if (*RExC_parse == 'c') {
3461 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3462 if (! (wastedflags & wasted_c) ) {
3463 wastedflags |= wasted_gc;
3466 "Useless (%sc) - %suse /gc modifier",
3467 flagsp == &negflags ? "?-" : "?",
3468 flagsp == &negflags ? "don't " : ""
3473 else { pmflag(flagsp, *RExC_parse); }
3477 if (*RExC_parse == '-') {
3479 wastedflags = 0; /* reset so (?g-c) warns twice */
3483 RExC_flags |= posflags;
3484 RExC_flags &= ~negflags;
3485 if (*RExC_parse == ':') {
3491 if (*RExC_parse != ')') {
3493 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3495 nextchar(pRExC_state);
3503 ret = reganode(pRExC_state, OPEN, parno);
3504 Set_Node_Length(ret, 1); /* MJD */
3505 Set_Node_Offset(ret, RExC_parse); /* MJD */
3512 /* Pick up the branches, linking them together. */
3513 parse_start = RExC_parse; /* MJD */
3514 br = regbranch(pRExC_state, &flags, 1);
3515 /* branch_len = (paren != 0); */
3519 if (*RExC_parse == '|') {
3520 if (!SIZE_ONLY && RExC_extralen) {
3521 reginsert(pRExC_state, BRANCHJ, br);
3524 reginsert(pRExC_state, BRANCH, br);
3525 Set_Node_Length(br, paren != 0);
3526 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3530 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3532 else if (paren == ':') {
3533 *flagp |= flags&SIMPLE;
3535 if (open) { /* Starts with OPEN. */
3536 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3538 else if (paren != '?') /* Not Conditional */
3540 *flagp |= flags & (SPSTART | HASWIDTH);
3542 while (*RExC_parse == '|') {
3543 if (!SIZE_ONLY && RExC_extralen) {
3544 ender = reganode(pRExC_state, LONGJMP,0);
3545 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3548 RExC_extralen += 2; /* Account for LONGJMP. */
3549 nextchar(pRExC_state);
3550 br = regbranch(pRExC_state, &flags, 0);
3554 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3558 *flagp |= flags&SPSTART;
3561 if (have_branch || paren != ':') {
3562 /* Make a closing node, and hook it on the end. */
3565 ender = reg_node(pRExC_state, TAIL);
3568 ender = reganode(pRExC_state, CLOSE, parno);
3569 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3570 Set_Node_Length(ender,1); /* MJD */
3576 *flagp &= ~HASWIDTH;
3579 ender = reg_node(pRExC_state, SUCCEED);
3582 ender = reg_node(pRExC_state, END);
3585 regtail(pRExC_state, lastbr, ender);
3588 /* Hook the tails of the branches to the closing node. */
3589 for (br = ret; br != NULL; br = regnext(br)) {
3590 regoptail(pRExC_state, br, ender);
3597 static const char parens[] = "=!<,>";
3599 if (paren && (p = strchr(parens, paren))) {
3600 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3601 int flag = (p - parens) > 1;
3604 node = SUSPEND, flag = 0;
3605 reginsert(pRExC_state, node,ret);
3606 Set_Node_Cur_Length(ret);
3607 Set_Node_Offset(ret, parse_start + 1);
3609 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3613 /* Check for proper termination. */
3615 RExC_flags = oregflags;
3616 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3617 RExC_parse = oregcomp_parse;
3618 vFAIL("Unmatched (");
3621 else if (!paren && RExC_parse < RExC_end) {
3622 if (*RExC_parse == ')') {
3624 vFAIL("Unmatched )");
3627 FAIL("Junk on end of regexp"); /* "Can't happen". */
3635 - regbranch - one alternative of an | operator
3637 * Implements the concatenation operator.
3640 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3642 register regnode *ret;
3643 register regnode *chain = NULL;
3644 register regnode *latest;
3645 I32 flags = 0, c = 0;
3650 if (!SIZE_ONLY && RExC_extralen)
3651 ret = reganode(pRExC_state, BRANCHJ,0);
3653 ret = reg_node(pRExC_state, BRANCH);
3654 Set_Node_Length(ret, 1);
3658 if (!first && SIZE_ONLY)
3659 RExC_extralen += 1; /* BRANCHJ */
3661 *flagp = WORST; /* Tentatively. */
3664 nextchar(pRExC_state);
3665 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3667 latest = regpiece(pRExC_state, &flags);
3668 if (latest == NULL) {
3669 if (flags & TRYAGAIN)
3673 else if (ret == NULL)
3675 *flagp |= flags&HASWIDTH;
3676 if (chain == NULL) /* First piece. */
3677 *flagp |= flags&SPSTART;
3680 regtail(pRExC_state, chain, latest);
3685 if (chain == NULL) { /* Loop ran zero times. */
3686 chain = reg_node(pRExC_state, NOTHING);
3691 *flagp |= flags&SIMPLE;
3698 - regpiece - something followed by possible [*+?]
3700 * Note that the branching code sequences used for ? and the general cases
3701 * of * and + are somewhat optimized: they use the same NOTHING node as
3702 * both the endmarker for their branch list and the body of the last branch.
3703 * It might seem that this node could be dispensed with entirely, but the
3704 * endmarker role is not redundant.
3707 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3709 register regnode *ret;
3711 register char *next;
3713 char *origparse = RExC_parse;
3716 I32 max = REG_INFTY;
3719 ret = regatom(pRExC_state, &flags);
3721 if (flags & TRYAGAIN)
3728 if (op == '{' && regcurly(RExC_parse)) {
3729 parse_start = RExC_parse; /* MJD */
3730 next = RExC_parse + 1;
3732 while (isDIGIT(*next) || *next == ',') {
3741 if (*next == '}') { /* got one */
3745 min = atoi(RExC_parse);
3749 maxpos = RExC_parse;
3751 if (!max && *maxpos != '0')
3752 max = REG_INFTY; /* meaning "infinity" */
3753 else if (max >= REG_INFTY)
3754 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3756 nextchar(pRExC_state);
3759 if ((flags&SIMPLE)) {
3760 RExC_naughty += 2 + RExC_naughty / 2;
3761 reginsert(pRExC_state, CURLY, ret);
3762 Set_Node_Offset(ret, parse_start+1); /* MJD */
3763 Set_Node_Cur_Length(ret);
3766 regnode *w = reg_node(pRExC_state, WHILEM);
3769 regtail(pRExC_state, ret, w);
3770 if (!SIZE_ONLY && RExC_extralen) {
3771 reginsert(pRExC_state, LONGJMP,ret);
3772 reginsert(pRExC_state, NOTHING,ret);
3773 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3775 reginsert(pRExC_state, CURLYX,ret);
3777 Set_Node_Offset(ret, parse_start+1);
3778 Set_Node_Length(ret,
3779 op == '{' ? (RExC_parse - parse_start) : 1);
3781 if (!SIZE_ONLY && RExC_extralen)
3782 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3783 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3785 RExC_whilem_seen++, RExC_extralen += 3;
3786 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3794 if (max && max < min)
3795 vFAIL("Can't do {n,m} with n > m");
3797 ARG1_SET(ret, (U16)min);
3798 ARG2_SET(ret, (U16)max);
3810 #if 0 /* Now runtime fix should be reliable. */
3812 /* if this is reinstated, don't forget to put this back into perldiag:
3814 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3816 (F) The part of the regexp subject to either the * or + quantifier
3817 could match an empty string. The {#} shows in the regular
3818 expression about where the problem was discovered.
3822 if (!(flags&HASWIDTH) && op != '?')
3823 vFAIL("Regexp *+ operand could be empty");
3826 parse_start = RExC_parse;
3827 nextchar(pRExC_state);
3829 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3831 if (op == '*' && (flags&SIMPLE)) {
3832 reginsert(pRExC_state, STAR, ret);
3836 else if (op == '*') {
3840 else if (op == '+' && (flags&SIMPLE)) {
3841 reginsert(pRExC_state, PLUS, ret);
3845 else if (op == '+') {
3849 else if (op == '?') {
3854 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
3856 "%.*s matches null string many times",
3857 RExC_parse - origparse,
3861 if (*RExC_parse == '?') {
3862 nextchar(pRExC_state);
3863 reginsert(pRExC_state, MINMOD, ret);
3864 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3866 if (ISMULT2(RExC_parse)) {
3868 vFAIL("Nested quantifiers");
3875 - regatom - the lowest level
3877 * Optimization: gobbles an entire sequence of ordinary characters so that
3878 * it can turn them into a single node, which is smaller to store and
3879 * faster to run. Backslashed characters are exceptions, each becoming a
3880 * separate node; the code is simpler that way and it's not worth fixing.
3882 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3884 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3886 register regnode *ret = 0;
3888 char *parse_start = RExC_parse;
3890 *flagp = WORST; /* Tentatively. */
3893 switch (*RExC_parse) {
3895 RExC_seen_zerolen++;
3896 nextchar(pRExC_state);
3897 if (RExC_flags & PMf_MULTILINE)
3898 ret = reg_node(pRExC_state, MBOL);
3899 else if (RExC_flags & PMf_SINGLELINE)
3900 ret = reg_node(pRExC_state, SBOL);
3902 ret = reg_node(pRExC_state, BOL);
3903 Set_Node_Length(ret, 1); /* MJD */
3906 nextchar(pRExC_state);
3908 RExC_seen_zerolen++;
3909 if (RExC_flags & PMf_MULTILINE)
3910 ret = reg_node(pRExC_state, MEOL);
3911 else if (RExC_flags & PMf_SINGLELINE)
3912 ret = reg_node(pRExC_state, SEOL);
3914 ret = reg_node(pRExC_state, EOL);
3915 Set_Node_Length(ret, 1); /* MJD */
3918 nextchar(pRExC_state);
3919 if (RExC_flags & PMf_SINGLELINE)
3920 ret = reg_node(pRExC_state, SANY);
3922 ret = reg_node(pRExC_state, REG_ANY);
3923 *flagp |= HASWIDTH|SIMPLE;
3925 Set_Node_Length(ret, 1); /* MJD */
3929 char *oregcomp_parse = ++RExC_parse;
3930 ret = regclass(pRExC_state);
3931 if (*RExC_parse != ']') {
<