5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 8
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
171 for (p = PL_regsize; p > parenfloor; p--) {
172 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
175 SSPUSHPTR(PL_reg_start_tmp[p]);
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
184 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
185 SSPUSHPTR(PL_regstartp);
186 SSPUSHPTR(PL_regendp);
187 SSPUSHINT(PL_regsize);
188 SSPUSHINT(*PL_reglastparen);
189 SSPUSHINT(*PL_reglastcloseparen);
190 SSPUSHPTR(PL_reginput);
191 #define REGCP_FRAME_ELEMS 2
192 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
193 * are needed for the regexp context stack bookkeeping. */
194 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
195 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
200 /* These are needed since we do not localize EVAL nodes: */
201 #define REGCP_SET(cp) \
203 PerlIO_printf(Perl_debug_log, \
204 " Setting an EVAL scope, savestack=%"IVdf"\n", \
205 (IV)PL_savestack_ix)); \
208 #define REGCP_UNWIND(cp) \
210 if (cp != PL_savestack_ix) \
211 PerlIO_printf(Perl_debug_log, \
212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
213 (IV)(cp), (IV)PL_savestack_ix)); \
217 S_regcppop(pTHX_ const regexp *rex)
223 GET_RE_DEBUG_FLAGS_DECL;
225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
229 input = (char *) SSPOPPTR;
230 *PL_reglastcloseparen = SSPOPINT;
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
233 PL_regendp=(I32 *) SSPOPPTR;
234 PL_regstartp=(I32 *) SSPOPPTR;
237 /* Now restore the parentheses context. */
238 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
239 i > 0; i -= REGCP_PAREN_ELEMS) {
241 U32 paren = (U32)SSPOPINT;
242 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
243 PL_regstartp[paren] = SSPOPINT;
245 if (paren <= *PL_reglastparen)
246 PL_regendp[paren] = tmps;
248 PerlIO_printf(Perl_debug_log,
249 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
250 (UV)paren, (IV)PL_regstartp[paren],
251 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
252 (IV)PL_regendp[paren],
253 (paren > *PL_reglastparen ? "(no)" : ""));
257 if (*PL_reglastparen + 1 <= rex->nparens) {
258 PerlIO_printf(Perl_debug_log,
259 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
260 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
264 /* It would seem that the similar code in regtry()
265 * already takes care of this, and in fact it is in
266 * a better location to since this code can #if 0-ed out
267 * but the code in regtry() is needed or otherwise tests
268 * requiring null fields (pat.t#187 and split.t#{13,14}
269 * (as of patchlevel 7877) will fail. Then again,
270 * this code seems to be necessary or otherwise
271 * building DynaLoader will fail:
272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
276 PL_regstartp[i] = -1;
283 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
286 * pregexec and friends
289 #ifndef PERL_IN_XSUB_RE
291 - pregexec - match a regexp against a string
294 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
295 char *strbeg, I32 minend, SV *screamer, U32 nosave)
296 /* strend: pointer to null at end of string */
297 /* strbeg: real beginning of string */
298 /* minend: end of match must be >=minend after stringarg. */
299 /* nosave: For optimizations. */
302 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
303 nosave ? 0 : REXEC_COPY_STR);
308 * Need to implement the following flags for reg_anch:
310 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
312 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
313 * INTUIT_AUTORITATIVE_ML
314 * INTUIT_ONCE_NOML - Intuit can match in one location only.
317 * Another flag for this function: SECOND_TIME (so that float substrs
318 * with giant delta may be not rechecked).
321 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
323 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
324 Otherwise, only SvCUR(sv) is used to get strbeg. */
326 /* XXXX We assume that strpos is strbeg unless sv. */
328 /* XXXX Some places assume that there is a fixed substring.
329 An update may be needed if optimizer marks as "INTUITable"
330 RExen without fixed substrings. Similarly, it is assumed that
331 lengths of all the strings are no more than minlen, thus they
332 cannot come from lookahead.
333 (Or minlen should take into account lookahead.) */
335 /* A failure to find a constant substring means that there is no need to make
336 an expensive call to REx engine, thus we celebrate a failure. Similarly,
337 finding a substring too deep into the string means that less calls to
338 regtry() should be needed.
340 REx compiler's optimizer found 4 possible hints:
341 a) Anchored substring;
343 c) Whether we are anchored (beginning-of-line or \G);
344 d) First node (of those at offset 0) which may distingush positions;
345 We use a)b)d) and multiline-part of c), and try to find a position in the
346 string which does not contradict any of them.
349 /* Most of decisions we do here should have been done at compile time.
350 The nodes of the REx which we used for the search should have been
351 deleted from the finite automaton. */
354 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
355 char *strend, U32 flags, re_scream_pos_data *data)
358 register I32 start_shift = 0;
359 /* Should be nonnegative! */
360 register I32 end_shift = 0;
365 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
367 register char *other_last = NULL; /* other substr checked before this */
368 char *check_at = NULL; /* check substr found at this pos */
369 const I32 multiline = prog->reganch & PMf_MULTILINE;
371 const char * const i_strpos = strpos;
374 GET_RE_DEBUG_FLAGS_DECL;
376 RX_MATCH_UTF8_set(prog,do_utf8);
378 if (prog->reganch & ROPT_UTF8) {
379 PL_reg_flags |= RF_utf8;
382 debug_start_match(prog, do_utf8, strpos, strend,
383 sv ? "Guessing start of match in sv for"
384 : "Guessing start of match in string for");
387 /* CHR_DIST() would be more correct here but it makes things slow. */
388 if (prog->minlen > strend - strpos) {
389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
390 "String too short... [re_intuit_start]\n"));
394 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
397 if (!prog->check_utf8 && prog->check_substr)
398 to_utf8_substr(prog);
399 check = prog->check_utf8;
401 if (!prog->check_substr && prog->check_utf8)
402 to_byte_substr(prog);
403 check = prog->check_substr;
405 if (check == &PL_sv_undef) {
406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
407 "Non-utf8 string cannot match utf8 check string\n"));
410 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
411 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
412 || ( (prog->reganch & ROPT_ANCH_BOL)
413 && !multiline ) ); /* Check after \n? */
416 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
417 | ROPT_IMPLICIT)) /* not a real BOL */
418 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
420 && (strpos != strbeg)) {
421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
424 if (prog->check_offset_min == prog->check_offset_max &&
425 !(prog->reganch & ROPT_CANY_SEEN)) {
426 /* Substring at constant offset from beg-of-str... */
429 s = HOP3c(strpos, prog->check_offset_min, strend);
432 slen = SvCUR(check); /* >= 1 */
434 if ( strend - s > slen || strend - s < slen - 1
435 || (strend - s == slen && strend[-1] != '\n')) {
436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
439 /* Now should match s[0..slen-2] */
441 if (slen && (*SvPVX_const(check) != *s
443 && memNE(SvPVX_const(check), s, slen)))) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
449 else if (*SvPVX_const(check) != *s
450 || ((slen = SvCUR(check)) > 1
451 && memNE(SvPVX_const(check), s, slen)))
454 goto success_at_start;
457 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
459 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
460 end_shift = prog->check_end_shift;
463 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
464 - (SvTAIL(check) != 0);
465 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
467 if (end_shift < eshift)
471 else { /* Can match at random position */
474 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
475 end_shift = prog->check_end_shift;
477 /* end shift should be non negative here */
480 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
482 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
483 (IV)end_shift, prog->precomp);
487 /* Find a possible match in the region s..strend by looking for
488 the "check" substring in the region corrected by start/end_shift. */
491 I32 srch_start_shift = start_shift;
492 I32 srch_end_shift = end_shift;
493 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
494 srch_end_shift -= ((strbeg - s) - srch_start_shift);
495 srch_start_shift = strbeg - s;
497 DEBUG_OPTIMISE_MORE_r({
498 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
499 (IV)prog->check_offset_min,
500 (IV)srch_start_shift,
502 (IV)prog->check_end_shift);
505 if (flags & REXEC_SCREAM) {
506 I32 p = -1; /* Internal iterator of scream. */
507 I32 * const pp = data ? data->scream_pos : &p;
509 if (PL_screamfirst[BmRARE(check)] >= 0
510 || ( BmRARE(check) == '\n'
511 && (BmPREVIOUS(check) == SvCUR(check) - 1)
513 s = screaminstr(sv, check,
514 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
517 /* we may be pointing at the wrong string */
518 if (s && RX_MATCH_COPIED(prog))
519 s = strbeg + (s - SvPVX_const(sv));
521 *data->scream_olds = s;
526 if (prog->reganch & ROPT_CANY_SEEN) {
527 start_point= (U8*)(s + srch_start_shift);
528 end_point= (U8*)(strend - srch_end_shift);
530 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
531 end_point= HOP3(strend, -srch_end_shift, strbeg);
533 DEBUG_OPTIMISE_MORE_r({
534 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
535 (int)(end_point - start_point),
536 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
540 s = fbm_instr( start_point, end_point,
541 check, multiline ? FBMrf_MULTILINE : 0);
544 /* Update the count-of-usability, remove useless subpatterns,
548 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
549 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
550 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
551 (s ? "Found" : "Did not find"),
552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
553 ? "anchored" : "floating"),
556 (s ? " at offset " : "...\n") );
561 /* Finish the diagnostic message */
562 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
564 /* XXX dmq: first branch is for positive lookbehind...
565 Our check string is offset from the beginning of the pattern.
566 So we need to do any stclass tests offset forward from that
575 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
576 Start with the other substr.
577 XXXX no SCREAM optimization yet - and a very coarse implementation
578 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
579 *always* match. Probably should be marked during compile...
580 Probably it is right to do no SCREAM here...
583 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
584 : (prog->float_substr && prog->anchored_substr))
586 /* Take into account the "other" substring. */
587 /* XXXX May be hopelessly wrong for UTF... */
590 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
593 char * const last = HOP3c(s, -start_shift, strbeg);
595 char * const saved_s = s;
598 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
601 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
606 t = HOP3c(t, prog->anchored_offset, strend);
607 if (t < other_last) /* These positions already checked */
609 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
612 /* XXXX It is not documented what units *_offsets are in.
613 We assume bytes, but this is clearly wrong.
614 Meaning this code needs to be carefully reviewed for errors.
618 /* On end-of-str: see comment below. */
619 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
620 if (must == &PL_sv_undef) {
622 DEBUG_r(must = prog->anchored_utf8); /* for debug */
627 HOP3(HOP3(last1, prog->anchored_offset, strend)
628 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
630 multiline ? FBMrf_MULTILINE : 0
633 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
634 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
635 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
636 (s ? "Found" : "Contradicts"),
637 quoted, RE_SV_TAIL(must));
642 if (last1 >= last2) {
643 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
644 ", giving up...\n"));
647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
648 ", trying floating at offset %ld...\n",
649 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
650 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
651 s = HOP3c(last, 1, strend);
655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
656 (long)(s - i_strpos)));
657 t = HOP3c(s, -prog->anchored_offset, strbeg);
658 other_last = HOP3c(s, 1, strend);
666 else { /* Take into account the floating substring. */
668 char * const saved_s = s;
671 t = HOP3c(s, -start_shift, strbeg);
673 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
674 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
675 last = HOP3c(t, prog->float_max_offset, strend);
676 s = HOP3c(t, prog->float_min_offset, strend);
679 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
680 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
681 /* fbm_instr() takes into account exact value of end-of-str
682 if the check is SvTAIL(ed). Since false positives are OK,
683 and end-of-str is not later than strend we are OK. */
684 if (must == &PL_sv_undef) {
686 DEBUG_r(must = prog->float_utf8); /* for debug message */
689 s = fbm_instr((unsigned char*)s,
690 (unsigned char*)last + SvCUR(must)
692 must, multiline ? FBMrf_MULTILINE : 0);
694 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
695 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
696 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
697 (s ? "Found" : "Contradicts"),
698 quoted, RE_SV_TAIL(must));
702 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
703 ", giving up...\n"));
706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
707 ", trying anchored starting at offset %ld...\n",
708 (long)(saved_s + 1 - i_strpos)));
710 s = HOP3c(t, 1, strend);
714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
715 (long)(s - i_strpos)));
716 other_last = s; /* Fix this later. --Hugo */
726 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
728 DEBUG_OPTIMISE_MORE_r(
729 PerlIO_printf(Perl_debug_log,
730 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
731 (IV)prog->check_offset_min,
732 (IV)prog->check_offset_max,
740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
742 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
745 /* Fixed substring is found far enough so that the match
746 cannot start at strpos. */
748 if (ml_anch && t[-1] != '\n') {
749 /* Eventually fbm_*() should handle this, but often
750 anchored_offset is not 0, so this check will not be wasted. */
751 /* XXXX In the code below we prefer to look for "^" even in
752 presence of anchored substrings. And we search even
753 beyond the found float position. These pessimizations
754 are historical artefacts only. */
756 while (t < strend - prog->minlen) {
758 if (t < check_at - prog->check_offset_min) {
759 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
760 /* Since we moved from the found position,
761 we definitely contradict the found anchored
762 substr. Due to the above check we do not
763 contradict "check" substr.
764 Thus we can arrive here only if check substr
765 is float. Redo checking for "other"=="fixed".
768 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
769 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
770 goto do_other_anchored;
772 /* We don't contradict the found floating substring. */
773 /* XXXX Why not check for STCLASS? */
775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
776 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
779 /* Position contradicts check-string */
780 /* XXXX probably better to look for check-string
781 than for "\n", so one should lower the limit for t? */
782 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
783 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
784 other_last = strpos = s = t + 1;
789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
790 PL_colors[0], PL_colors[1]));
794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
795 PL_colors[0], PL_colors[1]));
799 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
802 /* The found string does not prohibit matching at strpos,
803 - no optimization of calling REx engine can be performed,
804 unless it was an MBOL and we are not after MBOL,
805 or a future STCLASS check will fail this. */
807 /* Even in this situation we may use MBOL flag if strpos is offset
808 wrt the start of the string. */
809 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
810 && (strpos != strbeg) && strpos[-1] != '\n'
811 /* May be due to an implicit anchor of m{.*foo} */
812 && !(prog->reganch & ROPT_IMPLICIT))
817 DEBUG_EXECUTE_r( if (ml_anch)
818 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
819 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
822 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
824 prog->check_utf8 /* Could be deleted already */
825 && --BmUSEFUL(prog->check_utf8) < 0
826 && (prog->check_utf8 == prog->float_utf8)
828 prog->check_substr /* Could be deleted already */
829 && --BmUSEFUL(prog->check_substr) < 0
830 && (prog->check_substr == prog->float_substr)
833 /* If flags & SOMETHING - do not do it many times on the same match */
834 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
835 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
836 if (do_utf8 ? prog->check_substr : prog->check_utf8)
837 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
838 prog->check_substr = prog->check_utf8 = NULL; /* disable */
839 prog->float_substr = prog->float_utf8 = NULL; /* clear */
840 check = NULL; /* abort */
842 /* XXXX This is a remnant of the old implementation. It
843 looks wasteful, since now INTUIT can use many
845 prog->reganch &= ~RE_USE_INTUIT;
852 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
853 /* trie stclasses are too expensive to use here, we are better off to
854 leave it to regmatch itself */
855 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
856 /* minlen == 0 is possible if regstclass is \b or \B,
857 and the fixed substr is ''$.
858 Since minlen is already taken into account, s+1 is before strend;
859 accidentally, minlen >= 1 guaranties no false positives at s + 1
860 even for \b or \B. But (minlen? 1 : 0) below assumes that
861 regstclass does not come from lookahead... */
862 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
863 This leaves EXACTF only, which is dealt with in find_byclass(). */
864 const U8* const str = (U8*)STRING(prog->regstclass);
865 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
866 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
869 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
870 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
871 else if (prog->float_substr || prog->float_utf8)
872 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
876 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
877 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
880 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
883 const char *what = NULL;
885 if (endpos == strend) {
886 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
887 "Could not match STCLASS...\n") );
890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
891 "This position contradicts STCLASS...\n") );
892 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
894 /* Contradict one of substrings */
895 if (prog->anchored_substr || prog->anchored_utf8) {
896 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
897 DEBUG_EXECUTE_r( what = "anchored" );
899 s = HOP3c(t, 1, strend);
900 if (s + start_shift + end_shift > strend) {
901 /* XXXX Should be taken into account earlier? */
902 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
903 "Could not match STCLASS...\n") );
908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
909 "Looking for %s substr starting at offset %ld...\n",
910 what, (long)(s + start_shift - i_strpos)) );
913 /* Have both, check_string is floating */
914 if (t + start_shift >= check_at) /* Contradicts floating=check */
915 goto retry_floating_check;
916 /* Recheck anchored substring, but not floating... */
920 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
921 "Looking for anchored substr starting at offset %ld...\n",
922 (long)(other_last - i_strpos)) );
923 goto do_other_anchored;
925 /* Another way we could have checked stclass at the
926 current position only: */
931 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
932 "Looking for /%s^%s/m starting at offset %ld...\n",
933 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
936 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
938 /* Check is floating subtring. */
939 retry_floating_check:
940 t = check_at - start_shift;
941 DEBUG_EXECUTE_r( what = "floating" );
942 goto hop_and_restart;
945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
946 "By STCLASS: moving %ld --> %ld\n",
947 (long)(t - i_strpos), (long)(s - i_strpos))
951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
952 "Does not contradict STCLASS...\n");
957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
958 PL_colors[4], (check ? "Guessed" : "Giving up"),
959 PL_colors[5], (long)(s - i_strpos)) );
962 fail_finish: /* Substring not found */
963 if (prog->check_substr || prog->check_utf8) /* could be removed already */
964 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
967 PL_colors[4], PL_colors[5]));
973 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
974 foldlen, foldbuf, uniflags) STMT_START { \
975 switch (trie_type) { \
976 case trie_utf8_fold: \
978 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
983 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
984 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
985 foldlen -= UNISKIP( uvc ); \
986 uscan = foldbuf + UNISKIP( uvc ); \
990 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
998 charid = trie->charmap[ uvc ]; \
1002 if (trie->widecharmap) { \
1003 SV** const svpp = hv_fetch(trie->widecharmap, \
1004 (char*)&uvc, sizeof(UV), 0); \
1006 charid = (U16)SvIV(*svpp); \
1011 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1014 ibcmp_utf8(s, NULL, 0, do_utf8, \
1015 m, NULL, ln, (bool)UTF)) \
1016 && (!reginfo || regtry(reginfo, s)) ) \
1019 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1020 uvchr_to_utf8(tmpbuf, c); \
1021 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1023 && (f == c1 || f == c2) \
1024 && (ln == foldlen || \
1025 !ibcmp_utf8((char *) foldbuf, \
1026 NULL, foldlen, do_utf8, \
1028 NULL, ln, (bool)UTF)) \
1029 && (!reginfo || regtry(reginfo, s)) ) \
1034 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1038 && (ln == 1 || !(OP(c) == EXACTF \
1040 : ibcmp_locale(s, m, ln))) \
1041 && (!reginfo || regtry(reginfo, s)) ) \
1047 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1049 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1055 #define REXEC_FBC_SCAN(CoDe) \
1057 while (s < strend) { \
1063 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1064 REXEC_FBC_UTF8_SCAN( \
1066 if (tmp && (!reginfo || regtry(reginfo, s))) \
1075 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1078 if (tmp && (!reginfo || regtry(reginfo, s))) \
1087 #define REXEC_FBC_TRYIT \
1088 if ((!reginfo || regtry(reginfo, s))) \
1091 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1094 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1097 REXEC_FBC_CLASS_SCAN(CoNd); \
1101 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1102 PL_reg_flags |= RF_tainted; \
1104 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1107 REXEC_FBC_CLASS_SCAN(CoNd); \
1111 #define DUMP_EXEC_POS(li,s,doutf8) \
1112 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1114 /* We know what class REx starts with. Try to find this position... */
1115 /* if reginfo is NULL, its a dryrun */
1116 /* annoyingly all the vars in this routine have different names from their counterparts
1117 in regmatch. /grrr */
1120 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1121 const char *strend, const regmatch_info *reginfo)
1124 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1128 register STRLEN uskip;
1132 register I32 tmp = 1; /* Scratch variable? */
1133 register const bool do_utf8 = PL_reg_match_utf8;
1135 /* We know what class it must start with. */
1139 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1140 !UTF8_IS_INVARIANT((U8)s[0]) ?
1141 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1142 REGINCLASS(prog, c, (U8*)s));
1145 while (s < strend) {
1148 if (REGINCLASS(prog, c, (U8*)s) ||
1149 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1150 /* The assignment of 2 is intentional:
1151 * for the folded sharp s, the skip is 2. */
1152 (skip = SHARP_S_SKIP))) {
1153 if (tmp && (!reginfo || regtry(reginfo, s)))
1166 if (tmp && (!reginfo || regtry(reginfo, s)))
1174 ln = STR_LEN(c); /* length to match in octets/bytes */
1175 lnc = (I32) ln; /* length to match in characters */
1177 STRLEN ulen1, ulen2;
1179 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1180 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1181 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1183 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1184 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1186 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1188 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1191 while (sm < ((U8 *) m + ln)) {
1206 c2 = PL_fold_locale[c1];
1208 e = HOP3c(strend, -((I32)lnc), s);
1210 if (!reginfo && e < s)
1211 e = s; /* Due to minlen logic of intuit() */
1213 /* The idea in the EXACTF* cases is to first find the
1214 * first character of the EXACTF* node and then, if
1215 * necessary, case-insensitively compare the full
1216 * text of the node. The c1 and c2 are the first
1217 * characters (though in Unicode it gets a bit
1218 * more complicated because there are more cases
1219 * than just upper and lower: one needs to use
1220 * the so-called folding case for case-insensitive
1221 * matching (called "loose matching" in Unicode).
1222 * ibcmp_utf8() will do just that. */
1226 U8 tmpbuf [UTF8_MAXBYTES+1];
1227 STRLEN len, foldlen;
1228 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1230 /* Upper and lower of 1st char are equal -
1231 * probably not a "letter". */
1233 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1235 REXEC_FBC_EXACTISH_CHECK(c == c1);
1240 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1243 /* Handle some of the three Greek sigmas cases.
1244 * Note that not all the possible combinations
1245 * are handled here: some of them are handled
1246 * by the standard folding rules, and some of
1247 * them (the character class or ANYOF cases)
1248 * are handled during compiletime in
1249 * regexec.c:S_regclass(). */
1250 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1251 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1252 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1254 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1260 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1262 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1266 PL_reg_flags |= RF_tainted;
1273 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1274 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1276 tmp = ((OP(c) == BOUND ?
1277 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1278 LOAD_UTF8_CHARCLASS_ALNUM();
1279 REXEC_FBC_UTF8_SCAN(
1280 if (tmp == !(OP(c) == BOUND ?
1281 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1282 isALNUM_LC_utf8((U8*)s)))
1290 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1291 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1294 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1300 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1304 PL_reg_flags |= RF_tainted;
1311 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1312 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1314 tmp = ((OP(c) == NBOUND ?
1315 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1316 LOAD_UTF8_CHARCLASS_ALNUM();
1317 REXEC_FBC_UTF8_SCAN(
1318 if (tmp == !(OP(c) == NBOUND ?
1319 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1320 isALNUM_LC_utf8((U8*)s)))
1322 else REXEC_FBC_TRYIT;
1326 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1327 tmp = ((OP(c) == NBOUND ?
1328 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1331 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1333 else REXEC_FBC_TRYIT;
1336 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1340 REXEC_FBC_CSCAN_PRELOAD(
1341 LOAD_UTF8_CHARCLASS_ALNUM(),
1342 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1346 REXEC_FBC_CSCAN_TAINT(
1347 isALNUM_LC_utf8((U8*)s),
1351 REXEC_FBC_CSCAN_PRELOAD(
1352 LOAD_UTF8_CHARCLASS_ALNUM(),
1353 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1357 REXEC_FBC_CSCAN_TAINT(
1358 !isALNUM_LC_utf8((U8*)s),
1362 REXEC_FBC_CSCAN_PRELOAD(
1363 LOAD_UTF8_CHARCLASS_SPACE(),
1364 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1368 REXEC_FBC_CSCAN_TAINT(
1369 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1373 REXEC_FBC_CSCAN_PRELOAD(
1374 LOAD_UTF8_CHARCLASS_SPACE(),
1375 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1379 REXEC_FBC_CSCAN_TAINT(
1380 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1384 REXEC_FBC_CSCAN_PRELOAD(
1385 LOAD_UTF8_CHARCLASS_DIGIT(),
1386 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1390 REXEC_FBC_CSCAN_TAINT(
1391 isDIGIT_LC_utf8((U8*)s),
1395 REXEC_FBC_CSCAN_PRELOAD(
1396 LOAD_UTF8_CHARCLASS_DIGIT(),
1397 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1401 REXEC_FBC_CSCAN_TAINT(
1402 !isDIGIT_LC_utf8((U8*)s),
1408 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1409 trie_type = do_utf8 ?
1410 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1412 /* what trie are we using right now */
1414 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1415 reg_trie_data *trie=aho->trie;
1417 const char *last_start = strend - trie->minlen;
1419 const char *real_start = s;
1421 STRLEN maxlen = trie->maxlen;
1423 U8 **points; /* map of where we were in the input string
1424 when reading a given char. For ASCII this
1425 is unnecessary overhead as the relationship
1426 is always 1:1, but for unicode, especially
1427 case folded unicode this is not true. */
1428 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1432 GET_RE_DEBUG_FLAGS_DECL;
1434 /* We can't just allocate points here. We need to wrap it in
1435 * an SV so it gets freed properly if there is a croak while
1436 * running the match */
1439 sv_points=newSV(maxlen * sizeof(U8 *));
1440 SvCUR_set(sv_points,
1441 maxlen * sizeof(U8 *));
1442 SvPOK_on(sv_points);
1443 sv_2mortal(sv_points);
1444 points=(U8**)SvPV_nolen(sv_points );
1445 if ( trie_type != trie_utf8_fold
1446 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1449 bitmap=(U8*)trie->bitmap;
1451 bitmap=(U8*)ANYOF_BITMAP(c);
1453 /* this is the Aho-Corasick algorithm modified a touch
1454 to include special handling for long "unknown char"
1455 sequences. The basic idea being that we use AC as long
1456 as we are dealing with a possible matching char, when
1457 we encounter an unknown char (and we have not encountered
1458 an accepting state) we scan forward until we find a legal
1460 AC matching is basically that of trie matching, except
1461 that when we encounter a failing transition, we fall back
1462 to the current states "fail state", and try the current char
1463 again, a process we repeat until we reach the root state,
1464 state 1, or a legal transition. If we fail on the root state
1465 then we can either terminate if we have reached an accepting
1466 state previously, or restart the entire process from the beginning
1470 while (s <= last_start) {
1471 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1479 U8 *uscan = (U8*)NULL;
1480 U8 *leftmost = NULL;
1482 U32 accepted_word= 0;
1486 while ( state && uc <= (U8*)strend ) {
1488 U32 word = aho->states[ state ].wordnum;
1492 DEBUG_TRIE_EXECUTE_r(
1493 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1494 dump_exec_pos( (char *)uc, c, strend, real_start,
1495 (char *)uc, do_utf8 );
1496 PerlIO_printf( Perl_debug_log,
1497 " Scanning for legal start char...\n");
1500 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1505 if (uc >(U8*)last_start) break;
1509 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1510 if (!leftmost || lpos < leftmost) {
1511 DEBUG_r(accepted_word=word);
1517 points[pointpos++ % maxlen]= uc;
1518 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1519 uvc, charid, foldlen, foldbuf, uniflags);
1520 DEBUG_TRIE_EXECUTE_r({
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1523 PerlIO_printf(Perl_debug_log,
1524 " Charid:%3u CP:%4"UVxf" ",
1530 word = aho->states[ state ].wordnum;
1532 base = aho->states[ state ].trans.base;
1534 DEBUG_TRIE_EXECUTE_r({
1536 dump_exec_pos( (char *)uc, c, strend, real_start,
1538 PerlIO_printf( Perl_debug_log,
1539 "%sState: %4"UVxf", word=%"UVxf,
1540 failed ? " Fail transition to " : "",
1541 (UV)state, (UV)word);
1546 (base + charid > trie->uniquecharcount )
1547 && (base + charid - 1 - trie->uniquecharcount
1549 && trie->trans[base + charid - 1 -
1550 trie->uniquecharcount].check == state
1551 && (tmp=trie->trans[base + charid - 1 -
1552 trie->uniquecharcount ].next))
1554 DEBUG_TRIE_EXECUTE_r(
1555 PerlIO_printf( Perl_debug_log," - legal\n"));
1560 DEBUG_TRIE_EXECUTE_r(
1561 PerlIO_printf( Perl_debug_log," - fail\n"));
1563 state = aho->fail[state];
1567 /* we must be accepting here */
1568 DEBUG_TRIE_EXECUTE_r(
1569 PerlIO_printf( Perl_debug_log," - accepting\n"));
1578 if (!state) state = 1;
1581 if ( aho->states[ state ].wordnum ) {
1582 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1583 if (!leftmost || lpos < leftmost) {
1584 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1589 s = (char*)leftmost;
1590 DEBUG_TRIE_EXECUTE_r({
1592 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1593 (UV)accepted_word, s - real_start
1596 if (!reginfo || regtry(reginfo, s)) {
1602 DEBUG_TRIE_EXECUTE_r({
1603 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1606 DEBUG_TRIE_EXECUTE_r(
1607 PerlIO_printf( Perl_debug_log,"No match.\n"));
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1625 - regexec_flags - match a regexp against a string
1628 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630 /* strend: pointer to null at end of string */
1631 /* strbeg: real beginning of string */
1632 /* minend: end of match must be >=minend after stringarg. */
1633 /* data: May be used for some additional optimizations. */
1634 /* nosave: For optimizations. */
1638 register regnode *c;
1639 register char *startpos = stringarg;
1640 I32 minlen; /* must match at least this many chars */
1641 I32 dontbother = 0; /* how many characters not to try at end */
1642 I32 end_shift = 0; /* Same for the end. */ /* CC */
1643 I32 scream_pos = -1; /* Internal iterator of scream. */
1644 char *scream_olds = NULL;
1645 SV* const oreplsv = GvSV(PL_replgv);
1646 const bool do_utf8 = (bool)DO_UTF8(sv);
1649 regmatch_info reginfo; /* create some info to pass to regtry etc */
1651 GET_RE_DEBUG_FLAGS_DECL;
1653 PERL_UNUSED_ARG(data);
1655 /* Be paranoid... */
1656 if (prog == NULL || startpos == NULL) {
1657 Perl_croak(aTHX_ "NULL regexp parameter");
1661 multiline = prog->reganch & PMf_MULTILINE;
1662 reginfo.prog = prog;
1664 RX_MATCH_UTF8_set(prog, do_utf8);
1666 debug_start_match(prog, do_utf8, startpos, strend,
1670 minlen = prog->minlen;
1672 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1674 "String too short [regexec_flags]...\n"));
1679 /* Check validity of program. */
1680 if (UCHARAT(prog->program) != REG_MAGIC) {
1681 Perl_croak(aTHX_ "corrupted regexp program");
1685 PL_reg_eval_set = 0;
1688 if (prog->reganch & ROPT_UTF8)
1689 PL_reg_flags |= RF_utf8;
1691 /* Mark beginning of line for ^ and lookbehind. */
1692 reginfo.bol = startpos; /* XXX not used ??? */
1696 /* Mark end of line for $ (and such) */
1699 /* see how far we have to get to not match where we matched before */
1700 reginfo.till = startpos+minend;
1702 /* If there is a "must appear" string, look for it. */
1705 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1708 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1709 reginfo.ganch = startpos;
1710 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1713 && mg->mg_len >= 0) {
1714 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1715 if (prog->reganch & ROPT_ANCH_GPOS) {
1716 if (s > reginfo.ganch)
1721 else /* pos() not defined */
1722 reginfo.ganch = strbeg;
1725 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1726 re_scream_pos_data d;
1728 d.scream_olds = &scream_olds;
1729 d.scream_pos = &scream_pos;
1730 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1733 goto phooey; /* not present */
1739 /* Simplest case: anchored match need be tried only once. */
1740 /* [unless only anchor is BOL and multiline is set] */
1741 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1742 if (s == startpos && regtry(®info, startpos))
1744 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1745 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1750 dontbother = minlen - 1;
1751 end = HOP3c(strend, -dontbother, strbeg) - 1;
1752 /* for multiline we only have to try after newlines */
1753 if (prog->check_substr || prog->check_utf8) {
1757 if (regtry(®info, s))
1762 if (prog->reganch & RE_USE_INTUIT) {
1763 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1774 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1775 if (regtry(®info, s))
1782 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1784 /* the warning about reginfo.ganch being used without intialization
1785 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1786 and we only enter this block when the same bit is set. */
1787 if (regtry(®info, reginfo.ganch))
1792 /* Messy cases: unanchored match. */
1793 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1794 /* we have /x+whatever/ */
1795 /* it must be a one character string (XXXX Except UTF?) */
1800 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1801 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1802 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1807 DEBUG_EXECUTE_r( did_match = 1 );
1808 if (regtry(®info, s)) goto got_it;
1810 while (s < strend && *s == ch)
1818 DEBUG_EXECUTE_r( did_match = 1 );
1819 if (regtry(®info, s)) goto got_it;
1821 while (s < strend && *s == ch)
1826 DEBUG_EXECUTE_r(if (!did_match)
1827 PerlIO_printf(Perl_debug_log,
1828 "Did not find anchored character...\n")
1831 else if (prog->anchored_substr != NULL
1832 || prog->anchored_utf8 != NULL
1833 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1834 && prog->float_max_offset < strend - s)) {
1839 char *last1; /* Last position checked before */
1843 if (prog->anchored_substr || prog->anchored_utf8) {
1844 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1845 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1846 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1847 back_max = back_min = prog->anchored_offset;
1849 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1850 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1851 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1852 back_max = prog->float_max_offset;
1853 back_min = prog->float_min_offset;
1857 if (must == &PL_sv_undef)
1858 /* could not downgrade utf8 check substring, so must fail */
1864 last = HOP3c(strend, /* Cannot start after this */
1865 -(I32)(CHR_SVLEN(must)
1866 - (SvTAIL(must) != 0) + back_min), strbeg);
1869 last1 = HOPc(s, -1);
1871 last1 = s - 1; /* bogus */
1873 /* XXXX check_substr already used to find "s", can optimize if
1874 check_substr==must. */
1876 dontbother = end_shift;
1877 strend = HOPc(strend, -dontbother);
1878 while ( (s <= last) &&
1879 ((flags & REXEC_SCREAM)
1880 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1881 end_shift, &scream_pos, 0))
1882 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1883 (unsigned char*)strend, must,
1884 multiline ? FBMrf_MULTILINE : 0))) ) {
1885 /* we may be pointing at the wrong string */
1886 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1887 s = strbeg + (s - SvPVX_const(sv));
1888 DEBUG_EXECUTE_r( did_match = 1 );
1889 if (HOPc(s, -back_max) > last1) {
1890 last1 = HOPc(s, -back_min);
1891 s = HOPc(s, -back_max);
1894 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1896 last1 = HOPc(s, -back_min);
1900 while (s <= last1) {
1901 if (regtry(®info, s))
1907 while (s <= last1) {
1908 if (regtry(®info, s))
1914 DEBUG_EXECUTE_r(if (!did_match) {
1915 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1916 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1917 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1918 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1919 ? "anchored" : "floating"),
1920 quoted, RE_SV_TAIL(must));
1924 else if ( (c = prog->regstclass) ) {
1926 const OPCODE op = OP(prog->regstclass);
1927 /* don't bother with what can't match */
1928 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1929 strend = HOPc(strend, -(minlen - 1));
1932 SV * const prop = sv_newmortal();
1933 regprop(prog, prop, c);
1935 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1937 PerlIO_printf(Perl_debug_log,
1938 "Matching stclass %.*s against %s (%d chars)\n",
1939 (int)SvCUR(prop), SvPVX_const(prop),
1940 quoted, (int)(strend - s));
1943 if (find_byclass(prog, c, s, strend, ®info))
1945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1949 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1954 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1955 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1956 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1958 if (flags & REXEC_SCREAM) {
1959 last = screaminstr(sv, float_real, s - strbeg,
1960 end_shift, &scream_pos, 1); /* last one */
1962 last = scream_olds; /* Only one occurrence. */
1963 /* we may be pointing at the wrong string */
1964 else if (RX_MATCH_COPIED(prog))
1965 s = strbeg + (s - SvPVX_const(sv));
1969 const char * const little = SvPV_const(float_real, len);
1971 if (SvTAIL(float_real)) {
1972 if (memEQ(strend - len + 1, little, len - 1))
1973 last = strend - len + 1;
1974 else if (!multiline)
1975 last = memEQ(strend - len, little, len)
1976 ? strend - len : NULL;
1982 last = rninstr(s, strend, little, little + len);
1984 last = strend; /* matching "$" */
1989 PerlIO_printf(Perl_debug_log,
1990 "%sCan't trim the tail, match fails (should not happen)%s\n",
1991 PL_colors[4], PL_colors[5]));
1992 goto phooey; /* Should not happen! */
1994 dontbother = strend - last + prog->float_min_offset;
1996 if (minlen && (dontbother < minlen))
1997 dontbother = minlen - 1;
1998 strend -= dontbother; /* this one's always in bytes! */
1999 /* We don't know much -- general case. */
2002 if (regtry(®info, s))
2011 if (regtry(®info, s))
2013 } while (s++ < strend);
2021 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2023 if (PL_reg_eval_set) {
2024 /* Preserve the current value of $^R */
2025 if (oreplsv != GvSV(PL_replgv))
2026 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2027 restored, the value remains
2029 restore_pos(aTHX_ prog);
2032 /* make sure $`, $&, $', and $digit will work later */
2033 if ( !(flags & REXEC_NOT_FIRST) ) {
2034 RX_MATCH_COPY_FREE(prog);
2035 if (flags & REXEC_COPY_STR) {
2036 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2037 #ifdef PERL_OLD_COPY_ON_WRITE
2039 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2041 PerlIO_printf(Perl_debug_log,
2042 "Copy on write: regexp capture, type %d\n",
2045 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2046 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2047 assert (SvPOKp(prog->saved_copy));
2051 RX_MATCH_COPIED_on(prog);
2052 s = savepvn(strbeg, i);
2058 prog->subbeg = strbeg;
2059 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2067 PL_colors[4], PL_colors[5]));
2068 if (PL_reg_eval_set)
2069 restore_pos(aTHX_ prog);
2075 - regtry - try match at specific point
2077 STATIC I32 /* 0 failure, 1 success */
2078 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2084 regexp *prog = reginfo->prog;
2085 GET_RE_DEBUG_FLAGS_DECL;
2087 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2090 PL_reg_eval_set = RS_init;
2091 DEBUG_EXECUTE_r(DEBUG_s(
2092 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2093 (IV)(PL_stack_sp - PL_stack_base));
2095 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2096 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2097 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2099 /* Apparently this is not needed, judging by wantarray. */
2100 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2101 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2104 /* Make $_ available to executed code. */
2105 if (reginfo->sv != DEFSV) {
2107 DEFSV = reginfo->sv;
2110 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2111 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2112 /* prepare for quick setting of pos */
2113 #ifdef PERL_OLD_COPY_ON_WRITE
2115 sv_force_normal_flags(sv, 0);
2117 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2118 &PL_vtbl_mglob, NULL, 0);
2122 PL_reg_oldpos = mg->mg_len;
2123 SAVEDESTRUCTOR_X(restore_pos, prog);
2125 if (!PL_reg_curpm) {
2126 Newxz(PL_reg_curpm, 1, PMOP);
2129 SV* const repointer = newSViv(0);
2130 /* so we know which PL_regex_padav element is PL_reg_curpm */
2131 SvFLAGS(repointer) |= SVf_BREAK;
2132 av_push(PL_regex_padav,repointer);
2133 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2134 PL_regex_pad = AvARRAY(PL_regex_padav);
2138 PM_SETRE(PL_reg_curpm, prog);
2139 PL_reg_oldcurpm = PL_curpm;
2140 PL_curpm = PL_reg_curpm;
2141 if (RX_MATCH_COPIED(prog)) {
2142 /* Here is a serious problem: we cannot rewrite subbeg,
2143 since it may be needed if this match fails. Thus
2144 $` inside (?{}) could fail... */
2145 PL_reg_oldsaved = prog->subbeg;
2146 PL_reg_oldsavedlen = prog->sublen;
2147 #ifdef PERL_OLD_COPY_ON_WRITE
2148 PL_nrs = prog->saved_copy;
2150 RX_MATCH_COPIED_off(prog);
2153 PL_reg_oldsaved = NULL;
2154 prog->subbeg = PL_bostr;
2155 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2157 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2158 prog->startp[0] = startpos - PL_bostr;
2159 PL_reginput = startpos;
2160 PL_reglastparen = &prog->lastparen;
2161 PL_reglastcloseparen = &prog->lastcloseparen;
2162 prog->lastparen = 0;
2163 prog->lastcloseparen = 0;
2165 PL_regstartp = prog->startp;
2166 PL_regendp = prog->endp;
2167 if (PL_reg_start_tmpl <= prog->nparens) {
2168 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2169 if(PL_reg_start_tmp)
2170 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2172 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2175 /* XXXX What this code is doing here?!!! There should be no need
2176 to do this again and again, PL_reglastparen should take care of
2179 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2180 * Actually, the code in regcppop() (which Ilya may be meaning by
2181 * PL_reglastparen), is not needed at all by the test suite
2182 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2183 * enough, for building DynaLoader, or otherwise this
2184 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2185 * will happen. Meanwhile, this code *is* needed for the
2186 * above-mentioned test suite tests to succeed. The common theme
2187 * on those tests seems to be returning null fields from matches.
2192 if (prog->nparens) {
2194 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2201 if (regmatch(reginfo, prog->program + 1)) {
2202 prog->endp[0] = PL_reginput - PL_bostr;
2205 REGCP_UNWIND(lastcp);
2210 #define sayYES goto yes
2211 #define sayNO goto no
2212 #define sayNO_SILENT goto no_silent
2214 /* we dont use STMT_START/END here because it leads to
2215 "unreachable code" warnings, which are bogus, but distracting. */
2216 #define CACHEsayNO \
2217 if (ST.cache_mask) \
2218 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2221 /* this is used to determine how far from the left messages like
2222 'failed...' are printed. It should be set such that messages
2223 are inline with the regop output that created them.
2225 #define REPORT_CODE_OFF 32
2228 /* Make sure there is a test for this +1 options in re_tests */
2229 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2231 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2232 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2234 #define SLAB_FIRST(s) (&(s)->states[0])
2235 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2237 /* grab a new slab and return the first slot in it */
2239 STATIC regmatch_state *
2242 #if PERL_VERSION < 9
2245 regmatch_slab *s = PL_regmatch_slab->next;
2247 Newx(s, 1, regmatch_slab);
2248 s->prev = PL_regmatch_slab;
2250 PL_regmatch_slab->next = s;
2252 PL_regmatch_slab = s;
2253 return SLAB_FIRST(s);
2257 /* push a new state then goto it */
2259 #define PUSH_STATE_GOTO(state, node) \
2261 st->resume_state = state; \
2264 /* push a new state with success backtracking, then goto it */
2266 #define PUSH_YES_STATE_GOTO(state, node) \
2268 st->resume_state = state; \
2269 goto push_yes_state;
2274 - regmatch - main matching routine
2276 * Conceptually the strategy is simple: check to see whether the current
2277 * node matches, call self recursively to see whether the rest matches,
2278 * and then act accordingly. In practice we make some effort to avoid
2279 * recursion, in particular by going through "ordinary" nodes (that don't
2280 * need to know whether the rest of the match failed) by a loop instead of
2283 /* [lwall] I've hoisted the register declarations to the outer block in order to
2284 * maybe save a little bit of pushing and popping on the stack. It also takes
2285 * advantage of machines that use a register save mask on subroutine entry.
2287 * This function used to be heavily recursive, but since this had the
2288 * effect of blowing the CPU stack on complex regexes, it has been
2289 * restructured to be iterative, and to save state onto the heap rather
2290 * than the stack. Essentially whereever regmatch() used to be called, it
2291 * pushes the current state, notes where to return, then jumps back into
2294 * Originally the structure of this function used to look something like
2299 while (scan != NULL) {
2300 a++; // do stuff with a and b
2306 if (regmatch(...)) // recurse
2316 * Now it looks something like this:
2324 regmatch_state *st = new();
2326 st->a++; // do stuff with a and b
2328 while (scan != NULL) {
2336 st->resume_state = resume_FOO;
2337 goto start_recurse; // recurse
2346 st = new(); push a new state
2347 st->a = 1; st->b = 2;
2354 switch (resume_state) {
2356 goto resume_point_FOO;
2363 * WARNING: this means that any line in this function that contains a
2364 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2365 * regmatch() using gotos instead. Thus the values of any local variables
2366 * not saved in the regmatch_state structure will have been lost when
2367 * execution resumes on the next line .
2369 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2370 * PL_regmatch_state always points to the currently active state, and
2371 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2372 * The first time regmatch is called, the first slab is allocated, and is
2373 * never freed until interpreter desctruction. When the slab is full,
2374 * a new one is allocated chained to the end. At exit from regmatch, slabs
2375 * allocated since entry are freed.
2379 #define DEBUG_STATE_pp(pp) \
2381 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2382 PerlIO_printf(Perl_debug_log, \
2385 reg_name[st->resume_state] ); \
2389 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2394 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2395 const char *start, const char *end, const char *blurb)
2397 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2401 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2402 prog->precomp, prog->prelen, 60);
2404 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2405 start, end - start, 60);
2407 PerlIO_printf(Perl_debug_log,
2408 "%s%s REx%s %s against %s\n",
2409 PL_colors[4], blurb, PL_colors[5], s0, s1);
2411 if (do_utf8||utf8_pat)
2412 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2413 utf8_pat ? "pattern" : "",
2414 utf8_pat && do_utf8 ? " and " : "",
2415 do_utf8 ? "string" : ""
2421 S_dump_exec_pos(pTHX_ const char *locinput,
2422 const regnode *scan,
2423 const char *loc_regeol,
2424 const char *loc_bostr,
2425 const char *loc_reg_starttry,
2428 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2429 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2430 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2431 /* The part of the string before starttry has one color
2432 (pref0_len chars), between starttry and current
2433 position another one (pref_len - pref0_len chars),
2434 after the current position the third one.
2435 We assume that pref0_len <= pref_len, otherwise we
2436 decrease pref0_len. */
2437 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2438 ? (5 + taill) - l : locinput - loc_bostr;
2441 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2443 pref0_len = pref_len - (locinput - loc_reg_starttry);
2444 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2445 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2446 ? (5 + taill) - pref_len : loc_regeol - locinput);
2447 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2451 if (pref0_len > pref_len)
2452 pref0_len = pref_len;
2454 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2456 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2457 (locinput - pref_len),pref0_len, 60, 4, 5);
2459 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2460 (locinput - pref_len + pref0_len),
2461 pref_len - pref0_len, 60, 2, 3);
2463 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2464 locinput, loc_regeol - locinput, 10, 0, 1);
2466 const STRLEN tlen=len0+len1+len2;
2467 PerlIO_printf(Perl_debug_log,
2468 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2469 (IV)(locinput - loc_bostr),
2472 (docolor ? "" : "> <"),
2474 (int)(tlen > 19 ? 0 : 19 - tlen),
2481 STATIC I32 /* 0 failure, 1 success */
2482 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2484 #if PERL_VERSION < 9
2488 register const bool do_utf8 = PL_reg_match_utf8;
2489 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2491 regexp *rex = reginfo->prog;
2493 regmatch_slab *orig_slab;
2494 regmatch_state *orig_state;
2496 /* the current state. This is a cached copy of PL_regmatch_state */
2497 register regmatch_state *st;
2499 /* cache heavy used fields of st in registers */
2500 register regnode *scan;
2501 register regnode *next;
2502 register I32 n = 0; /* general value; init to avoid compiler warning */
2503 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2504 register char *locinput = PL_reginput;
2505 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2507 bool result = 0; /* return value of S_regmatch */
2508 int depth = 0; /* depth of backtrack stack */
2509 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
2510 regmatch_state *yes_state = NULL; /* state to pop to on success of
2512 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2513 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2516 /* these three flags are set by various ops to signal information to
2517 * the very next op. They have a useful lifetime of exactly one loop
2518 * iteration, and are not preserved or restored by state pushes/pops
2520 bool sw = 0; /* the condition value in (?(cond)a|b) */
2521 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2522 int logical = 0; /* the following EVAL is:
2526 or the following IFMATCH/UNLESSM is:
2527 false: plain (?=foo)
2528 true: used as a condition: (?(?=foo))
2532 GET_RE_DEBUG_FLAGS_DECL;
2535 /* on first ever call to regmatch, allocate first slab */
2536 if (!PL_regmatch_slab) {
2537 Newx(PL_regmatch_slab, 1, regmatch_slab);
2538 PL_regmatch_slab->prev = NULL;
2539 PL_regmatch_slab->next = NULL;
2540 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2543 /* remember current high-water mark for exit */
2544 /* XXX this should be done with SAVE* instead */
2545 orig_slab = PL_regmatch_slab;
2546 orig_state = PL_regmatch_state;
2548 /* grab next free state slot */
2549 st = ++PL_regmatch_state;
2550 if (st > SLAB_LAST(PL_regmatch_slab))
2551 st = PL_regmatch_state = S_push_slab(aTHX);
2553 /* Note that nextchr is a byte even in UTF */
2554 nextchr = UCHARAT(locinput);
2556 while (scan != NULL) {
2559 SV * const prop = sv_newmortal();
2560 regnode *rnext=regnext(scan);
2561 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2562 regprop(rex, prop, scan);
2564 PerlIO_printf(Perl_debug_log,
2565 "%3"IVdf":%*s%s(%"IVdf")\n",
2566 (IV)(scan - rex->program), depth*2, "",
2568 (PL_regkind[OP(scan)] == END || !rnext) ?
2569 0 : (IV)(rnext - rex->program));
2572 next = scan + NEXT_OFF(scan);
2575 state_num = OP(scan);
2578 switch (state_num) {
2580 if (locinput == PL_bostr)
2582 /* reginfo->till = reginfo->bol; */
2587 if (locinput == PL_bostr ||
2588 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2594 if (locinput == PL_bostr)
2598 if (locinput == reginfo->ganch)
2604 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2609 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2611 if (PL_regeol - locinput > 1)
2615 if (PL_regeol != locinput)
2619 if (!nextchr && locinput >= PL_regeol)
2622 locinput += PL_utf8skip[nextchr];
2623 if (locinput > PL_regeol)
2625 nextchr = UCHARAT(locinput);
2628 nextchr = UCHARAT(++locinput);
2631 if (!nextchr && locinput >= PL_regeol)
2633 nextchr = UCHARAT(++locinput);
2636 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2639 locinput += PL_utf8skip[nextchr];
2640 if (locinput > PL_regeol)
2642 nextchr = UCHARAT(locinput);
2645 nextchr = UCHARAT(++locinput);
2649 #define ST st->u.trie
2651 /* In this case the charclass data is available inline so
2652 we can fail fast without a lot of extra overhead.
2654 if (scan->flags == EXACT || !do_utf8) {
2655 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2657 PerlIO_printf(Perl_debug_log,
2658 "%*s %sfailed to match trie start class...%s\n",
2659 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2668 /* what type of TRIE am I? (utf8 makes this contextual) */
2669 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2670 trie_type = do_utf8 ?
2671 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2674 /* what trie are we using right now */
2675 reg_trie_data * const trie
2676 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2677 U32 state = trie->startstate;
2679 if (trie->bitmap && trie_type != trie_utf8_fold &&
2680 !TRIE_BITMAP_TEST(trie,*locinput)
2682 if (trie->states[ state ].wordnum) {
2684 PerlIO_printf(Perl_debug_log,
2685 "%*s %smatched empty string...%s\n",
2686 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2691 PerlIO_printf(Perl_debug_log,
2692 "%*s %sfailed to match trie start class...%s\n",
2693 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2700 U8 *uc = ( U8* )locinput;
2704 U8 *uscan = (U8*)NULL;
2706 SV *sv_accept_buff = NULL;
2707 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2709 ST.accepted = 0; /* how many accepting states we have seen */
2711 ST.jump = trie->jump;
2720 traverse the TRIE keeping track of all accepting states
2721 we transition through until we get to a failing node.
2724 while ( state && uc <= (U8*)PL_regeol ) {
2725 U32 base = trie->states[ state ].trans.base;
2728 /* We use charid to hold the wordnum as we don't use it
2729 for charid until after we have done the wordnum logic.
2730 We define an alias just so that the wordnum logic reads
2733 #define got_wordnum charid
2734 got_wordnum = trie->states[ state ].wordnum;
2736 if ( got_wordnum ) {
2737 if ( ! ST.accepted ) {
2740 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2741 sv_accept_buff=newSV(bufflen *
2742 sizeof(reg_trie_accepted) - 1);
2743 SvCUR_set(sv_accept_buff, 0);
2744 SvPOK_on(sv_accept_buff);
2745 sv_2mortal(sv_accept_buff);
2748 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2751 if (ST.accepted >= bufflen) {
2753 ST.accept_buff =(reg_trie_accepted*)
2754 SvGROW(sv_accept_buff,
2755 bufflen * sizeof(reg_trie_accepted));
2757 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2758 + sizeof(reg_trie_accepted));
2761 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2762 ST.accept_buff[ST.accepted].endpos = uc;
2764 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2768 DEBUG_TRIE_EXECUTE_r({
2769 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2770 PerlIO_printf( Perl_debug_log,
2771 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2772 2+depth * 2, "", PL_colors[4],
2773 (UV)state, (UV)ST.accepted );
2777 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2778 uvc, charid, foldlen, foldbuf, uniflags);
2781 (base + charid > trie->uniquecharcount )
2782 && (base + charid - 1 - trie->uniquecharcount
2784 && trie->trans[base + charid - 1 -
2785 trie->uniquecharcount].check == state)
2787 state = trie->trans[base + charid - 1 -
2788 trie->uniquecharcount ].next;
2799 DEBUG_TRIE_EXECUTE_r(
2800 PerlIO_printf( Perl_debug_log,
2801 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2802 charid, uvc, (UV)state, PL_colors[5] );
2809 PerlIO_printf( Perl_debug_log,
2810 "%*s %sgot %"IVdf" possible matches%s\n",
2811 REPORT_CODE_OFF + depth * 2, "",
2812 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2818 case TRIE_next_fail: /* we failed - try next alterative */
2820 if ( ST.accepted == 1 ) {
2821 /* only one choice left - just continue */
2823 reg_trie_data * const trie
2824 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2825 SV ** const tmp = RX_DEBUG(reginfo->prog)
2826 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2828 PerlIO_printf( Perl_debug_log,
2829 "%*s %sonly one match left: #%d <%s>%s\n",
2830 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2831 ST.accept_buff[ 0 ].wordnum,
2832 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2835 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2836 /* in this case we free tmps/leave before we call regmatch
2837 as we wont be using accept_buff again. */
2840 locinput = PL_reginput;
2841 nextchr = UCHARAT(locinput);
2846 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2848 continue; /* execute rest of RE */
2851 if (!ST.accepted-- ) {
2858 There are at least two accepting states left. Presumably
2859 the number of accepting states is going to be low,
2860 typically two. So we simply scan through to find the one
2861 with lowest wordnum. Once we find it, we swap the last
2862 state into its place and decrement the size. We then try to
2863 match the rest of the pattern at the point where the word
2864 ends. If we succeed, control just continues along the
2865 regex; if we fail we return here to try the next accepting
2872 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2873 DEBUG_TRIE_EXECUTE_r(
2874 PerlIO_printf( Perl_debug_log,
2875 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2876 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
2877 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2878 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2881 if (ST.accept_buff[cur].wordnum <
2882 ST.accept_buff[best].wordnum)
2887 reg_trie_data * const trie
2888 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2889 SV ** const tmp = RX_DEBUG(reginfo->prog)
2890 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2892 regnode *nextop=!ST.jump ?
2894 ST.B - ST.jump[ST.accept_buff[best].wordnum];
2895 PerlIO_printf( Perl_debug_log,
2896 "%*s %strying alternation #%d <%s> at node #%d %s\n",
2897 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2898 ST.accept_buff[best].wordnum,
2899 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2900 REG_NODE_NUM(nextop),
2904 if ( best<ST.accepted ) {
2905 reg_trie_accepted tmp = ST.accept_buff[ best ];
2906 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2907 ST.accept_buff[ ST.accepted ] = tmp;
2910 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2912 PUSH_STATE_GOTO(TRIE_next, ST.B);
2915 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2925 char *s = STRING(scan);
2927 if (do_utf8 != UTF) {
2928 /* The target and the pattern have differing utf8ness. */
2930 const char * const e = s + ln;
2933 /* The target is utf8, the pattern is not utf8. */
2938 if (NATIVE_TO_UNI(*(U8*)s) !=
2939 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2947 /* The target is not utf8, the pattern is utf8. */
2952 if (NATIVE_TO_UNI(*((U8*)l)) !=
2953 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2961 nextchr = UCHARAT(locinput);
2964 /* The target and the pattern have the same utf8ness. */
2965 /* Inline the first character, for speed. */
2966 if (UCHARAT(s) != nextchr)
2968 if (PL_regeol - locinput < ln)
2970 if (ln > 1 && memNE(s, locinput, ln))
2973 nextchr = UCHARAT(locinput);
2977 PL_reg_flags |= RF_tainted;
2980 char * const s = STRING(scan);
2983 if (do_utf8 || UTF) {
2984 /* Either target or the pattern are utf8. */
2985 const char * const l = locinput;
2986 char *e = PL_regeol;
2988 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2989 l, &e, 0, do_utf8)) {
2990 /* One more case for the sharp s:
2991 * pack("U0U*", 0xDF) =~ /ss/i,
2992 * the 0xC3 0x9F are the UTF-8
2993 * byte sequence for the U+00DF. */
2995 toLOWER(s[0]) == 's' &&
2997 toLOWER(s[1]) == 's' &&
3004 nextchr = UCHARAT(locinput);
3008 /* Neither the target and the pattern are utf8. */
3010 /* Inline the first character, for speed. */
3011 if (UCHARAT(s) != nextchr &&
3012 UCHARAT(s) != ((OP(scan) == EXACTF)
3013 ? PL_fold : PL_fold_locale)[nextchr])
3015 if (PL_regeol - locinput < ln)
3017 if (ln > 1 && (OP(scan) == EXACTF
3018 ? ibcmp(s, locinput, ln)
3019 : ibcmp_locale(s, locinput, ln)))
3022 nextchr = UCHARAT(locinput);
3027 STRLEN inclasslen = PL_regeol - locinput;
3029 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3031 if (locinput >= PL_regeol)
3033 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3034 nextchr = UCHARAT(locinput);
3039 nextchr = UCHARAT(locinput);
3040 if (!REGINCLASS(rex, scan, (U8*)locinput))
3042 if (!nextchr && locinput >= PL_regeol)
3044 nextchr = UCHARAT(++locinput);
3048 /* If we might have the case of the German sharp s
3049 * in a casefolding Unicode character class. */
3051 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3052 locinput += SHARP_S_SKIP;
3053 nextchr = UCHARAT(locinput);
3059 PL_reg_flags |= RF_tainted;
3065 LOAD_UTF8_CHARCLASS_ALNUM();
3066 if (!(OP(scan) == ALNUM
3067 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3068 : isALNUM_LC_utf8((U8*)locinput)))
3072 locinput += PL_utf8skip[nextchr];
3073 nextchr = UCHARAT(locinput);
3076 if (!(OP(scan) == ALNUM
3077 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3079 nextchr = UCHARAT(++locinput);
3082 PL_reg_flags |= RF_tainted;
3085 if (!nextchr && locinput >= PL_regeol)
3088 LOAD_UTF8_CHARCLASS_ALNUM();
3089 if (OP(scan) == NALNUM
3090 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3091 : isALNUM_LC_utf8((U8*)locinput))
3095 locinput += PL_utf8skip[nextchr];
3096 nextchr = UCHARAT(locinput);
3099 if (OP(scan) == NALNUM
3100 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3102 nextchr = UCHARAT(++locinput);
3106 PL_reg_flags |= RF_tainted;
3110 /* was last char in word? */
3112 if (locinput == PL_bostr)
3115 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3117 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3119 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3120 ln = isALNUM_uni(ln);
3121 LOAD_UTF8_CHARCLASS_ALNUM();
3122 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3125 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3126 n = isALNUM_LC_utf8((U8*)locinput);
3130 ln = (locinput != PL_bostr) ?
3131 UCHARAT(locinput - 1) : '\n';
3132 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3134 n = isALNUM(nextchr);
3137 ln = isALNUM_LC(ln);
3138 n = isALNUM_LC(nextchr);
3141 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3142 OP(scan) == BOUNDL))
3146 PL_reg_flags |= RF_tainted;
3152 if (UTF8_IS_CONTINUED(nextchr)) {
3153 LOAD_UTF8_CHARCLASS_SPACE();
3154 if (!(OP(scan) == SPACE
3155 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3156 : isSPACE_LC_utf8((U8*)locinput)))
3160 locinput += PL_utf8skip[nextchr];
3161 nextchr = UCHARAT(locinput);
3164 if (!(OP(scan) == SPACE
3165 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3167 nextchr = UCHARAT(++locinput);
3170 if (!(OP(scan) == SPACE
3171 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3173 nextchr = UCHARAT(++locinput);
3177 PL_reg_flags |= RF_tainted;
3180 if (!nextchr && locinput >= PL_regeol)
3183 LOAD_UTF8_CHARCLASS_SPACE();
3184 if (OP(scan) == NSPACE
3185 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3186 : isSPACE_LC_utf8((U8*)locinput))
3190 locinput += PL_utf8skip[nextchr];
3191 nextchr = UCHARAT(locinput);
3194 if (OP(scan) == NSPACE
3195 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3197 nextchr = UCHARAT(++locinput);
3200 PL_reg_flags |= RF_tainted;
3206 LOAD_UTF8_CHARCLASS_DIGIT();
3207 if (!(OP(scan) == DIGIT
3208 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3209 : isDIGIT_LC_utf8((U8*)locinput)))
3213 locinput += PL_utf8skip[nextchr];
3214 nextchr = UCHARAT(locinput);
3217 if (!(OP(scan) == DIGIT
3218 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3220 nextchr = UCHARAT(++locinput);
3223 PL_reg_flags |= RF_tainted;
3226 if (!nextchr && locinput >= PL_regeol)
3229 LOAD_UTF8_CHARCLASS_DIGIT();
3230 if (OP(scan) == NDIGIT
3231 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3232 : isDIGIT_LC_utf8((U8*)locinput))
3236 locinput += PL_utf8skip[nextchr];
3237 nextchr = UCHARAT(locinput);
3240 if (OP(scan) == NDIGIT
3241 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3243 nextchr = UCHARAT(++locinput);
3246 if (locinput >= PL_regeol)
3249 LOAD_UTF8_CHARCLASS_MARK();
3250 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3252 locinput += PL_utf8skip[nextchr];
3253 while (locinput < PL_regeol &&
3254 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3255 locinput += UTF8SKIP(locinput);
3256 if (locinput > PL_regeol)
3261 nextchr = UCHARAT(locinput);
3264 PL_reg_flags |= RF_tainted;
3269 n = ARG(scan); /* which paren pair */
3270 ln = PL_regstartp[n];
3271 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3272 if ((I32)*PL_reglastparen < n || ln == -1)
3273 sayNO; /* Do not match unless seen CLOSEn. */
3274 if (ln == PL_regendp[n])
3278 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3280 const char *e = PL_bostr + PL_regendp[n];
3282 * Note that we can't do the "other character" lookup trick as
3283 * in the 8-bit case (no pun intended) because in Unicode we
3284 * have to map both upper and title case to lower case.
3286 if (OP(scan) == REFF) {
3288 STRLEN ulen1, ulen2;
3289 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3290 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3294 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3295 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3296 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3303 nextchr = UCHARAT(locinput);
3307 /* Inline the first character, for speed. */
3308 if (UCHARAT(s) != nextchr &&
3310 (UCHARAT(s) != ((OP(scan) == REFF
3311 ? PL_fold : PL_fold_locale)[nextchr]))))
3313 ln = PL_regendp[n] - ln;
3314 if (locinput + ln > PL_regeol)
3316 if (ln > 1 && (OP(scan) == REF
3317 ? memNE(s, locinput, ln)
3319 ? ibcmp(s, locinput, ln)
3320 : ibcmp_locale(s, locinput, ln))))
3323 nextchr = UCHARAT(locinput);
3334 #define ST st->u.eval
3338 regnode *startpoint;
3341 case RECURSE: /* /(...(?1))/ */
3342 if (cur_eval && cur_eval->locinput==locinput) {
3343 if (cur_eval->u.eval.close_paren == ARG(scan))
3344 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3345 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3346 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3351 (void)ReREFCNT_inc(rex);
3352 if (OP(scan)==RECURSE) {
3353 startpoint = scan + ARG2L(scan);
3354 ST.close_paren = ARG(scan);
3356 startpoint = re->program+1;
3359 goto eval_recurse_doit;
3361 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3362 if (cur_eval && cur_eval->locinput==locinput) {
3363 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3364 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3369 /* execute the code in the {...} */
3371 SV ** const before = SP;
3372 OP_4tree * const oop = PL_op;
3373 COP * const ocurcop = PL_curcop;
3377 PL_op = (OP_4tree*)rex->data->data[n];
3378 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3379 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3380 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3382 CALLRUNOPS(aTHX); /* Scalar context. */
3385 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3392 PAD_RESTORE_LOCAL(old_comppad);
3393 PL_curcop = ocurcop;
3396 sv_setsv(save_scalar(PL_replgv), ret);
3400 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3403 /* extract RE object from returned value; compiling if
3408 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3409 mg = mg_find(sv, PERL_MAGIC_qr);
3410 else if (SvSMAGICAL(ret)) {
3411 if (SvGMAGICAL(ret))
3412 sv_unmagic(ret, PERL_MAGIC_qr);
3414 mg = mg_find(ret, PERL_MAGIC_qr);
3418 re = (regexp *)mg->mg_obj;
3419 (void)ReREFCNT_inc(re);
3423 const char * const t = SvPV_const(ret, len);
3425 const I32 osize = PL_regsize;
3428 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3429 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3431 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3433 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3439 debug_start_match(re, do_utf8, locinput, PL_regeol,
3440 "Matching embedded");
3442 startpoint = re->program + 1;
3443 ST.close_paren = 0; /* only used for RECURSE */
3444 /* borrowed from regtry */
3445 if (PL_reg_start_tmpl <= re->nparens) {
3446 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3447 if(PL_reg_start_tmp)
3448 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3450 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3453 eval_recurse_doit: /* Share code with RECURSE below this line */
3454 /* run the pattern returned from (??{...}) */
3455 ST.cp = regcppush(0); /* Save *all* the positions. */
3456 REGCP_SET(ST.lastcp);
3458 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3459 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3461 *PL_reglastparen = 0;
3462 *PL_reglastcloseparen = 0;
3463 PL_reginput = locinput;
3465 /* XXXX This is too dramatic a measure... */
3468 ST.toggle_reg_flags = PL_reg_flags;
3469 if (re->reganch & ROPT_UTF8)
3470 PL_reg_flags |= RF_utf8;
3472 PL_reg_flags &= ~RF_utf8;
3473 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3476 ST.prev_curlyx = cur_curlyx;
3480 ST.prev_eval = cur_eval;
3482 /* now continue from first node in postoned RE */
3483 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3486 /* logical is 1, /(?(?{...})X|Y)/ */
3487 sw = (bool)SvTRUE(ret);
3492 case EVAL_AB: /* cleanup after a successful (??{A})B */
3493 /* note: this is called twice; first after popping B, then A */
3494 PL_reg_flags ^= ST.toggle_reg_flags;
3498 cur_eval = ST.prev_eval;
3499 cur_curlyx = ST.prev_curlyx;
3500 /* XXXX This is too dramatic a measure... */
3505 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3506 /* note: this is called twice; first after popping B, then A */
3507 PL_reg_flags ^= ST.toggle_reg_flags;
3510 PL_reginput = locinput;
3511 REGCP_UNWIND(ST.lastcp);
3513 cur_eval = ST.prev_eval;
3514 cur_curlyx = ST.prev_curlyx;
3515 /* XXXX This is too dramatic a measure... */
3521 n = ARG(scan); /* which paren pair */
3522 PL_reg_start_tmp[n] = locinput;
3527 n = ARG(scan); /* which paren pair */
3528 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3529 PL_regendp[n] = locinput - PL_bostr;
3530 if (n > (I32)*PL_reglastparen)
3531 *PL_reglastparen = n;
3532 *PL_reglastcloseparen = n;
3533 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3538 n = ARG(scan); /* which paren pair */
3539 sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3542 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3544 next = NEXTOPER(NEXTOPER(scan));
3546 next = scan + ARG(scan);
3547 if (OP(next) == IFTHEN) /* Fake one. */
3548 next = NEXTOPER(NEXTOPER(next));
3552 logical = scan->flags;
3555 /*******************************************************************
3557 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3558 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3559 STAR/PLUS/CURLY/CURLYN are used instead.)
3561 A*B is compiled as <CURLYX><A><WHILEM><B>
3563 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3564 state, which contains the current count, initialised to -1. It also sets
3565 cur_curlyx to point to this state, with any previous value saved in the
3568 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3569 since the pattern may possibly match zero times (i.e. it's a while {} loop
3570 rather than a do {} while loop).
3572 Each entry to WHILEM represents a successful match of A. The count in the
3573 CURLYX block is incremented, another WHILEM state is pushed, and execution
3574 passes to A or B depending on greediness and the current count.
3576 For example, if matching against the string a1a2a3b (where the aN are
3577 substrings that match /A/), then the match progresses as follows: (the
3578 pushed states are interspersed with the bits of strings matched so far):
3581 <CURLYX cnt=0><WHILEM>
3582 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3583 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3584 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3585 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3587 (Contrast this with something like CURLYM, which maintains only a single
3591 a1 <CURLYM cnt=1> a2
3592 a1 a2 <CURLYM cnt=2> a3
3593 a1 a2 a3 <CURLYM cnt=3> b
3596 Each WHILEM state block marks a point to backtrack to upon partial failure
3597 of A or B, and also contains some minor state data related to that
3598 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3599 overall state, such as the count, and pointers to the A and B ops.
3601 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3602 must always point to the *current* CURLYX block, the rules are:
3604 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3605 and set cur_curlyx to point the new block.
3607 When popping the CURLYX block after a successful or unsuccessful match,
3608 restore the previous cur_curlyx.
3610 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3611 to the outer one saved in the CURLYX block.
3613 When popping the WHILEM block after a successful or unsuccessful B match,
3614 restore the previous cur_curlyx.
3616 Here's an example for the pattern (AI* BI)*BO
3617 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3620 curlyx backtrack stack
3621 ------ ---------------
3623 CO <CO prev=NULL> <WO>
3624 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3625 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3626 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3628 At this point the pattern succeeds, and we work back down the stack to
3629 clean up, restoring as we go:
3631 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3632 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3633 CO <CO prev=NULL> <WO>
3636 *******************************************************************/
3638 #define ST st->u.curlyx
3640 case CURLYX: /* start of /A*B/ (for complex A) */
3642 /* No need to save/restore up to this paren */
3643 I32 parenfloor = scan->flags;
3645 assert(next); /* keep Coverity happy */
3646 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3649 /* XXXX Probably it is better to teach regpush to support
3650 parenfloor > PL_regsize... */
3651 if (parenfloor > (I32)*PL_reglastparen)
3652 parenfloor = *PL_reglastparen; /* Pessimization... */
3654 ST.prev_curlyx= cur_curlyx;
3656 ST.cp = PL_savestack_ix;
3658 /* these fields contain the state of the current curly.
3659 * they are accessed by subsequent WHILEMs */
3660 ST.parenfloor = parenfloor;
3661 ST.min = ARG1(scan);
3662 ST.max = ARG2(scan);
3663 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3667 ST.count = -1; /* this will be updated by WHILEM */
3668 ST.lastloc = NULL; /* this will be updated by WHILEM */
3670 PL_reginput = locinput;
3671 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3675 case CURLYX_end: /* just finished matching all of A*B */
3677 cur_curlyx = ST.prev_curlyx;
3681 case CURLYX_end_fail: /* just failed to match all of A*B */
3683 cur_curlyx = ST.prev_curlyx;
3689 #define ST st->u.whilem
3691 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3693 /* see the discussion above about CURLYX/WHILEM */
3696 assert(cur_curlyx); /* keep Coverity happy */
3697 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3698 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3699 ST.cache_offset = 0;
3702 PL_reginput = locinput;
3704 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3705 "%*s whilem: matched %ld out of %ld..%ld\n",
3706 REPORT_CODE_OFF+depth*2, "", (long)n,
3707 (long)cur_curlyx->u.curlyx.min,
3708 (long)cur_curlyx->u.curlyx.max)
3711 /* First just match a string of min A's. */
3713 if (n < cur_curlyx->u.curlyx.min) {
3714 cur_curlyx->u.curlyx.lastloc = locinput;
3715 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3719 /* If degenerate A matches "", assume A done. */
3721 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3722 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3723 "%*s whilem: empty match detected, trying continuation...\n",
3724 REPORT_CODE_OFF+depth*2, "")
3726 goto do_whilem_B_max;
3729 /* super-linear cache processing */
3733 if (!PL_reg_maxiter) {
3734 /* start the countdown: Postpone detection until we
3735 * know the match is not *that* much linear. */
3736 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3737 /* possible overflow for long strings and many CURLYX's */
3738 if (PL_reg_maxiter < 0)
3739 PL_reg_maxiter = I32_MAX;
3740 PL_reg_leftiter = PL_reg_maxiter;
3743 if (PL_reg_leftiter-- == 0) {
3744 /* initialise cache */
3745 const I32 size = (PL_reg_maxiter + 7)/8;
3746 if (PL_reg_poscache) {
3747 if ((I32)PL_reg_poscache_size < size) {
3748 Renew(PL_reg_poscache, size, char);
3749 PL_reg_poscache_size = size;
3751 Zero(PL_reg_poscache, size, char);
3754 PL_reg_poscache_size = size;
3755 Newxz(PL_reg_poscache, size, char);
3757 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3758 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3759 PL_colors[4], PL_colors[5])
3763 if (PL_reg_leftiter < 0) {
3764 /* have we already failed at this position? */
3766 offset = (scan->flags & 0xf) - 1
3767 + (locinput - PL_bostr) * (scan->flags>>4);
3768 mask = 1 << (offset % 8);
3770 if (PL_reg_poscache[offset] & mask) {
3771 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3772 "%*s whilem: (cache) already tried at this position...\n",
3773 REPORT_CODE_OFF+depth*2, "")
3775 sayNO; /* cache records failure */
3777 ST.cache_offset = offset;
3778 ST.cache_mask = mask;
3782 /* Prefer B over A for minimal matching. */
3784 if (cur_curlyx->u.curlyx.minmod) {
3785 ST.save_curlyx = cur_curlyx;
3786 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3787 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
3788 REGCP_SET(ST.lastcp);
3789 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
3793 /* Prefer A over B for maximal matching. */
3795 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3796 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3797 cur_curlyx->u.curlyx.lastloc = locinput;
3798 REGCP_SET(ST.lastcp);
3799 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
3802 goto do_whilem_B_max;
3806 case WHILEM_B_min: /* just matched B in a minimal match */
3807 case WHILEM_B_max: /* just matched B in a maximal match */
3808 cur_curlyx = ST.save_curlyx;
3812 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
3813 cur_curlyx = ST.save_curlyx;
3814 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3815 cur_curlyx->u.curlyx.count--;
3819 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
3820 REGCP_UNWIND(ST.lastcp);
3823 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
3824 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3825 cur_curlyx->u.curlyx.count--;
3829 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
3830 REGCP_UNWIND(ST.lastcp);
3831 regcppop(rex); /* Restore some previous $<digit>s? */
3832 PL_reginput = locinput;
3833 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3834 "%*s whilem: failed, trying continuation...\n",
3835 REPORT_CODE_OFF+depth*2, "")
3838 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3839 && ckWARN(WARN_REGEXP)
3840 && !(PL_reg_flags & RF_warned))
3842 PL_reg_flags |= RF_warned;
3843 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3844 "Complex regular subexpression recursion",
3849 ST.save_curlyx = cur_curlyx;
3850 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3851 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
3854 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
3855 cur_curlyx = ST.save_curlyx;
3856 REGCP_UNWIND(ST.lastcp);
3859 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
3860 /* Maximum greed exceeded */
3861 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3862 && ckWARN(WARN_REGEXP)
3863 && !(PL_reg_flags & RF_warned))
3865 PL_reg_flags |= RF_warned;
3866 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
3867 "%s limit (%d) exceeded",
3868 "Complex regular subexpression recursion",
3871 cur_curlyx->u.curlyx.count--;
3875 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3876 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
3878 /* Try grabbing another A and see if it helps. */
3879 PL_reginput = locinput;
3880 cur_curlyx->u.curlyx.lastloc = locinput;
3881 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3882 REGCP_SET(ST.lastcp);
3883 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
3887 #define ST st->u.branch
3889 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3890 next = scan + ARG(scan);
3893 scan = NEXTOPER(scan);
3896 case BRANCH: /* /(...|A|...)/ */
3897 scan = NEXTOPER(scan); /* scan now points to inner node */
3898 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3899 /* last branch; skip state push and jump direct to node */
3901 ST.lastparen = *PL_reglastparen;
3902 ST.next_branch = next;
3904 PL_reginput = locinput;
3906 /* Now go into the branch */
3907 PUSH_STATE_GOTO(BRANCH_next, scan);
3910 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3911 REGCP_UNWIND(ST.cp);
3912 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3914 *PL_reglastparen = n;
3915 scan = ST.next_branch;
3916 /* no more branches? */
3917 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3919 continue; /* execute next BRANCH[J] op */
3927 #define ST st->u.curlym
3929 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3931 /* This is an optimisation of CURLYX that enables us to push
3932 * only a single backtracking state, no matter now many matches
3933 * there are in {m,n}. It relies on the pattern being constant
3934 * length, with no parens to influence future backrefs
3938 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3940 /* if paren positive, emulate an OPEN/CLOSE around A */
3942 I32 paren = ST.me->flags;
3943 if (paren > PL_regsize)
3945 if (paren > (I32)*PL_reglastparen)
3946 *PL_reglastparen = paren;
3947 scan += NEXT_OFF(scan); /* Skip former OPEN. */