5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 8
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
171 for (p = PL_regsize; p > parenfloor; p--) {
172 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
175 SSPUSHPTR(PL_reg_start_tmp[p]);
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
184 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
185 SSPUSHPTR(PL_regstartp);
186 SSPUSHPTR(PL_regendp);
187 SSPUSHINT(PL_regsize);
188 SSPUSHINT(*PL_reglastparen);
189 SSPUSHINT(*PL_reglastcloseparen);
190 SSPUSHPTR(PL_reginput);
191 #define REGCP_FRAME_ELEMS 2
192 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
193 * are needed for the regexp context stack bookkeeping. */
194 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
195 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
200 /* These are needed since we do not localize EVAL nodes: */
201 #define REGCP_SET(cp) \
203 PerlIO_printf(Perl_debug_log, \
204 " Setting an EVAL scope, savestack=%"IVdf"\n", \
205 (IV)PL_savestack_ix)); \
208 #define REGCP_UNWIND(cp) \
210 if (cp != PL_savestack_ix) \
211 PerlIO_printf(Perl_debug_log, \
212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
213 (IV)(cp), (IV)PL_savestack_ix)); \
217 S_regcppop(pTHX_ const regexp *rex)
223 GET_RE_DEBUG_FLAGS_DECL;
225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
229 input = (char *) SSPOPPTR;
230 *PL_reglastcloseparen = SSPOPINT;
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
233 PL_regendp=(I32 *) SSPOPPTR;
234 PL_regstartp=(I32 *) SSPOPPTR;
237 /* Now restore the parentheses context. */
238 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
239 i > 0; i -= REGCP_PAREN_ELEMS) {
241 U32 paren = (U32)SSPOPINT;
242 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
243 PL_regstartp[paren] = SSPOPINT;
245 if (paren <= *PL_reglastparen)
246 PL_regendp[paren] = tmps;
248 PerlIO_printf(Perl_debug_log,
249 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
250 (UV)paren, (IV)PL_regstartp[paren],
251 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
252 (IV)PL_regendp[paren],
253 (paren > *PL_reglastparen ? "(no)" : ""));
257 if (*PL_reglastparen + 1 <= rex->nparens) {
258 PerlIO_printf(Perl_debug_log,
259 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
260 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
264 /* It would seem that the similar code in regtry()
265 * already takes care of this, and in fact it is in
266 * a better location to since this code can #if 0-ed out
267 * but the code in regtry() is needed or otherwise tests
268 * requiring null fields (pat.t#187 and split.t#{13,14}
269 * (as of patchlevel 7877) will fail. Then again,
270 * this code seems to be necessary or otherwise
271 * building DynaLoader will fail:
272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
276 PL_regstartp[i] = -1;
283 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
286 * pregexec and friends
289 #ifndef PERL_IN_XSUB_RE
291 - pregexec - match a regexp against a string
294 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
295 char *strbeg, I32 minend, SV *screamer, U32 nosave)
296 /* strend: pointer to null at end of string */
297 /* strbeg: real beginning of string */
298 /* minend: end of match must be >=minend after stringarg. */
299 /* nosave: For optimizations. */
302 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
303 nosave ? 0 : REXEC_COPY_STR);
308 * Need to implement the following flags for reg_anch:
310 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
312 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
313 * INTUIT_AUTORITATIVE_ML
314 * INTUIT_ONCE_NOML - Intuit can match in one location only.
317 * Another flag for this function: SECOND_TIME (so that float substrs
318 * with giant delta may be not rechecked).
321 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
323 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
324 Otherwise, only SvCUR(sv) is used to get strbeg. */
326 /* XXXX We assume that strpos is strbeg unless sv. */
328 /* XXXX Some places assume that there is a fixed substring.
329 An update may be needed if optimizer marks as "INTUITable"
330 RExen without fixed substrings. Similarly, it is assumed that
331 lengths of all the strings are no more than minlen, thus they
332 cannot come from lookahead.
333 (Or minlen should take into account lookahead.) */
335 /* A failure to find a constant substring means that there is no need to make
336 an expensive call to REx engine, thus we celebrate a failure. Similarly,
337 finding a substring too deep into the string means that less calls to
338 regtry() should be needed.
340 REx compiler's optimizer found 4 possible hints:
341 a) Anchored substring;
343 c) Whether we are anchored (beginning-of-line or \G);
344 d) First node (of those at offset 0) which may distingush positions;
345 We use a)b)d) and multiline-part of c), and try to find a position in the
346 string which does not contradict any of them.
349 /* Most of decisions we do here should have been done at compile time.
350 The nodes of the REx which we used for the search should have been
351 deleted from the finite automaton. */
354 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
355 char *strend, U32 flags, re_scream_pos_data *data)
358 register I32 start_shift = 0;
359 /* Should be nonnegative! */
360 register I32 end_shift = 0;
365 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
367 register char *other_last = NULL; /* other substr checked before this */
368 char *check_at = NULL; /* check substr found at this pos */
369 const I32 multiline = prog->reganch & PMf_MULTILINE;
371 const char * const i_strpos = strpos;
374 GET_RE_DEBUG_FLAGS_DECL;
376 RX_MATCH_UTF8_set(prog,do_utf8);
378 if (prog->reganch & ROPT_UTF8) {
379 PL_reg_flags |= RF_utf8;
382 debug_start_match(prog, do_utf8, strpos, strend,
383 sv ? "Guessing start of match in sv for"
384 : "Guessing start of match in string for");
387 /* CHR_DIST() would be more correct here but it makes things slow. */
388 if (prog->minlen > strend - strpos) {
389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
390 "String too short... [re_intuit_start]\n"));
394 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
397 if (!prog->check_utf8 && prog->check_substr)
398 to_utf8_substr(prog);
399 check = prog->check_utf8;
401 if (!prog->check_substr && prog->check_utf8)
402 to_byte_substr(prog);
403 check = prog->check_substr;
405 if (check == &PL_sv_undef) {
406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
407 "Non-utf8 string cannot match utf8 check string\n"));
410 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
411 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
412 || ( (prog->reganch & ROPT_ANCH_BOL)
413 && !multiline ) ); /* Check after \n? */
416 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
417 | ROPT_IMPLICIT)) /* not a real BOL */
418 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
420 && (strpos != strbeg)) {
421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
424 if (prog->check_offset_min == prog->check_offset_max &&
425 !(prog->reganch & ROPT_CANY_SEEN)) {
426 /* Substring at constant offset from beg-of-str... */
429 s = HOP3c(strpos, prog->check_offset_min, strend);
432 slen = SvCUR(check); /* >= 1 */
434 if ( strend - s > slen || strend - s < slen - 1
435 || (strend - s == slen && strend[-1] != '\n')) {
436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
439 /* Now should match s[0..slen-2] */
441 if (slen && (*SvPVX_const(check) != *s
443 && memNE(SvPVX_const(check), s, slen)))) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
449 else if (*SvPVX_const(check) != *s
450 || ((slen = SvCUR(check)) > 1
451 && memNE(SvPVX_const(check), s, slen)))
454 goto success_at_start;
457 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
459 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
460 end_shift = prog->check_end_shift;
463 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
464 - (SvTAIL(check) != 0);
465 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
467 if (end_shift < eshift)
471 else { /* Can match at random position */
474 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
475 end_shift = prog->check_end_shift;
477 /* end shift should be non negative here */
480 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
482 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
483 (IV)end_shift, prog->precomp);
487 /* Find a possible match in the region s..strend by looking for
488 the "check" substring in the region corrected by start/end_shift. */
491 I32 srch_start_shift = start_shift;
492 I32 srch_end_shift = end_shift;
493 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
494 srch_end_shift -= ((strbeg - s) - srch_start_shift);
495 srch_start_shift = strbeg - s;
497 DEBUG_OPTIMISE_MORE_r({
498 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
499 (IV)prog->check_offset_min,
500 (IV)srch_start_shift,
502 (IV)prog->check_end_shift);
505 if (flags & REXEC_SCREAM) {
506 I32 p = -1; /* Internal iterator of scream. */
507 I32 * const pp = data ? data->scream_pos : &p;
509 if (PL_screamfirst[BmRARE(check)] >= 0
510 || ( BmRARE(check) == '\n'
511 && (BmPREVIOUS(check) == SvCUR(check) - 1)
513 s = screaminstr(sv, check,
514 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
517 /* we may be pointing at the wrong string */
518 if (s && RX_MATCH_COPIED(prog))
519 s = strbeg + (s - SvPVX_const(sv));
521 *data->scream_olds = s;
526 if (prog->reganch & ROPT_CANY_SEEN) {
527 start_point= (U8*)(s + srch_start_shift);
528 end_point= (U8*)(strend - srch_end_shift);
530 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
531 end_point= HOP3(strend, -srch_end_shift, strbeg);
533 DEBUG_OPTIMISE_MORE_r({
534 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
535 (int)(end_point - start_point),
536 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
540 s = fbm_instr( start_point, end_point,
541 check, multiline ? FBMrf_MULTILINE : 0);
544 /* Update the count-of-usability, remove useless subpatterns,
548 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
549 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
550 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
551 (s ? "Found" : "Did not find"),
552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
553 ? "anchored" : "floating"),
556 (s ? " at offset " : "...\n") );
561 /* Finish the diagnostic message */
562 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
564 /* XXX dmq: first branch is for positive lookbehind...
565 Our check string is offset from the beginning of the pattern.
566 So we need to do any stclass tests offset forward from that
575 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
576 Start with the other substr.
577 XXXX no SCREAM optimization yet - and a very coarse implementation
578 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
579 *always* match. Probably should be marked during compile...
580 Probably it is right to do no SCREAM here...
583 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
584 : (prog->float_substr && prog->anchored_substr))
586 /* Take into account the "other" substring. */
587 /* XXXX May be hopelessly wrong for UTF... */
590 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
593 char * const last = HOP3c(s, -start_shift, strbeg);
595 char * const saved_s = s;
598 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
601 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
606 t = HOP3c(t, prog->anchored_offset, strend);
607 if (t < other_last) /* These positions already checked */
609 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
612 /* XXXX It is not documented what units *_offsets are in.
613 We assume bytes, but this is clearly wrong.
614 Meaning this code needs to be carefully reviewed for errors.
618 /* On end-of-str: see comment below. */
619 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
620 if (must == &PL_sv_undef) {
622 DEBUG_r(must = prog->anchored_utf8); /* for debug */
627 HOP3(HOP3(last1, prog->anchored_offset, strend)
628 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
630 multiline ? FBMrf_MULTILINE : 0
633 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
634 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
635 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
636 (s ? "Found" : "Contradicts"),
637 quoted, RE_SV_TAIL(must));
642 if (last1 >= last2) {
643 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
644 ", giving up...\n"));
647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
648 ", trying floating at offset %ld...\n",
649 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
650 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
651 s = HOP3c(last, 1, strend);
655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
656 (long)(s - i_strpos)));
657 t = HOP3c(s, -prog->anchored_offset, strbeg);
658 other_last = HOP3c(s, 1, strend);
666 else { /* Take into account the floating substring. */
668 char * const saved_s = s;
671 t = HOP3c(s, -start_shift, strbeg);
673 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
674 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
675 last = HOP3c(t, prog->float_max_offset, strend);
676 s = HOP3c(t, prog->float_min_offset, strend);
679 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
680 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
681 /* fbm_instr() takes into account exact value of end-of-str
682 if the check is SvTAIL(ed). Since false positives are OK,
683 and end-of-str is not later than strend we are OK. */
684 if (must == &PL_sv_undef) {
686 DEBUG_r(must = prog->float_utf8); /* for debug message */
689 s = fbm_instr((unsigned char*)s,
690 (unsigned char*)last + SvCUR(must)
692 must, multiline ? FBMrf_MULTILINE : 0);
694 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
695 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
696 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
697 (s ? "Found" : "Contradicts"),
698 quoted, RE_SV_TAIL(must));
702 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
703 ", giving up...\n"));
706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
707 ", trying anchored starting at offset %ld...\n",
708 (long)(saved_s + 1 - i_strpos)));
710 s = HOP3c(t, 1, strend);
714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
715 (long)(s - i_strpos)));
716 other_last = s; /* Fix this later. --Hugo */
726 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
728 DEBUG_OPTIMISE_MORE_r(
729 PerlIO_printf(Perl_debug_log,
730 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
731 (IV)prog->check_offset_min,
732 (IV)prog->check_offset_max,
740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
742 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
745 /* Fixed substring is found far enough so that the match
746 cannot start at strpos. */
748 if (ml_anch && t[-1] != '\n') {
749 /* Eventually fbm_*() should handle this, but often
750 anchored_offset is not 0, so this check will not be wasted. */
751 /* XXXX In the code below we prefer to look for "^" even in
752 presence of anchored substrings. And we search even
753 beyond the found float position. These pessimizations
754 are historical artefacts only. */
756 while (t < strend - prog->minlen) {
758 if (t < check_at - prog->check_offset_min) {
759 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
760 /* Since we moved from the found position,
761 we definitely contradict the found anchored
762 substr. Due to the above check we do not
763 contradict "check" substr.
764 Thus we can arrive here only if check substr
765 is float. Redo checking for "other"=="fixed".
768 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
769 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
770 goto do_other_anchored;
772 /* We don't contradict the found floating substring. */
773 /* XXXX Why not check for STCLASS? */
775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
776 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
779 /* Position contradicts check-string */
780 /* XXXX probably better to look for check-string
781 than for "\n", so one should lower the limit for t? */
782 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
783 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
784 other_last = strpos = s = t + 1;
789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
790 PL_colors[0], PL_colors[1]));
794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
795 PL_colors[0], PL_colors[1]));
799 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
802 /* The found string does not prohibit matching at strpos,
803 - no optimization of calling REx engine can be performed,
804 unless it was an MBOL and we are not after MBOL,
805 or a future STCLASS check will fail this. */
807 /* Even in this situation we may use MBOL flag if strpos is offset
808 wrt the start of the string. */
809 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
810 && (strpos != strbeg) && strpos[-1] != '\n'
811 /* May be due to an implicit anchor of m{.*foo} */
812 && !(prog->reganch & ROPT_IMPLICIT))
817 DEBUG_EXECUTE_r( if (ml_anch)
818 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
819 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
822 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
824 prog->check_utf8 /* Could be deleted already */
825 && --BmUSEFUL(prog->check_utf8) < 0
826 && (prog->check_utf8 == prog->float_utf8)
828 prog->check_substr /* Could be deleted already */
829 && --BmUSEFUL(prog->check_substr) < 0
830 && (prog->check_substr == prog->float_substr)
833 /* If flags & SOMETHING - do not do it many times on the same match */
834 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
835 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
836 if (do_utf8 ? prog->check_substr : prog->check_utf8)
837 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
838 prog->check_substr = prog->check_utf8 = NULL; /* disable */
839 prog->float_substr = prog->float_utf8 = NULL; /* clear */
840 check = NULL; /* abort */
842 /* XXXX This is a remnant of the old implementation. It
843 looks wasteful, since now INTUIT can use many
845 prog->reganch &= ~RE_USE_INTUIT;
852 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
853 /* trie stclasses are too expensive to use here, we are better off to
854 leave it to regmatch itself */
855 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
856 /* minlen == 0 is possible if regstclass is \b or \B,
857 and the fixed substr is ''$.
858 Since minlen is already taken into account, s+1 is before strend;
859 accidentally, minlen >= 1 guaranties no false positives at s + 1
860 even for \b or \B. But (minlen? 1 : 0) below assumes that
861 regstclass does not come from lookahead... */
862 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
863 This leaves EXACTF only, which is dealt with in find_byclass(). */
864 const U8* const str = (U8*)STRING(prog->regstclass);
865 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
866 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
869 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
870 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
871 else if (prog->float_substr || prog->float_utf8)
872 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
876 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
877 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
880 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
883 const char *what = NULL;
885 if (endpos == strend) {
886 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
887 "Could not match STCLASS...\n") );
890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
891 "This position contradicts STCLASS...\n") );
892 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
894 /* Contradict one of substrings */
895 if (prog->anchored_substr || prog->anchored_utf8) {
896 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
897 DEBUG_EXECUTE_r( what = "anchored" );
899 s = HOP3c(t, 1, strend);
900 if (s + start_shift + end_shift > strend) {
901 /* XXXX Should be taken into account earlier? */
902 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
903 "Could not match STCLASS...\n") );
908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
909 "Looking for %s substr starting at offset %ld...\n",
910 what, (long)(s + start_shift - i_strpos)) );
913 /* Have both, check_string is floating */
914 if (t + start_shift >= check_at) /* Contradicts floating=check */
915 goto retry_floating_check;
916 /* Recheck anchored substring, but not floating... */
920 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
921 "Looking for anchored substr starting at offset %ld...\n",
922 (long)(other_last - i_strpos)) );
923 goto do_other_anchored;
925 /* Another way we could have checked stclass at the
926 current position only: */
931 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
932 "Looking for /%s^%s/m starting at offset %ld...\n",
933 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
936 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
938 /* Check is floating subtring. */
939 retry_floating_check:
940 t = check_at - start_shift;
941 DEBUG_EXECUTE_r( what = "floating" );
942 goto hop_and_restart;
945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
946 "By STCLASS: moving %ld --> %ld\n",
947 (long)(t - i_strpos), (long)(s - i_strpos))
951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
952 "Does not contradict STCLASS...\n");
957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
958 PL_colors[4], (check ? "Guessed" : "Giving up"),
959 PL_colors[5], (long)(s - i_strpos)) );
962 fail_finish: /* Substring not found */
963 if (prog->check_substr || prog->check_utf8) /* could be removed already */
964 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
967 PL_colors[4], PL_colors[5]));
973 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
974 foldlen, foldbuf, uniflags) STMT_START { \
975 switch (trie_type) { \
976 case trie_utf8_fold: \
978 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
983 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
984 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
985 foldlen -= UNISKIP( uvc ); \
986 uscan = foldbuf + UNISKIP( uvc ); \
990 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
998 charid = trie->charmap[ uvc ]; \
1002 if (trie->widecharmap) { \
1003 SV** const svpp = hv_fetch(trie->widecharmap, \
1004 (char*)&uvc, sizeof(UV), 0); \
1006 charid = (U16)SvIV(*svpp); \
1011 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1014 ibcmp_utf8(s, NULL, 0, do_utf8, \
1015 m, NULL, ln, (bool)UTF)) \
1016 && (!reginfo || regtry(reginfo, s)) ) \
1019 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1020 uvchr_to_utf8(tmpbuf, c); \
1021 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1023 && (f == c1 || f == c2) \
1024 && (ln == foldlen || \
1025 !ibcmp_utf8((char *) foldbuf, \
1026 NULL, foldlen, do_utf8, \
1028 NULL, ln, (bool)UTF)) \
1029 && (!reginfo || regtry(reginfo, s)) ) \
1034 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1038 && (ln == 1 || !(OP(c) == EXACTF \
1040 : ibcmp_locale(s, m, ln))) \
1041 && (!reginfo || regtry(reginfo, s)) ) \
1047 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1049 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1055 #define REXEC_FBC_SCAN(CoDe) \
1057 while (s < strend) { \
1063 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1064 REXEC_FBC_UTF8_SCAN( \
1066 if (tmp && (!reginfo || regtry(reginfo, s))) \
1075 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1078 if (tmp && (!reginfo || regtry(reginfo, s))) \
1087 #define REXEC_FBC_TRYIT \
1088 if ((!reginfo || regtry(reginfo, s))) \
1091 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1094 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1097 REXEC_FBC_CLASS_SCAN(CoNd); \
1101 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1102 PL_reg_flags |= RF_tainted; \
1104 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1107 REXEC_FBC_CLASS_SCAN(CoNd); \
1111 #define DUMP_EXEC_POS(li,s,doutf8) \
1112 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1114 /* We know what class REx starts with. Try to find this position... */
1115 /* if reginfo is NULL, its a dryrun */
1116 /* annoyingly all the vars in this routine have different names from their counterparts
1117 in regmatch. /grrr */
1120 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1121 const char *strend, const regmatch_info *reginfo)
1124 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1128 register STRLEN uskip;
1132 register I32 tmp = 1; /* Scratch variable? */
1133 register const bool do_utf8 = PL_reg_match_utf8;
1135 /* We know what class it must start with. */
1139 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1140 !UTF8_IS_INVARIANT((U8)s[0]) ?
1141 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1142 REGINCLASS(prog, c, (U8*)s));
1145 while (s < strend) {
1148 if (REGINCLASS(prog, c, (U8*)s) ||
1149 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1150 /* The assignment of 2 is intentional:
1151 * for the folded sharp s, the skip is 2. */
1152 (skip = SHARP_S_SKIP))) {
1153 if (tmp && (!reginfo || regtry(reginfo, s)))
1166 if (tmp && (!reginfo || regtry(reginfo, s)))
1174 ln = STR_LEN(c); /* length to match in octets/bytes */
1175 lnc = (I32) ln; /* length to match in characters */
1177 STRLEN ulen1, ulen2;
1179 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1180 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1181 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1183 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1184 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1186 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1188 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1191 while (sm < ((U8 *) m + ln)) {
1206 c2 = PL_fold_locale[c1];
1208 e = HOP3c(strend, -((I32)lnc), s);
1210 if (!reginfo && e < s)
1211 e = s; /* Due to minlen logic of intuit() */
1213 /* The idea in the EXACTF* cases is to first find the
1214 * first character of the EXACTF* node and then, if
1215 * necessary, case-insensitively compare the full
1216 * text of the node. The c1 and c2 are the first
1217 * characters (though in Unicode it gets a bit
1218 * more complicated because there are more cases
1219 * than just upper and lower: one needs to use
1220 * the so-called folding case for case-insensitive
1221 * matching (called "loose matching" in Unicode).
1222 * ibcmp_utf8() will do just that. */
1226 U8 tmpbuf [UTF8_MAXBYTES+1];
1227 STRLEN len, foldlen;
1228 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1230 /* Upper and lower of 1st char are equal -
1231 * probably not a "letter". */
1233 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1235 REXEC_FBC_EXACTISH_CHECK(c == c1);
1240 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1243 /* Handle some of the three Greek sigmas cases.
1244 * Note that not all the possible combinations
1245 * are handled here: some of them are handled
1246 * by the standard folding rules, and some of
1247 * them (the character class or ANYOF cases)
1248 * are handled during compiletime in
1249 * regexec.c:S_regclass(). */
1250 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1251 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1252 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1254 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1260 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1262 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1266 PL_reg_flags |= RF_tainted;
1273 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1274 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1276 tmp = ((OP(c) == BOUND ?
1277 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1278 LOAD_UTF8_CHARCLASS_ALNUM();
1279 REXEC_FBC_UTF8_SCAN(
1280 if (tmp == !(OP(c) == BOUND ?
1281 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1282 isALNUM_LC_utf8((U8*)s)))
1290 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1291 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1294 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1300 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1304 PL_reg_flags |= RF_tainted;
1311 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1312 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1314 tmp = ((OP(c) == NBOUND ?
1315 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1316 LOAD_UTF8_CHARCLASS_ALNUM();
1317 REXEC_FBC_UTF8_SCAN(
1318 if (tmp == !(OP(c) == NBOUND ?
1319 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1320 isALNUM_LC_utf8((U8*)s)))
1322 else REXEC_FBC_TRYIT;
1326 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1327 tmp = ((OP(c) == NBOUND ?
1328 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1331 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1333 else REXEC_FBC_TRYIT;
1336 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1340 REXEC_FBC_CSCAN_PRELOAD(
1341 LOAD_UTF8_CHARCLASS_ALNUM(),
1342 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1346 REXEC_FBC_CSCAN_TAINT(
1347 isALNUM_LC_utf8((U8*)s),
1351 REXEC_FBC_CSCAN_PRELOAD(
1352 LOAD_UTF8_CHARCLASS_ALNUM(),
1353 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1357 REXEC_FBC_CSCAN_TAINT(
1358 !isALNUM_LC_utf8((U8*)s),
1362 REXEC_FBC_CSCAN_PRELOAD(
1363 LOAD_UTF8_CHARCLASS_SPACE(),
1364 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1368 REXEC_FBC_CSCAN_TAINT(
1369 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1373 REXEC_FBC_CSCAN_PRELOAD(
1374 LOAD_UTF8_CHARCLASS_SPACE(),
1375 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1379 REXEC_FBC_CSCAN_TAINT(
1380 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1384 REXEC_FBC_CSCAN_PRELOAD(
1385 LOAD_UTF8_CHARCLASS_DIGIT(),
1386 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1390 REXEC_FBC_CSCAN_TAINT(
1391 isDIGIT_LC_utf8((U8*)s),
1395 REXEC_FBC_CSCAN_PRELOAD(
1396 LOAD_UTF8_CHARCLASS_DIGIT(),
1397 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1401 REXEC_FBC_CSCAN_TAINT(
1402 !isDIGIT_LC_utf8((U8*)s),
1408 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1409 trie_type = do_utf8 ?
1410 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1412 /* what trie are we using right now */
1414 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1415 reg_trie_data *trie=aho->trie;
1417 const char *last_start = strend - trie->minlen;
1419 const char *real_start = s;
1421 STRLEN maxlen = trie->maxlen;
1423 U8 **points; /* map of where we were in the input string
1424 when reading a given char. For ASCII this
1425 is unnecessary overhead as the relationship
1426 is always 1:1, but for unicode, especially
1427 case folded unicode this is not true. */
1428 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1432 GET_RE_DEBUG_FLAGS_DECL;
1434 /* We can't just allocate points here. We need to wrap it in
1435 * an SV so it gets freed properly if there is a croak while
1436 * running the match */
1439 sv_points=newSV(maxlen * sizeof(U8 *));
1440 SvCUR_set(sv_points,
1441 maxlen * sizeof(U8 *));
1442 SvPOK_on(sv_points);
1443 sv_2mortal(sv_points);
1444 points=(U8**)SvPV_nolen(sv_points );
1445 if ( trie_type != trie_utf8_fold
1446 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1449 bitmap=(U8*)trie->bitmap;
1451 bitmap=(U8*)ANYOF_BITMAP(c);
1453 /* this is the Aho-Corasick algorithm modified a touch
1454 to include special handling for long "unknown char"
1455 sequences. The basic idea being that we use AC as long
1456 as we are dealing with a possible matching char, when
1457 we encounter an unknown char (and we have not encountered
1458 an accepting state) we scan forward until we find a legal
1460 AC matching is basically that of trie matching, except
1461 that when we encounter a failing transition, we fall back
1462 to the current states "fail state", and try the current char
1463 again, a process we repeat until we reach the root state,
1464 state 1, or a legal transition. If we fail on the root state
1465 then we can either terminate if we have reached an accepting
1466 state previously, or restart the entire process from the beginning
1470 while (s <= last_start) {
1471 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1479 U8 *uscan = (U8*)NULL;
1480 U8 *leftmost = NULL;
1482 U32 accepted_word= 0;
1486 while ( state && uc <= (U8*)strend ) {
1488 U32 word = aho->states[ state ].wordnum;
1492 DEBUG_TRIE_EXECUTE_r(
1493 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1494 dump_exec_pos( (char *)uc, c, strend, real_start,
1495 (char *)uc, do_utf8 );
1496 PerlIO_printf( Perl_debug_log,
1497 " Scanning for legal start char...\n");
1500 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1505 if (uc >(U8*)last_start) break;
1509 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1510 if (!leftmost || lpos < leftmost) {
1511 DEBUG_r(accepted_word=word);
1517 points[pointpos++ % maxlen]= uc;
1518 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1519 uvc, charid, foldlen, foldbuf, uniflags);
1520 DEBUG_TRIE_EXECUTE_r({
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1523 PerlIO_printf(Perl_debug_log,
1524 " Charid:%3u CP:%4"UVxf" ",
1530 word = aho->states[ state ].wordnum;
1532 base = aho->states[ state ].trans.base;
1534 DEBUG_TRIE_EXECUTE_r({
1536 dump_exec_pos( (char *)uc, c, strend, real_start,
1538 PerlIO_printf( Perl_debug_log,
1539 "%sState: %4"UVxf", word=%"UVxf,
1540 failed ? " Fail transition to " : "",
1541 (UV)state, (UV)word);
1546 (base + charid > trie->uniquecharcount )
1547 && (base + charid - 1 - trie->uniquecharcount
1549 && trie->trans[base + charid - 1 -
1550 trie->uniquecharcount].check == state
1551 && (tmp=trie->trans[base + charid - 1 -
1552 trie->uniquecharcount ].next))
1554 DEBUG_TRIE_EXECUTE_r(
1555 PerlIO_printf( Perl_debug_log," - legal\n"));
1560 DEBUG_TRIE_EXECUTE_r(
1561 PerlIO_printf( Perl_debug_log," - fail\n"));
1563 state = aho->fail[state];
1567 /* we must be accepting here */
1568 DEBUG_TRIE_EXECUTE_r(
1569 PerlIO_printf( Perl_debug_log," - accepting\n"));
1578 if (!state) state = 1;
1581 if ( aho->states[ state ].wordnum ) {
1582 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1583 if (!leftmost || lpos < leftmost) {
1584 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1589 s = (char*)leftmost;
1590 DEBUG_TRIE_EXECUTE_r({
1592 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1593 (UV)accepted_word, s - real_start
1596 if (!reginfo || regtry(reginfo, s)) {
1602 DEBUG_TRIE_EXECUTE_r({
1603 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1606 DEBUG_TRIE_EXECUTE_r(
1607 PerlIO_printf( Perl_debug_log,"No match.\n"));
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1625 - regexec_flags - match a regexp against a string
1628 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630 /* strend: pointer to null at end of string */
1631 /* strbeg: real beginning of string */
1632 /* minend: end of match must be >=minend after stringarg. */
1633 /* data: May be used for some additional optimizations. */
1634 /* nosave: For optimizations. */
1638 register regnode *c;
1639 register char *startpos = stringarg;
1640 I32 minlen; /* must match at least this many chars */
1641 I32 dontbother = 0; /* how many characters not to try at end */
1642 I32 end_shift = 0; /* Same for the end. */ /* CC */
1643 I32 scream_pos = -1; /* Internal iterator of scream. */
1644 char *scream_olds = NULL;
1645 SV* const oreplsv = GvSV(PL_replgv);
1646 const bool do_utf8 = (bool)DO_UTF8(sv);
1649 regmatch_info reginfo; /* create some info to pass to regtry etc */
1651 GET_RE_DEBUG_FLAGS_DECL;
1653 PERL_UNUSED_ARG(data);
1655 /* Be paranoid... */
1656 if (prog == NULL || startpos == NULL) {
1657 Perl_croak(aTHX_ "NULL regexp parameter");
1661 multiline = prog->reganch & PMf_MULTILINE;
1662 reginfo.prog = prog;
1664 RX_MATCH_UTF8_set(prog, do_utf8);
1666 debug_start_match(prog, do_utf8, startpos, strend,
1670 minlen = prog->minlen;
1672 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1674 "String too short [regexec_flags]...\n"));
1679 /* Check validity of program. */
1680 if (UCHARAT(prog->program) != REG_MAGIC) {
1681 Perl_croak(aTHX_ "corrupted regexp program");
1685 PL_reg_eval_set = 0;
1688 if (prog->reganch & ROPT_UTF8)
1689 PL_reg_flags |= RF_utf8;
1691 /* Mark beginning of line for ^ and lookbehind. */
1692 reginfo.bol = startpos; /* XXX not used ??? */
1696 /* Mark end of line for $ (and such) */
1699 /* see how far we have to get to not match where we matched before */
1700 reginfo.till = startpos+minend;
1702 /* If there is a "must appear" string, look for it. */
1705 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1708 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1709 reginfo.ganch = startpos;
1710 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1713 && mg->mg_len >= 0) {
1714 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1715 if (prog->reganch & ROPT_ANCH_GPOS) {
1716 if (s > reginfo.ganch)
1721 else /* pos() not defined */
1722 reginfo.ganch = strbeg;
1725 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1726 re_scream_pos_data d;
1728 d.scream_olds = &scream_olds;
1729 d.scream_pos = &scream_pos;
1730 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1733 goto phooey; /* not present */
1739 /* Simplest case: anchored match need be tried only once. */
1740 /* [unless only anchor is BOL and multiline is set] */
1741 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1742 if (s == startpos && regtry(®info, startpos))
1744 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1745 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1750 dontbother = minlen - 1;
1751 end = HOP3c(strend, -dontbother, strbeg) - 1;
1752 /* for multiline we only have to try after newlines */
1753 if (prog->check_substr || prog->check_utf8) {
1757 if (regtry(®info, s))
1762 if (prog->reganch & RE_USE_INTUIT) {
1763 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1774 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1775 if (regtry(®info, s))
1782 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1784 /* the warning about reginfo.ganch being used without intialization
1785 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1786 and we only enter this block when the same bit is set. */
1787 if (regtry(®info, reginfo.ganch))
1792 /* Messy cases: unanchored match. */
1793 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1794 /* we have /x+whatever/ */
1795 /* it must be a one character string (XXXX Except UTF?) */
1800 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1801 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1802 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1807 DEBUG_EXECUTE_r( did_match = 1 );
1808 if (regtry(®info, s)) goto got_it;
1810 while (s < strend && *s == ch)
1818 DEBUG_EXECUTE_r( did_match = 1 );
1819 if (regtry(®info, s)) goto got_it;
1821 while (s < strend && *s == ch)
1826 DEBUG_EXECUTE_r(if (!did_match)
1827 PerlIO_printf(Perl_debug_log,
1828 "Did not find anchored character...\n")
1831 else if (prog->anchored_substr != NULL
1832 || prog->anchored_utf8 != NULL
1833 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1834 && prog->float_max_offset < strend - s)) {
1839 char *last1; /* Last position checked before */
1843 if (prog->anchored_substr || prog->anchored_utf8) {
1844 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1845 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1846 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1847 back_max = back_min = prog->anchored_offset;
1849 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1850 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1851 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1852 back_max = prog->float_max_offset;
1853 back_min = prog->float_min_offset;
1857 if (must == &PL_sv_undef)
1858 /* could not downgrade utf8 check substring, so must fail */
1864 last = HOP3c(strend, /* Cannot start after this */
1865 -(I32)(CHR_SVLEN(must)
1866 - (SvTAIL(must) != 0) + back_min), strbeg);
1869 last1 = HOPc(s, -1);
1871 last1 = s - 1; /* bogus */
1873 /* XXXX check_substr already used to find "s", can optimize if
1874 check_substr==must. */
1876 dontbother = end_shift;
1877 strend = HOPc(strend, -dontbother);
1878 while ( (s <= last) &&
1879 ((flags & REXEC_SCREAM)
1880 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1881 end_shift, &scream_pos, 0))
1882 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1883 (unsigned char*)strend, must,
1884 multiline ? FBMrf_MULTILINE : 0))) ) {
1885 /* we may be pointing at the wrong string */
1886 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1887 s = strbeg + (s - SvPVX_const(sv));
1888 DEBUG_EXECUTE_r( did_match = 1 );
1889 if (HOPc(s, -back_max) > last1) {
1890 last1 = HOPc(s, -back_min);
1891 s = HOPc(s, -back_max);
1894 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1896 last1 = HOPc(s, -back_min);
1900 while (s <= last1) {
1901 if (regtry(®info, s))
1907 while (s <= last1) {
1908 if (regtry(®info, s))
1914 DEBUG_EXECUTE_r(if (!did_match) {
1915 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1916 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1917 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1918 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1919 ? "anchored" : "floating"),
1920 quoted, RE_SV_TAIL(must));
1924 else if ( (c = prog->regstclass) ) {
1926 const OPCODE op = OP(prog->regstclass);
1927 /* don't bother with what can't match */
1928 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1929 strend = HOPc(strend, -(minlen - 1));
1932 SV * const prop = sv_newmortal();
1933 regprop(prog, prop, c);
1935 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1937 PerlIO_printf(Perl_debug_log,
1938 "Matching stclass %.*s against %s (%d chars)\n",
1939 (int)SvCUR(prop), SvPVX_const(prop),
1940 quoted, (int)(strend - s));
1943 if (find_byclass(prog, c, s, strend, ®info))
1945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1949 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1954 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1955 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1956 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1958 if (flags & REXEC_SCREAM) {
1959 last = screaminstr(sv, float_real, s - strbeg,
1960 end_shift, &scream_pos, 1); /* last one */
1962 last = scream_olds; /* Only one occurrence. */
1963 /* we may be pointing at the wrong string */
1964 else if (RX_MATCH_COPIED(prog))
1965 s = strbeg + (s - SvPVX_const(sv));
1969 const char * const little = SvPV_const(float_real, len);
1971 if (SvTAIL(float_real)) {
1972 if (memEQ(strend - len + 1, little, len - 1))
1973 last = strend - len + 1;
1974 else if (!multiline)
1975 last = memEQ(strend - len, little, len)
1976 ? strend - len : NULL;
1982 last = rninstr(s, strend, little, little + len);
1984 last = strend; /* matching "$" */
1989 PerlIO_printf(Perl_debug_log,
1990 "%sCan't trim the tail, match fails (should not happen)%s\n",
1991 PL_colors[4], PL_colors[5]));
1992 goto phooey; /* Should not happen! */
1994 dontbother = strend - last + prog->float_min_offset;
1996 if (minlen && (dontbother < minlen))
1997 dontbother = minlen - 1;
1998 strend -= dontbother; /* this one's always in bytes! */
1999 /* We don't know much -- general case. */
2002 if (regtry(®info, s))
2011 if (regtry(®info, s))
2013 } while (s++ < strend);
2021 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2023 if (PL_reg_eval_set) {
2024 /* Preserve the current value of $^R */
2025 if (oreplsv != GvSV(PL_replgv))
2026 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2027 restored, the value remains
2029 restore_pos(aTHX_ prog);
2031 if (prog->paren_names)
2032 (void)hv_iterinit(prog->paren_names);
2034 /* make sure $`, $&, $', and $digit will work later */
2035 if ( !(flags & REXEC_NOT_FIRST) ) {
2036 RX_MATCH_COPY_FREE(prog);
2037 if (flags & REXEC_COPY_STR) {
2038 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2039 #ifdef PERL_OLD_COPY_ON_WRITE
2041 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2043 PerlIO_printf(Perl_debug_log,
2044 "Copy on write: regexp capture, type %d\n",
2047 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2048 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2049 assert (SvPOKp(prog->saved_copy));
2053 RX_MATCH_COPIED_on(prog);
2054 s = savepvn(strbeg, i);
2060 prog->subbeg = strbeg;
2061 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2068 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2069 PL_colors[4], PL_colors[5]));
2070 if (PL_reg_eval_set)
2071 restore_pos(aTHX_ prog);
2077 - regtry - try match at specific point
2079 STATIC I32 /* 0 failure, 1 success */
2080 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2086 regexp *prog = reginfo->prog;
2087 GET_RE_DEBUG_FLAGS_DECL;
2089 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2092 PL_reg_eval_set = RS_init;
2093 DEBUG_EXECUTE_r(DEBUG_s(
2094 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2095 (IV)(PL_stack_sp - PL_stack_base));
2098 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2099 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2101 /* Apparently this is not needed, judging by wantarray. */
2102 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2103 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2106 /* Make $_ available to executed code. */
2107 if (reginfo->sv != DEFSV) {
2109 DEFSV = reginfo->sv;
2112 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2113 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2114 /* prepare for quick setting of pos */
2115 #ifdef PERL_OLD_COPY_ON_WRITE
2117 sv_force_normal_flags(sv, 0);
2119 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2120 &PL_vtbl_mglob, NULL, 0);
2124 PL_reg_oldpos = mg->mg_len;
2125 SAVEDESTRUCTOR_X(restore_pos, prog);
2127 if (!PL_reg_curpm) {
2128 Newxz(PL_reg_curpm, 1, PMOP);
2131 SV* const repointer = newSViv(0);
2132 /* so we know which PL_regex_padav element is PL_reg_curpm */
2133 SvFLAGS(repointer) |= SVf_BREAK;
2134 av_push(PL_regex_padav,repointer);
2135 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2136 PL_regex_pad = AvARRAY(PL_regex_padav);
2140 PM_SETRE(PL_reg_curpm, prog);
2141 PL_reg_oldcurpm = PL_curpm;
2142 PL_curpm = PL_reg_curpm;
2143 if (RX_MATCH_COPIED(prog)) {
2144 /* Here is a serious problem: we cannot rewrite subbeg,
2145 since it may be needed if this match fails. Thus
2146 $` inside (?{}) could fail... */
2147 PL_reg_oldsaved = prog->subbeg;
2148 PL_reg_oldsavedlen = prog->sublen;
2149 #ifdef PERL_OLD_COPY_ON_WRITE
2150 PL_nrs = prog->saved_copy;
2152 RX_MATCH_COPIED_off(prog);
2155 PL_reg_oldsaved = NULL;
2156 prog->subbeg = PL_bostr;
2157 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2159 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2160 prog->startp[0] = startpos - PL_bostr;
2161 PL_reginput = startpos;
2162 PL_reglastparen = &prog->lastparen;
2163 PL_reglastcloseparen = &prog->lastcloseparen;
2164 prog->lastparen = 0;
2165 prog->lastcloseparen = 0;
2167 PL_regstartp = prog->startp;
2168 PL_regendp = prog->endp;
2169 if (PL_reg_start_tmpl <= prog->nparens) {
2170 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2171 if(PL_reg_start_tmp)
2172 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2174 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2177 /* XXXX What this code is doing here?!!! There should be no need
2178 to do this again and again, PL_reglastparen should take care of
2181 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2182 * Actually, the code in regcppop() (which Ilya may be meaning by
2183 * PL_reglastparen), is not needed at all by the test suite
2184 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2185 * enough, for building DynaLoader, or otherwise this
2186 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2187 * will happen. Meanwhile, this code *is* needed for the
2188 * above-mentioned test suite tests to succeed. The common theme
2189 * on those tests seems to be returning null fields from matches.
2194 if (prog->nparens) {
2196 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2203 if (regmatch(reginfo, prog->program + 1)) {
2204 prog->endp[0] = PL_reginput - PL_bostr;
2207 REGCP_UNWIND(lastcp);
2212 #define sayYES goto yes
2213 #define sayNO goto no
2214 #define sayNO_SILENT goto no_silent
2216 /* we dont use STMT_START/END here because it leads to
2217 "unreachable code" warnings, which are bogus, but distracting. */
2218 #define CACHEsayNO \
2219 if (ST.cache_mask) \
2220 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2223 /* this is used to determine how far from the left messages like
2224 'failed...' are printed. It should be set such that messages
2225 are inline with the regop output that created them.
2227 #define REPORT_CODE_OFF 32
2230 /* Make sure there is a test for this +1 options in re_tests */
2231 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2233 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2234 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2236 #define SLAB_FIRST(s) (&(s)->states[0])
2237 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2239 /* grab a new slab and return the first slot in it */
2241 STATIC regmatch_state *
2244 #if PERL_VERSION < 9
2247 regmatch_slab *s = PL_regmatch_slab->next;
2249 Newx(s, 1, regmatch_slab);
2250 s->prev = PL_regmatch_slab;
2252 PL_regmatch_slab->next = s;
2254 PL_regmatch_slab = s;
2255 return SLAB_FIRST(s);
2259 /* push a new state then goto it */
2261 #define PUSH_STATE_GOTO(state, node) \
2263 st->resume_state = state; \
2266 /* push a new state with success backtracking, then goto it */
2268 #define PUSH_YES_STATE_GOTO(state, node) \
2270 st->resume_state = state; \
2271 goto push_yes_state;
2277 regmatch() - main matching routine
2279 This is basically one big switch statement in a loop. We execute an op,
2280 set 'next' to point the next op, and continue. If we come to a point which
2281 we may need to backtrack to on failure such as (A|B|C), we push a
2282 backtrack state onto the backtrack stack. On failure, we pop the top
2283 state, and re-enter the loop at the state indicated. If there are no more
2284 states to pop, we return failure.
2286 Sometimes we also need to backtrack on success; for example /A+/, where
2287 after successfully matching one A, we need to go back and try to
2288 match another one; similarly for lookahead assertions: if the assertion
2289 completes successfully, we backtrack to the state just before the assertion
2290 and then carry on. In these cases, the pushed state is marked as
2291 'backtrack on success too'. This marking is in fact done by a chain of
2292 pointers, each pointing to the previous 'yes' state. On success, we pop to
2293 the nearest yes state, discarding any intermediate failure-only states.
2294 Sometimes a yes state is pushed just to force some cleanup code to be
2295 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2296 it to free the inner regex.
2298 Note that failure backtracking rewinds the cursor position, while
2299 success backtracking leaves it alone.
2301 A pattern is complete when the END op is executed, while a subpattern
2302 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2303 ops trigger the "pop to last yes state if any, otherwise return true"
2306 A common convention in this function is to use A and B to refer to the two
2307 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2308 the subpattern to be matched possibly multiple times, while B is the entire
2309 rest of the pattern. Variable and state names reflect this convention.
2311 The states in the main switch are the union of ops and failure/success of
2312 substates associated with with that op. For example, IFMATCH is the op
2313 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2314 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2315 successfully matched A and IFMATCH_A_fail is a state saying that we have
2316 just failed to match A. Resume states always come in pairs. The backtrack
2317 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2318 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2319 on success or failure.
2321 The struct that holds a backtracking state is actually a big union, with
2322 one variant for each major type of op. The variable st points to the
2323 top-most backtrack struct. To make the code clearer, within each
2324 block of code we #define ST to alias the relevant union.
2326 Here's a concrete example of a (vastly oversimplified) IFMATCH
2332 #define ST st->u.ifmatch
2334 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2335 ST.foo = ...; // some state we wish to save
2337 // push a yes backtrack state with a resume value of
2338 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2340 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2343 case IFMATCH_A: // we have successfully executed A; now continue with B
2345 bar = ST.foo; // do something with the preserved value
2348 case IFMATCH_A_fail: // A failed, so the assertion failed
2349 ...; // do some housekeeping, then ...
2350 sayNO; // propagate the failure
2357 For any old-timers reading this who are familiar with the old recursive
2358 approach, the code above is equivalent to:
2360 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2369 ...; // do some housekeeping, then ...
2370 sayNO; // propagate the failure
2373 The topmost backtrack state, pointed to by st, is usually free. If you
2374 want to claim it, populate any ST.foo fields in it with values you wish to
2375 save, then do one of
2377 PUSH_STATE_GOTO(resume_state, node);
2378 PUSH_YES_STATE_GOTO(resume_state, node);
2380 which sets that backtrack state's resume value to 'resume_state', pushes a
2381 new free entry to the top of the backtrack stack, then goes to 'node'.
2382 On backtracking, the free slot is popped, and the saved state becomes the
2383 new free state. An ST.foo field in this new top state can be temporarily
2384 accessed to retrieve values, but once the main loop is re-entered, it
2385 becomes available for reuse.
2387 Note that the depth of the backtrack stack constantly increases during the
2388 left-to-right execution of the pattern, rather than going up and down with
2389 the pattern nesting. For example the stack is at its maximum at Z at the
2390 end of the pattern, rather than at X in the following:
2392 /(((X)+)+)+....(Y)+....Z/
2394 The only exceptions to this are lookahead/behind assertions and the cut,
2395 (?>A), which pop all the backtrack states associated with A before
2398 Bascktrack state structs are allocated in slabs of about 4K in size.
2399 PL_regmatch_state and st always point to the currently active state,
2400 and PL_regmatch_slab points to the slab currently containing
2401 PL_regmatch_state. The first time regmatch() is called, the first slab is
2402 allocated, and is never freed until interpreter destruction. When the slab
2403 is full, a new one is allocated and chained to the end. At exit from
2404 regmatch(), slabs allocated since entry are freed.
2409 #define DEBUG_STATE_pp(pp) \
2411 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2412 PerlIO_printf(Perl_debug_log, \
2415 reg_name[st->resume_state] ); \
2419 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2424 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2425 const char *start, const char *end, const char *blurb)
2427 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2431 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2432 prog->precomp, prog->prelen, 60);
2434 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2435 start, end - start, 60);
2437 PerlIO_printf(Perl_debug_log,
2438 "%s%s REx%s %s against %s\n",
2439 PL_colors[4], blurb, PL_colors[5], s0, s1);
2441 if (do_utf8||utf8_pat)
2442 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2443 utf8_pat ? "pattern" : "",
2444 utf8_pat && do_utf8 ? " and " : "",
2445 do_utf8 ? "string" : ""
2451 S_dump_exec_pos(pTHX_ const char *locinput,
2452 const regnode *scan,
2453 const char *loc_regeol,
2454 const char *loc_bostr,
2455 const char *loc_reg_starttry,
2458 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2459 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2460 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2461 /* The part of the string before starttry has one color
2462 (pref0_len chars), between starttry and current
2463 position another one (pref_len - pref0_len chars),
2464 after the current position the third one.
2465 We assume that pref0_len <= pref_len, otherwise we
2466 decrease pref0_len. */
2467 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2468 ? (5 + taill) - l : locinput - loc_bostr;
2471 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2473 pref0_len = pref_len - (locinput - loc_reg_starttry);
2474 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2475 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2476 ? (5 + taill) - pref_len : loc_regeol - locinput);
2477 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2481 if (pref0_len > pref_len)
2482 pref0_len = pref_len;
2484 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2486 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2487 (locinput - pref_len),pref0_len, 60, 4, 5);
2489 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2490 (locinput - pref_len + pref0_len),
2491 pref_len - pref0_len, 60, 2, 3);
2493 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2494 locinput, loc_regeol - locinput, 10, 0, 1);
2496 const STRLEN tlen=len0+len1+len2;
2497 PerlIO_printf(Perl_debug_log,
2498 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2499 (IV)(locinput - loc_bostr),
2502 (docolor ? "" : "> <"),
2504 (int)(tlen > 19 ? 0 : 19 - tlen),
2511 /* reg_check_named_buff_matched()
2512 * Checks to see if a named buffer has matched. The data array of
2513 * buffer numbers corresponding to the buffer is expected to reside
2514 * in the regexp->data->data array in the slot stored in the ARG() of
2515 * node involved. Note that this routine doesn't actually care about the
2516 * name, that information is not preserved from compilation to execution.
2517 * Returns the index of the leftmost defined buffer with the given name
2518 * or 0 if non of the buffers matched.
2521 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2523 SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ];
2524 I32 *nums=(I32*)SvPVX(sv_dat);
2525 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2526 if ((I32)*PL_reglastparen >= nums[n] &&
2527 PL_regendp[nums[n]] != -1)
2535 STATIC I32 /* 0 failure, 1 success */
2536 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2538 #if PERL_VERSION < 9
2542 register const bool do_utf8 = PL_reg_match_utf8;
2543 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2545 regexp *rex = reginfo->prog;
2547 regmatch_slab *orig_slab;
2548 regmatch_state *orig_state;
2550 /* the current state. This is a cached copy of PL_regmatch_state */
2551 register regmatch_state *st;
2553 /* cache heavy used fields of st in registers */
2554 register regnode *scan;
2555 register regnode *next;
2556 register I32 n = 0; /* general value; init to avoid compiler warning */
2557 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2558 register char *locinput = PL_reginput;
2559 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2561 bool result = 0; /* return value of S_regmatch */
2562 int depth = 0; /* depth of backtrack stack */
2563 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
2564 regmatch_state *yes_state = NULL; /* state to pop to on success of
2566 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2567 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2570 /* these three flags are set by various ops to signal information to
2571 * the very next op. They have a useful lifetime of exactly one loop
2572 * iteration, and are not preserved or restored by state pushes/pops
2574 bool sw = 0; /* the condition value in (?(cond)a|b) */
2575 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2576 int logical = 0; /* the following EVAL is:
2580 or the following IFMATCH/UNLESSM is:
2581 false: plain (?=foo)
2582 true: used as a condition: (?(?=foo))
2586 GET_RE_DEBUG_FLAGS_DECL;
2589 /* on first ever call to regmatch, allocate first slab */
2590 if (!PL_regmatch_slab) {
2591 Newx(PL_regmatch_slab, 1, regmatch_slab);
2592 PL_regmatch_slab->prev = NULL;
2593 PL_regmatch_slab->next = NULL;
2594 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2597 /* remember current high-water mark for exit */
2598 /* XXX this should be done with SAVE* instead */
2599 orig_slab = PL_regmatch_slab;
2600 orig_state = PL_regmatch_state;
2602 /* grab next free state slot */
2603 st = ++PL_regmatch_state;
2604 if (st > SLAB_LAST(PL_regmatch_slab))
2605 st = PL_regmatch_state = S_push_slab(aTHX);
2607 /* Note that nextchr is a byte even in UTF */
2608 nextchr = UCHARAT(locinput);
2610 while (scan != NULL) {
2613 SV * const prop = sv_newmortal();
2614 regnode *rnext=regnext(scan);
2615 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2616 regprop(rex, prop, scan);
2618 PerlIO_printf(Perl_debug_log,
2619 "%3"IVdf":%*s%s(%"IVdf")\n",
2620 (IV)(scan - rex->program), depth*2, "",
2622 (PL_regkind[OP(scan)] == END || !rnext) ?
2623 0 : (IV)(rnext - rex->program));
2626 next = scan + NEXT_OFF(scan);
2629 state_num = OP(scan);
2632 switch (state_num) {
2634 if (locinput == PL_bostr)
2636 /* reginfo->till = reginfo->bol; */
2641 if (locinput == PL_bostr ||
2642 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2648 if (locinput == PL_bostr)
2652 if (locinput == reginfo->ganch)
2658 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2663 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2665 if (PL_regeol - locinput > 1)
2669 if (PL_regeol != locinput)
2673 if (!nextchr && locinput >= PL_regeol)
2676 locinput += PL_utf8skip[nextchr];
2677 if (locinput > PL_regeol)
2679 nextchr = UCHARAT(locinput);
2682 nextchr = UCHARAT(++locinput);
2685 if (!nextchr && locinput >= PL_regeol)
2687 nextchr = UCHARAT(++locinput);
2690 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2693 locinput += PL_utf8skip[nextchr];
2694 if (locinput > PL_regeol)
2696 nextchr = UCHARAT(locinput);
2699 nextchr = UCHARAT(++locinput);
2703 #define ST st->u.trie
2705 /* In this case the charclass data is available inline so
2706 we can fail fast without a lot of extra overhead.
2708 if (scan->flags == EXACT || !do_utf8) {
2709 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2711 PerlIO_printf(Perl_debug_log,
2712 "%*s %sfailed to match trie start class...%s\n",
2713 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2722 /* what type of TRIE am I? (utf8 makes this contextual) */
2723 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2724 trie_type = do_utf8 ?
2725 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2728 /* what trie are we using right now */
2729 reg_trie_data * const trie
2730 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2731 U32 state = trie->startstate;
2733 if (trie->bitmap && trie_type != trie_utf8_fold &&
2734 !TRIE_BITMAP_TEST(trie,*locinput)
2736 if (trie->states[ state ].wordnum) {
2738 PerlIO_printf(Perl_debug_log,
2739 "%*s %smatched empty string...%s\n",
2740 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2745 PerlIO_printf(Perl_debug_log,
2746 "%*s %sfailed to match trie start class...%s\n",
2747 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2754 U8 *uc = ( U8* )locinput;
2758 U8 *uscan = (U8*)NULL;
2760 SV *sv_accept_buff = NULL;
2761 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2763 ST.accepted = 0; /* how many accepting states we have seen */
2765 ST.jump = trie->jump;
2774 traverse the TRIE keeping track of all accepting states
2775 we transition through until we get to a failing node.
2778 while ( state && uc <= (U8*)PL_regeol ) {
2779 U32 base = trie->states[ state ].trans.base;
2782 /* We use charid to hold the wordnum as we don't use it
2783 for charid until after we have done the wordnum logic.
2784 We define an alias just so that the wordnum logic reads
2787 #define got_wordnum charid
2788 got_wordnum = trie->states[ state ].wordnum;
2790 if ( got_wordnum ) {
2791 if ( ! ST.accepted ) {
2794 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2795 sv_accept_buff=newSV(bufflen *
2796 sizeof(reg_trie_accepted) - 1);
2797 SvCUR_set(sv_accept_buff, 0);
2798 SvPOK_on(sv_accept_buff);
2799 sv_2mortal(sv_accept_buff);
2802 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2805 if (ST.accepted >= bufflen) {
2807 ST.accept_buff =(reg_trie_accepted*)
2808 SvGROW(sv_accept_buff,
2809 bufflen * sizeof(reg_trie_accepted));
2811 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2812 + sizeof(reg_trie_accepted));
2815 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2816 ST.accept_buff[ST.accepted].endpos = uc;
2818 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2822 DEBUG_TRIE_EXECUTE_r({
2823 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2824 PerlIO_printf( Perl_debug_log,
2825 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2826 2+depth * 2, "", PL_colors[4],
2827 (UV)state, (UV)ST.accepted );
2831 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2832 uvc, charid, foldlen, foldbuf, uniflags);
2835 (base + charid > trie->uniquecharcount )
2836 && (base + charid - 1 - trie->uniquecharcount
2838 && trie->trans[base + charid - 1 -
2839 trie->uniquecharcount].check == state)
2841 state = trie->trans[base + charid - 1 -
2842 trie->uniquecharcount ].next;
2853 DEBUG_TRIE_EXECUTE_r(
2854 PerlIO_printf( Perl_debug_log,
2855 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2856 charid, uvc, (UV)state, PL_colors[5] );
2863 PerlIO_printf( Perl_debug_log,
2864 "%*s %sgot %"IVdf" possible matches%s\n",
2865 REPORT_CODE_OFF + depth * 2, "",
2866 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2872 case TRIE_next_fail: /* we failed - try next alterative */
2874 if ( ST.accepted == 1 ) {
2875 /* only one choice left - just continue */
2877 reg_trie_data * const trie
2878 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2879 SV ** const tmp = RX_DEBUG(reginfo->prog)
2880 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2882 PerlIO_printf( Perl_debug_log,
2883 "%*s %sonly one match left: #%d <%s>%s\n",
2884 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2885 ST.accept_buff[ 0 ].wordnum,
2886 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2889 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2890 /* in this case we free tmps/leave before we call regmatch
2891 as we wont be using accept_buff again. */
2894 locinput = PL_reginput;
2895 nextchr = UCHARAT(locinput);
2900 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2902 continue; /* execute rest of RE */
2905 if (!ST.accepted-- ) {
2912 There are at least two accepting states left. Presumably
2913 the number of accepting states is going to be low,
2914 typically two. So we simply scan through to find the one
2915 with lowest wordnum. Once we find it, we swap the last
2916 state into its place and decrement the size. We then try to
2917 match the rest of the pattern at the point where the word
2918 ends. If we succeed, control just continues along the
2919 regex; if we fail we return here to try the next accepting
2926 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2927 DEBUG_TRIE_EXECUTE_r(
2928 PerlIO_printf( Perl_debug_log,
2929 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2930 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
2931 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2932 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2935 if (ST.accept_buff[cur].wordnum <
2936 ST.accept_buff[best].wordnum)
2941 reg_trie_data * const trie
2942 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2943 SV ** const tmp = RX_DEBUG(reginfo->prog)
2944 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2946 regnode *nextop=!ST.jump ?
2948 ST.B - ST.jump[ST.accept_buff[best].wordnum];
2949 PerlIO_printf( Perl_debug_log,
2950 "%*s %strying alternation #%d <%s> at node #%d %s\n",
2951 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2952 ST.accept_buff[best].wordnum,
2953 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2954 REG_NODE_NUM(nextop),
2958 if ( best<ST.accepted ) {
2959 reg_trie_accepted tmp = ST.accept_buff[ best ];
2960 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2961 ST.accept_buff[ ST.accepted ] = tmp;
2964 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2966 PUSH_STATE_GOTO(TRIE_next, ST.B);
2969 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2979 char *s = STRING(scan);
2981 if (do_utf8 != UTF) {
2982 /* The target and the pattern have differing utf8ness. */
2984 const char * const e = s + ln;
2987 /* The target is utf8, the pattern is not utf8. */
2992 if (NATIVE_TO_UNI(*(U8*)s) !=
2993 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3001 /* The target is not utf8, the pattern is utf8. */
3006 if (NATIVE_TO_UNI(*((U8*)l)) !=
3007 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3015 nextchr = UCHARAT(locinput);
3018 /* The target and the pattern have the same utf8ness. */
3019 /* Inline the first character, for speed. */
3020 if (UCHARAT(s) != nextchr)
3022 if (PL_regeol - locinput < ln)
3024 if (ln > 1 && memNE(s, locinput, ln))
3027 nextchr = UCHARAT(locinput);
3031 PL_reg_flags |= RF_tainted;
3034 char * const s = STRING(scan);
3037 if (do_utf8 || UTF) {
3038 /* Either target or the pattern are utf8. */
3039 const char * const l = locinput;
3040 char *e = PL_regeol;
3042 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3043 l, &e, 0, do_utf8)) {
3044 /* One more case for the sharp s:
3045 * pack("U0U*", 0xDF) =~ /ss/i,
3046 * the 0xC3 0x9F are the UTF-8
3047 * byte sequence for the U+00DF. */
3049 toLOWER(s[0]) == 's' &&
3051 toLOWER(s[1]) == 's' &&
3058 nextchr = UCHARAT(locinput);
3062 /* Neither the target and the pattern are utf8. */
3064 /* Inline the first character, for speed. */
3065 if (UCHARAT(s) != nextchr &&
3066 UCHARAT(s) != ((OP(scan) == EXACTF)
3067 ? PL_fold : PL_fold_locale)[nextchr])
3069 if (PL_regeol - locinput < ln)
3071 if (ln > 1 && (OP(scan) == EXACTF
3072 ? ibcmp(s, locinput, ln)
3073 : ibcmp_locale(s, locinput, ln)))
3076 nextchr = UCHARAT(locinput);
3081 STRLEN inclasslen = PL_regeol - locinput;
3083 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3085 if (locinput >= PL_regeol)
3087 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3088 nextchr = UCHARAT(locinput);
3093 nextchr = UCHARAT(locinput);
3094 if (!REGINCLASS(rex, scan, (U8*)locinput))
3096 if (!nextchr && locinput >= PL_regeol)
3098 nextchr = UCHARAT(++locinput);
3102 /* If we might have the case of the German sharp s
3103 * in a casefolding Unicode character class. */
3105 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3106 locinput += SHARP_S_SKIP;
3107 nextchr = UCHARAT(locinput);
3113 PL_reg_flags |= RF_tainted;
3119 LOAD_UTF8_CHARCLASS_ALNUM();
3120 if (!(OP(scan) == ALNUM
3121 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3122 : isALNUM_LC_utf8((U8*)locinput)))
3126 locinput += PL_utf8skip[nextchr];
3127 nextchr = UCHARAT(locinput);
3130 if (!(OP(scan) == ALNUM
3131 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3133 nextchr = UCHARAT(++locinput);
3136 PL_reg_flags |= RF_tainted;
3139 if (!nextchr && locinput >= PL_regeol)
3142 LOAD_UTF8_CHARCLASS_ALNUM();
3143 if (OP(scan) == NALNUM
3144 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3145 : isALNUM_LC_utf8((U8*)locinput))
3149 locinput += PL_utf8skip[nextchr];
3150 nextchr = UCHARAT(locinput);
3153 if (OP(scan) == NALNUM
3154 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3156 nextchr = UCHARAT(++locinput);
3160 PL_reg_flags |= RF_tainted;
3164 /* was last char in word? */
3166 if (locinput == PL_bostr)
3169 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3171 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3173 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3174 ln = isALNUM_uni(ln);
3175 LOAD_UTF8_CHARCLASS_ALNUM();
3176 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3179 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3180 n = isALNUM_LC_utf8((U8*)locinput);
3184 ln = (locinput != PL_bostr) ?
3185 UCHARAT(locinput - 1) : '\n';
3186 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3188 n = isALNUM(nextchr);
3191 ln = isALNUM_LC(ln);
3192 n = isALNUM_LC(nextchr);
3195 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3196 OP(scan) == BOUNDL))
3200 PL_reg_flags |= RF_tainted;
3206 if (UTF8_IS_CONTINUED(nextchr)) {
3207 LOAD_UTF8_CHARCLASS_SPACE();
3208 if (!(OP(scan) == SPACE
3209 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3210 : isSPACE_LC_utf8((U8*)locinput)))
3214 locinput += PL_utf8skip[nextchr];
3215 nextchr = UCHARAT(locinput);
3218 if (!(OP(scan) == SPACE
3219 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3221 nextchr = UCHARAT(++locinput);
3224 if (!(OP(scan) == SPACE
3225 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3227 nextchr = UCHARAT(++locinput);
3231 PL_reg_flags |= RF_tainted;
3234 if (!nextchr && locinput >= PL_regeol)
3237 LOAD_UTF8_CHARCLASS_SPACE();
3238 if (OP(scan) == NSPACE
3239 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3240 : isSPACE_LC_utf8((U8*)locinput))
3244 locinput += PL_utf8skip[nextchr];
3245 nextchr = UCHARAT(locinput);
3248 if (OP(scan) == NSPACE
3249 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3251 nextchr = UCHARAT(++locinput);
3254 PL_reg_flags |= RF_tainted;
3260 LOAD_UTF8_CHARCLASS_DIGIT();
3261 if (!(OP(scan) == DIGIT
3262 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3263 : isDIGIT_LC_utf8((U8*)locinput)))
3267 locinput += PL_utf8skip[nextchr];
3268 nextchr = UCHARAT(locinput);
3271 if (!(OP(scan) == DIGIT
3272 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3274 nextchr = UCHARAT(++locinput);
3277 PL_reg_flags |= RF_tainted;
3280 if (!nextchr && locinput >= PL_regeol)
3283 LOAD_UTF8_CHARCLASS_DIGIT();
3284 if (OP(scan) == NDIGIT
3285 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3286 : isDIGIT_LC_utf8((U8*)locinput))
3290 locinput += PL_utf8skip[nextchr];
3291 nextchr = UCHARAT(locinput);
3294 if (OP(scan) == NDIGIT
3295 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3297 nextchr = UCHARAT(++locinput);
3300 if (locinput >= PL_regeol)
3303 LOAD_UTF8_CHARCLASS_MARK();
3304 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3306 locinput += PL_utf8skip[nextchr];
3307 while (locinput < PL_regeol &&
3308 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3309 locinput += UTF8SKIP(locinput);
3310 if (locinput > PL_regeol)
3315 nextchr = UCHARAT(locinput);
3322 PL_reg_flags |= RF_tainted;
3327 n = reg_check_named_buff_matched(rex,scan);
3330 type = REF + ( type - NREF );
3337 PL_reg_flags |= RF_tainted;
3341 n = ARG(scan); /* which paren pair */
3344 ln = PL_regstartp[n];
3345 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3346 if ((I32)*PL_reglastparen < n || ln == -1)
3347 sayNO; /* Do not match unless seen CLOSEn. */
3348 if (ln == PL_regendp[n])
3352 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3354 const char *e = PL_bostr + PL_regendp[n];
3356 * Note that we can't do the "other character" lookup trick as
3357 * in the 8-bit case (no pun intended) because in Unicode we
3358 * have to map both upper and title case to lower case.
3362 STRLEN ulen1, ulen2;
3363 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3364 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3368 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3369 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3370 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3377 nextchr = UCHARAT(locinput);
3381 /* Inline the first character, for speed. */
3382 if (UCHARAT(s) != nextchr &&
3384 (UCHARAT(s) != (type == REFF
3385 ? PL_fold : PL_fold_locale)[nextchr])))
3387 ln = PL_regendp[n] - ln;
3388 if (locinput + ln > PL_regeol)
3390 if (ln > 1 && (type == REF
3391 ? memNE(s, locinput, ln)
3393 ? ibcmp(s, locinput, ln)
3394 : ibcmp_locale(s, locinput, ln))))
3397 nextchr = UCHARAT(locinput);
3407 #define ST st->u.eval
3411 regnode *startpoint;
3414 case RECURSE: /* /(...(?1))/ */
3415 if (cur_eval && cur_eval->locinput==locinput) {
3416 if (cur_eval->u.eval.close_paren == ARG(scan))
3417 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3418 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3419 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3424 (void)ReREFCNT_inc(rex);
3425 if (OP(scan)==RECURSE) {
3426 startpoint = scan + ARG2L(scan);
3427 ST.close_paren = ARG(scan);
3429 startpoint = re->program+1;
3432 goto eval_recurse_doit;
3434 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3435 if (cur_eval && cur_eval->locinput==locinput) {
3436 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3437 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3442 /* execute the code in the {...} */
3444 SV ** const before = SP;
3445 OP_4tree * const oop = PL_op;
3446 COP * const ocurcop = PL_curcop;
3450 PL_op = (OP_4tree*)rex->data->data[n];
3451 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3452 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3453 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3455 CALLRUNOPS(aTHX); /* Scalar context. */
3458 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3465 PAD_RESTORE_LOCAL(old_comppad);
3466 PL_curcop = ocurcop;
3469 sv_setsv(save_scalar(PL_replgv), ret);
3473 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3476 /* extract RE object from returned value; compiling if
3481 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3482 mg = mg_find(sv, PERL_MAGIC_qr);
3483 else if (SvSMAGICAL(ret)) {
3484 if (SvGMAGICAL(ret))
3485 sv_unmagic(ret, PERL_MAGIC_qr);
3487 mg = mg_find(ret, PERL_MAGIC_qr);
3491 re = (regexp *)mg->mg_obj;
3492 (void)ReREFCNT_inc(re);
3496 const char * const t = SvPV_const(ret, len);
3498 const I32 osize = PL_regsize;
3501 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3502 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3504 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3506 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3512 debug_start_match(re, do_utf8, locinput, PL_regeol,
3513 "Matching embedded");
3515 startpoint = re->program + 1;
3516 ST.close_paren = 0; /* only used for RECURSE */
3517 /* borrowed from regtry */
3518 if (PL_reg_start_tmpl <= re->nparens) {
3519 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3520 if(PL_reg_start_tmp)
3521 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3523 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3526 eval_recurse_doit: /* Share code with RECURSE below this line */
3527 /* run the pattern returned from (??{...}) */
3528 ST.cp = regcppush(0); /* Save *all* the positions. */
3529 REGCP_SET(ST.lastcp);
3531 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3532 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3534 *PL_reglastparen = 0;
3535 *PL_reglastcloseparen = 0;
3536 PL_reginput = locinput;
3539 /* XXXX This is too dramatic a measure... */
3542 ST.toggle_reg_flags = PL_reg_flags;
3543 if (re->reganch & ROPT_UTF8)
3544 PL_reg_flags |= RF_utf8;
3546 PL_reg_flags &= ~RF_utf8;
3547 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3550 ST.prev_curlyx = cur_curlyx;
3554 ST.prev_eval = cur_eval;
3556 /* now continue from first node in postoned RE */
3557 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3560 /* logical is 1, /(?(?{...})X|Y)/ */
3561 sw = (bool)SvTRUE(ret);
3566 case EVAL_AB: /* cleanup after a successful (??{A})B */
3567 /* note: this is called twice; first after popping B, then A */
3568 PL_reg_flags ^= ST.toggle_reg_flags;
3572 cur_eval = ST.prev_eval;
3573 cur_curlyx = ST.prev_curlyx;
3574 /* XXXX This is too dramatic a measure... */
3579 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3580 /* note: this is called twice; first after popping B, then A */
3581 PL_reg_flags ^= ST.toggle_reg_flags;
3584 PL_reginput = locinput;
3585 REGCP_UNWIND(ST.lastcp);
3587 cur_eval = ST.prev_eval;
3588 cur_curlyx = ST.prev_curlyx;
3589 /* XXXX This is too dramatic a measure... */
3595 n = ARG(scan); /* which paren pair */
3596 PL_reg_start_tmp[n] = locinput;
3601 n = ARG(scan); /* which paren pair */
3602 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3603 PL_regendp[n] = locinput - PL_bostr;
3604 if (n > (I32)*PL_reglastparen)
3605 *PL_reglastparen = n;
3606 *PL_reglastcloseparen = n;
3607 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3612 n = ARG(scan); /* which paren pair */
3613 sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3616 /* reg_check_named_buff_matched returns 0 for no match */
3617 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3621 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3627 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3629 next = NEXTOPER(NEXTOPER(scan));
3631 next = scan + ARG(scan);
3632 if (OP(next) == IFTHEN) /* Fake one. */
3633 next = NEXTOPER(NEXTOPER(next));
3637 logical = scan->flags;
3640 /*******************************************************************
3642 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3643 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3644 STAR/PLUS/CURLY/CURLYN are used instead.)
3646 A*B is compiled as <CURLYX><A><WHILEM><B>
3648 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3649 state, which contains the current count, initialised to -1. It also sets
3650 cur_curlyx to point to this state, with any previous value saved in the
3653 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3654 since the pattern may possibly match zero times (i.e. it's a while {} loop
3655 rather than a do {} while loop).
3657 Each entry to WHILEM represents a successful match of A. The count in the
3658 CURLYX block is incremented, another WHILEM state is pushed, and execution
3659 passes to A or B depending on greediness and the current count.
3661 For example, if matching against the string a1a2a3b (where the aN are
3662 substrings that match /A/), then the match progresses as follows: (the
3663 pushed states are interspersed with the bits of strings matched so far):
3666 <CURLYX cnt=0><WHILEM>
3667 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3668 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3669 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3670 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3672 (Contrast this with something like CURLYM, which maintains only a single
3676 a1 <CURLYM cnt=1> a2
3677 a1 a2 <CURLYM cnt=2> a3
3678 a1 a2 a3 <CURLYM cnt=3> b
3681 Each WHILEM state block marks a point to backtrack to upon partial failure
3682 of A or B, and also contains some minor state data related to that
3683 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3684 overall state, such as the count, and pointers to the A and B ops.
3686 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3687 must always point to the *current* CURLYX block, the rules are:
3689 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3690 and set cur_curlyx to point the new block.
3692 When popping the CURLYX block after a successful or unsuccessful match,
3693 restore the previous cur_curlyx.
3695 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3696 to the outer one saved in the CURLYX block.
3698 When popping the WHILEM block after a successful or unsuccessful B match,
3699 restore the previous cur_curlyx.
3701 Here's an example for the pattern (AI* BI)*BO
3702 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3705 curlyx backtrack stack
3706 ------ ---------------
3708 CO <CO prev=NULL> <WO>
3709 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3710 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3711 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3713 At this point the pattern succeeds, and we work back down the stack to
3714 clean up, restoring as we go:
3716 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3717 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3718 CO <CO prev=NULL> <WO>
3721 *******************************************************************/
3723 #define ST st->u.curlyx
3725 case CURLYX: /* start of /A*B/ (for complex A) */
3727 /* No need to save/restore up to this paren */
3728 I32 parenfloor = scan->flags;
3730 assert(next); /* keep Coverity happy */
3731 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3734 /* XXXX Probably it is better to teach regpush to support
3735 parenfloor > PL_regsize... */
3736 if (parenfloor > (I32)*PL_reglastparen)
3737 parenfloor = *PL_reglastparen; /* Pessimization... */
3739 ST.prev_curlyx= cur_curlyx;
3741 ST.cp = PL_savestack_ix;
3743 /* these fields contain the state of the current curly.
3744 * they are accessed by subsequent WHILEMs */
3745 ST.parenfloor = parenfloor;
3746 ST.min = ARG1(scan);
3747 ST.max = ARG2(scan);
3748 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3752 ST.count = -1; /* this will be updated by WHILEM */
3753 ST.lastloc = NULL; /* this will be updated by WHILEM */
3755 PL_reginput = locinput;
3756 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3760 case CURLYX_end: /* just finished matching all of A*B */
3762 cur_curlyx = ST.prev_curlyx;
3766 case CURLYX_end_fail: /* just failed to match all of A*B */
3768 cur_curlyx = ST.prev_curlyx;
3774 #define ST st->u.whilem
3776 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3778 /* see the discussion above about CURLYX/WHILEM */
3780 assert(cur_curlyx); /* keep Coverity happy */
3781 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3782 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3783 ST.cache_offset = 0;
3786 PL_reginput = locinput;
3788 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3789 "%*s whilem: matched %ld out of %ld..%ld\n",
3790 REPORT_CODE_OFF+depth*2, "", (long)n,
3791 (long)cur_curlyx->u.curlyx.min,
3792 (long)cur_curlyx->u.curlyx.max)
3795 /* First just match a string of min A's. */
3797 if (n < cur_curlyx->u.curlyx.min) {
3798 cur_curlyx->u.curlyx.lastloc = locinput;
3799 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3803 /* If degenerate A matches "", assume A done. */
3805 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3806 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3807 "%*s whilem: empty match detected, trying continuation...\n",
3808 REPORT_CODE_OFF+depth*2, "")
3810 goto do_whilem_B_max;
3813 /* super-linear cache processing */
3817 if (!PL_reg_maxiter) {
3818 /* start the countdown: Postpone detection until we
3819 * know the match is not *that* much linear. */
3820 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3821 /* possible overflow for long strings and many CURLYX's */
3822 if (PL_reg_maxiter < 0)
3823 PL_reg_maxiter = I32_MAX;
3824 PL_reg_leftiter = PL_reg_maxiter;
3827 if (PL_reg_leftiter-- == 0) {
3828 /* initialise cache */
3829 const I32 size = (PL_reg_maxiter + 7)/8;
3830 if (PL_reg_poscache) {
3831 if ((I32)PL_reg_poscache_size < size) {
3832 Renew(PL_reg_poscache, size, char);
3833 PL_reg_poscache_size = size;
3835 Zero(PL_reg_poscache, size, char);
3838 PL_reg_poscache_size = size;
3839 Newxz(PL_reg_poscache, size, char);
3841 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3842 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3843 PL_colors[4], PL_colors[5])
3847 if (PL_reg_leftiter < 0) {
3848 /* have we already failed at this position? */
3850 offset = (scan->flags & 0xf) - 1
3851 + (locinput - PL_bostr) * (scan->flags>>4);
3852 mask = 1 << (offset % 8);
3854 if (PL_reg_poscache[offset] & mask) {
3855 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3856 "%*s whilem: (cache) already tried at this position...\n",
3857 REPORT_CODE_OFF+depth*2, "")
3859 sayNO; /* cache records failure */
3861 ST.cache_offset = offset;
3862 ST.cache_mask = mask;
3866 /* Prefer B over A for minimal matching. */
3868 if (cur_curlyx->u.curlyx.minmod) {
3869 ST.save_curlyx = cur_curlyx;
3870 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3871 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
3872 REGCP_SET(ST.lastcp);
3873 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
3877 /* Prefer A over B for maximal matching. */
3879 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3880 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3881 cur_curlyx->u.curlyx.lastloc = locinput;
3882 REGCP_SET(ST.lastcp);
3883 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
3886 goto do_whilem_B_max;
3890 case WHILEM_B_min: /* just matched B in a minimal match */
3891 case WHILEM_B_max: /* just matched B in a maximal match */
3892 cur_curlyx = ST.save_curlyx;
3896 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
3897 cur_curlyx = ST.save_curlyx;
3898 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3899 cur_curlyx->u.curlyx.count--;
3903 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
3904 REGCP_UNWIND(ST.lastcp);
3907 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
3908 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3909 cur_curlyx->u.curlyx.count--;
3913 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
3914 REGCP_UNWIND(ST.lastcp);
3915 regcppop(rex); /* Restore some previous $<digit>s? */
3916 PL_reginput = locinput;
3917 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3918 "%*s whilem: failed, trying continuation...\n",
3919 REPORT_CODE_OFF+depth*2, "")
3922 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3923 && ckWARN(WARN_REGEXP)
3924 && !(PL_reg_flags & RF_warned))
3926 PL_reg_flags |= RF_warned;
3927 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3928 "Complex regular subexpression recursion",
3933 ST.save_curlyx = cur_curlyx;
3934 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3935 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
3938 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
3939 cur_curlyx = ST.save_curlyx;
3940 REGCP_UNWIND(ST.lastcp);
3943 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
3944 /* Maximum greed exceeded */
3945 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3946 && ckWARN(WARN_REGEXP)
3947 && !(PL_reg_flags & RF_warned))
3949 PL_reg_flags |= RF_warned;
3950 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
3951 "%s limit (%d) exceeded",
3952 "Complex regular subexpression recursion",
3955 cur_curlyx->u.curlyx.count--;
3959 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3960 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")