5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126 #define JUMPABLE(rn) ( \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
135 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
137 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
140 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
147 /* ... so we use this as its faster. */
148 #define IS_TEXT(rn) ( OP(rn)==EXACT )
149 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158 #define FIND_NEXT_IMPT(rn) STMT_START { \
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
162 rn = NEXTOPER(NEXTOPER(rn)); \
163 else if (type == PLUS) \
165 else if (type == IFMATCH) \
166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
167 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
178 const int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 GET_RE_DEBUG_FLAGS_DECL;
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187 #define REGCP_OTHER_ELEMS 8
188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
190 for (p = PL_regsize; p > parenfloor; p--) {
191 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
192 SSPUSHINT(PL_regendp[p]);
193 SSPUSHINT(PL_regstartp[p]);
194 SSPUSHPTR(PL_reg_start_tmp[p]);
196 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regstartp[p],
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
203 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
204 SSPUSHPTR(PL_regstartp);
205 SSPUSHPTR(PL_regendp);
206 SSPUSHINT(PL_regsize);
207 SSPUSHINT(*PL_reglastparen);
208 SSPUSHINT(*PL_reglastcloseparen);
209 SSPUSHPTR(PL_reginput);
210 #define REGCP_FRAME_ELEMS 2
211 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
212 * are needed for the regexp context stack bookkeeping. */
213 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
214 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
219 /* These are needed since we do not localize EVAL nodes: */
220 #define REGCP_SET(cp) \
222 PerlIO_printf(Perl_debug_log, \
223 " Setting an EVAL scope, savestack=%"IVdf"\n", \
224 (IV)PL_savestack_ix)); \
227 #define REGCP_UNWIND(cp) \
229 if (cp != PL_savestack_ix) \
230 PerlIO_printf(Perl_debug_log, \
231 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
232 (IV)(cp), (IV)PL_savestack_ix)); \
236 S_regcppop(pTHX_ const regexp *rex)
242 GET_RE_DEBUG_FLAGS_DECL;
244 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
246 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
247 i = SSPOPINT; /* Parentheses elements to pop. */
248 input = (char *) SSPOPPTR;
249 *PL_reglastcloseparen = SSPOPINT;
250 *PL_reglastparen = SSPOPINT;
251 PL_regsize = SSPOPINT;
252 PL_regendp=(I32 *) SSPOPPTR;
253 PL_regstartp=(I32 *) SSPOPPTR;
256 /* Now restore the parentheses context. */
257 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
258 i > 0; i -= REGCP_PAREN_ELEMS) {
260 U32 paren = (U32)SSPOPINT;
261 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
262 PL_regstartp[paren] = SSPOPINT;
264 if (paren <= *PL_reglastparen)
265 PL_regendp[paren] = tmps;
267 PerlIO_printf(Perl_debug_log,
268 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
269 (UV)paren, (IV)PL_regstartp[paren],
270 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
271 (IV)PL_regendp[paren],
272 (paren > *PL_reglastparen ? "(no)" : ""));
276 if (*PL_reglastparen + 1 <= rex->nparens) {
277 PerlIO_printf(Perl_debug_log,
278 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
279 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
283 /* It would seem that the similar code in regtry()
284 * already takes care of this, and in fact it is in
285 * a better location to since this code can #if 0-ed out
286 * but the code in regtry() is needed or otherwise tests
287 * requiring null fields (pat.t#187 and split.t#{13,14}
288 * (as of patchlevel 7877) will fail. Then again,
289 * this code seems to be necessary or otherwise
290 * building DynaLoader will fail:
291 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
293 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
295 PL_regstartp[i] = -1;
302 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
305 * pregexec and friends
308 #ifndef PERL_IN_XSUB_RE
310 - pregexec - match a regexp against a string
313 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
314 char *strbeg, I32 minend, SV *screamer, U32 nosave)
315 /* strend: pointer to null at end of string */
316 /* strbeg: real beginning of string */
317 /* minend: end of match must be >=minend after stringarg. */
318 /* nosave: For optimizations. */
321 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
322 nosave ? 0 : REXEC_COPY_STR);
327 * Need to implement the following flags for reg_anch:
329 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
331 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
332 * INTUIT_AUTORITATIVE_ML
333 * INTUIT_ONCE_NOML - Intuit can match in one location only.
336 * Another flag for this function: SECOND_TIME (so that float substrs
337 * with giant delta may be not rechecked).
340 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
342 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
343 Otherwise, only SvCUR(sv) is used to get strbeg. */
345 /* XXXX We assume that strpos is strbeg unless sv. */
347 /* XXXX Some places assume that there is a fixed substring.
348 An update may be needed if optimizer marks as "INTUITable"
349 RExen without fixed substrings. Similarly, it is assumed that
350 lengths of all the strings are no more than minlen, thus they
351 cannot come from lookahead.
352 (Or minlen should take into account lookahead.)
353 NOTE: Some of this comment is not correct. minlen does now take account
354 of lookahead/behind. Further research is required. -- demerphq
358 /* A failure to find a constant substring means that there is no need to make
359 an expensive call to REx engine, thus we celebrate a failure. Similarly,
360 finding a substring too deep into the string means that less calls to
361 regtry() should be needed.
363 REx compiler's optimizer found 4 possible hints:
364 a) Anchored substring;
366 c) Whether we are anchored (beginning-of-line or \G);
367 d) First node (of those at offset 0) which may distingush positions;
368 We use a)b)d) and multiline-part of c), and try to find a position in the
369 string which does not contradict any of them.
372 /* Most of decisions we do here should have been done at compile time.
373 The nodes of the REx which we used for the search should have been
374 deleted from the finite automaton. */
377 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
378 char *strend, U32 flags, re_scream_pos_data *data)
381 register I32 start_shift = 0;
382 /* Should be nonnegative! */
383 register I32 end_shift = 0;
388 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
390 register char *other_last = NULL; /* other substr checked before this */
391 char *check_at = NULL; /* check substr found at this pos */
392 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
393 RXi_GET_DECL(prog,progi);
395 const char * const i_strpos = strpos;
398 GET_RE_DEBUG_FLAGS_DECL;
400 RX_MATCH_UTF8_set(prog,do_utf8);
402 if (prog->extflags & RXf_UTF8) {
403 PL_reg_flags |= RF_utf8;
406 debug_start_match(prog, do_utf8, strpos, strend,
407 sv ? "Guessing start of match in sv for"
408 : "Guessing start of match in string for");
411 /* CHR_DIST() would be more correct here but it makes things slow. */
412 if (prog->minlen > strend - strpos) {
413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
414 "String too short... [re_intuit_start]\n"));
418 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
421 if (!prog->check_utf8 && prog->check_substr)
422 to_utf8_substr(prog);
423 check = prog->check_utf8;
425 if (!prog->check_substr && prog->check_utf8)
426 to_byte_substr(prog);
427 check = prog->check_substr;
429 if (check == &PL_sv_undef) {
430 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
431 "Non-utf8 string cannot match utf8 check string\n"));
434 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
435 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
436 || ( (prog->extflags & RXf_ANCH_BOL)
437 && !multiline ) ); /* Check after \n? */
440 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
441 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
442 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
444 && (strpos != strbeg)) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
448 if (prog->check_offset_min == prog->check_offset_max &&
449 !(prog->extflags & RXf_CANY_SEEN)) {
450 /* Substring at constant offset from beg-of-str... */
453 s = HOP3c(strpos, prog->check_offset_min, strend);
456 slen = SvCUR(check); /* >= 1 */
458 if ( strend - s > slen || strend - s < slen - 1
459 || (strend - s == slen && strend[-1] != '\n')) {
460 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
463 /* Now should match s[0..slen-2] */
465 if (slen && (*SvPVX_const(check) != *s
467 && memNE(SvPVX_const(check), s, slen)))) {
469 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
473 else if (*SvPVX_const(check) != *s
474 || ((slen = SvCUR(check)) > 1
475 && memNE(SvPVX_const(check), s, slen)))
478 goto success_at_start;
481 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
483 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
484 end_shift = prog->check_end_shift;
487 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
488 - (SvTAIL(check) != 0);
489 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
491 if (end_shift < eshift)
495 else { /* Can match at random position */
498 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
499 end_shift = prog->check_end_shift;
501 /* end shift should be non negative here */
504 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
506 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
507 (IV)end_shift, prog->precomp);
511 /* Find a possible match in the region s..strend by looking for
512 the "check" substring in the region corrected by start/end_shift. */
515 I32 srch_start_shift = start_shift;
516 I32 srch_end_shift = end_shift;
517 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
518 srch_end_shift -= ((strbeg - s) - srch_start_shift);
519 srch_start_shift = strbeg - s;
521 DEBUG_OPTIMISE_MORE_r({
522 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
523 (IV)prog->check_offset_min,
524 (IV)srch_start_shift,
526 (IV)prog->check_end_shift);
529 if (flags & REXEC_SCREAM) {
530 I32 p = -1; /* Internal iterator of scream. */
531 I32 * const pp = data ? data->scream_pos : &p;
533 if (PL_screamfirst[BmRARE(check)] >= 0
534 || ( BmRARE(check) == '\n'
535 && (BmPREVIOUS(check) == SvCUR(check) - 1)
537 s = screaminstr(sv, check,
538 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
541 /* we may be pointing at the wrong string */
542 if (s && RX_MATCH_COPIED(prog))
543 s = strbeg + (s - SvPVX_const(sv));
545 *data->scream_olds = s;
550 if (prog->extflags & RXf_CANY_SEEN) {
551 start_point= (U8*)(s + srch_start_shift);
552 end_point= (U8*)(strend - srch_end_shift);
554 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
555 end_point= HOP3(strend, -srch_end_shift, strbeg);
557 DEBUG_OPTIMISE_MORE_r({
558 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
559 (int)(end_point - start_point),
560 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
564 s = fbm_instr( start_point, end_point,
565 check, multiline ? FBMrf_MULTILINE : 0);
568 /* Update the count-of-usability, remove useless subpatterns,
572 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
573 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
574 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
575 (s ? "Found" : "Did not find"),
576 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
577 ? "anchored" : "floating"),
580 (s ? " at offset " : "...\n") );
585 /* Finish the diagnostic message */
586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
588 /* XXX dmq: first branch is for positive lookbehind...
589 Our check string is offset from the beginning of the pattern.
590 So we need to do any stclass tests offset forward from that
599 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
600 Start with the other substr.
601 XXXX no SCREAM optimization yet - and a very coarse implementation
602 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
603 *always* match. Probably should be marked during compile...
604 Probably it is right to do no SCREAM here...
607 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
608 : (prog->float_substr && prog->anchored_substr))
610 /* Take into account the "other" substring. */
611 /* XXXX May be hopelessly wrong for UTF... */
614 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
617 char * const last = HOP3c(s, -start_shift, strbeg);
619 char * const saved_s = s;
622 t = s - prog->check_offset_max;
623 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
625 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
630 t = HOP3c(t, prog->anchored_offset, strend);
631 if (t < other_last) /* These positions already checked */
633 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
636 /* XXXX It is not documented what units *_offsets are in.
637 We assume bytes, but this is clearly wrong.
638 Meaning this code needs to be carefully reviewed for errors.
642 /* On end-of-str: see comment below. */
643 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
644 if (must == &PL_sv_undef) {
646 DEBUG_r(must = prog->anchored_utf8); /* for debug */
651 HOP3(HOP3(last1, prog->anchored_offset, strend)
652 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
654 multiline ? FBMrf_MULTILINE : 0
657 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
658 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
659 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
660 (s ? "Found" : "Contradicts"),
661 quoted, RE_SV_TAIL(must));
666 if (last1 >= last2) {
667 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
668 ", giving up...\n"));
671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
672 ", trying floating at offset %ld...\n",
673 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
674 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
675 s = HOP3c(last, 1, strend);
679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
680 (long)(s - i_strpos)));
681 t = HOP3c(s, -prog->anchored_offset, strbeg);
682 other_last = HOP3c(s, 1, strend);
690 else { /* Take into account the floating substring. */
692 char * const saved_s = s;
695 t = HOP3c(s, -start_shift, strbeg);
697 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
698 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
699 last = HOP3c(t, prog->float_max_offset, strend);
700 s = HOP3c(t, prog->float_min_offset, strend);
703 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
704 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
705 /* fbm_instr() takes into account exact value of end-of-str
706 if the check is SvTAIL(ed). Since false positives are OK,
707 and end-of-str is not later than strend we are OK. */
708 if (must == &PL_sv_undef) {
710 DEBUG_r(must = prog->float_utf8); /* for debug message */
713 s = fbm_instr((unsigned char*)s,
714 (unsigned char*)last + SvCUR(must)
716 must, multiline ? FBMrf_MULTILINE : 0);
718 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
719 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
720 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
721 (s ? "Found" : "Contradicts"),
722 quoted, RE_SV_TAIL(must));
726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
727 ", giving up...\n"));
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
731 ", trying anchored starting at offset %ld...\n",
732 (long)(saved_s + 1 - i_strpos)));
734 s = HOP3c(t, 1, strend);
738 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
739 (long)(s - i_strpos)));
740 other_last = s; /* Fix this later. --Hugo */
750 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
752 DEBUG_OPTIMISE_MORE_r(
753 PerlIO_printf(Perl_debug_log,
754 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
755 (IV)prog->check_offset_min,
756 (IV)prog->check_offset_max,
764 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
766 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
769 /* Fixed substring is found far enough so that the match
770 cannot start at strpos. */
772 if (ml_anch && t[-1] != '\n') {
773 /* Eventually fbm_*() should handle this, but often
774 anchored_offset is not 0, so this check will not be wasted. */
775 /* XXXX In the code below we prefer to look for "^" even in
776 presence of anchored substrings. And we search even
777 beyond the found float position. These pessimizations
778 are historical artefacts only. */
780 while (t < strend - prog->minlen) {
782 if (t < check_at - prog->check_offset_min) {
783 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
784 /* Since we moved from the found position,
785 we definitely contradict the found anchored
786 substr. Due to the above check we do not
787 contradict "check" substr.
788 Thus we can arrive here only if check substr
789 is float. Redo checking for "other"=="fixed".
792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
793 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
794 goto do_other_anchored;
796 /* We don't contradict the found floating substring. */
797 /* XXXX Why not check for STCLASS? */
799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
800 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
803 /* Position contradicts check-string */
804 /* XXXX probably better to look for check-string
805 than for "\n", so one should lower the limit for t? */
806 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
807 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
808 other_last = strpos = s = t + 1;
813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
814 PL_colors[0], PL_colors[1]));
818 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
819 PL_colors[0], PL_colors[1]));
823 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
826 /* The found string does not prohibit matching at strpos,
827 - no optimization of calling REx engine can be performed,
828 unless it was an MBOL and we are not after MBOL,
829 or a future STCLASS check will fail this. */
831 /* Even in this situation we may use MBOL flag if strpos is offset
832 wrt the start of the string. */
833 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
834 && (strpos != strbeg) && strpos[-1] != '\n'
835 /* May be due to an implicit anchor of m{.*foo} */
836 && !(prog->intflags & PREGf_IMPLICIT))
841 DEBUG_EXECUTE_r( if (ml_anch)
842 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
843 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
846 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
848 prog->check_utf8 /* Could be deleted already */
849 && --BmUSEFUL(prog->check_utf8) < 0
850 && (prog->check_utf8 == prog->float_utf8)
852 prog->check_substr /* Could be deleted already */
853 && --BmUSEFUL(prog->check_substr) < 0
854 && (prog->check_substr == prog->float_substr)
857 /* If flags & SOMETHING - do not do it many times on the same match */
858 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
859 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
860 if (do_utf8 ? prog->check_substr : prog->check_utf8)
861 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
862 prog->check_substr = prog->check_utf8 = NULL; /* disable */
863 prog->float_substr = prog->float_utf8 = NULL; /* clear */
864 check = NULL; /* abort */
866 /* XXXX This is a remnant of the old implementation. It
867 looks wasteful, since now INTUIT can use many
869 prog->extflags &= ~RXf_USE_INTUIT;
876 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
877 /* trie stclasses are too expensive to use here, we are better off to
878 leave it to regmatch itself */
879 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
880 /* minlen == 0 is possible if regstclass is \b or \B,
881 and the fixed substr is ''$.
882 Since minlen is already taken into account, s+1 is before strend;
883 accidentally, minlen >= 1 guaranties no false positives at s + 1
884 even for \b or \B. But (minlen? 1 : 0) below assumes that
885 regstclass does not come from lookahead... */
886 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
887 This leaves EXACTF only, which is dealt with in find_byclass(). */
888 const U8* const str = (U8*)STRING(progi->regstclass);
889 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
890 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
893 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
894 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
895 else if (prog->float_substr || prog->float_utf8)
896 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
900 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
901 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
904 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
907 const char *what = NULL;
909 if (endpos == strend) {
910 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
911 "Could not match STCLASS...\n") );
914 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
915 "This position contradicts STCLASS...\n") );
916 if ((prog->extflags & RXf_ANCH) && !ml_anch)
918 /* Contradict one of substrings */
919 if (prog->anchored_substr || prog->anchored_utf8) {
920 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
921 DEBUG_EXECUTE_r( what = "anchored" );
923 s = HOP3c(t, 1, strend);
924 if (s + start_shift + end_shift > strend) {
925 /* XXXX Should be taken into account earlier? */
926 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
927 "Could not match STCLASS...\n") );
932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
933 "Looking for %s substr starting at offset %ld...\n",
934 what, (long)(s + start_shift - i_strpos)) );
937 /* Have both, check_string is floating */
938 if (t + start_shift >= check_at) /* Contradicts floating=check */
939 goto retry_floating_check;
940 /* Recheck anchored substring, but not floating... */
944 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
945 "Looking for anchored substr starting at offset %ld...\n",
946 (long)(other_last - i_strpos)) );
947 goto do_other_anchored;
949 /* Another way we could have checked stclass at the
950 current position only: */
955 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
956 "Looking for /%s^%s/m starting at offset %ld...\n",
957 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
960 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
962 /* Check is floating subtring. */
963 retry_floating_check:
964 t = check_at - start_shift;
965 DEBUG_EXECUTE_r( what = "floating" );
966 goto hop_and_restart;
969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
970 "By STCLASS: moving %ld --> %ld\n",
971 (long)(t - i_strpos), (long)(s - i_strpos))
975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
976 "Does not contradict STCLASS...\n");
981 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
982 PL_colors[4], (check ? "Guessed" : "Giving up"),
983 PL_colors[5], (long)(s - i_strpos)) );
986 fail_finish: /* Substring not found */
987 if (prog->check_substr || prog->check_utf8) /* could be removed already */
988 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
991 PL_colors[4], PL_colors[5]));
997 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
998 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
999 switch (trie_type) { \
1000 case trie_utf8_fold: \
1001 if ( foldlen>0 ) { \
1002 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1007 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1008 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1009 foldlen -= UNISKIP( uvc ); \
1010 uscan = foldbuf + UNISKIP( uvc ); \
1014 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1022 charid = trie->charmap[ uvc ]; \
1026 if (widecharmap) { \
1027 SV** const svpp = hv_fetch(widecharmap, \
1028 (char*)&uvc, sizeof(UV), 0); \
1030 charid = (U16)SvIV(*svpp); \
1035 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1038 ibcmp_utf8(s, NULL, 0, do_utf8, \
1039 m, NULL, ln, (bool)UTF)) \
1040 && (!reginfo || regtry(reginfo, &s)) ) \
1043 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1044 uvchr_to_utf8(tmpbuf, c); \
1045 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1047 && (f == c1 || f == c2) \
1048 && (ln == foldlen || \
1049 !ibcmp_utf8((char *) foldbuf, \
1050 NULL, foldlen, do_utf8, \
1052 NULL, ln, (bool)UTF)) \
1053 && (!reginfo || regtry(reginfo, &s)) ) \
1058 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1062 && (ln == 1 || !(OP(c) == EXACTF \
1064 : ibcmp_locale(s, m, ln))) \
1065 && (!reginfo || regtry(reginfo, &s)) ) \
1071 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1073 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1079 #define REXEC_FBC_SCAN(CoDe) \
1081 while (s < strend) { \
1087 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1088 REXEC_FBC_UTF8_SCAN( \
1090 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1099 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1102 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1111 #define REXEC_FBC_TRYIT \
1112 if ((!reginfo || regtry(reginfo, &s))) \
1115 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1118 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1121 REXEC_FBC_CLASS_SCAN(CoNd); \
1125 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1126 PL_reg_flags |= RF_tainted; \
1128 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1131 REXEC_FBC_CLASS_SCAN(CoNd); \
1135 #define DUMP_EXEC_POS(li,s,doutf8) \
1136 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1138 /* We know what class REx starts with. Try to find this position... */
1139 /* if reginfo is NULL, its a dryrun */
1140 /* annoyingly all the vars in this routine have different names from their counterparts
1141 in regmatch. /grrr */
1144 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1145 const char *strend, regmatch_info *reginfo)
1148 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1152 register STRLEN uskip;
1156 register I32 tmp = 1; /* Scratch variable? */
1157 register const bool do_utf8 = PL_reg_match_utf8;
1158 RXi_GET_DECL(prog,progi);
1160 /* We know what class it must start with. */
1164 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1165 !UTF8_IS_INVARIANT((U8)s[0]) ?
1166 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1167 REGINCLASS(prog, c, (U8*)s));
1170 while (s < strend) {
1173 if (REGINCLASS(prog, c, (U8*)s) ||
1174 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1175 /* The assignment of 2 is intentional:
1176 * for the folded sharp s, the skip is 2. */
1177 (skip = SHARP_S_SKIP))) {
1178 if (tmp && (!reginfo || regtry(reginfo, &s)))
1191 if (tmp && (!reginfo || regtry(reginfo, &s)))
1199 ln = STR_LEN(c); /* length to match in octets/bytes */
1200 lnc = (I32) ln; /* length to match in characters */
1202 STRLEN ulen1, ulen2;
1204 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1205 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1206 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1208 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1209 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1211 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1213 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1216 while (sm < ((U8 *) m + ln)) {
1231 c2 = PL_fold_locale[c1];
1233 e = HOP3c(strend, -((I32)lnc), s);
1235 if (!reginfo && e < s)
1236 e = s; /* Due to minlen logic of intuit() */
1238 /* The idea in the EXACTF* cases is to first find the
1239 * first character of the EXACTF* node and then, if
1240 * necessary, case-insensitively compare the full
1241 * text of the node. The c1 and c2 are the first
1242 * characters (though in Unicode it gets a bit
1243 * more complicated because there are more cases
1244 * than just upper and lower: one needs to use
1245 * the so-called folding case for case-insensitive
1246 * matching (called "loose matching" in Unicode).
1247 * ibcmp_utf8() will do just that. */
1251 U8 tmpbuf [UTF8_MAXBYTES+1];
1252 STRLEN len, foldlen;
1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1255 /* Upper and lower of 1st char are equal -
1256 * probably not a "letter". */
1258 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1260 REXEC_FBC_EXACTISH_CHECK(c == c1);
1265 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1268 /* Handle some of the three Greek sigmas cases.
1269 * Note that not all the possible combinations
1270 * are handled here: some of them are handled
1271 * by the standard folding rules, and some of
1272 * them (the character class or ANYOF cases)
1273 * are handled during compiletime in
1274 * regexec.c:S_regclass(). */
1275 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1276 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1277 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1279 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1285 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1287 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1291 PL_reg_flags |= RF_tainted;
1298 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1299 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1301 tmp = ((OP(c) == BOUND ?
1302 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1303 LOAD_UTF8_CHARCLASS_ALNUM();
1304 REXEC_FBC_UTF8_SCAN(
1305 if (tmp == !(OP(c) == BOUND ?
1306 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1307 isALNUM_LC_utf8((U8*)s)))
1315 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1316 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1319 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1325 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1329 PL_reg_flags |= RF_tainted;
1336 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1337 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1339 tmp = ((OP(c) == NBOUND ?
1340 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1341 LOAD_UTF8_CHARCLASS_ALNUM();
1342 REXEC_FBC_UTF8_SCAN(
1343 if (tmp == !(OP(c) == NBOUND ?
1344 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1345 isALNUM_LC_utf8((U8*)s)))
1347 else REXEC_FBC_TRYIT;
1351 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1352 tmp = ((OP(c) == NBOUND ?
1353 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1356 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1358 else REXEC_FBC_TRYIT;
1361 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1365 REXEC_FBC_CSCAN_PRELOAD(
1366 LOAD_UTF8_CHARCLASS_ALNUM(),
1367 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1371 REXEC_FBC_CSCAN_TAINT(
1372 isALNUM_LC_utf8((U8*)s),
1376 REXEC_FBC_CSCAN_PRELOAD(
1377 LOAD_UTF8_CHARCLASS_ALNUM(),
1378 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1382 REXEC_FBC_CSCAN_TAINT(
1383 !isALNUM_LC_utf8((U8*)s),
1387 REXEC_FBC_CSCAN_PRELOAD(
1388 LOAD_UTF8_CHARCLASS_SPACE(),
1389 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1393 REXEC_FBC_CSCAN_TAINT(
1394 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1398 REXEC_FBC_CSCAN_PRELOAD(
1399 LOAD_UTF8_CHARCLASS_SPACE(),
1400 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1404 REXEC_FBC_CSCAN_TAINT(
1405 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1409 REXEC_FBC_CSCAN_PRELOAD(
1410 LOAD_UTF8_CHARCLASS_DIGIT(),
1411 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1415 REXEC_FBC_CSCAN_TAINT(
1416 isDIGIT_LC_utf8((U8*)s),
1420 REXEC_FBC_CSCAN_PRELOAD(
1421 LOAD_UTF8_CHARCLASS_DIGIT(),
1422 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1426 REXEC_FBC_CSCAN_TAINT(
1427 !isDIGIT_LC_utf8((U8*)s),
1433 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1434 trie_type = do_utf8 ?
1435 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1437 /* what trie are we using right now */
1439 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1441 = (reg_trie_data*)progi->data->data[ aho->trie ];
1442 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
1444 const char *last_start = strend - trie->minlen;
1446 const char *real_start = s;
1448 STRLEN maxlen = trie->maxlen;
1450 U8 **points; /* map of where we were in the input string
1451 when reading a given char. For ASCII this
1452 is unnecessary overhead as the relationship
1453 is always 1:1, but for unicode, especially
1454 case folded unicode this is not true. */
1455 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1459 GET_RE_DEBUG_FLAGS_DECL;
1461 /* We can't just allocate points here. We need to wrap it in
1462 * an SV so it gets freed properly if there is a croak while
1463 * running the match */
1466 sv_points=newSV(maxlen * sizeof(U8 *));
1467 SvCUR_set(sv_points,
1468 maxlen * sizeof(U8 *));
1469 SvPOK_on(sv_points);
1470 sv_2mortal(sv_points);
1471 points=(U8**)SvPV_nolen(sv_points );
1472 if ( trie_type != trie_utf8_fold
1473 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1476 bitmap=(U8*)trie->bitmap;
1478 bitmap=(U8*)ANYOF_BITMAP(c);
1480 /* this is the Aho-Corasick algorithm modified a touch
1481 to include special handling for long "unknown char"
1482 sequences. The basic idea being that we use AC as long
1483 as we are dealing with a possible matching char, when
1484 we encounter an unknown char (and we have not encountered
1485 an accepting state) we scan forward until we find a legal
1487 AC matching is basically that of trie matching, except
1488 that when we encounter a failing transition, we fall back
1489 to the current states "fail state", and try the current char
1490 again, a process we repeat until we reach the root state,
1491 state 1, or a legal transition. If we fail on the root state
1492 then we can either terminate if we have reached an accepting
1493 state previously, or restart the entire process from the beginning
1497 while (s <= last_start) {
1498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1506 U8 *uscan = (U8*)NULL;
1507 U8 *leftmost = NULL;
1509 U32 accepted_word= 0;
1513 while ( state && uc <= (U8*)strend ) {
1515 U32 word = aho->states[ state ].wordnum;
1519 DEBUG_TRIE_EXECUTE_r(
1520 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1522 (char *)uc, do_utf8 );
1523 PerlIO_printf( Perl_debug_log,
1524 " Scanning for legal start char...\n");
1527 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1532 if (uc >(U8*)last_start) break;
1536 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1537 if (!leftmost || lpos < leftmost) {
1538 DEBUG_r(accepted_word=word);
1544 points[pointpos++ % maxlen]= uc;
1545 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1546 uscan, len, uvc, charid, foldlen,
1548 DEBUG_TRIE_EXECUTE_r({
1549 dump_exec_pos( (char *)uc, c, strend, real_start,
1551 PerlIO_printf(Perl_debug_log,
1552 " Charid:%3u CP:%4"UVxf" ",
1558 word = aho->states[ state ].wordnum;
1560 base = aho->states[ state ].trans.base;
1562 DEBUG_TRIE_EXECUTE_r({
1564 dump_exec_pos( (char *)uc, c, strend, real_start,
1566 PerlIO_printf( Perl_debug_log,
1567 "%sState: %4"UVxf", word=%"UVxf,
1568 failed ? " Fail transition to " : "",
1569 (UV)state, (UV)word);
1574 (base + charid > trie->uniquecharcount )
1575 && (base + charid - 1 - trie->uniquecharcount
1577 && trie->trans[base + charid - 1 -
1578 trie->uniquecharcount].check == state
1579 && (tmp=trie->trans[base + charid - 1 -
1580 trie->uniquecharcount ].next))
1582 DEBUG_TRIE_EXECUTE_r(
1583 PerlIO_printf( Perl_debug_log," - legal\n"));
1588 DEBUG_TRIE_EXECUTE_r(
1589 PerlIO_printf( Perl_debug_log," - fail\n"));
1591 state = aho->fail[state];
1595 /* we must be accepting here */
1596 DEBUG_TRIE_EXECUTE_r(
1597 PerlIO_printf( Perl_debug_log," - accepting\n"));
1606 if (!state) state = 1;
1609 if ( aho->states[ state ].wordnum ) {
1610 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1611 if (!leftmost || lpos < leftmost) {
1612 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1617 s = (char*)leftmost;
1618 DEBUG_TRIE_EXECUTE_r({
1620 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1621 (UV)accepted_word, (IV)(s - real_start)
1624 if (!reginfo || regtry(reginfo, &s)) {
1630 DEBUG_TRIE_EXECUTE_r({
1631 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1634 DEBUG_TRIE_EXECUTE_r(
1635 PerlIO_printf( Perl_debug_log,"No match.\n"));
1644 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1653 S_swap_match_buff (pTHX_ regexp *prog) {
1655 RXi_GET_DECL(prog,progi);
1658 /* We have to be careful. If the previous successful match
1659 was from this regex we don't want a subsequent paritally
1660 successful match to clobber the old results.
1661 So when we detect this possibility we add a swap buffer
1662 to the re, and switch the buffer each match. If we fail
1663 we switch it back, otherwise we leave it swapped.
1665 Newxz(progi->swap, 1, regexp_paren_ofs);
1666 /* no need to copy these */
1667 Newxz(progi->swap->startp, prog->nparens + 1, I32);
1668 Newxz(progi->swap->endp, prog->nparens + 1, I32);
1670 t = progi->swap->startp;
1671 progi->swap->startp = prog->startp;
1673 t = progi->swap->endp;
1674 progi->swap->endp = prog->endp;
1680 - regexec_flags - match a regexp against a string
1683 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1684 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1685 /* strend: pointer to null at end of string */
1686 /* strbeg: real beginning of string */
1687 /* minend: end of match must be >=minend after stringarg. */
1688 /* data: May be used for some additional optimizations.
1689 Currently its only used, with a U32 cast, for transmitting
1690 the ganch offset when doing a /g match. This will change */
1691 /* nosave: For optimizations. */
1694 /*register*/ char *s;
1695 register regnode *c;
1696 /*register*/ char *startpos = stringarg;
1697 I32 minlen; /* must match at least this many chars */
1698 I32 dontbother = 0; /* how many characters not to try at end */
1699 I32 end_shift = 0; /* Same for the end. */ /* CC */
1700 I32 scream_pos = -1; /* Internal iterator of scream. */
1701 char *scream_olds = NULL;
1702 SV* const oreplsv = GvSV(PL_replgv);
1703 const bool do_utf8 = (bool)DO_UTF8(sv);
1705 RXi_GET_DECL(prog,progi);
1706 regmatch_info reginfo; /* create some info to pass to regtry etc */
1707 bool swap_on_fail = 0;
1709 GET_RE_DEBUG_FLAGS_DECL;
1711 PERL_UNUSED_ARG(data);
1713 /* Be paranoid... */
1714 if (prog == NULL || startpos == NULL) {
1715 Perl_croak(aTHX_ "NULL regexp parameter");
1719 multiline = prog->extflags & RXf_PMf_MULTILINE;
1720 reginfo.prog = prog;
1722 RX_MATCH_UTF8_set(prog, do_utf8);
1724 debug_start_match(prog, do_utf8, startpos, strend,
1728 minlen = prog->minlen;
1730 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1731 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1732 "String too short [regexec_flags]...\n"));
1737 /* Check validity of program. */
1738 if (UCHARAT(progi->program) != REG_MAGIC) {
1739 Perl_croak(aTHX_ "corrupted regexp program");
1743 PL_reg_eval_set = 0;
1746 if (prog->extflags & RXf_UTF8)
1747 PL_reg_flags |= RF_utf8;
1749 /* Mark beginning of line for ^ and lookbehind. */
1750 reginfo.bol = startpos; /* XXX not used ??? */
1754 /* Mark end of line for $ (and such) */
1757 /* see how far we have to get to not match where we matched before */
1758 reginfo.till = startpos+minend;
1760 /* If there is a "must appear" string, look for it. */
1763 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1766 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1767 reginfo.ganch = startpos + prog->gofs;
1768 else if (sv && SvTYPE(sv) >= SVt_PVMG
1770 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1771 && mg->mg_len >= 0) {
1772 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1773 if (prog->extflags & RXf_ANCH_GPOS) {
1774 if (s > reginfo.ganch)
1776 s = reginfo.ganch - prog->gofs;
1780 reginfo.ganch = strbeg + PTR2UV(data);
1781 } else /* pos() not defined */
1782 reginfo.ganch = strbeg;
1784 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
1786 swap_match_buff(prog); /* do we need a save destructor here for
1789 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1790 re_scream_pos_data d;
1792 d.scream_olds = &scream_olds;
1793 d.scream_pos = &scream_pos;
1794 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1797 goto phooey; /* not present */
1803 /* Simplest case: anchored match need be tried only once. */
1804 /* [unless only anchor is BOL and multiline is set] */
1805 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1806 if (s == startpos && regtry(®info, &startpos))
1808 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1809 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1814 dontbother = minlen - 1;
1815 end = HOP3c(strend, -dontbother, strbeg) - 1;
1816 /* for multiline we only have to try after newlines */
1817 if (prog->check_substr || prog->check_utf8) {
1821 if (regtry(®info, &s))
1826 if (prog->extflags & RXf_USE_INTUIT) {
1827 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1838 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1839 if (regtry(®info, &s))
1846 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1848 /* the warning about reginfo.ganch being used without intialization
1849 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1850 and we only enter this block when the same bit is set. */
1851 char *tmp_s = reginfo.ganch - prog->gofs;
1852 if (regtry(®info, &tmp_s))
1857 /* Messy cases: unanchored match. */
1858 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1859 /* we have /x+whatever/ */
1860 /* it must be a one character string (XXXX Except UTF?) */
1865 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1867 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1872 DEBUG_EXECUTE_r( did_match = 1 );
1873 if (regtry(®info, &s)) goto got_it;
1875 while (s < strend && *s == ch)
1883 DEBUG_EXECUTE_r( did_match = 1 );
1884 if (regtry(®info, &s)) goto got_it;
1886 while (s < strend && *s == ch)
1891 DEBUG_EXECUTE_r(if (!did_match)
1892 PerlIO_printf(Perl_debug_log,
1893 "Did not find anchored character...\n")
1896 else if (prog->anchored_substr != NULL
1897 || prog->anchored_utf8 != NULL
1898 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1899 && prog->float_max_offset < strend - s)) {
1904 char *last1; /* Last position checked before */
1908 if (prog->anchored_substr || prog->anchored_utf8) {
1909 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1910 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1911 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1912 back_max = back_min = prog->anchored_offset;
1914 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1915 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1916 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1917 back_max = prog->float_max_offset;
1918 back_min = prog->float_min_offset;
1922 if (must == &PL_sv_undef)
1923 /* could not downgrade utf8 check substring, so must fail */
1929 last = HOP3c(strend, /* Cannot start after this */
1930 -(I32)(CHR_SVLEN(must)
1931 - (SvTAIL(must) != 0) + back_min), strbeg);
1934 last1 = HOPc(s, -1);
1936 last1 = s - 1; /* bogus */
1938 /* XXXX check_substr already used to find "s", can optimize if
1939 check_substr==must. */
1941 dontbother = end_shift;
1942 strend = HOPc(strend, -dontbother);
1943 while ( (s <= last) &&
1944 ((flags & REXEC_SCREAM)
1945 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1946 end_shift, &scream_pos, 0))
1947 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1948 (unsigned char*)strend, must,
1949 multiline ? FBMrf_MULTILINE : 0))) ) {
1950 /* we may be pointing at the wrong string */
1951 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1952 s = strbeg + (s - SvPVX_const(sv));
1953 DEBUG_EXECUTE_r( did_match = 1 );
1954 if (HOPc(s, -back_max) > last1) {
1955 last1 = HOPc(s, -back_min);
1956 s = HOPc(s, -back_max);
1959 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1961 last1 = HOPc(s, -back_min);
1965 while (s <= last1) {
1966 if (regtry(®info, &s))
1972 while (s <= last1) {
1973 if (regtry(®info, &s))
1979 DEBUG_EXECUTE_r(if (!did_match) {
1980 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1981 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1982 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1983 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1984 ? "anchored" : "floating"),
1985 quoted, RE_SV_TAIL(must));
1989 else if ( (c = progi->regstclass) ) {
1991 const OPCODE op = OP(progi->regstclass);
1992 /* don't bother with what can't match */
1993 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1994 strend = HOPc(strend, -(minlen - 1));
1997 SV * const prop = sv_newmortal();
1998 regprop(prog, prop, c);
2000 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
2002 PerlIO_printf(Perl_debug_log,
2003 "Matching stclass %.*s against %s (%d chars)\n",
2004 (int)SvCUR(prop), SvPVX_const(prop),
2005 quoted, (int)(strend - s));
2008 if (find_byclass(prog, c, s, strend, ®info))
2010 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2014 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2019 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2020 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2021 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2023 if (flags & REXEC_SCREAM) {
2024 last = screaminstr(sv, float_real, s - strbeg,
2025 end_shift, &scream_pos, 1); /* last one */
2027 last = scream_olds; /* Only one occurrence. */
2028 /* we may be pointing at the wrong string */
2029 else if (RX_MATCH_COPIED(prog))
2030 s = strbeg + (s - SvPVX_const(sv));
2034 const char * const little = SvPV_const(float_real, len);
2036 if (SvTAIL(float_real)) {
2037 if (memEQ(strend - len + 1, little, len - 1))
2038 last = strend - len + 1;
2039 else if (!multiline)
2040 last = memEQ(strend - len, little, len)
2041 ? strend - len : NULL;
2047 last = rninstr(s, strend, little, little + len);
2049 last = strend; /* matching "$" */
2054 PerlIO_printf(Perl_debug_log,
2055 "%sCan't trim the tail, match fails (should not happen)%s\n",
2056 PL_colors[4], PL_colors[5]));
2057 goto phooey; /* Should not happen! */
2059 dontbother = strend - last + prog->float_min_offset;
2061 if (minlen && (dontbother < minlen))
2062 dontbother = minlen - 1;
2063 strend -= dontbother; /* this one's always in bytes! */
2064 /* We don't know much -- general case. */
2067 if (regtry(®info, &s))
2076 if (regtry(®info, &s))
2078 } while (s++ < strend);
2086 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2088 if (PL_reg_eval_set) {
2089 /* Preserve the current value of $^R */
2090 if (oreplsv != GvSV(PL_replgv))
2091 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2092 restored, the value remains
2094 restore_pos(aTHX_ prog);
2096 if (prog->paren_names)
2097 (void)hv_iterinit(prog->paren_names);
2099 /* make sure $`, $&, $', and $digit will work later */
2100 if ( !(flags & REXEC_NOT_FIRST) ) {
2101 RX_MATCH_COPY_FREE(prog);
2102 if (flags & REXEC_COPY_STR) {
2103 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2104 #ifdef PERL_OLD_COPY_ON_WRITE
2106 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2108 PerlIO_printf(Perl_debug_log,
2109 "Copy on write: regexp capture, type %d\n",
2112 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2113 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2114 assert (SvPOKp(prog->saved_copy));
2118 RX_MATCH_COPIED_on(prog);
2119 s = savepvn(strbeg, i);
2125 prog->subbeg = strbeg;
2126 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2133 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2134 PL_colors[4], PL_colors[5]));
2135 if (PL_reg_eval_set)
2136 restore_pos(aTHX_ prog);
2138 /* we failed :-( roll it back */
2139 swap_match_buff(prog);
2146 - regtry - try match at specific point
2148 STATIC I32 /* 0 failure, 1 success */
2149 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2155 regexp *prog = reginfo->prog;
2156 RXi_GET_DECL(prog,progi);
2157 GET_RE_DEBUG_FLAGS_DECL;
2158 reginfo->cutpoint=NULL;
2160 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2163 PL_reg_eval_set = RS_init;
2164 DEBUG_EXECUTE_r(DEBUG_s(
2165 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2166 (IV)(PL_stack_sp - PL_stack_base));
2169 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2170 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2172 /* Apparently this is not needed, judging by wantarray. */
2173 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2174 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2177 /* Make $_ available to executed code. */
2178 if (reginfo->sv != DEFSV) {
2180 DEFSV = reginfo->sv;
2183 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2184 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2185 /* prepare for quick setting of pos */
2186 #ifdef PERL_OLD_COPY_ON_WRITE
2187 if (SvIsCOW(reginfo->sv))
2188 sv_force_normal_flags(reginfo->sv, 0);
2190 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2191 &PL_vtbl_mglob, NULL, 0);
2195 PL_reg_oldpos = mg->mg_len;
2196 SAVEDESTRUCTOR_X(restore_pos, prog);
2198 if (!PL_reg_curpm) {
2199 Newxz(PL_reg_curpm, 1, PMOP);
2202 SV* const repointer = newSViv(0);
2203 /* so we know which PL_regex_padav element is PL_reg_curpm */
2204 SvFLAGS(repointer) |= SVf_BREAK;
2205 av_push(PL_regex_padav,repointer);
2206 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2207 PL_regex_pad = AvARRAY(PL_regex_padav);
2211 PM_SETRE(PL_reg_curpm, prog);
2212 PL_reg_oldcurpm = PL_curpm;
2213 PL_curpm = PL_reg_curpm;
2214 if (RX_MATCH_COPIED(prog)) {
2215 /* Here is a serious problem: we cannot rewrite subbeg,
2216 since it may be needed if this match fails. Thus
2217 $` inside (?{}) could fail... */
2218 PL_reg_oldsaved = prog->subbeg;
2219 PL_reg_oldsavedlen = prog->sublen;
2220 #ifdef PERL_OLD_COPY_ON_WRITE
2221 PL_nrs = prog->saved_copy;
2223 RX_MATCH_COPIED_off(prog);
2226 PL_reg_oldsaved = NULL;
2227 prog->subbeg = PL_bostr;
2228 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2230 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2231 prog->startp[0] = *startpos - PL_bostr;
2232 PL_reginput = *startpos;
2233 PL_reglastparen = &prog->lastparen;
2234 PL_reglastcloseparen = &prog->lastcloseparen;
2235 prog->lastparen = 0;
2236 prog->lastcloseparen = 0;
2238 PL_regstartp = prog->startp;
2239 PL_regendp = prog->endp;
2240 if (PL_reg_start_tmpl <= prog->nparens) {
2241 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2242 if(PL_reg_start_tmp)
2243 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2245 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2248 /* XXXX What this code is doing here?!!! There should be no need
2249 to do this again and again, PL_reglastparen should take care of
2252 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2253 * Actually, the code in regcppop() (which Ilya may be meaning by
2254 * PL_reglastparen), is not needed at all by the test suite
2255 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2256 * enough, for building DynaLoader, or otherwise this
2257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2258 * will happen. Meanwhile, this code *is* needed for the
2259 * above-mentioned test suite tests to succeed. The common theme
2260 * on those tests seems to be returning null fields from matches.
2265 if (prog->nparens) {
2267 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2274 if (regmatch(reginfo, progi->program + 1)) {
2275 PL_regendp[0] = PL_reginput - PL_bostr;
2278 if (reginfo->cutpoint)
2279 *startpos= reginfo->cutpoint;
2280 REGCP_UNWIND(lastcp);
2285 #define sayYES goto yes
2286 #define sayNO goto no
2287 #define sayNO_SILENT goto no_silent
2289 /* we dont use STMT_START/END here because it leads to
2290 "unreachable code" warnings, which are bogus, but distracting. */
2291 #define CACHEsayNO \
2292 if (ST.cache_mask) \
2293 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2296 /* this is used to determine how far from the left messages like
2297 'failed...' are printed. It should be set such that messages
2298 are inline with the regop output that created them.
2300 #define REPORT_CODE_OFF 32
2303 /* Make sure there is a test for this +1 options in re_tests */
2304 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2306 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2307 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2309 #define SLAB_FIRST(s) (&(s)->states[0])
2310 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2312 /* grab a new slab and return the first slot in it */
2314 STATIC regmatch_state *
2317 #if PERL_VERSION < 9
2320 regmatch_slab *s = PL_regmatch_slab->next;
2322 Newx(s, 1, regmatch_slab);
2323 s->prev = PL_regmatch_slab;
2325 PL_regmatch_slab->next = s;
2327 PL_regmatch_slab = s;
2328 return SLAB_FIRST(s);
2332 /* push a new state then goto it */
2334 #define PUSH_STATE_GOTO(state, node) \
2336 st->resume_state = state; \
2339 /* push a new state with success backtracking, then goto it */
2341 #define PUSH_YES_STATE_GOTO(state, node) \
2343 st->resume_state = state; \
2344 goto push_yes_state;
2350 regmatch() - main matching routine
2352 This is basically one big switch statement in a loop. We execute an op,
2353 set 'next' to point the next op, and continue. If we come to a point which
2354 we may need to backtrack to on failure such as (A|B|C), we push a
2355 backtrack state onto the backtrack stack. On failure, we pop the top
2356 state, and re-enter the loop at the state indicated. If there are no more
2357 states to pop, we return failure.
2359 Sometimes we also need to backtrack on success; for example /A+/, where
2360 after successfully matching one A, we need to go back and try to
2361 match another one; similarly for lookahead assertions: if the assertion
2362 completes successfully, we backtrack to the state just before the assertion
2363 and then carry on. In these cases, the pushed state is marked as
2364 'backtrack on success too'. This marking is in fact done by a chain of
2365 pointers, each pointing to the previous 'yes' state. On success, we pop to
2366 the nearest yes state, discarding any intermediate failure-only states.
2367 Sometimes a yes state is pushed just to force some cleanup code to be
2368 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2369 it to free the inner regex.
2371 Note that failure backtracking rewinds the cursor position, while
2372 success backtracking leaves it alone.
2374 A pattern is complete when the END op is executed, while a subpattern
2375 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2376 ops trigger the "pop to last yes state if any, otherwise return true"
2379 A common convention in this function is to use A and B to refer to the two
2380 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2381 the subpattern to be matched possibly multiple times, while B is the entire
2382 rest of the pattern. Variable and state names reflect this convention.
2384 The states in the main switch are the union of ops and failure/success of
2385 substates associated with with that op. For example, IFMATCH is the op
2386 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2387 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2388 successfully matched A and IFMATCH_A_fail is a state saying that we have
2389 just failed to match A. Resume states always come in pairs. The backtrack
2390 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2391 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2392 on success or failure.
2394 The struct that holds a backtracking state is actually a big union, with
2395 one variant for each major type of op. The variable st points to the
2396 top-most backtrack struct. To make the code clearer, within each
2397 block of code we #define ST to alias the relevant union.
2399 Here's a concrete example of a (vastly oversimplified) IFMATCH
2405 #define ST st->u.ifmatch
2407 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2408 ST.foo = ...; // some state we wish to save
2410 // push a yes backtrack state with a resume value of
2411 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2413 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2416 case IFMATCH_A: // we have successfully executed A; now continue with B
2418 bar = ST.foo; // do something with the preserved value
2421 case IFMATCH_A_fail: // A failed, so the assertion failed
2422 ...; // do some housekeeping, then ...
2423 sayNO; // propagate the failure
2430 For any old-timers reading this who are familiar with the old recursive
2431 approach, the code above is equivalent to:
2433 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2442 ...; // do some housekeeping, then ...
2443 sayNO; // propagate the failure
2446 The topmost backtrack state, pointed to by st, is usually free. If you
2447 want to claim it, populate any ST.foo fields in it with values you wish to
2448 save, then do one of
2450 PUSH_STATE_GOTO(resume_state, node);
2451 PUSH_YES_STATE_GOTO(resume_state, node);
2453 which sets that backtrack state's resume value to 'resume_state', pushes a
2454 new free entry to the top of the backtrack stack, then goes to 'node'.
2455 On backtracking, the free slot is popped, and the saved state becomes the
2456 new free state. An ST.foo field in this new top state can be temporarily
2457 accessed to retrieve values, but once the main loop is re-entered, it
2458 becomes available for reuse.
2460 Note that the depth of the backtrack stack constantly increases during the
2461 left-to-right execution of the pattern, rather than going up and down with
2462 the pattern nesting. For example the stack is at its maximum at Z at the
2463 end of the pattern, rather than at X in the following:
2465 /(((X)+)+)+....(Y)+....Z/
2467 The only exceptions to this are lookahead/behind assertions and the cut,
2468 (?>A), which pop all the backtrack states associated with A before
2471 Bascktrack state structs are allocated in slabs of about 4K in size.
2472 PL_regmatch_state and st always point to the currently active state,
2473 and PL_regmatch_slab points to the slab currently containing
2474 PL_regmatch_state. The first time regmatch() is called, the first slab is
2475 allocated, and is never freed until interpreter destruction. When the slab
2476 is full, a new one is allocated and chained to the end. At exit from
2477 regmatch(), slabs allocated since entry are freed.
2482 #define DEBUG_STATE_pp(pp) \
2484 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2485 PerlIO_printf(Perl_debug_log, \
2486 " %*s"pp" %s%s%s%s%s\n", \
2488 reg_name[st->resume_state], \
2489 ((st==yes_state||st==mark_state) ? "[" : ""), \
2490 ((st==yes_state) ? "Y" : ""), \
2491 ((st==mark_state) ? "M" : ""), \
2492 ((st==yes_state||st==mark_state) ? "]" : "") \
2497 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2502 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2503 const char *start, const char *end, const char *blurb)
2505 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
2509 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2510 prog->precomp, prog->prelen, 60);
2512 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2513 start, end - start, 60);
2515 PerlIO_printf(Perl_debug_log,
2516 "%s%s REx%s %s against %s\n",
2517 PL_colors[4], blurb, PL_colors[5], s0, s1);
2519 if (do_utf8||utf8_pat)
2520 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2521 utf8_pat ? "pattern" : "",
2522 utf8_pat && do_utf8 ? " and " : "",
2523 do_utf8 ? "string" : ""
2529 S_dump_exec_pos(pTHX_ const char *locinput,
2530 const regnode *scan,
2531 const char *loc_regeol,
2532 const char *loc_bostr,
2533 const char *loc_reg_starttry,
2536 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2537 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2538 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2539 /* The part of the string before starttry has one color
2540 (pref0_len chars), between starttry and current
2541 position another one (pref_len - pref0_len chars),
2542 after the current position the third one.
2543 We assume that pref0_len <= pref_len, otherwise we
2544 decrease pref0_len. */
2545 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2546 ? (5 + taill) - l : locinput - loc_bostr;
2549 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2551 pref0_len = pref_len - (locinput - loc_reg_starttry);
2552 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2553 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2554 ? (5 + taill) - pref_len : loc_regeol - locinput);
2555 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2559 if (pref0_len > pref_len)
2560 pref0_len = pref_len;
2562 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2564 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2565 (locinput - pref_len),pref0_len, 60, 4, 5);
2567 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2568 (locinput - pref_len + pref0_len),
2569 pref_len - pref0_len, 60, 2, 3);
2571 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2572 locinput, loc_regeol - locinput, 10, 0, 1);
2574 const STRLEN tlen=len0+len1+len2;
2575 PerlIO_printf(Perl_debug_log,
2576 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2577 (IV)(locinput - loc_bostr),
2580 (docolor ? "" : "> <"),
2582 (int)(tlen > 19 ? 0 : 19 - tlen),
2589 /* reg_check_named_buff_matched()
2590 * Checks to see if a named buffer has matched. The data array of
2591 * buffer numbers corresponding to the buffer is expected to reside
2592 * in the regexp->data->data array in the slot stored in the ARG() of
2593 * node involved. Note that this routine doesn't actually care about the
2594 * name, that information is not preserved from compilation to execution.
2595 * Returns the index of the leftmost defined buffer with the given name
2596 * or 0 if non of the buffers matched.
2599 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2601 RXi_GET_DECL(rex,rexi);
2602 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
2603 I32 *nums=(I32*)SvPVX(sv_dat);
2604 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2605 if ((I32)*PL_reglastparen >= nums[n] &&
2606 PL_regendp[nums[n]] != -1)
2614 STATIC I32 /* 0 failure, 1 success */
2615 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2617 #if PERL_VERSION < 9
2621 register const bool do_utf8 = PL_reg_match_utf8;
2622 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2624 regexp *rex = reginfo->prog;
2625 RXi_GET_DECL(rex,rexi);
2627 regmatch_slab *orig_slab;
2628 regmatch_state *orig_state;
2630 /* the current state. This is a cached copy of PL_regmatch_state */
2631 register regmatch_state *st;
2633 /* cache heavy used fields of st in registers */
2634 register regnode *scan;
2635 register regnode *next;
2636 register U32 n = 0; /* general value; init to avoid compiler warning */
2637 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2638 register char *locinput = PL_reginput;
2639 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2641 bool result = 0; /* return value of S_regmatch */
2642 int depth = 0; /* depth of backtrack stack */
2643 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2644 const U32 max_nochange_depth =
2645 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2646 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2648 regmatch_state *yes_state = NULL; /* state to pop to on success of
2650 /* mark_state piggy backs on the yes_state logic so that when we unwind
2651 the stack on success we can update the mark_state as we go */
2652 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2654 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2655 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2657 bool no_final = 0; /* prevent failure from backtracking? */
2658 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2659 char *startpoint = PL_reginput;
2660 SV *popmark = NULL; /* are we looking for a mark? */
2661 SV *sv_commit = NULL; /* last mark name seen in failure */
2662 SV *sv_yes_mark = NULL; /* last mark name we have seen
2663 during a successfull match */
2664 U32 lastopen = 0; /* last open we saw */
2665 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2668 /* these three flags are set by various ops to signal information to
2669 * the very next op. They have a useful lifetime of exactly one loop
2670 * iteration, and are not preserved or restored by state pushes/pops
2672 bool sw = 0; /* the condition value in (?(cond)a|b) */
2673 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2674 int logical = 0; /* the following EVAL is:
2678 or the following IFMATCH/UNLESSM is:
2679 false: plain (?=foo)
2680 true: used as a condition: (?(?=foo))
2684 GET_RE_DEBUG_FLAGS_DECL;
2688 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2690 /* on first ever call to regmatch, allocate first slab */
2691 if (!PL_regmatch_slab) {
2692 Newx(PL_regmatch_slab, 1, regmatch_slab);
2693 PL_regmatch_slab->prev = NULL;
2694 PL_regmatch_slab->next = NULL;
2695 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2698 /* remember current high-water mark for exit */
2699 /* XXX this should be done with SAVE* instead */
2700 orig_slab = PL_regmatch_slab;
2701 orig_state = PL_regmatch_state;
2703 /* grab next free state slot */
2704 st = ++PL_regmatch_state;
2705 if (st > SLAB_LAST(PL_regmatch_slab))
2706 st = PL_regmatch_state = S_push_slab(aTHX);
2708 /* Note that nextchr is a byte even in UTF */
2709 nextchr = UCHARAT(locinput);
2711 while (scan != NULL) {
2714 SV * const prop = sv_newmortal();
2715 regnode *rnext=regnext(scan);
2716 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2717 regprop(rex, prop, scan);
2719 PerlIO_printf(Perl_debug_log,
2720 "%3"IVdf":%*s%s(%"IVdf")\n",
2721 (IV)(scan - rexi->program), depth*2, "",
2723 (PL_regkind[OP(scan)] == END || !rnext) ?
2724 0 : (IV)(rnext - rexi->program));
2727 next = scan + NEXT_OFF(scan);
2730 state_num = OP(scan);
2733 switch (state_num) {
2735 if (locinput == PL_bostr)
2737 /* reginfo->till = reginfo->bol; */
2742 if (locinput == PL_bostr ||
2743 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2749 if (locinput == PL_bostr)
2753 if (locinput == reginfo->ganch)
2758 /* update the startpoint */
2759 st->u.keeper.val = PL_regstartp[0];
2760 PL_reginput = locinput;
2761 PL_regstartp[0] = locinput - PL_bostr;
2762 PUSH_STATE_GOTO(KEEPS_next, next);
2764 case KEEPS_next_fail:
2765 /* rollback the start point change */
2766 PL_regstartp[0] = st->u.keeper.val;
2772 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2777 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2779 if (PL_regeol - locinput > 1)
2783 if (PL_regeol != locinput)
2787 if (!nextchr && locinput >= PL_regeol)
2790 locinput += PL_utf8skip[nextchr];
2791 if (locinput > PL_regeol)
2793 nextchr = UCHARAT(locinput);
2796 nextchr = UCHARAT(++locinput);
2799 if (!nextchr && locinput >= PL_regeol)
2801 nextchr = UCHARAT(++locinput);
2804 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2807 locinput += PL_utf8skip[nextchr];
2808 if (locinput > PL_regeol)
2810 nextchr = UCHARAT(locinput);
2813 nextchr = UCHARAT(++locinput);
2817 #define ST st->u.trie
2819 /* In this case the charclass data is available inline so
2820 we can fail fast without a lot of extra overhead.
2822 if (scan->flags == EXACT || !do_utf8) {
2823 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2825 PerlIO_printf(Perl_debug_log,
2826 "%*s %sfailed to match trie start class...%s\n",
2827 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2836 /* what type of TRIE am I? (utf8 makes this contextual) */
2837 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2838 trie_type = do_utf8 ?
2839 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2842 /* what trie are we using right now */
2843 reg_trie_data * const trie
2844 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
2845 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
2846 U32 state = trie->startstate;
2848 if (trie->bitmap && trie_type != trie_utf8_fold &&
2849 !TRIE_BITMAP_TEST(trie,*locinput)
2851 if (trie->states[ state ].wordnum) {
2853 PerlIO_printf(Perl_debug_log,
2854 "%*s %smatched empty string...%s\n",
2855 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2860 PerlIO_printf(Perl_debug_log,
2861 "%*s %sfailed to match trie start class...%s\n",
2862 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2869 U8 *uc = ( U8* )locinput;
2873 U8 *uscan = (U8*)NULL;
2875 SV *sv_accept_buff = NULL;
2876 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2878 ST.accepted = 0; /* how many accepting states we have seen */
2880 ST.jump = trie->jump;
2883 traverse the TRIE keeping track of all accepting states
2884 we transition through until we get to a failing node.
2887 while ( state && uc <= (U8*)PL_regeol ) {
2888 U32 base = trie->states[ state ].trans.base;
2891 /* We use charid to hold the wordnum as we don't use it
2892 for charid until after we have done the wordnum logic.
2893 We define an alias just so that the wordnum logic reads
2896 #define got_wordnum charid
2897 got_wordnum = trie->states[ state ].wordnum;
2899 if ( got_wordnum ) {
2900 if ( ! ST.accepted ) {
2903 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2904 sv_accept_buff=newSV(bufflen *
2905 sizeof(reg_trie_accepted) - 1);
2906 SvCUR_set(sv_accept_buff, 0);
2907 SvPOK_on(sv_accept_buff);
2908 sv_2mortal(sv_accept_buff);
2911 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2914 if (ST.accepted >= bufflen) {
2916 ST.accept_buff =(reg_trie_accepted*)
2917 SvGROW(sv_accept_buff,
2918 bufflen * sizeof(reg_trie_accepted));
2920 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2921 + sizeof(reg_trie_accepted));
2924 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2925 ST.accept_buff[ST.accepted].endpos = uc;
2927 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2931 DEBUG_TRIE_EXECUTE_r({
2932 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2933 PerlIO_printf( Perl_debug_log,
2934 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2935 2+depth * 2, "", PL_colors[4],
2936 (UV)state, (UV)ST.accepted );
2940 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2941 uscan, len, uvc, charid, foldlen,
2945 (base + charid > trie->uniquecharcount )
2946 && (base + charid - 1 - trie->uniquecharcount
2948 && trie->trans[base + charid - 1 -
2949 trie->uniquecharcount].check == state)
2951 state = trie->trans[base + charid - 1 -
2952 trie->uniquecharcount ].next;
2963 DEBUG_TRIE_EXECUTE_r(
2964 PerlIO_printf( Perl_debug_log,
2965 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2966 charid, uvc, (UV)state, PL_colors[5] );
2973 PerlIO_printf( Perl_debug_log,
2974 "%*s %sgot %"IVdf" possible matches%s\n",
2975 REPORT_CODE_OFF + depth * 2, "",
2976 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2979 goto trie_first_try; /* jump into the fail handler */
2981 case TRIE_next_fail: /* we failed - try next alterative */
2983 REGCP_UNWIND(ST.cp);
2984 for (n = *PL_reglastparen; n > ST.lastparen; n--)
2986 *PL_reglastparen = n;
2995 ST.lastparen = *PL_reglastparen;
2998 if ( ST.accepted == 1 ) {
2999 /* only one choice left - just continue */
3001 AV *const trie_words
3002 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3003 SV ** const tmp = av_fetch( trie_words,
3004 ST.accept_buff[ 0 ].wordnum-1, 0 );
3005 SV *sv= tmp ? sv_newmortal() : NULL;
3007 PerlIO_printf( Perl_debug_log,
3008 "%*s %sonly one match left: #%d <%s>%s\n",
3009 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3010 ST.accept_buff[ 0 ].wordnum,
3011 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3012 PL_colors[0], PL_colors[1],
3013 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3015 : "not compiled under -Dr",
3018 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3019 /* in this case we free tmps/leave before we call regmatch
3020 as we wont be using accept_buff again. */
3022 locinput = PL_reginput;
3023 nextchr = UCHARAT(locinput);
3024 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3027 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3028 if (!has_cutgroup) {
3033 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3036 continue; /* execute rest of RE */
3039 if ( !ST.accepted-- ) {
3041 PerlIO_printf( Perl_debug_log,
3042 "%*s %sTRIE failed...%s\n",
3043 REPORT_CODE_OFF+depth*2, "",
3054 There are at least two accepting states left. Presumably
3055 the number of accepting states is going to be low,
3056 typically two. So we simply scan through to find the one
3057 with lowest wordnum. Once we find it, we swap the last
3058 state into its place and decrement the size. We then try to
3059 match the rest of the pattern at the point where the word
3060 ends. If we succeed, control just continues along the
3061 regex; if we fail we return here to try the next accepting
3068 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3069 DEBUG_TRIE_EXECUTE_r(
3070 PerlIO_printf( Perl_debug_log,
3071 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3072 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3073 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3074 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3077 if (ST.accept_buff[cur].wordnum <
3078 ST.accept_buff[best].wordnum)
3083 AV *const trie_words
3084 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3085 SV ** const tmp = av_fetch( trie_words,
3086 ST.accept_buff[ best ].wordnum - 1, 0 );
3087 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3089 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3090 SV *sv= tmp ? sv_newmortal() : NULL;
3092 PerlIO_printf( Perl_debug_log,
3093 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3094 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3095 ST.accept_buff[best].wordnum,
3096 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3097 PL_colors[0], PL_colors[1],
3098 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3099 ) : "not compiled under -Dr",
3100 REG_NODE_NUM(nextop),
3104 if ( best<ST.accepted ) {
3105 reg_trie_accepted tmp = ST.accept_buff[ best ];
3106 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3107 ST.accept_buff[ ST.accepted ] = tmp;
3110 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3111 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3115 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3119 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3122 PUSH_STATE_GOTO(TRIE_next, scan);
3135 char *s = STRING(scan);
3137 if (do_utf8 != UTF) {
3138 /* The target and the pattern have differing utf8ness. */
3140 const char * const e = s + ln;
3143 /* The target is utf8, the pattern is not utf8. */
3148 if (NATIVE_TO_UNI(*(U8*)s) !=
3149 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3157 /* The target is not utf8, the pattern is utf8. */
3162 if (NATIVE_TO_UNI(*((U8*)l)) !=
3163 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3171 nextchr = UCHARAT(locinput);
3174 /* The target and the pattern have the same utf8ness. */
3175 /* Inline the first character, for speed. */
3176 if (UCHARAT(s) != nextchr)
3178 if (PL_regeol - locinput < ln)
3180 if (ln > 1 && memNE(s, locinput, ln))
3183 nextchr = UCHARAT(locinput);
3187 PL_reg_flags |= RF_tainted;
3190 char * const s = STRING(scan);
3193 if (do_utf8 || UTF) {
3194 /* Either target or the pattern are utf8. */
3195 const char * const l = locinput;
3196 char *e = PL_regeol;
3198 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3199 l, &e, 0, do_utf8)) {
3200 /* One more case for the sharp s:
3201 * pack("U0U*", 0xDF) =~ /ss/i,
3202 * the 0xC3 0x9F are the UTF-8
3203 * byte sequence for the U+00DF. */
3205 toLOWER(s[0]) == 's' &&
3207 toLOWER(s[1]) == 's' &&
3214 nextchr = UCHARAT(locinput);
3218 /* Neither the target and the pattern are utf8. */
3220 /* Inline the first character, for speed. */
3221 if (UCHARAT(s) != nextchr &&
3222 UCHARAT(s) != ((OP(scan) == EXACTF)
3223 ? PL_fold : PL_fold_locale)[nextchr])
3225 if (PL_regeol - locinput < ln)
3227 if (ln > 1 && (OP(scan) == EXACTF
3228 ? ibcmp(s, locinput, ln)
3229 : ibcmp_locale(s, locinput, ln)))
3232 nextchr = UCHARAT(locinput);
3237 STRLEN inclasslen = PL_regeol - locinput;
3239 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3241 if (locinput >= PL_regeol)
3243 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3244 nextchr = UCHARAT(locinput);
3249 nextchr = UCHARAT(locinput);
3250 if (!REGINCLASS(rex, scan, (U8*)locinput))
3252 if (!nextchr && locinput >= PL_regeol)
3254 nextchr = UCHARAT(++locinput);
3258 /* If we might have the case of the German sharp s
3259 * in a casefolding Unicode character class. */
3261 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3262 locinput += SHARP_S_SKIP;
3263 nextchr = UCHARAT(locinput);
3269 PL_reg_flags |= RF_tainted;
3275 LOAD_UTF8_CHARCLASS_ALNUM();
3276 if (!(OP(scan) == ALNUM
3277 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3278 : isALNUM_LC_utf8((U8*)locinput)))
3282 locinput += PL_utf8skip[nextchr];
3283 nextchr = UCHARAT(locinput);
3286 if (!(OP(scan) == ALNUM
3287 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3289 nextchr = UCHARAT(++locinput);
3292 PL_reg_flags |= RF_tainted;
3295 if (!nextchr && locinput >= PL_regeol)
3298 LOAD_UTF8_CHARCLASS_ALNUM();
3299 if (OP(scan) == NALNUM
3300 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3301 : isALNUM_LC_utf8((U8*)locinput))
3305 locinput += PL_utf8skip[nextchr];
3306 nextchr = UCHARAT(locinput);
3309 if (OP(scan) == NALNUM
3310 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3312 nextchr = UCHARAT(++locinput);
3316 PL_reg_flags |= RF_tainted;
3320 /* was last char in word? */
3322 if (locinput == PL_bostr)
3325 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3327 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3329 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3330 ln = isALNUM_uni(ln);
3331 LOAD_UTF8_CHARCLASS_ALNUM();
3332 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3335 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3336 n = isALNUM_LC_utf8((U8*)locinput);
3340 ln = (locinput != PL_bostr) ?
3341 UCHARAT(locinput - 1) : '\n';
3342 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3344 n = isALNUM(nextchr);
3347 ln = isALNUM_LC(ln);
3348 n = isALNUM_LC(nextchr);
3351 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3352 OP(scan) == BOUNDL))
3356 PL_reg_flags |= RF_tainted;
3362 if (UTF8_IS_CONTINUED(nextchr)) {
3363 LOAD_UTF8_CHARCLASS_SPACE();
3364 if (!(OP(scan) == SPACE
3365 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3366 : isSPACE_LC_utf8((U8*)locinput)))
3370 locinput += PL_utf8skip[nextchr];
3371 nextchr = UCHARAT(locinput);
3374 if (!(OP(scan) == SPACE
3375 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3377 nextchr = UCHARAT(++locinput);
3380 if (!(OP(scan) == SPACE
3381 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3383 nextchr = UCHARAT(++locinput);
3387 PL_reg_flags |= RF_tainted;
3390 if (!nextchr && locinput >= PL_regeol)
3393 LOAD_UTF8_CHARCLASS_SPACE();
3394 if (OP(scan) == NSPACE
3395 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3396 : isSPACE_LC_utf8((U8*)locinput))
3400 locinput += PL_utf8skip[nextchr];
3401 nextchr = UCHARAT(locinput);
3404 if (OP(scan) == NSPACE
3405 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3407 nextchr = UCHARAT(++locinput);
3410 PL_reg_flags |= RF_tainted;
3416 LOAD_UTF8_CHARCLASS_DIGIT();
3417 if (!(OP(scan) == DIGIT
3418 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3419 : isDIGIT_LC_utf8((U8*)locinput)))
3423 locinput += PL_utf8skip[nextchr];
3424 nextchr = UCHARAT(locinput);
3427 if (!(OP(scan) == DIGIT
3428 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3430 nextchr = UCHARAT(++locinput);
3433 PL_reg_flags |= RF_tainted;
3436 if (!nextchr && locinput >= PL_regeol)
3439 LOAD_UTF8_CHARCLASS_DIGIT();
3440 if (OP(scan) == NDIGIT
3441 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3442 : isDIGIT_LC_utf8((U8*)locinput))
3446 locinput += PL_utf8skip[nextchr];
3447 nextchr = UCHARAT(locinput);
3450 if (OP(scan) == NDIGIT
3451 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3453 nextchr = UCHARAT(++locinput);
3456 if (locinput >= PL_regeol)
3459 LOAD_UTF8_CHARCLASS_MARK();
3460 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3462 locinput += PL_utf8skip[nextchr];
3463 while (locinput < PL_regeol &&
3464 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3465 locinput += UTF8SKIP(locinput);
3466 if (locinput > PL_regeol)
3471 nextchr = UCHARAT(locinput);
3478 PL_reg_flags |= RF_tainted;
3483 n = reg_check_named_buff_matched(rex,scan);
3486 type = REF + ( type - NREF );
3493 PL_reg_flags |= RF_tainted;
3497 n = ARG(scan); /* which paren pair */
3500 ln = PL_regstartp[n];
3501 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3502 if (*PL_reglastparen < n || ln == -1)
3503 sayNO; /* Do not match unless seen CLOSEn. */
3504 if (ln == PL_regendp[n])
3508 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3510 const char *e = PL_bostr + PL_regendp[n];
3512 * Note that we can't do the "other character" lookup trick as
3513 * in the 8-bit case (no pun intended) because in Unicode we
3514 * have to map both upper and title case to lower case.
3518 STRLEN ulen1, ulen2;
3519 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3520 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3524 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3525 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3526 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3533 nextchr = UCHARAT(locinput);
3537 /* Inline the first character, for speed. */
3538 if (UCHARAT(s) != nextchr &&
3540 (UCHARAT(s) != (type == REFF
3541 ? PL_fold : PL_fold_locale)[nextchr])))
3543 ln = PL_regendp[n] - ln;
3544 if (locinput + ln > PL_regeol)
3546 if (ln > 1 && (type == REF
3547 ? memNE(s, locinput, ln)
3549 ? ibcmp(s, locinput, ln)
3550 : ibcmp_locale(s, locinput, ln))))
3553 nextchr = UCHARAT(locinput);
3563 #define ST st->u.eval
3567 regexp_internal *rei;
3568 regnode *startpoint;
3571 case GOSUB: /* /(...(?1))/ */
3572 if (cur_eval && cur_eval->locinput==locinput) {
3573 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3574 Perl_croak(aTHX_ "Infinite recursion in regex");
3575 if ( ++nochange_depth > max_nochange_depth )
3577 "Pattern subroutine nesting without pos change"
3578 " exceeded limit in regex");
3584 (void)ReREFCNT_inc(rex);
3585 if (OP(scan)==GOSUB) {
3586 startpoint = scan + ARG2L(scan);
3587 ST.close_paren = ARG(scan);
3589 startpoint = rei->program+1;
3592 goto eval_recurse_doit;
3594 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3595 if (cur_eval && cur_eval->locinput==locinput) {
3596 if ( ++nochange_depth > max_nochange_depth )
3597 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3602 /* execute the code in the {...} */
3604 SV ** const before = SP;
3605 OP_4tree * const oop = PL_op;
3606 COP * const ocurcop = PL_curcop;
3610 PL_op = (OP_4tree*)rexi->data->data[n];
3611 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3612 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3613 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3614 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3617 SV *sv_mrk = get_sv("REGMARK", 1);
3618 sv_setsv(sv_mrk, sv_yes_mark);
3621 CALLRUNOPS(aTHX); /* Scalar context. */
3624 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3631 PAD_RESTORE_LOCAL(old_comppad);
3632 PL_curcop = ocurcop;
3635 sv_setsv(save_scalar(PL_replgv), ret);
3639 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3642 /* extract RE object from returned value; compiling if
3647 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3648 mg = mg_find(sv, PERL_MAGIC_qr);
3649 else if (SvSMAGICAL(ret)) {
3650 if (SvGMAGICAL(ret))
3651 sv_unmagic(ret, PERL_MAGIC_qr);
3653 mg = mg_find(ret, PERL_MAGIC_qr);
3657 re = (regexp *)mg->mg_obj;
3658 (void)ReREFCNT_inc(re);
3662 const char * const t = SvPV_const(ret, len);
3664 const I32 osize = PL_regsize;
3667 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3668 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3670 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3672 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3679 debug_start_match(re, do_utf8, locinput, PL_regeol,
3680 "Matching embedded");
3682 startpoint = rei->program + 1;
3683 ST.close_paren = 0; /* only used for GOSUB */
3684 /* borrowed from regtry */
3685 if (PL_reg_start_tmpl <= re->nparens) {
3686 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3687 if(PL_reg_start_tmp)
3688 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3690 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3693 eval_recurse_doit: /* Share code with GOSUB below this line */
3694 /* run the pattern returned from (??{...}) */
3695 ST.cp = regcppush(0); /* Save *all* the positions. */
3696 REGCP_SET(ST.lastcp);
3698 PL_regstartp = re->startp; /* essentially NOOP on GOSUB */
3699 PL_regendp = re->endp; /* essentially NOOP on GOSUB */
3701 *PL_reglastparen = 0;
3702 *PL_reglastcloseparen = 0;
3703 PL_reginput = locinput;
3706 /* XXXX This is too dramatic a measure... */
3709 ST.toggle_reg_flags = PL_reg_flags;
3710 if (re->extflags & RXf_UTF8)
3711 PL_reg_flags |= RF_utf8;
3713 PL_reg_flags &= ~RF_utf8;
3714 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3717 ST.prev_curlyx = cur_curlyx;
3722 ST.prev_eval = cur_eval;
3724 /* now continue from first node in postoned RE */
3725 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3728 /* logical is 1, /(?(?{...})X|Y)/ */
3729 sw = (bool)SvTRUE(ret);
3734 case EVAL_AB: /* cleanup after a successful (??{A})B */
3735 /* note: this is called twice; first after popping B, then A */
3736 PL_reg_flags ^= ST.toggle_reg_flags;
3739 rexi = RXi_GET(rex);
3741 cur_eval = ST.prev_eval;
3742 cur_curlyx = ST.prev_curlyx;
3743 /* XXXX This is too dramatic a measure... */
3745 if ( nochange_depth > 0 );
3750 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3751 /* note: this is called twice; first after popping B, then A */
3752 PL_reg_flags ^= ST.toggle_reg_flags;
3755 rexi = RXi_GET(rex);
3756 PL_reginput = locinput;
3757 REGCP_UNWIND(ST.lastcp);
3759 cur_eval = ST.prev_eval;
3760 cur_curlyx = ST.prev_curlyx;
3761 /* XXXX This is too dramatic a measure... */
3763 if ( nochange_depth > 0 );
3769 n = ARG(scan); /* which paren pair */
3770 PL_reg_start_tmp[n] = locinput;
3776 n = ARG(scan); /* which paren pair */
3777 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3778 PL_regendp[n] = locinput - PL_bostr;
3779 /*if (n > PL_regsize)
3781 if (n > *PL_reglastparen)
3782 *PL_reglastparen = n;
3783 *PL_reglastcloseparen = n;
3784 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3792 cursor && OP(cursor)!=END;
3793 cursor=regnext(cursor))
3795 if ( OP(cursor)==CLOSE ){
3797 if ( n <= lastopen ) {
3798 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3799 PL_regendp[n] = locinput - PL_bostr;
3800 /*if (n > PL_regsize)
3802 if (n > *PL_reglastparen)
3803 *PL_reglastparen = n;
3804 *PL_reglastcloseparen = n;
3805 if ( n == ARG(scan) || (cur_eval &&
3806 cur_eval->u.eval.close_paren == n))
3815 n = ARG(scan); /* which paren pair */
3816 sw = (bool)(*PL_reglastparen >= n && PL_regendp[n] != -1);
3819 /* reg_check_named_buff_matched returns 0 for no match */
3820 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3824 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3830 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3832 next = NEXTOPER(NEXTOPER(scan));
3834 next = scan + ARG(scan);
3835 if (OP(next) == IFTHEN) /* Fake one. */
3836 next = NEXTOPER(NEXTOPER(next));
3840 logical = scan->flags;
3843 /*******************************************************************
3845 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3846 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3847 STAR/PLUS/CURLY/CURLYN are used instead.)
3849 A*B is compiled as <CURLYX><A><WHILEM><B>
3851 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3852 state, which contains the current count, initialised to -1. It also sets
3853 cur_curlyx to point to this state, with any previous value saved in the
3856 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3857 since the pattern may possibly match zero times (i.e. it's a while {} loop
3858 rather than a do {} while loop).
3860 Each entry to WHILEM represents a successful match of A. The count in the
3861 CURLYX block is incremented, another WHILEM state is pushed, and execution
3862 passes to A or B depending on greediness and the current count.
3864 For example, if matching against the string a1a2a3b (where the aN are
3865 substrings that match /A/), then the match progresses as follows: (the
3866 pushed states are interspersed with the bits of strings matched so far):
3869 <CURLYX cnt=0><WHILEM>
3870 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3871 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3872 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3873 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3875 (Contrast this with something like CURLYM, which maintains only a single
3879 a1 <CURLYM cnt=1> a2
3880 a1 a2 <CURLYM cnt=2> a3
3881 a1 a2 a3 <CURLYM cnt=3> b
3884 Each WHILEM state block marks a point to backtrack to upon partial failure
3885 of A or B, and also contains some minor state data related to that
3886 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3887 overall state, such as the count, and pointers to the A and B ops.
3889 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3890 must always point to the *current* CURLYX block, the rules are:
3892 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3893 and set cur_curlyx to point the new block.
3895 When popping the CURLYX block after a successful or unsuccessful match,
3896 restore the previous cur_curlyx.
3898 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3899 to the outer one saved in the CURLYX block.
3901 When popping the WHILEM block after a successful or unsuccessful B match,
3902 restore the previous cur_curlyx.
3904 Here's an example for the pattern (AI* BI)*BO
3905 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3908 curlyx backtrack stack
3909 ------ ---------------
3911 CO <CO prev=NULL> <WO>
3912 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3913 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3914 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3916 At this point the pattern succeeds, and we work back down the stack to
3917 clean up, restoring as we go:
3919 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3920 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3921 CO <CO prev=NULL> <WO>
3924 *******************************************************************/
3926 #define ST st->u.curlyx
3928 case CURLYX: /* start of /A*B/ (for complex A) */
3930 /* No need to save/restore up to this paren */
3931 I32 parenfloor = scan->flags;
3933 assert(next); /* keep Coverity happy */
3934 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3937 /* XXXX Probably it is better to teach regpush to support
3938 parenfloor > PL_regsize... */
3939 if (parenfloor > (I32)*PL_reglastparen)
3940 parenfloor = *PL_reglastparen; /* Pessimization... */
3942 ST.prev_curlyx= cur_curlyx;
3944 ST.cp = PL_savestack_ix;
3946 /* these fields contain the state of the current curly.
3947 * they are accessed by subsequent WHILEMs */
3948 ST.parenfloor = parenfloor;
3949 ST.min = ARG1(scan);
3950 ST.max = ARG2(scan);
3951 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3955 ST.count = -1; /* this will be updated by WHILEM */
3956 ST.lastloc = NULL; /* this will be updated by WHILEM */