5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
80 #define RF_evaled 4 /* Did an EVAL with setting? */
81 #define RF_utf8 8 /* String contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
153 static void restore_pos(pTHX_ void *arg);
156 S_regcppush(pTHX_ I32 parenfloor)
159 const int retval = PL_savestack_ix;
160 #define REGCP_PAREN_ELEMS 4
161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
163 GET_RE_DEBUG_FLAGS_DECL;
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168 #define REGCP_OTHER_ELEMS 6
169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
170 for (p = PL_regsize; p > parenfloor; p--) {
171 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
174 SSPUSHPTR(PL_reg_start_tmp[p]);
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
183 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
186 SSPUSHINT(*PL_reglastcloseparen);
187 SSPUSHPTR(PL_reginput);
188 #define REGCP_FRAME_ELEMS 2
189 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
197 /* These are needed since we do not localize EVAL nodes: */
198 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
199 " Setting an EVAL scope, savestack=%"IVdf"\n", \
200 (IV)PL_savestack_ix)); cp = PL_savestack_ix
202 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
203 PerlIO_printf(Perl_debug_log, \
204 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
205 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
208 S_regcppop(pTHX_ const regexp *rex)
214 GET_RE_DEBUG_FLAGS_DECL;
216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
219 i = SSPOPINT; /* Parentheses elements to pop. */
220 input = (char *) SSPOPPTR;
221 *PL_reglastcloseparen = SSPOPINT;
222 *PL_reglastparen = SSPOPINT;
223 PL_regsize = SSPOPINT;
225 /* Now restore the parentheses context. */
226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
227 i > 0; i -= REGCP_PAREN_ELEMS) {
229 U32 paren = (U32)SSPOPINT;
230 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
231 PL_regstartp[paren] = SSPOPINT;
233 if (paren <= *PL_reglastparen)
234 PL_regendp[paren] = tmps;
236 PerlIO_printf(Perl_debug_log,
237 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
238 (UV)paren, (IV)PL_regstartp[paren],
239 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
240 (IV)PL_regendp[paren],
241 (paren > *PL_reglastparen ? "(no)" : ""));
245 if (*PL_reglastparen + 1 <= rex->nparens) {
246 PerlIO_printf(Perl_debug_log,
247 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
248 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
252 /* It would seem that the similar code in regtry()
253 * already takes care of this, and in fact it is in
254 * a better location to since this code can #if 0-ed out
255 * but the code in regtry() is needed or otherwise tests
256 * requiring null fields (pat.t#187 and split.t#{13,14}
257 * (as of patchlevel 7877) will fail. Then again,
258 * this code seems to be necessary or otherwise
259 * building DynaLoader will fail:
260 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
264 PL_regstartp[i] = -1;
271 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
274 * pregexec and friends
277 #ifndef PERL_IN_XSUB_RE
279 - pregexec - match a regexp against a string
282 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
283 char *strbeg, I32 minend, SV *screamer, U32 nosave)
284 /* strend: pointer to null at end of string */
285 /* strbeg: real beginning of string */
286 /* minend: end of match must be >=minend after stringarg. */
287 /* nosave: For optimizations. */
290 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
291 nosave ? 0 : REXEC_COPY_STR);
296 * Need to implement the following flags for reg_anch:
298 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
300 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
301 * INTUIT_AUTORITATIVE_ML
302 * INTUIT_ONCE_NOML - Intuit can match in one location only.
305 * Another flag for this function: SECOND_TIME (so that float substrs
306 * with giant delta may be not rechecked).
309 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
311 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
312 Otherwise, only SvCUR(sv) is used to get strbeg. */
314 /* XXXX We assume that strpos is strbeg unless sv. */
316 /* XXXX Some places assume that there is a fixed substring.
317 An update may be needed if optimizer marks as "INTUITable"
318 RExen without fixed substrings. Similarly, it is assumed that
319 lengths of all the strings are no more than minlen, thus they
320 cannot come from lookahead.
321 (Or minlen should take into account lookahead.) */
323 /* A failure to find a constant substring means that there is no need to make
324 an expensive call to REx engine, thus we celebrate a failure. Similarly,
325 finding a substring too deep into the string means that less calls to
326 regtry() should be needed.
328 REx compiler's optimizer found 4 possible hints:
329 a) Anchored substring;
331 c) Whether we are anchored (beginning-of-line or \G);
332 d) First node (of those at offset 0) which may distingush positions;
333 We use a)b)d) and multiline-part of c), and try to find a position in the
334 string which does not contradict any of them.
337 /* Most of decisions we do here should have been done at compile time.
338 The nodes of the REx which we used for the search should have been
339 deleted from the finite automaton. */
342 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
343 char *strend, U32 flags, re_scream_pos_data *data)
346 register I32 start_shift = 0;
347 /* Should be nonnegative! */
348 register I32 end_shift = 0;
353 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
355 register char *other_last = NULL; /* other substr checked before this */
356 char *check_at = NULL; /* check substr found at this pos */
357 const I32 multiline = prog->reganch & PMf_MULTILINE;
359 const char * const i_strpos = strpos;
362 GET_RE_DEBUG_FLAGS_DECL;
364 RX_MATCH_UTF8_set(prog,do_utf8);
366 if (prog->reganch & ROPT_UTF8) {
367 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
368 "UTF-8 regex...\n"));
369 PL_reg_flags |= RF_utf8;
373 RE_PV_DISPLAY_DECL(s, len, PL_reg_match_utf8,
374 PERL_DEBUG_PAD_ZERO(0), strpos, strend - strpos, 60);
378 if (PL_reg_match_utf8)
379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
380 "UTF-8 target...\n"));
381 PerlIO_printf(Perl_debug_log,
382 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
383 PL_colors[4], PL_colors[5], PL_colors[0],
386 (strlen(prog->precomp) > 60 ? "..." : ""),
388 (int)(len > 60 ? 60 : len),
390 (len > 60 ? "..." : "")
394 /* CHR_DIST() would be more correct here but it makes things slow. */
395 if (prog->minlen > strend - strpos) {
396 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
397 "String too short... [re_intuit_start]\n"));
400 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
403 if (!prog->check_utf8 && prog->check_substr)
404 to_utf8_substr(prog);
405 check = prog->check_utf8;
407 if (!prog->check_substr && prog->check_utf8)
408 to_byte_substr(prog);
409 check = prog->check_substr;
411 if (check == &PL_sv_undef) {
412 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
413 "Non-utf string cannot match utf check string\n"));
416 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
417 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
418 || ( (prog->reganch & ROPT_ANCH_BOL)
419 && !multiline ) ); /* Check after \n? */
422 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
423 | ROPT_IMPLICIT)) /* not a real BOL */
424 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
426 && (strpos != strbeg)) {
427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
430 if (prog->check_offset_min == prog->check_offset_max &&
431 !(prog->reganch & ROPT_CANY_SEEN)) {
432 /* Substring at constant offset from beg-of-str... */
435 s = HOP3c(strpos, prog->check_offset_min, strend);
437 slen = SvCUR(check); /* >= 1 */
439 if ( strend - s > slen || strend - s < slen - 1
440 || (strend - s == slen && strend[-1] != '\n')) {
441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
444 /* Now should match s[0..slen-2] */
446 if (slen && (*SvPVX_const(check) != *s
448 && memNE(SvPVX_const(check), s, slen)))) {
450 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
454 else if (*SvPVX_const(check) != *s
455 || ((slen = SvCUR(check)) > 1
456 && memNE(SvPVX_const(check), s, slen)))
459 goto success_at_start;
462 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
464 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
465 end_shift = prog->minlen - start_shift -
466 CHR_SVLEN(check) + (SvTAIL(check) != 0);
468 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
469 - (SvTAIL(check) != 0);
470 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
472 if (end_shift < eshift)
476 else { /* Can match at random position */
479 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
480 /* Should be nonnegative! */
481 end_shift = prog->minlen - start_shift -
482 CHR_SVLEN(check) + (SvTAIL(check) != 0);
485 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
487 Perl_croak(aTHX_ "panic: end_shift");
491 /* Find a possible match in the region s..strend by looking for
492 the "check" substring in the region corrected by start/end_shift. */
493 if (flags & REXEC_SCREAM) {
494 I32 p = -1; /* Internal iterator of scream. */
495 I32 * const pp = data ? data->scream_pos : &p;
497 if (PL_screamfirst[BmRARE(check)] >= 0
498 || ( BmRARE(check) == '\n'
499 && (BmPREVIOUS(check) == SvCUR(check) - 1)
501 s = screaminstr(sv, check,
502 start_shift + (s - strbeg), end_shift, pp, 0);
505 /* we may be pointing at the wrong string */
506 if (s && RX_MATCH_COPIED(prog))
507 s = strbeg + (s - SvPVX_const(sv));
509 *data->scream_olds = s;
511 else if (prog->reganch & ROPT_CANY_SEEN)
512 s = fbm_instr((U8*)(s + start_shift),
513 (U8*)(strend - end_shift),
514 check, multiline ? FBMrf_MULTILINE : 0);
516 s = fbm_instr(HOP3(s, start_shift, strend),
517 HOP3(strend, -end_shift, strbeg),
518 check, multiline ? FBMrf_MULTILINE : 0);
520 /* Update the count-of-usability, remove useless subpatterns,
523 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
524 (s ? "Found" : "Did not find"),
525 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
527 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
529 PL_colors[1], (SvTAIL(check) ? "$" : ""),
530 (s ? " at offset " : "...\n") ) );
537 /* Finish the diagnostic message */
538 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
540 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
541 Start with the other substr.
542 XXXX no SCREAM optimization yet - and a very coarse implementation
543 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
544 *always* match. Probably should be marked during compile...
545 Probably it is right to do no SCREAM here...
548 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
549 /* Take into account the "other" substring. */
550 /* XXXX May be hopelessly wrong for UTF... */
553 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
556 char * const last = HOP3c(s, -start_shift, strbeg);
558 char * const saved_s = s;
561 t = s - prog->check_offset_max;
562 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
564 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
569 t = HOP3c(t, prog->anchored_offset, strend);
570 if (t < other_last) /* These positions already checked */
572 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
575 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
576 /* On end-of-str: see comment below. */
577 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
578 if (must == &PL_sv_undef) {
580 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
585 HOP3(HOP3(last1, prog->anchored_offset, strend)
586 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
588 multiline ? FBMrf_MULTILINE : 0
590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
591 "%s anchored substr \"%s%.*s%s\"%s",
592 (s ? "Found" : "Contradicts"),
595 - (SvTAIL(must)!=0)),
597 PL_colors[1], (SvTAIL(must) ? "$" : "")));
599 if (last1 >= last2) {
600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
601 ", giving up...\n"));
604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
605 ", trying floating at offset %ld...\n",
606 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
607 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
608 s = HOP3c(last, 1, strend);
612 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
613 (long)(s - i_strpos)));
614 t = HOP3c(s, -prog->anchored_offset, strbeg);
615 other_last = HOP3c(s, 1, strend);
623 else { /* Take into account the floating substring. */
625 char * const saved_s = s;
628 t = HOP3c(s, -start_shift, strbeg);
630 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
631 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
632 last = HOP3c(t, prog->float_max_offset, strend);
633 s = HOP3c(t, prog->float_min_offset, strend);
636 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
637 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
638 /* fbm_instr() takes into account exact value of end-of-str
639 if the check is SvTAIL(ed). Since false positives are OK,
640 and end-of-str is not later than strend we are OK. */
641 if (must == &PL_sv_undef) {
643 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
646 s = fbm_instr((unsigned char*)s,
647 (unsigned char*)last + SvCUR(must)
649 must, multiline ? FBMrf_MULTILINE : 0);
650 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
651 (s ? "Found" : "Contradicts"),
653 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
655 PL_colors[1], (SvTAIL(must) ? "$" : "")));
658 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
659 ", giving up...\n"));
662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
663 ", trying anchored starting at offset %ld...\n",
664 (long)(saved_s + 1 - i_strpos)));
666 s = HOP3c(t, 1, strend);
670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
671 (long)(s - i_strpos)));
672 other_last = s; /* Fix this later. --Hugo */
681 t = s - prog->check_offset_max;
682 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
684 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
686 /* Fixed substring is found far enough so that the match
687 cannot start at strpos. */
689 if (ml_anch && t[-1] != '\n') {
690 /* Eventually fbm_*() should handle this, but often
691 anchored_offset is not 0, so this check will not be wasted. */
692 /* XXXX In the code below we prefer to look for "^" even in
693 presence of anchored substrings. And we search even
694 beyond the found float position. These pessimizations
695 are historical artefacts only. */
697 while (t < strend - prog->minlen) {
699 if (t < check_at - prog->check_offset_min) {
700 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
701 /* Since we moved from the found position,
702 we definitely contradict the found anchored
703 substr. Due to the above check we do not
704 contradict "check" substr.
705 Thus we can arrive here only if check substr
706 is float. Redo checking for "other"=="fixed".
709 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
710 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
711 goto do_other_anchored;
713 /* We don't contradict the found floating substring. */
714 /* XXXX Why not check for STCLASS? */
716 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
717 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
720 /* Position contradicts check-string */
721 /* XXXX probably better to look for check-string
722 than for "\n", so one should lower the limit for t? */
723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
724 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
725 other_last = strpos = s = t + 1;
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
731 PL_colors[0], PL_colors[1]));
735 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
736 PL_colors[0], PL_colors[1]));
740 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
743 /* The found string does not prohibit matching at strpos,
744 - no optimization of calling REx engine can be performed,
745 unless it was an MBOL and we are not after MBOL,
746 or a future STCLASS check will fail this. */
748 /* Even in this situation we may use MBOL flag if strpos is offset
749 wrt the start of the string. */
750 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
751 && (strpos != strbeg) && strpos[-1] != '\n'
752 /* May be due to an implicit anchor of m{.*foo} */
753 && !(prog->reganch & ROPT_IMPLICIT))
758 DEBUG_EXECUTE_r( if (ml_anch)
759 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
760 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
763 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
765 prog->check_utf8 /* Could be deleted already */
766 && --BmUSEFUL(prog->check_utf8) < 0
767 && (prog->check_utf8 == prog->float_utf8)
769 prog->check_substr /* Could be deleted already */
770 && --BmUSEFUL(prog->check_substr) < 0
771 && (prog->check_substr == prog->float_substr)
774 /* If flags & SOMETHING - do not do it many times on the same match */
775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
776 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
777 if (do_utf8 ? prog->check_substr : prog->check_utf8)
778 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
779 prog->check_substr = prog->check_utf8 = NULL; /* disable */
780 prog->float_substr = prog->float_utf8 = NULL; /* clear */
781 check = NULL; /* abort */
783 /* XXXX This is a remnant of the old implementation. It
784 looks wasteful, since now INTUIT can use many
786 prog->reganch &= ~RE_USE_INTUIT;
793 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
794 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
795 /* minlen == 0 is possible if regstclass is \b or \B,
796 and the fixed substr is ''$.
797 Since minlen is already taken into account, s+1 is before strend;
798 accidentally, minlen >= 1 guaranties no false positives at s + 1
799 even for \b or \B. But (minlen? 1 : 0) below assumes that
800 regstclass does not come from lookahead... */
801 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
802 This leaves EXACTF only, which is dealt with in find_byclass(). */
803 const U8* const str = (U8*)STRING(prog->regstclass);
804 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
805 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
807 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
808 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
809 : (prog->float_substr || prog->float_utf8
810 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
813 /*if (OP(prog->regstclass) == TRIE)
816 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
819 const char *what = NULL;
821 if (endpos == strend) {
822 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
823 "Could not match STCLASS...\n") );
826 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
827 "This position contradicts STCLASS...\n") );
828 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
830 /* Contradict one of substrings */
831 if (prog->anchored_substr || prog->anchored_utf8) {
832 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
833 DEBUG_EXECUTE_r( what = "anchored" );
835 s = HOP3c(t, 1, strend);
836 if (s + start_shift + end_shift > strend) {
837 /* XXXX Should be taken into account earlier? */
838 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
839 "Could not match STCLASS...\n") );
844 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
845 "Looking for %s substr starting at offset %ld...\n",
846 what, (long)(s + start_shift - i_strpos)) );
849 /* Have both, check_string is floating */
850 if (t + start_shift >= check_at) /* Contradicts floating=check */
851 goto retry_floating_check;
852 /* Recheck anchored substring, but not floating... */
856 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
857 "Looking for anchored substr starting at offset %ld...\n",
858 (long)(other_last - i_strpos)) );
859 goto do_other_anchored;
861 /* Another way we could have checked stclass at the
862 current position only: */
867 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
868 "Looking for /%s^%s/m starting at offset %ld...\n",
869 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
872 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
874 /* Check is floating subtring. */
875 retry_floating_check:
876 t = check_at - start_shift;
877 DEBUG_EXECUTE_r( what = "floating" );
878 goto hop_and_restart;
881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
882 "By STCLASS: moving %ld --> %ld\n",
883 (long)(t - i_strpos), (long)(s - i_strpos))
887 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
888 "Does not contradict STCLASS...\n");
893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
894 PL_colors[4], (check ? "Guessed" : "Giving up"),
895 PL_colors[5], (long)(s - i_strpos)) );
898 fail_finish: /* Substring not found */
899 if (prog->check_substr || prog->check_utf8) /* could be removed already */
900 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
903 PL_colors[4], PL_colors[5]));
907 /* We know what class REx starts with. Try to find this position... */
908 /* if reginfo is NULL, its a dryrun */
909 /* annoyingly all the vars in this routine have different names from their counterparts
910 in regmatch. /grrr */
912 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
913 foldlen, foldbuf, uniflags) STMT_START { \
914 switch (trie_type) { \
915 case trie_utf8_fold: \
917 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
922 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
923 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
924 foldlen -= UNISKIP( uvc ); \
925 uscan = foldbuf + UNISKIP( uvc ); \
929 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
937 charid = trie->charmap[ uvc ]; \
941 if (trie->widecharmap) { \
942 SV** const svpp = hv_fetch(trie->widecharmap, \
943 (char*)&uvc, sizeof(UV), 0); \
945 charid = (U16)SvIV(*svpp); \
950 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
953 ibcmp_utf8(s, NULL, 0, do_utf8, \
954 m, NULL, ln, (bool)UTF)) \
955 && (!reginfo || regtry(reginfo, s)) ) \
958 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
959 uvchr_to_utf8(tmpbuf, c); \
960 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
962 && (f == c1 || f == c2) \
963 && (ln == foldlen || \
964 !ibcmp_utf8((char *) foldbuf, \
965 NULL, foldlen, do_utf8, \
967 NULL, ln, (bool)UTF)) \
968 && (!reginfo || regtry(reginfo, s)) ) \
973 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
977 && (ln == 1 || !(OP(c) == EXACTF \
979 : ibcmp_locale(s, m, ln))) \
980 && (!reginfo || regtry(reginfo, s)) ) \
986 #define REXEC_FBC_UTF8_SCAN(CoDe) \
988 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
994 #define REXEC_FBC_SCAN(CoDe) \
996 while (s < strend) { \
1002 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1003 REXEC_FBC_UTF8_SCAN( \
1005 if (tmp && (!reginfo || regtry(reginfo, s))) \
1014 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1017 if (tmp && (!reginfo || regtry(reginfo, s))) \
1026 #define REXEC_FBC_TRYIT \
1027 if ((!reginfo || regtry(reginfo, s))) \
1030 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1033 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1036 REXEC_FBC_CLASS_SCAN(CoNd); \
1040 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1041 PL_reg_flags |= RF_tainted; \
1043 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1046 REXEC_FBC_CLASS_SCAN(CoNd); \
1051 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1052 const char *strend, const regmatch_info *reginfo)
1055 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1059 register STRLEN uskip;
1063 register I32 tmp = 1; /* Scratch variable? */
1064 register const bool do_utf8 = PL_reg_match_utf8;
1066 /* We know what class it must start with. */
1070 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1071 !UTF8_IS_INVARIANT((U8)s[0]) ?
1072 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1073 REGINCLASS(prog, c, (U8*)s));
1076 while (s < strend) {
1079 if (REGINCLASS(prog, c, (U8*)s) ||
1080 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1081 /* The assignment of 2 is intentional:
1082 * for the folded sharp s, the skip is 2. */
1083 (skip = SHARP_S_SKIP))) {
1084 if (tmp && (!reginfo || regtry(reginfo, s)))
1097 if (tmp && (!reginfo || regtry(reginfo, s)))
1105 ln = STR_LEN(c); /* length to match in octets/bytes */
1106 lnc = (I32) ln; /* length to match in characters */
1108 STRLEN ulen1, ulen2;
1110 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1111 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1112 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1114 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1115 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1117 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1119 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1122 while (sm < ((U8 *) m + ln)) {
1137 c2 = PL_fold_locale[c1];
1139 e = HOP3c(strend, -((I32)lnc), s);
1141 if (!reginfo && e < s)
1142 e = s; /* Due to minlen logic of intuit() */
1144 /* The idea in the EXACTF* cases is to first find the
1145 * first character of the EXACTF* node and then, if
1146 * necessary, case-insensitively compare the full
1147 * text of the node. The c1 and c2 are the first
1148 * characters (though in Unicode it gets a bit
1149 * more complicated because there are more cases
1150 * than just upper and lower: one needs to use
1151 * the so-called folding case for case-insensitive
1152 * matching (called "loose matching" in Unicode).
1153 * ibcmp_utf8() will do just that. */
1157 U8 tmpbuf [UTF8_MAXBYTES+1];
1158 STRLEN len, foldlen;
1159 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1161 /* Upper and lower of 1st char are equal -
1162 * probably not a "letter". */
1164 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1166 REXEC_FBC_EXACTISH_CHECK(c == c1);
1171 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1174 /* Handle some of the three Greek sigmas cases.
1175 * Note that not all the possible combinations
1176 * are handled here: some of them are handled
1177 * by the standard folding rules, and some of
1178 * them (the character class or ANYOF cases)
1179 * are handled during compiletime in
1180 * regexec.c:S_regclass(). */
1181 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1182 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1183 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1185 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1191 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1193 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1197 PL_reg_flags |= RF_tainted;
1204 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1205 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1207 tmp = ((OP(c) == BOUND ?
1208 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1209 LOAD_UTF8_CHARCLASS_ALNUM();
1210 REXEC_FBC_UTF8_SCAN(
1211 if (tmp == !(OP(c) == BOUND ?
1212 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1213 isALNUM_LC_utf8((U8*)s)))
1221 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1222 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1225 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1231 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1235 PL_reg_flags |= RF_tainted;
1242 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1243 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1245 tmp = ((OP(c) == NBOUND ?
1246 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1247 LOAD_UTF8_CHARCLASS_ALNUM();
1248 REXEC_FBC_UTF8_SCAN(
1249 if (tmp == !(OP(c) == NBOUND ?
1250 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1251 isALNUM_LC_utf8((U8*)s)))
1253 else REXEC_FBC_TRYIT;
1257 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1258 tmp = ((OP(c) == NBOUND ?
1259 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1262 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1264 else REXEC_FBC_TRYIT;
1267 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1271 REXEC_FBC_CSCAN_PRELOAD(
1272 LOAD_UTF8_CHARCLASS_ALNUM(),
1273 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1277 REXEC_FBC_CSCAN_TAINT(
1278 isALNUM_LC_utf8((U8*)s),
1282 REXEC_FBC_CSCAN_PRELOAD(
1283 LOAD_UTF8_CHARCLASS_ALNUM(),
1284 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1288 REXEC_FBC_CSCAN_TAINT(
1289 !isALNUM_LC_utf8((U8*)s),
1293 REXEC_FBC_CSCAN_PRELOAD(
1294 LOAD_UTF8_CHARCLASS_SPACE(),
1295 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1299 REXEC_FBC_CSCAN_TAINT(
1300 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1304 REXEC_FBC_CSCAN_PRELOAD(
1305 LOAD_UTF8_CHARCLASS_SPACE(),
1306 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1310 REXEC_FBC_CSCAN_TAINT(
1311 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1315 REXEC_FBC_CSCAN_PRELOAD(
1316 LOAD_UTF8_CHARCLASS_DIGIT(),
1317 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1321 REXEC_FBC_CSCAN_TAINT(
1322 isDIGIT_LC_utf8((U8*)s),
1326 REXEC_FBC_CSCAN_PRELOAD(
1327 LOAD_UTF8_CHARCLASS_DIGIT(),
1328 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1332 REXEC_FBC_CSCAN_TAINT(
1333 !isDIGIT_LC_utf8((U8*)s),
1337 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1339 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1340 trie_type = do_utf8 ?
1341 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1343 /* what trie are we using right now */
1345 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1346 reg_trie_data *trie=aho->trie;
1348 const char *last_start = strend - trie->minlen;
1350 const char *real_start = s;
1352 STRLEN maxlen = trie->maxlen;
1354 U8 **points; /* map of where we were in the input string
1355 when reading a given string. For ASCII this
1356 is unnecessary overhead as the relationship
1357 is always 1:1, but for unicode, especially
1358 case folded unicode this is not true. */
1359 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1361 GET_RE_DEBUG_FLAGS_DECL;
1363 /* We can't just allocate points here. We need to wrap it in
1364 * an SV so it gets freed properly if there is a croak while
1365 * running the match */
1368 sv_points=newSV(maxlen * sizeof(U8 *));
1369 SvCUR_set(sv_points,
1370 maxlen * sizeof(U8 *));
1371 SvPOK_on(sv_points);
1372 sv_2mortal(sv_points);
1373 points=(U8**)SvPV_nolen(sv_points );
1375 if (trie->bitmap && trie_type != trie_utf8_fold) {
1376 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
1381 while (s <= last_start) {
1382 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1390 U8 *uscan = (U8*)NULL;
1391 U8 *leftmost = NULL;
1395 while ( state && uc <= (U8*)strend ) {
1397 if (aho->states[ state ].wordnum) {
1398 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1399 if (!leftmost || lpos < leftmost)
1403 points[pointpos++ % maxlen]= uc;
1404 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1405 uvc, charid, foldlen, foldbuf, uniflags);
1406 DEBUG_TRIE_EXECUTE_r(
1407 PerlIO_printf(Perl_debug_log,
1408 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1409 (int)((const char*)uc - real_start), charid, uvc)
1415 U32 word = aho->states[ state ].wordnum;
1417 base = aho->states[ state ].trans.base;
1419 DEBUG_TRIE_EXECUTE_r(
1420 PerlIO_printf( Perl_debug_log,
1421 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1422 failed ? "Fail transition to " : "",
1423 state, base, uvc, word)
1428 (base + charid > trie->uniquecharcount )
1429 && (base + charid - 1 - trie->uniquecharcount
1431 && trie->trans[base + charid - 1 -
1432 trie->uniquecharcount].check == state
1433 && (tmp=trie->trans[base + charid - 1 -
1434 trie->uniquecharcount ].next))
1444 state = aho->fail[state];
1448 /* we must be accepting here */
1456 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1457 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
1463 if ( aho->states[ state ].wordnum ) {
1464 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1465 if (!leftmost || lpos < leftmost)
1468 DEBUG_TRIE_EXECUTE_r(
1469 PerlIO_printf( Perl_debug_log,
1470 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1475 s = (char*)leftmost;
1476 if (!reginfo || regtry(reginfo, s)) {
1491 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1500 - regexec_flags - match a regexp against a string
1503 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1504 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1505 /* strend: pointer to null at end of string */
1506 /* strbeg: real beginning of string */
1507 /* minend: end of match must be >=minend after stringarg. */
1508 /* data: May be used for some additional optimizations. */
1509 /* nosave: For optimizations. */
1513 register regnode *c;
1514 register char *startpos = stringarg;
1515 I32 minlen; /* must match at least this many chars */
1516 I32 dontbother = 0; /* how many characters not to try at end */
1517 I32 end_shift = 0; /* Same for the end. */ /* CC */
1518 I32 scream_pos = -1; /* Internal iterator of scream. */
1519 char *scream_olds = NULL;
1520 SV* const oreplsv = GvSV(PL_replgv);
1521 const bool do_utf8 = DO_UTF8(sv);
1524 regmatch_info reginfo; /* create some info to pass to regtry etc */
1526 GET_RE_DEBUG_FLAGS_DECL;
1528 PERL_UNUSED_ARG(data);
1530 /* Be paranoid... */
1531 if (prog == NULL || startpos == NULL) {
1532 Perl_croak(aTHX_ "NULL regexp parameter");
1536 multiline = prog->reganch & PMf_MULTILINE;
1537 reginfo.prog = prog;
1539 RX_MATCH_UTF8_set(prog, do_utf8);
1541 minlen = prog->minlen;
1542 if (strend - startpos < minlen) {
1543 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1544 "String too short [regexec_flags]...\n"));
1548 /* Check validity of program. */
1549 if (UCHARAT(prog->program) != REG_MAGIC) {
1550 Perl_croak(aTHX_ "corrupted regexp program");
1554 PL_reg_eval_set = 0;
1557 if (prog->reganch & ROPT_UTF8)
1558 PL_reg_flags |= RF_utf8;
1560 /* Mark beginning of line for ^ and lookbehind. */
1561 reginfo.bol = startpos; /* XXX not used ??? */
1565 /* Mark end of line for $ (and such) */
1568 /* see how far we have to get to not match where we matched before */
1569 reginfo.till = startpos+minend;
1571 /* If there is a "must appear" string, look for it. */
1574 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1577 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1578 reginfo.ganch = startpos;
1579 else if (sv && SvTYPE(sv) >= SVt_PVMG
1581 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1582 && mg->mg_len >= 0) {
1583 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1584 if (prog->reganch & ROPT_ANCH_GPOS) {
1585 if (s > reginfo.ganch)
1590 else /* pos() not defined */
1591 reginfo.ganch = strbeg;
1594 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1595 re_scream_pos_data d;
1597 d.scream_olds = &scream_olds;
1598 d.scream_pos = &scream_pos;
1599 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1602 goto phooey; /* not present */
1607 RE_PV_DISPLAY_DECL(s0, len0, UTF,
1608 PERL_DEBUG_PAD_ZERO(0), prog->precomp, prog->prelen, 60);
1609 RE_PV_DISPLAY_DECL(s1, len1, do_utf8,
1610 PERL_DEBUG_PAD_ZERO(1), startpos, strend - startpos, 60);
1614 PerlIO_printf(Perl_debug_log,
1615 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1616 PL_colors[4], PL_colors[5], PL_colors[0],
1619 len0 > 60 ? "..." : "",
1621 (int)(len1 > 60 ? 60 : len1),
1623 (len1 > 60 ? "..." : "")
1627 /* Simplest case: anchored match need be tried only once. */
1628 /* [unless only anchor is BOL and multiline is set] */
1629 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1630 if (s == startpos && regtry(®info, startpos))
1632 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1633 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1638 dontbother = minlen - 1;
1639 end = HOP3c(strend, -dontbother, strbeg) - 1;
1640 /* for multiline we only have to try after newlines */
1641 if (prog->check_substr || prog->check_utf8) {
1645 if (regtry(®info, s))
1650 if (prog->reganch & RE_USE_INTUIT) {
1651 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1662 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1663 if (regtry(®info, s))
1670 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1671 if (regtry(®info, reginfo.ganch))
1676 /* Messy cases: unanchored match. */
1677 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1678 /* we have /x+whatever/ */
1679 /* it must be a one character string (XXXX Except UTF?) */
1684 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1685 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1686 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1691 DEBUG_EXECUTE_r( did_match = 1 );
1692 if (regtry(®info, s)) goto got_it;
1694 while (s < strend && *s == ch)
1702 DEBUG_EXECUTE_r( did_match = 1 );
1703 if (regtry(®info, s)) goto got_it;
1705 while (s < strend && *s == ch)
1710 DEBUG_EXECUTE_r(if (!did_match)
1711 PerlIO_printf(Perl_debug_log,
1712 "Did not find anchored character...\n")
1715 else if (prog->anchored_substr != NULL
1716 || prog->anchored_utf8 != NULL
1717 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1718 && prog->float_max_offset < strend - s)) {
1723 char *last1; /* Last position checked before */
1727 if (prog->anchored_substr || prog->anchored_utf8) {
1728 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1729 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1730 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1731 back_max = back_min = prog->anchored_offset;
1733 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1734 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1735 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1736 back_max = prog->float_max_offset;
1737 back_min = prog->float_min_offset;
1739 if (must == &PL_sv_undef)
1740 /* could not downgrade utf8 check substring, so must fail */
1743 last = HOP3c(strend, /* Cannot start after this */
1744 -(I32)(CHR_SVLEN(must)
1745 - (SvTAIL(must) != 0) + back_min), strbeg);
1748 last1 = HOPc(s, -1);
1750 last1 = s - 1; /* bogus */
1752 /* XXXX check_substr already used to find "s", can optimize if
1753 check_substr==must. */
1755 dontbother = end_shift;
1756 strend = HOPc(strend, -dontbother);
1757 while ( (s <= last) &&
1758 ((flags & REXEC_SCREAM)
1759 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1760 end_shift, &scream_pos, 0))
1761 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1762 (unsigned char*)strend, must,
1763 multiline ? FBMrf_MULTILINE : 0))) ) {
1764 /* we may be pointing at the wrong string */
1765 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1766 s = strbeg + (s - SvPVX_const(sv));
1767 DEBUG_EXECUTE_r( did_match = 1 );
1768 if (HOPc(s, -back_max) > last1) {
1769 last1 = HOPc(s, -back_min);
1770 s = HOPc(s, -back_max);
1773 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1775 last1 = HOPc(s, -back_min);
1779 while (s <= last1) {
1780 if (regtry(®info, s))
1786 while (s <= last1) {
1787 if (regtry(®info, s))
1793 DEBUG_EXECUTE_r(if (!did_match)
1794 PerlIO_printf(Perl_debug_log,
1795 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1796 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1797 ? "anchored" : "floating"),
1799 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1801 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1805 else if ((c = prog->regstclass)) {
1807 const OPCODE op = OP(prog->regstclass);
1808 /* don't bother with what can't match */
1809 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
1810 strend = HOPc(strend, -(minlen - 1));
1813 SV * const prop = sv_newmortal();
1814 regprop(prog, prop, c);
1816 RE_PV_DISPLAY_DECL(s0,len0,UTF,
1817 PERL_DEBUG_PAD_ZERO(0),SvPVX_const(prop),SvCUR(prop),60);
1818 RE_PV_DISPLAY_DECL(s1,len1,UTF,
1819 PERL_DEBUG_PAD_ZERO(1),s,strend-s,60);
1820 PerlIO_printf(Perl_debug_log,
1821 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
1823 len1, len1, s1, (int)(strend - s));
1826 if (find_byclass(prog, c, s, strend, ®info))
1828 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1832 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1837 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1838 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1839 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1841 if (flags & REXEC_SCREAM) {
1842 last = screaminstr(sv, float_real, s - strbeg,
1843 end_shift, &scream_pos, 1); /* last one */
1845 last = scream_olds; /* Only one occurrence. */
1846 /* we may be pointing at the wrong string */
1847 else if (RX_MATCH_COPIED(prog))
1848 s = strbeg + (s - SvPVX_const(sv));
1852 const char * const little = SvPV_const(float_real, len);
1854 if (SvTAIL(float_real)) {
1855 if (memEQ(strend - len + 1, little, len - 1))
1856 last = strend - len + 1;
1857 else if (!multiline)
1858 last = memEQ(strend - len, little, len)
1859 ? strend - len : NULL;
1865 last = rninstr(s, strend, little, little + len);
1867 last = strend; /* matching "$" */
1871 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1872 "%sCan't trim the tail, match fails (should not happen)%s\n",
1873 PL_colors[4], PL_colors[5]));
1874 goto phooey; /* Should not happen! */
1876 dontbother = strend - last + prog->float_min_offset;
1878 if (minlen && (dontbother < minlen))
1879 dontbother = minlen - 1;
1880 strend -= dontbother; /* this one's always in bytes! */
1881 /* We don't know much -- general case. */
1884 if (regtry(®info, s))
1893 if (regtry(®info, s))
1895 } while (s++ < strend);
1903 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1905 if (PL_reg_eval_set) {
1906 /* Preserve the current value of $^R */
1907 if (oreplsv != GvSV(PL_replgv))
1908 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1909 restored, the value remains
1911 restore_pos(aTHX_ prog);
1914 /* make sure $`, $&, $', and $digit will work later */
1915 if ( !(flags & REXEC_NOT_FIRST) ) {
1916 RX_MATCH_COPY_FREE(prog);
1917 if (flags & REXEC_COPY_STR) {
1918 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
1919 #ifdef PERL_OLD_COPY_ON_WRITE
1921 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1923 PerlIO_printf(Perl_debug_log,
1924 "Copy on write: regexp capture, type %d\n",
1927 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
1928 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
1929 assert (SvPOKp(prog->saved_copy));
1933 RX_MATCH_COPIED_on(prog);
1934 s = savepvn(strbeg, i);
1940 prog->subbeg = strbeg;
1941 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1948 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1949 PL_colors[4], PL_colors[5]));
1950 if (PL_reg_eval_set)
1951 restore_pos(aTHX_ prog);
1956 - regtry - try match at specific point
1958 STATIC I32 /* 0 failure, 1 success */
1959 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
1965 regexp *prog = reginfo->prog;
1966 GET_RE_DEBUG_FLAGS_DECL;
1969 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1971 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1974 PL_reg_eval_set = RS_init;
1975 DEBUG_EXECUTE_r(DEBUG_s(
1976 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1977 (IV)(PL_stack_sp - PL_stack_base));
1979 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1980 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1981 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1983 /* Apparently this is not needed, judging by wantarray. */
1984 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1985 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1988 /* Make $_ available to executed code. */
1989 if (reginfo->sv != DEFSV) {
1991 DEFSV = reginfo->sv;
1994 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1995 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
1996 /* prepare for quick setting of pos */
1997 #ifdef PERL_OLD_COPY_ON_WRITE
1999 sv_force_normal_flags(sv, 0);
2001 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2002 &PL_vtbl_mglob, NULL, 0);
2006 PL_reg_oldpos = mg->mg_len;
2007 SAVEDESTRUCTOR_X(restore_pos, prog);
2009 if (!PL_reg_curpm) {
2010 Newxz(PL_reg_curpm, 1, PMOP);
2013 SV* const repointer = newSViv(0);
2014 /* so we know which PL_regex_padav element is PL_reg_curpm */
2015 SvFLAGS(repointer) |= SVf_BREAK;
2016 av_push(PL_regex_padav,repointer);
2017 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2018 PL_regex_pad = AvARRAY(PL_regex_padav);
2022 PM_SETRE(PL_reg_curpm, prog);
2023 PL_reg_oldcurpm = PL_curpm;
2024 PL_curpm = PL_reg_curpm;
2025 if (RX_MATCH_COPIED(prog)) {
2026 /* Here is a serious problem: we cannot rewrite subbeg,
2027 since it may be needed if this match fails. Thus
2028 $` inside (?{}) could fail... */
2029 PL_reg_oldsaved = prog->subbeg;
2030 PL_reg_oldsavedlen = prog->sublen;
2031 #ifdef PERL_OLD_COPY_ON_WRITE
2032 PL_nrs = prog->saved_copy;
2034 RX_MATCH_COPIED_off(prog);
2037 PL_reg_oldsaved = NULL;
2038 prog->subbeg = PL_bostr;
2039 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2041 prog->startp[0] = startpos - PL_bostr;
2042 PL_reginput = startpos;
2043 PL_regstartp = prog->startp;
2044 PL_regendp = prog->endp;
2045 PL_reglastparen = &prog->lastparen;
2046 PL_reglastcloseparen = &prog->lastcloseparen;
2047 prog->lastparen = 0;
2048 prog->lastcloseparen = 0;
2050 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2051 if (PL_reg_start_tmpl <= prog->nparens) {
2052 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2053 if(PL_reg_start_tmp)
2054 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2056 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2059 /* XXXX What this code is doing here?!!! There should be no need
2060 to do this again and again, PL_reglastparen should take care of
2063 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2064 * Actually, the code in regcppop() (which Ilya may be meaning by
2065 * PL_reglastparen), is not needed at all by the test suite
2066 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2067 * enough, for building DynaLoader, or otherwise this
2068 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2069 * will happen. Meanwhile, this code *is* needed for the
2070 * above-mentioned test suite tests to succeed. The common theme
2071 * on those tests seems to be returning null fields from matches.
2076 if (prog->nparens) {
2078 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2085 if (regmatch(reginfo, prog->program + 1)) {
2086 prog->endp[0] = PL_reginput - PL_bostr;
2089 REGCP_UNWIND(lastcp);
2094 #define sayYES goto yes
2095 #define sayNO goto no
2096 #define sayNO_ANYOF goto no_anyof
2097 #define sayYES_FINAL goto yes_final
2098 #define sayNO_FINAL goto no_final
2099 #define sayNO_SILENT goto do_no
2100 #define saySAME(x) if (x) goto yes; else goto no
2102 #define CACHEsayNO STMT_START { \
2103 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2104 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2105 (1<<st->u.whilem.cache_bit); \
2110 /* this is used to determine how far from the left messages like
2111 'failed...' are printed. Currently 29 makes these messages line
2112 up with the opcode they refer to. Earlier perls used 25 which
2113 left these messages outdented making reviewing a debug output
2116 #define REPORT_CODE_OFF 29
2119 /* Make sure there is a test for this +1 options in re_tests */
2120 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2122 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2123 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2125 #define SLAB_FIRST(s) (&(s)->states[0])
2126 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2128 /* grab a new slab and return the first slot in it */
2130 STATIC regmatch_state *
2133 #if PERL_VERSION < 9
2136 regmatch_slab *s = PL_regmatch_slab->next;
2138 Newx(s, 1, regmatch_slab);
2139 s->prev = PL_regmatch_slab;
2141 PL_regmatch_slab->next = s;
2143 PL_regmatch_slab = s;
2144 return SLAB_FIRST(s);
2147 /* simulate a recursive call to regmatch */
2149 #define REGMATCH(ns, where) \
2152 st->resume_state = resume_##where; \
2153 goto start_recurse; \
2154 resume_point_##where:
2156 /* push a new state then goto it */
2158 #define PUSH_STATE_GOTO(state, node) \
2160 st->resume_state = state; \
2163 /* push a new state with success backtracking, then goto it */
2165 #define PUSH_YES_STATE_GOTO(state, node) \
2167 st->resume_state = state; \
2168 goto push_yes_state;
2173 - regmatch - main matching routine
2175 * Conceptually the strategy is simple: check to see whether the current
2176 * node matches, call self recursively to see whether the rest matches,
2177 * and then act accordingly. In practice we make some effort to avoid
2178 * recursion, in particular by going through "ordinary" nodes (that don't
2179 * need to know whether the rest of the match failed) by a loop instead of
2182 /* [lwall] I've hoisted the register declarations to the outer block in order to
2183 * maybe save a little bit of pushing and popping on the stack. It also takes
2184 * advantage of machines that use a register save mask on subroutine entry.
2186 * This function used to be heavily recursive, but since this had the
2187 * effect of blowing the CPU stack on complex regexes, it has been
2188 * restructured to be iterative, and to save state onto the heap rather
2189 * than the stack. Essentially whereever regmatch() used to be called, it
2190 * pushes the current state, notes where to return, then jumps back into
2193 * Originally the structure of this function used to look something like
2198 while (scan != NULL) {
2199 a++; // do stuff with a and b
2205 if (regmatch(...)) // recurse
2215 * Now it looks something like this:
2223 regmatch_state *st = new();
2225 st->a++; // do stuff with a and b
2227 while (scan != NULL) {
2235 st->resume_state = resume_FOO;
2236 goto start_recurse; // recurse
2245 st = new(); push a new state
2246 st->a = 1; st->b = 2;
2253 switch (resume_state) {
2255 goto resume_point_FOO;
2262 * WARNING: this means that any line in this function that contains a
2263 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2264 * regmatch() using gotos instead. Thus the values of any local variables
2265 * not saved in the regmatch_state structure will have been lost when
2266 * execution resumes on the next line .
2268 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2269 * PL_regmatch_state always points to the currently active state, and
2270 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2271 * The first time regmatch is called, the first slab is allocated, and is
2272 * never freed until interpreter desctruction. When the slab is full,
2273 * a new one is allocated chained to the end. At exit from regmatch, slabs
2274 * allocated since entry are freed.
2277 /* *** every FOO_fail should = FOO+1 */
2278 #define TRIE_next (REGNODE_MAX+1)
2279 #define TRIE_next_fail (REGNODE_MAX+2)
2280 #define EVAL_A (REGNODE_MAX+3)
2281 #define EVAL_A_fail (REGNODE_MAX+4)
2282 #define resume_CURLYX (REGNODE_MAX+5)
2283 #define resume_WHILEM1 (REGNODE_MAX+6)
2284 #define resume_WHILEM2 (REGNODE_MAX+7)
2285 #define resume_WHILEM3 (REGNODE_MAX+8)
2286 #define resume_WHILEM4 (REGNODE_MAX+9)
2287 #define resume_WHILEM5 (REGNODE_MAX+10)
2288 #define resume_WHILEM6 (REGNODE_MAX+11)
2289 #define BRANCH_next (REGNODE_MAX+12)
2290 #define BRANCH_next_fail (REGNODE_MAX+13)
2291 #define CURLYM_A (REGNODE_MAX+14)
2292 #define CURLYM_A_fail (REGNODE_MAX+15)
2293 #define CURLYM_B (REGNODE_MAX+16)
2294 #define CURLYM_B_fail (REGNODE_MAX+17)
2295 #define IFMATCH_A (REGNODE_MAX+18)
2296 #define IFMATCH_A_fail (REGNODE_MAX+19)
2297 #define CURLY_B_min_known (REGNODE_MAX+20)
2298 #define CURLY_B_min_known_fail (REGNODE_MAX+21)
2299 #define CURLY_B_min (REGNODE_MAX+22)
2300 #define CURLY_B_min_fail (REGNODE_MAX+23)
2301 #define CURLY_B_max (REGNODE_MAX+24)
2302 #define CURLY_B_max_fail (REGNODE_MAX+25)
2305 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2310 S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2312 const int docolor = *PL_colors[0];
2313 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2314 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2315 /* The part of the string before starttry has one color
2316 (pref0_len chars), between starttry and current
2317 position another one (pref_len - pref0_len chars),
2318 after the current position the third one.
2319 We assume that pref0_len <= pref_len, otherwise we
2320 decrease pref0_len. */
2321 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2322 ? (5 + taill) - l : locinput - PL_bostr;
2325 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2327 pref0_len = pref_len - (locinput - PL_reg_starttry);
2328 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2329 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2330 ? (5 + taill) - pref_len : PL_regeol - locinput);
2331 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2335 if (pref0_len > pref_len)
2336 pref0_len = pref_len;
2338 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2340 RE_PV_DISPLAY_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2341 (locinput - pref_len),pref0_len, 60);
2343 RE_PV_DISPLAY_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2344 (locinput - pref_len + pref0_len),
2345 pref_len - pref0_len, 60);
2347 RE_PV_DISPLAY_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2348 locinput, PL_regeol - locinput, 60);
2350 PerlIO_printf(Perl_debug_log,
2351 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2352 (IV)(locinput - PL_bostr),
2359 (docolor ? "" : "> <"),
2363 15 - l - pref_len + 1,
2370 STATIC I32 /* 0 failure, 1 success */
2371 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2373 #if PERL_VERSION < 9
2377 register const bool do_utf8 = PL_reg_match_utf8;
2378 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2380 regexp *rex = reginfo->prog;
2382 regmatch_slab *orig_slab;
2383 regmatch_state *orig_state;
2385 /* the current state. This is a cached copy of PL_regmatch_state */
2386 register regmatch_state *st;
2388 /* cache heavy used fields of st in registers */
2389 register regnode *scan;
2390 register regnode *next;
2391 register I32 n = 0; /* initialize to shut up compiler warning */
2392 register char *locinput = PL_reginput;
2394 /* these variables are NOT saved during a recusive RFEGMATCH: */
2395 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2396 bool result; /* return value of S_regmatch */
2397 int depth = 0; /* depth of recursion */
2398 regmatch_state *yes_state = NULL; /* state to pop to on success of
2403 GET_RE_DEBUG_FLAGS_DECL;
2407 /* on first ever call to regmatch, allocate first slab */
2408 if (!PL_regmatch_slab) {
2409 Newx(PL_regmatch_slab, 1, regmatch_slab);
2410 PL_regmatch_slab->prev = NULL;
2411 PL_regmatch_slab->next = NULL;
2412 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2415 /* remember current high-water mark for exit */
2416 /* XXX this should be done with SAVE* instead */
2417 orig_slab = PL_regmatch_slab;
2418 orig_state = PL_regmatch_state;
2420 /* grab next free state slot */
2421 st = ++PL_regmatch_state;
2422 if (st > SLAB_LAST(PL_regmatch_slab))
2423 st = PL_regmatch_state = S_push_slab(aTHX);
2429 /* Note that nextchr is a byte even in UTF */
2430 nextchr = UCHARAT(locinput);
2432 while (scan != NULL) {
2435 SV * const prop = sv_newmortal();
2436 dump_exec_pos( locinput, scan, do_utf8 );
2437 regprop(rex, prop, scan);
2439 PerlIO_printf(Perl_debug_log,
2440 "%3"IVdf":%*s%s(%"IVdf")\n",
2441 (IV)(scan - rex->program), PL_regindent*2, "",
2443 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2446 next = scan + NEXT_OFF(scan);
2449 state_num = OP(scan);
2452 switch (state_num) {
2454 if (locinput == PL_bostr)
2456 /* reginfo->till = reginfo->bol; */
2461 if (locinput == PL_bostr ||
2462 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2468 if (locinput == PL_bostr)
2472 if (locinput == reginfo->ganch)
2478 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2483 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2485 if (PL_regeol - locinput > 1)
2489 if (PL_regeol != locinput)
2493 if (!nextchr && locinput >= PL_regeol)
2496 locinput += PL_utf8skip[nextchr];
2497 if (locinput > PL_regeol)
2499 nextchr = UCHARAT(locinput);
2502 nextchr = UCHARAT(++locinput);
2505 if (!nextchr && locinput >= PL_regeol)
2507 nextchr = UCHARAT(++locinput);
2510 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2513 locinput += PL_utf8skip[nextchr];
2514 if (locinput > PL_regeol)
2516 nextchr = UCHARAT(locinput);
2519 nextchr = UCHARAT(++locinput);
2523 #define ST st->u.trie
2527 /* what type of TRIE am I? (utf8 makes this contextual) */
2528 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2529 trie_type = do_utf8 ?
2530 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2533 /* what trie are we using right now */
2534 reg_trie_data * const trie
2535 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2536 U32 state = trie->startstate;
2538 U8 *uc = ( U8* )locinput;
2544 U8 *uscan = (U8*)NULL;
2546 SV *sv_accept_buff = NULL;
2547 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2549 ST.accepted = 0; /* how many accepting states we have seen */
2555 if (trie->bitmap && trie_type != trie_utf8_fold &&
2556 !TRIE_BITMAP_TEST(trie,*locinput)
2558 if (trie->states[ state ].wordnum) {
2560 PerlIO_printf(Perl_debug_log,
2561 "%*s %smatched empty string...%s\n",
2562 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2567 PerlIO_printf(Perl_debug_log,
2568 "%*s %sfailed to match start class...%s\n",
2569 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2576 traverse the TRIE keeping track of all accepting states
2577 we transition through until we get to a failing node.
2580 while ( state && uc <= (U8*)PL_regeol ) {
2582 if (trie->states[ state ].wordnum) {
2583 if (!ST.accepted ) {
2586 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2587 sv_accept_buff=newSV(bufflen *
2588 sizeof(reg_trie_accepted) - 1);
2589 SvCUR_set(sv_accept_buff,
2590 sizeof(reg_trie_accepted));
2591 SvPOK_on(sv_accept_buff);
2592 sv_2mortal(sv_accept_buff);
2595 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2598 if (ST.accepted >= bufflen) {
2600 ST.accept_buff =(reg_trie_accepted*)
2601 SvGROW(sv_accept_buff,
2602 bufflen * sizeof(reg_trie_accepted));
2604 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2605 + sizeof(reg_trie_accepted));
2607 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2608 ST.accept_buff[ST.accepted].endpos = uc;
2612 base = trie->states[ state ].trans.base;
2614 DEBUG_TRIE_EXECUTE_r({
2615 dump_exec_pos( (char *)uc, scan, do_utf8 );
2616 PerlIO_printf( Perl_debug_log,
2617 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2618 2+PL_regindent * 2, "", PL_colors[4],
2619 (UV)state, (UV)base, (UV)ST.accepted );
2623 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2624 uvc, charid, foldlen, foldbuf, uniflags);
2627 (base + charid > trie->uniquecharcount )
2628 && (base + charid - 1 - trie->uniquecharcount
2630 && trie->trans[base + charid - 1 -
2631 trie->uniquecharcount].check == state)
2633 state = trie->trans[base + charid - 1 -
2634 trie->uniquecharcount ].next;
2645 DEBUG_TRIE_EXECUTE_r(
2646 PerlIO_printf( Perl_debug_log,
2647 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2648 charid, uvc, (UV)state, PL_colors[5] );
2655 PerlIO_printf( Perl_debug_log,
2656 "%*s %sgot %"IVdf" possible matches%s\n",
2657 REPORT_CODE_OFF + PL_regindent * 2, "",
2658 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2664 case TRIE_next_fail: /* we failed - try next alterative */
2666 if ( ST.accepted == 1 ) {
2667 /* only one choice left - just continue */
2669 reg_trie_data * const trie
2670 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2671 SV ** const tmp = RX_DEBUG(reginfo->prog)
2672 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2674 PerlIO_printf( Perl_debug_log,
2675 "%*s %sonly one match left: #%d <%s>%s\n",
2676 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2677 ST.accept_buff[ 0 ].wordnum,
2678 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2681 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2682 /* in this case we free tmps/leave before we call regmatch
2683 as we wont be using accept_buff again. */
2686 locinput = PL_reginput;
2687 nextchr = UCHARAT(locinput);
2689 continue; /* execute rest of RE */
2692 if (!ST.accepted-- ) {
2699 There are at least two accepting states left. Presumably
2700 the number of accepting states is going to be low,
2701 typically two. So we simply scan through to find the one
2702 with lowest wordnum. Once we find it, we swap the last
2703 state into its place and decrement the size. We then try to
2704 match the rest of the pattern at the point where the word
2705 ends. If we succeed, control just continues along the
2706 regex; if we fail we return here to try the next accepting
2713 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2714 DEBUG_TRIE_EXECUTE_r(
2715 PerlIO_printf( Perl_debug_log,
2716 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2717 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2718 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2719 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2722 if (ST.accept_buff[cur].wordnum <
2723 ST.accept_buff[best].wordnum)
2728 reg_trie_data * const trie
2729 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2730 SV ** const tmp = RX_DEBUG(reginfo->prog)
2731 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2733 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2734 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2735 ST.accept_buff[best].wordnum,
2736 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2740 if ( best<ST.accepted ) {
2741 reg_trie_accepted tmp = ST.accept_buff[ best ];
2742 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2743 ST.accept_buff[ ST.accepted ] = tmp;
2746 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2748 PUSH_STATE_GOTO(TRIE_next, ST.B);
2754 char *s = STRING(scan);
2755 st->ln = STR_LEN(scan);
2756 if (do_utf8 != UTF) {
2757 /* The target and the pattern have differing utf8ness. */
2759 const char * const e = s + st->ln;
2762 /* The target is utf8, the pattern is not utf8. */
2767 if (NATIVE_TO_UNI(*(U8*)s) !=
2768 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2776 /* The target is not utf8, the pattern is utf8. */
2781 if (NATIVE_TO_UNI(*((U8*)l)) !=
2782 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2790 nextchr = UCHARAT(locinput);
2793 /* The target and the pattern have the same utf8ness. */
2794 /* Inline the first character, for speed. */
2795 if (UCHARAT(s) != nextchr)
2797 if (PL_regeol - locinput < st->ln)
2799 if (st->ln > 1 && memNE(s, locinput, st->ln))
2802 nextchr = UCHARAT(locinput);
2806 PL_reg_flags |= RF_tainted;
2809 char * const s = STRING(scan);
2810 st->ln = STR_LEN(scan);
2812 if (do_utf8 || UTF) {
2813 /* Either target or the pattern are utf8. */
2814 const char * const l = locinput;
2815 char *e = PL_regeol;
2817 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2818 l, &e, 0, do_utf8)) {
2819 /* One more case for the sharp s:
2820 * pack("U0U*", 0xDF) =~ /ss/i,
2821 * the 0xC3 0x9F are the UTF-8
2822 * byte sequence for the U+00DF. */
2824 toLOWER(s[0]) == 's' &&
2826 toLOWER(s[1]) == 's' &&
2833 nextchr = UCHARAT(locinput);
2837 /* Neither the target and the pattern are utf8. */
2839 /* Inline the first character, for speed. */
2840 if (UCHARAT(s) != nextchr &&
2841 UCHARAT(s) != ((OP(scan) == EXACTF)
2842 ? PL_fold : PL_fold_locale)[nextchr])
2844 if (PL_regeol - locinput < st->ln)
2846 if (st->ln > 1 && (OP(scan) == EXACTF
2847 ? ibcmp(s, locinput, st->ln)
2848 : ibcmp_locale(s, locinput, st->ln)))
2851 nextchr = UCHARAT(locinput);
2856 STRLEN inclasslen = PL_regeol - locinput;
2858 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
2860 if (locinput >= PL_regeol)
2862 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2863 nextchr = UCHARAT(locinput);
2868 nextchr = UCHARAT(locinput);
2869 if (!REGINCLASS(rex, scan, (U8*)locinput))
2871 if (!nextchr && locinput >= PL_regeol)
2873 nextchr = UCHARAT(++locinput);
2877 /* If we might have the case of the German sharp s
2878 * in a casefolding Unicode character class. */
2880 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2881 locinput += SHARP_S_SKIP;
2882 nextchr = UCHARAT(locinput);
2888 PL_reg_flags |= RF_tainted;
2894 LOAD_UTF8_CHARCLASS_ALNUM();
2895 if (!(OP(scan) == ALNUM
2896 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2897 : isALNUM_LC_utf8((U8*)locinput)))
2901 locinput += PL_utf8skip[nextchr];
2902 nextchr = UCHARAT(locinput);
2905 if (!(OP(scan) == ALNUM
2906 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2908 nextchr = UCHARAT(++locinput);
2911 PL_reg_flags |= RF_tainted;
2914 if (!nextchr && locinput >= PL_regeol)
2917 LOAD_UTF8_CHARCLASS_ALNUM();
2918 if (OP(scan) == NALNUM
2919 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2920 : isALNUM_LC_utf8((U8*)locinput))
2924 locinput += PL_utf8skip[nextchr];
2925 nextchr = UCHARAT(locinput);
2928 if (OP(scan) == NALNUM
2929 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2931 nextchr = UCHARAT(++locinput);
2935 PL_reg_flags |= RF_tainted;
2939 /* was last char in word? */
2941 if (locinput == PL_bostr)
2944 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2946 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
2948 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2949 st->ln = isALNUM_uni(st->ln);
2950 LOAD_UTF8_CHARCLASS_ALNUM();
2951 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2954 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
2955 n = isALNUM_LC_utf8((U8*)locinput);
2959 st->ln = (locinput != PL_bostr) ?
2960 UCHARAT(locinput - 1) : '\n';
2961 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2962 st->ln = isALNUM(st->ln);
2963 n = isALNUM(nextchr);
2966 st->ln = isALNUM_LC(st->ln);
2967 n = isALNUM_LC(nextchr);
2970 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
2971 OP(scan) == BOUNDL))
2975 PL_reg_flags |= RF_tainted;
2981 if (UTF8_IS_CONTINUED(nextchr)) {
2982 LOAD_UTF8_CHARCLASS_SPACE();
2983 if (!(OP(scan) == SPACE
2984 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2985 : isSPACE_LC_utf8((U8*)locinput)))
2989 locinput += PL_utf8skip[nextchr];
2990 nextchr = UCHARAT(locinput);
2993 if (!(OP(scan) == SPACE
2994 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2996 nextchr = UCHARAT(++locinput);
2999 if (!(OP(scan) == SPACE
3000 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3002 nextchr = UCHARAT(++locinput);
3006 PL_reg_flags |= RF_tainted;
3009 if (!nextchr && locinput >= PL_regeol)
3012 LOAD_UTF8_CHARCLASS_SPACE();
3013 if (OP(scan) == NSPACE
3014 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3015 : isSPACE_LC_utf8((U8*)locinput))
3019 locinput += PL_utf8skip[nextchr];
3020 nextchr = UCHARAT(locinput);
3023 if (OP(scan) == NSPACE
3024 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3026 nextchr = UCHARAT(++locinput);
3029 PL_reg_flags |= RF_tainted;
3035 LOAD_UTF8_CHARCLASS_DIGIT();
3036 if (!(OP(scan) == DIGIT
3037 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3038 : isDIGIT_LC_utf8((U8*)locinput)))
3042 locinput += PL_utf8skip[nextchr];
3043 nextchr = UCHARAT(locinput);
3046 if (!(OP(scan) == DIGIT
3047 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3049 nextchr = UCHARAT(++locinput);
3052 PL_reg_flags |= RF_tainted;
3055 if (!nextchr && locinput >= PL_regeol)
3058 LOAD_UTF8_CHARCLASS_DIGIT();
3059 if (OP(scan) == NDIGIT
3060 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3061 : isDIGIT_LC_utf8((U8*)locinput))
3065 locinput += PL_utf8skip[nextchr];
3066 nextchr = UCHARAT(locinput);
3069 if (OP(scan) == NDIGIT
3070 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3072 nextchr = UCHARAT(++locinput);
3075 if (locinput >= PL_regeol)
3078 LOAD_UTF8_CHARCLASS_MARK();
3079 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3081 locinput += PL_utf8skip[nextchr];
3082 while (locinput < PL_regeol &&
3083 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3084 locinput += UTF8SKIP(locinput);
3085 if (locinput > PL_regeol)
3090 nextchr = UCHARAT(locinput);
3093 PL_reg_flags |= RF_tainted;
3098 n = ARG(scan); /* which paren pair */
3099 st->ln = PL_regstartp[n];
3100 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3101 if ((I32)*PL_reglastparen < n || st->ln == -1)
3102 sayNO; /* Do not match unless seen CLOSEn. */
3103 if (st->ln == PL_regendp[n])
3106 s = PL_bostr + st->ln;
3107 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3109 const char *e = PL_bostr + PL_regendp[n];
3111 * Note that we can't do the "other character" lookup trick as
3112 * in the 8-bit case (no pun intended) because in Unicode we
3113 * have to map both upper and title case to lower case.
3115 if (OP(scan) == REFF) {
3117 STRLEN ulen1, ulen2;
3118 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3119 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3123 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3124 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3125 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3132 nextchr = UCHARAT(locinput);
3136 /* Inline the first character, for speed. */
3137 if (UCHARAT(s) != nextchr &&
3139 (UCHARAT(s) != ((OP(scan) == REFF
3140 ? PL_fold : PL_fold_locale)[nextchr]))))
3142 st->ln = PL_regendp[n] - st->ln;
3143 if (locinput + st->ln > PL_regeol)
3145 if (st->ln > 1 && (OP(scan) == REF
3146 ? memNE(s, locinput, st->ln)
3148 ? ibcmp(s, locinput, st->ln)
3149 : ibcmp_locale(s, locinput, st->ln))))
3152 nextchr = UCHARAT(locinput);
3163 #define ST st->u.eval
3165 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3169 /* execute the code in the {...} */
3171 SV ** const before = SP;
3172 OP_4tree * const oop = PL_op;
3173 COP * const ocurcop = PL_curcop;
3177 PL_op = (OP_4tree*)rex->data->data[n];
3178 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3179 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3180 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3182 CALLRUNOPS(aTHX); /* Scalar context. */
3185 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3192 PAD_RESTORE_LOCAL(old_comppad);
3193 PL_curcop = ocurcop;
3196 sv_setsv(save_scalar(PL_replgv), ret);
3200 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3203 /* extract RE object from returned value; compiling if
3208 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3209 mg = mg_find(sv, PERL_MAGIC_qr);
3210 else if (SvSMAGICAL(ret)) {
3211 if (SvGMAGICAL(ret))
3212 sv_unmagic(ret, PERL_MAGIC_qr);
3214 mg = mg_find(ret, PERL_MAGIC_qr);
3218 re = (regexp *)mg->mg_obj;
3219 (void)ReREFCNT_inc(re);
3223 const char * const t = SvPV_const(ret, len);
3225 const I32 osize = PL_regsize;
3228 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3229 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3231 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3233 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3239 /* run the pattern returned from (??{...}) */
3242 PerlIO_printf(Perl_debug_log,
3243 "Entering embedded \"%s%.60s%s%s\"\n",
3247 (strlen(re->precomp) > 60 ? "..." : ""))
3250 ST.cp = regcppush(0); /* Save *all* the positions. */
3251 REGCP_SET(ST.lastcp);
3252 *PL_reglastparen = 0;
3253 *PL_reglastcloseparen = 0;
3254 PL_reginput = locinput;
3256 /* XXXX This is too dramatic a measure... */
3260 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3261 ((re->reganch & ROPT_UTF8) != 0);
3262 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3267 /* now continue from first node in postoned RE */
3268 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
3271 /* /(?(?{...})X|Y)/ */
3272 st->sw = SvTRUE(ret);
3277 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3279 PL_reg_flags ^= RF_utf8;
3282 /* XXXX This is too dramatic a measure... */
3284 /* Restore parens of the caller without popping the
3287 const I32 tmp = PL_savestack_ix;
3288 PL_savestack_ix = ST.lastcp;
3290 PL_savestack_ix = tmp;
3292 PL_reginput = locinput;
3293 /* continue at the node following the (??{...}) */
3297 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3298 /* Restore state to the outer re then re-throw the failure */
3300 PL_reg_flags ^= RF_utf8;
3304 /* XXXX This is too dramatic a measure... */
3307 PL_reginput = locinput;
3308 REGCP_UNWIND(ST.lastcp);
3315 n = ARG(scan); /* which paren pair */
3316 PL_reg_start_tmp[n] = locinput;
3321 n = ARG(scan); /* which paren pair */
3322 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3323 PL_regendp[n] = locinput - PL_bostr;
3324 if (n > (I32)*PL_reglastparen)
3325 *PL_reglastparen = n;
3326 *PL_reglastcloseparen = n;
3329 n = ARG(scan); /* which paren pair */
3330 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3333 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3335 next = NEXTOPER(NEXTOPER(scan));
3337 next = scan + ARG(scan);
3338 if (OP(next) == IFTHEN) /* Fake one. */
3339 next = NEXTOPER(NEXTOPER(next));
3343 st->logical = scan->flags;
3345 /*******************************************************************
3346 cc points to the regmatch_state associated with the most recent CURLYX.
3347 This struct contains info about the innermost (...)* loop (an
3348 "infoblock"), and a pointer to the next outer cc.
3350 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3352 1) After matching Y, regnode for CURLYX is processed;
3354 2) This regnode populates cc, and calls regmatch() recursively
3355 with the starting point at WHILEM node;
3357 3) Each hit of WHILEM node tries to match A and Z (in the order
3358 depending on the current iteration, min/max of {min,max} and
3359 greediness). The information about where are nodes for "A"
3360 and "Z" is read from cc, as is info on how many times "A"
3361 was already matched, and greediness.
3363 4) After A matches, the same WHILEM node is hit again.
3365 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3366 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3367 resets cc, since this Y(A)*Z can be a part of some other loop:
3368 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3369 of the external loop.
3371 Currently present infoblocks form a tree with a stem formed by st->cc
3372 and whatever it mentions via ->next, and additional attached trees
3373 corresponding to temporarily unset infoblocks as in "5" above.
3375 In the following picture, infoblocks for outer loop of
3376 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3377 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3378 infoblocks are drawn below the "reset" infoblock.
3380 In fact in the picture below we do not show failed matches for Z and T
3381 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3382 more obvious *why* one needs to *temporary* unset infoblocks.]
3384 Matched REx position InfoBlocks Comment
3388 Y A)*?Z)*?T x <- O <- I
3389 YA )*?Z)*?T x <- O <- I
3390 YA A)*?Z)*?T x <- O <- I
3391 YAA )*?Z)*?T x <- O <- I
3392 YAA Z)*?T x <- O # Temporary unset I
3395 YAAZ Y(A)*?Z)*?T x <- O
3398 YAAZY (A)*?Z)*?T x <- O
3401 YAAZY A)*?Z)*?T x <- O <- I
3404 YAAZYA )*?Z)*?T x <- O <- I
3407 YAAZYA Z)*?T x <- O # Temporary unset I
3413 YAAZYAZ T x # Temporary unset O
3420 *******************************************************************/
3423 /* No need to save/restore up to this paren */
3424 I32 parenfloor = scan->flags;
3428 CURLYX and WHILEM are always paired: they're the moral
3429 equivalent of pp_enteriter anbd pp_iter.
3431 The only time next could be null is if the node tree is
3432 corrupt. This was mentioned on p5p a few days ago.
3434 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3435 So we'll assert that this is true:
3438 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3440 /* XXXX Probably it is better to teach regpush to support
3441 parenfloor > PL_regsize... */
3442 if (parenfloor > (I32)*PL_reglastparen)
3443 parenfloor = *PL_reglastparen; /* Pessimization... */
3445 st->u.curlyx.cp = PL_savestack_ix;
3446 st->u.curlyx.outercc = st->cc;
3448 /* these fields contain the state of the current curly.
3449 * they are accessed by subsequent WHILEMs;
3450 * cur and lastloc are also updated by WHILEM */
3451 st->u.curlyx.parenfloor = parenfloor;
3452 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3453 st->u.curlyx.min = ARG1(scan);
3454 st->u.curlyx.max = ARG2(scan);
3455 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3456 st->u.curlyx.lastloc = 0;
3457 /* st->next and st->minmod are also read by WHILEM */
3459 PL_reginput = locinput;
3460 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3461 /*** all unsaved local vars undefined at this point */
3462 regcpblow(st->u.curlyx.cp);
3463 st->cc = st->u.curlyx.outercc;
3469 * This is really hard to understand, because after we match
3470 * what we're trying to match, we must make sure the rest of
3471 * the REx is going to match for sure, and to do that we have
3472 * to go back UP the parse tree by recursing ever deeper. And
3473 * if it fails, we have to reset our parent's current state
3474 * that we can try again after backing off.
3479 st->cc gets initialised by CURLYX ready for use by WHILEM.
3480 So again, unless somethings been corrupted, st->cc cannot
3481 be null at that point in WHILEM.
3483 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3484 So we'll assert that this is true:
3487 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3488 st->u.whilem.cache_offset = 0;
3489 st->u.whilem.cache_bit = 0;
3491 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3492 PL_reginput = locinput;
3495 PerlIO_printf(Perl_debug_log,
3496 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3497 REPORT_CODE_OFF+PL_regindent*2, "",
3498 (long)n, (long)st->cc->u.curlyx.min,
3499 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3502 /* If degenerate scan matches "", assume scan done. */
3504 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3505 st->u.whilem.savecc = st->cc;
3506 st->cc = st->cc->u.curlyx.outercc;
3508 st->ln = st->cc->u.curlyx.cur;
3510 PerlIO_printf(Perl_debug_log,
3511 "%*s empty match detected, try continuation...\n",
3512 REPORT_CODE_OFF+PL_regindent*2, "")
3514 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3515 /*** all unsaved local vars undefined at this point */
3516 st->cc = st->u.whilem.savecc;
3519 if (st->cc->u.curlyx.outercc)
3520 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3524 /* First just match a string of min scans. */
3526 if (n < st->cc->u.curlyx.min) {
3527 st->cc->u.curlyx.cur = n;
3528 st->cc->u.curlyx.lastloc = locinput;
3529 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3530 /*** all unsaved local vars undefined at this point */
3533 st->cc->u.curlyx.cur = n - 1;
3534 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3539 /* Check whether we already were at this position.
3540 Postpone detection until we know the match is not
3541 *that* much linear. */
3542 if (!PL_reg_maxiter) {
3543 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3544 /* possible overflow for long strings and many CURLYX's */
3545 if (PL_reg_maxiter < 0)
3546 PL_reg_maxiter = I32_MAX;
3547 PL_reg_leftiter = PL_reg_maxiter;
3549 if (PL_reg_leftiter-- == 0) {
3550 const I32 size = (PL_reg_maxiter + 7)/8;
3551 if (PL_reg_poscache) {
3552 if ((I32)PL_reg_poscache_size < size) {
3553 Renew(PL_reg_poscache, size, char);
3554 PL_reg_poscache_size = size;
3556 Zero(PL_reg_poscache, size, char);
3559 PL_reg_poscache_size = size;
3560 Newxz(PL_reg_poscache, size, char);
3563 PerlIO_printf(Perl_debug_log,
3564 "%sDetected a super-linear match, switching on caching%s...\n",
3565 PL_colors[4], PL_colors[5])
3568 if (PL_reg_leftiter < 0) {
3569 st->u.whilem.cache_offset = locinput - PL_bostr;
3571 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
3572 + st->u.whilem.cache_offset * (scan->flags>>4);
3573 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3574 st->u.whilem.cache_offset /= 8;
3575 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3577 PerlIO_printf(Perl_debug_log,
3578 "%*s already tried at this position...\n",
3579 REPORT_CODE_OFF+PL_regindent*2, "")
3581 sayNO; /* cache records failure */
3586 /* Prefer next over scan for minimal matching. */
3588 if (st->cc->minmod) {
3589 st->u.whilem.savecc = st->cc;
3590 st->cc = st->cc->u.curlyx.outercc;
3592 st->ln = st->cc->u.curlyx.cur;
3593 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3594 REGCP_SET(st->u.whilem.lastcp);
3595 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3596 /*** all unsaved local vars undefined at this point */
3597 st->cc = st->u.whilem.savecc;
3599 regcpblow(st->u.whilem.cp);
3600 sayYES; /* All done. */
3602 REGCP_UNWIND(st->u.whilem.lastcp);
3604 if (st->cc->u.curlyx.outercc)
3605 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3607 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3608 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3609 && !(PL_reg_flags & RF_warned)) {
3610 PL_reg_flags |= RF_warned;
3611 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3612 "Complex regular subexpression recursion",
3619 PerlIO_printf(Perl_debug_log,
3620 "%*s trying longer...\n",
3621 REPORT_CODE_OFF+PL_regindent*2, "")
3623 /* Try scanning more and see if it helps. */
3624 PL_reginput = locinput;
3625 st->cc->u.curlyx.cur = n;
3626 st->cc->u.curlyx.lastloc = locinput;
3627 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3628 REGCP_SET(st->u.whilem.lastcp);
3629 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3630 /*** all unsaved local vars undefined at this point */
3632 regcpblow(st->u.whilem.cp);
3635 REGCP_UNWIND(st->u.whilem.lastcp);
3637 st->cc->u.curlyx.cur = n - 1;
3638 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3642 /* Prefer scan over next for maximal matching. */
3644 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3645 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3646 st->cc->u.curlyx.cur = n;
3647 st->cc->u.curlyx.lastloc = locinput;
3648 REGCP_SET(st->u.whilem.lastcp);
3649 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
3650 /*** all unsaved local vars undefined at this point */
3652 regcpblow(st->u.whilem.cp);
3655 REGCP_UNWIND(st->u.whilem.lastcp);
3656 regcppop(rex); /* Restore some previous $<digit>s? */
3657 PL_reginput = locinput;
3659 PerlIO_printf(Perl_debug_log,
3660 "%*s failed, try continuation...\n",
3661 REPORT_CODE_OFF+PL_regindent*2, "")
3664 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3665 && !(PL_reg_flags & RF_warned)) {
3666 PL_reg_flags |= RF_warned;
3667 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3668 "Complex regular subexpression recursion",
3672 /* Failed deeper matches of scan, so see if this one works. */
3673 st->u.whilem.savecc = st->cc;
3674 st->cc = st->cc->u.curlyx.outercc;
3676 st->ln = st->cc->u.curlyx.cur;
3677 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
3678 /*** all unsaved local vars undefined at this point */
3679 st->cc = st->u.whilem.savecc;
3682 if (st->cc->u.curlyx.outercc)
3683 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3684 st->cc->u.curlyx.cur = n - 1;
3685 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3691 #define ST st->u.branch
3693 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3694 next = scan + ARG(scan);
3697 scan = NEXTOPER(scan);
3700 case BRANCH: /* /(...|A|...)/ */
3701 scan = NEXTOPER(scan); /* scan now points to inner node */
3702 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3703 /* last branch; skip state push and jump direct to node */
3705 ST.lastparen = *PL_reglastparen;
3706 ST.next_branch = next;
3708 PL_reginput = locinput;
3710 /* Now go into the branch */
3711 PUSH_STATE_GOTO(BRANCH_next, scan);
3714 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3715 REGCP_UNWIND(ST.cp);
3716 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3718 *PL_reglastparen = n;
3719 scan = ST.next_branch;
3720 /* no more branches? */
3721 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3723 continue; /* execute next BRANCH[J] op */
3731 #define ST st->u.curlym
3733 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3735 /* This is an optimisation of CURLYX that enables us to push
3736 * only a single backtracking state, no matter now many matches
3737 * there are in {m,n}. It relies on the pattern being constant
3738 * length, with no parens to influence future backrefs
3742 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3744 /* if paren positive, emulate an OPEN/CLOSE around A */
3746 I32 paren = ST.me->flags;
3747 if (paren > PL_regsize)
3749 if (paren > (I32)*PL_reglastparen)
3750 *PL_reglastparen = paren;
3751 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3757 ST.minmod = st->minmod;
3759 ST.c1 = CHRTEST_UNINIT;
3762 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3765 curlym_do_A: /* execute the A in /A{m,n}B/ */
3766 PL_reginput = locinput;
3767 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3770 case CURLYM_A: /* we've just matched an A */
3771 locinput = st->locinput;
3772 nextchr = UCHARAT(locinput);
3775 /* after first match, determine A's length: u.curlym.alen */
3776 if (ST.count == 1) {
3777 if (PL_reg_match_utf8) {
3779 while (s < PL_reginput) {
3785 ST.alen = PL_reginput - locinput;
3788 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3791 PerlIO_printf(Perl_debug_log,
3792 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
3793 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3794 (IV) ST.count, (IV)ST.alen)
3797 locinput = PL_reginput;
3798 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3799 goto curlym_do_A; /* try to match another A */
3800 goto curlym_do_B; /* try to match B */
3802 case CURLYM_A_fail: /* just failed to match an A */
3803 REGCP_UNWIND(ST.cp);
3804 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3807 curlym_do_B: /* execute the B in /A{m,n}B/ */
3808 PL_reginput = locinput;
3809 if (ST.c1 == CHRTEST_UNINIT) {
3810 /* calculate c1 and c2 for possible match of 1st char
3811 * following curly */
3812 ST.c1 = ST.c2 = CHRTEST_VOID;
3813 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3814 regnode *text_node = ST.B;
3815 if (! HAS_TEXT(text_node))
3816 FIND_NEXT_IMPT(text_node);
3817 if (HAS_TEXT(text_node)
3818 && PL_regkind[OP(text_node)] != REF)
3820 ST.c1 = (U8)*STRING(text_node);
3822 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3824 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3825 ? PL_fold_locale[ST.c1]
3832 PerlIO_printf(Perl_debug_log,
3833 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
3834 (int)(REPORT_CODE_OFF+PL_regindent*2),
3837 if (ST.c1 != CHRTEST_VOID
3838 && UCHARAT(PL_reginput) != ST.c1
3839 && UCHARAT(PL_reginput) != ST.c2)
3841 /* simulate B failing */
3842 state_num = CURLYM_B_fail;
3843 goto reenter_switch;
3847 /* mark current A as captured */
3848 I32 paren = ST.me->flags;
3851 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3852 PL_regendp[paren] = PL_reginput - PL_bostr;
3855 PL_regendp[paren] = -1;
3857 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
3860 case CURLYM_B_fail: /* just failed to match a B */
3861 REGCP_UNWIND(ST.cp);
3863 if (ST.count == ARG2(ST.me) /* max */)
3865 goto curlym_do_A; /* try to match a further A */
3867 /* backtrack one A */
3868 if (ST.count == ARG1(ST.me) /* min */)
3871 locinput = HOPc(locinput, -ST.alen);
3872 goto curlym_do_B; /* try to match B */
3875 #define ST st->u.curly
3877 #define CURLY_SETPAREN(paren, success) \
3880 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3881 PL_regendp[paren] = locinput - PL_bostr; \
3884 PL_regendp[paren] = -1; \
3887 case STAR: /* /A*B/ where A is width 1 */
3891 scan = NEXTOPER(scan);
3893 case PLUS: /* /A+B/ where A is width 1 */
3897 scan = NEXTOPER(scan);
3899 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3900 ST.paren = scan->flags; /* Which paren to set */
3901 if (ST.paren > PL_regsize)
3902 PL_regsize = ST.paren;
3903 if (ST.paren > (I32)*PL_reglastparen)
3904 *PL_reglastparen = ST.paren;
3905 ST.min = ARG1(scan); /* min to match */
3906 ST.max = ARG2(scan); /* max to match */
3907 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3909 case CURLY: /* /A{m,n}B/ where A is width 1 */
3911 ST.min = ARG1(scan); /* min to match */
3912 ST.max = ARG2(scan); /* max to match */
3913 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;