5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
80 #define RF_evaled 4 /* Did an EVAL with setting? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
153 static void restore_pos(pTHX_ void *arg);
156 S_regcppush(pTHX_ I32 parenfloor)
159 const int retval = PL_savestack_ix;
160 #define REGCP_PAREN_ELEMS 4
161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
163 GET_RE_DEBUG_FLAGS_DECL;
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168 #define REGCP_OTHER_ELEMS 6
169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
170 for (p = PL_regsize; p > parenfloor; p--) {
171 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
174 SSPUSHPTR(PL_reg_start_tmp[p]);
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
183 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
186 SSPUSHINT(*PL_reglastcloseparen);
187 SSPUSHPTR(PL_reginput);
188 #define REGCP_FRAME_ELEMS 2
189 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
197 /* These are needed since we do not localize EVAL nodes: */
198 #define REGCP_SET(cp) \
200 PerlIO_printf(Perl_debug_log, \
201 " Setting an EVAL scope, savestack=%"IVdf"\n", \
202 (IV)PL_savestack_ix)); \
205 #define REGCP_UNWIND(cp) \
207 if (cp != PL_savestack_ix) \
208 PerlIO_printf(Perl_debug_log, \
209 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
210 (IV)(cp), (IV)PL_savestack_ix)); \
214 S_regcppop(pTHX_ const regexp *rex)
220 GET_RE_DEBUG_FLAGS_DECL;
222 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
224 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
225 i = SSPOPINT; /* Parentheses elements to pop. */
226 input = (char *) SSPOPPTR;
227 *PL_reglastcloseparen = SSPOPINT;
228 *PL_reglastparen = SSPOPINT;
229 PL_regsize = SSPOPINT;
231 /* Now restore the parentheses context. */
232 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
233 i > 0; i -= REGCP_PAREN_ELEMS) {
235 U32 paren = (U32)SSPOPINT;
236 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
237 PL_regstartp[paren] = SSPOPINT;
239 if (paren <= *PL_reglastparen)
240 PL_regendp[paren] = tmps;
242 PerlIO_printf(Perl_debug_log,
243 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
244 (UV)paren, (IV)PL_regstartp[paren],
245 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
246 (IV)PL_regendp[paren],
247 (paren > *PL_reglastparen ? "(no)" : ""));
251 if (*PL_reglastparen + 1 <= rex->nparens) {
252 PerlIO_printf(Perl_debug_log,
253 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
254 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
258 /* It would seem that the similar code in regtry()
259 * already takes care of this, and in fact it is in
260 * a better location to since this code can #if 0-ed out
261 * but the code in regtry() is needed or otherwise tests
262 * requiring null fields (pat.t#187 and split.t#{13,14}
263 * (as of patchlevel 7877) will fail. Then again,
264 * this code seems to be necessary or otherwise
265 * building DynaLoader will fail:
266 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
268 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
270 PL_regstartp[i] = -1;
277 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
280 * pregexec and friends
283 #ifndef PERL_IN_XSUB_RE
285 - pregexec - match a regexp against a string
288 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
289 char *strbeg, I32 minend, SV *screamer, U32 nosave)
290 /* strend: pointer to null at end of string */
291 /* strbeg: real beginning of string */
292 /* minend: end of match must be >=minend after stringarg. */
293 /* nosave: For optimizations. */
296 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
297 nosave ? 0 : REXEC_COPY_STR);
302 * Need to implement the following flags for reg_anch:
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
315 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
317 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
320 /* XXXX We assume that strpos is strbeg unless sv. */
322 /* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
329 /* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
332 regtry() should be needed.
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
339 We use a)b)d) and multiline-part of c), and try to find a position in the
340 string which does not contradict any of them.
343 /* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
348 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
352 register I32 start_shift = 0;
353 /* Should be nonnegative! */
354 register I32 end_shift = 0;
359 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
361 register char *other_last = NULL; /* other substr checked before this */
362 char *check_at = NULL; /* check substr found at this pos */
363 const I32 multiline = prog->reganch & PMf_MULTILINE;
365 const char * const i_strpos = strpos;
368 GET_RE_DEBUG_FLAGS_DECL;
370 RX_MATCH_UTF8_set(prog,do_utf8);
372 if (prog->reganch & ROPT_UTF8) {
373 PL_reg_flags |= RF_utf8;
376 debug_start_match(prog, do_utf8, strpos, strend,
377 "Guessing start of match for");
380 /* CHR_DIST() would be more correct here but it makes things slow. */
381 if (prog->minlen > strend - strpos) {
382 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
383 "String too short... [re_intuit_start]\n"));
386 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
389 if (!prog->check_utf8 && prog->check_substr)
390 to_utf8_substr(prog);
391 check = prog->check_utf8;
393 if (!prog->check_substr && prog->check_utf8)
394 to_byte_substr(prog);
395 check = prog->check_substr;
397 if (check == &PL_sv_undef) {
398 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
399 "Non-utf string cannot match utf check string\n"));
402 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
403 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
404 || ( (prog->reganch & ROPT_ANCH_BOL)
405 && !multiline ) ); /* Check after \n? */
408 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
409 | ROPT_IMPLICIT)) /* not a real BOL */
410 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
412 && (strpos != strbeg)) {
413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
416 if (prog->check_offset_min == prog->check_offset_max &&
417 !(prog->reganch & ROPT_CANY_SEEN)) {
418 /* Substring at constant offset from beg-of-str... */
421 s = HOP3c(strpos, prog->check_offset_min, strend);
423 slen = SvCUR(check); /* >= 1 */
425 if ( strend - s > slen || strend - s < slen - 1
426 || (strend - s == slen && strend[-1] != '\n')) {
427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
430 /* Now should match s[0..slen-2] */
432 if (slen && (*SvPVX_const(check) != *s
434 && memNE(SvPVX_const(check), s, slen)))) {
436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
440 else if (*SvPVX_const(check) != *s
441 || ((slen = SvCUR(check)) > 1
442 && memNE(SvPVX_const(check), s, slen)))
445 goto success_at_start;
448 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
450 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
451 end_shift = prog->minlen - start_shift -
452 CHR_SVLEN(check) + (SvTAIL(check) != 0);
454 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
455 - (SvTAIL(check) != 0);
456 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
458 if (end_shift < eshift)
462 else { /* Can match at random position */
465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
466 /* Should be nonnegative! */
467 end_shift = prog->minlen - start_shift -
468 CHR_SVLEN(check) + (SvTAIL(check) != 0);
471 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
473 Perl_croak(aTHX_ "panic: end_shift");
477 /* Find a possible match in the region s..strend by looking for
478 the "check" substring in the region corrected by start/end_shift. */
479 if (flags & REXEC_SCREAM) {
480 I32 p = -1; /* Internal iterator of scream. */
481 I32 * const pp = data ? data->scream_pos : &p;
483 if (PL_screamfirst[BmRARE(check)] >= 0
484 || ( BmRARE(check) == '\n'
485 && (BmPREVIOUS(check) == SvCUR(check) - 1)
487 s = screaminstr(sv, check,
488 start_shift + (s - strbeg), end_shift, pp, 0);
491 /* we may be pointing at the wrong string */
492 if (s && RX_MATCH_COPIED(prog))
493 s = strbeg + (s - SvPVX_const(sv));
495 *data->scream_olds = s;
497 else if (prog->reganch & ROPT_CANY_SEEN)
498 s = fbm_instr((U8*)(s + start_shift),
499 (U8*)(strend - end_shift),
500 check, multiline ? FBMrf_MULTILINE : 0);
502 s = fbm_instr(HOP3(s, start_shift, strend),
503 HOP3(strend, -end_shift, strbeg),
504 check, multiline ? FBMrf_MULTILINE : 0);
506 /* Update the count-of-usability, remove useless subpatterns,
510 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
511 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
512 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
513 (s ? "Found" : "Did not find"),
514 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
515 ? "anchored" : "floating"),
518 (s ? " at offset " : "...\n") );
526 /* Finish the diagnostic message */
527 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
529 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
530 Start with the other substr.
531 XXXX no SCREAM optimization yet - and a very coarse implementation
532 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
533 *always* match. Probably should be marked during compile...
534 Probably it is right to do no SCREAM here...
537 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
538 /* Take into account the "other" substring. */
539 /* XXXX May be hopelessly wrong for UTF... */
542 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
545 char * const last = HOP3c(s, -start_shift, strbeg);
547 char * const saved_s = s;
550 t = s - prog->check_offset_max;
551 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
553 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
558 t = HOP3c(t, prog->anchored_offset, strend);
559 if (t < other_last) /* These positions already checked */
561 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
564 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
565 /* On end-of-str: see comment below. */
566 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
567 if (must == &PL_sv_undef) {
569 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
574 HOP3(HOP3(last1, prog->anchored_offset, strend)
575 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
577 multiline ? FBMrf_MULTILINE : 0
580 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
581 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
582 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
583 (s ? "Found" : "Contradicts"),
584 quoted, RE_SV_TAIL(must));
589 if (last1 >= last2) {
590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
591 ", giving up...\n"));
594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
595 ", trying floating at offset %ld...\n",
596 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
597 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
598 s = HOP3c(last, 1, strend);
602 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
603 (long)(s - i_strpos)));
604 t = HOP3c(s, -prog->anchored_offset, strbeg);
605 other_last = HOP3c(s, 1, strend);
613 else { /* Take into account the floating substring. */
615 char * const saved_s = s;
618 t = HOP3c(s, -start_shift, strbeg);
620 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
621 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
622 last = HOP3c(t, prog->float_max_offset, strend);
623 s = HOP3c(t, prog->float_min_offset, strend);
626 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
627 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
628 /* fbm_instr() takes into account exact value of end-of-str
629 if the check is SvTAIL(ed). Since false positives are OK,
630 and end-of-str is not later than strend we are OK. */
631 if (must == &PL_sv_undef) {
633 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
636 s = fbm_instr((unsigned char*)s,
637 (unsigned char*)last + SvCUR(must)
639 must, multiline ? FBMrf_MULTILINE : 0);
641 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
642 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
643 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
644 (s ? "Found" : "Contradicts"),
645 quoted, RE_SV_TAIL(must));
649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
650 ", giving up...\n"));
653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
654 ", trying anchored starting at offset %ld...\n",
655 (long)(saved_s + 1 - i_strpos)));
657 s = HOP3c(t, 1, strend);
661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
662 (long)(s - i_strpos)));
663 other_last = s; /* Fix this later. --Hugo */
672 t = s - prog->check_offset_max;
673 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
675 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
677 /* Fixed substring is found far enough so that the match
678 cannot start at strpos. */
680 if (ml_anch && t[-1] != '\n') {
681 /* Eventually fbm_*() should handle this, but often
682 anchored_offset is not 0, so this check will not be wasted. */
683 /* XXXX In the code below we prefer to look for "^" even in
684 presence of anchored substrings. And we search even
685 beyond the found float position. These pessimizations
686 are historical artefacts only. */
688 while (t < strend - prog->minlen) {
690 if (t < check_at - prog->check_offset_min) {
691 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
692 /* Since we moved from the found position,
693 we definitely contradict the found anchored
694 substr. Due to the above check we do not
695 contradict "check" substr.
696 Thus we can arrive here only if check substr
697 is float. Redo checking for "other"=="fixed".
700 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
701 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
702 goto do_other_anchored;
704 /* We don't contradict the found floating substring. */
705 /* XXXX Why not check for STCLASS? */
707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
708 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
711 /* Position contradicts check-string */
712 /* XXXX probably better to look for check-string
713 than for "\n", so one should lower the limit for t? */
714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
715 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
716 other_last = strpos = s = t + 1;
721 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
722 PL_colors[0], PL_colors[1]));
726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
727 PL_colors[0], PL_colors[1]));
731 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
734 /* The found string does not prohibit matching at strpos,
735 - no optimization of calling REx engine can be performed,
736 unless it was an MBOL and we are not after MBOL,
737 or a future STCLASS check will fail this. */
739 /* Even in this situation we may use MBOL flag if strpos is offset
740 wrt the start of the string. */
741 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
742 && (strpos != strbeg) && strpos[-1] != '\n'
743 /* May be due to an implicit anchor of m{.*foo} */
744 && !(prog->reganch & ROPT_IMPLICIT))
749 DEBUG_EXECUTE_r( if (ml_anch)
750 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
751 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
754 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
756 prog->check_utf8 /* Could be deleted already */
757 && --BmUSEFUL(prog->check_utf8) < 0
758 && (prog->check_utf8 == prog->float_utf8)
760 prog->check_substr /* Could be deleted already */
761 && --BmUSEFUL(prog->check_substr) < 0
762 && (prog->check_substr == prog->float_substr)
765 /* If flags & SOMETHING - do not do it many times on the same match */
766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
767 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
768 if (do_utf8 ? prog->check_substr : prog->check_utf8)
769 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
770 prog->check_substr = prog->check_utf8 = NULL; /* disable */
771 prog->float_substr = prog->float_utf8 = NULL; /* clear */
772 check = NULL; /* abort */
774 /* XXXX This is a remnant of the old implementation. It
775 looks wasteful, since now INTUIT can use many
777 prog->reganch &= ~RE_USE_INTUIT;
784 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
785 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
786 /* minlen == 0 is possible if regstclass is \b or \B,
787 and the fixed substr is ''$.
788 Since minlen is already taken into account, s+1 is before strend;
789 accidentally, minlen >= 1 guaranties no false positives at s + 1
790 even for \b or \B. But (minlen? 1 : 0) below assumes that
791 regstclass does not come from lookahead... */
792 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
793 This leaves EXACTF only, which is dealt with in find_byclass(). */
794 const U8* const str = (U8*)STRING(prog->regstclass);
795 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
796 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
798 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
799 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
800 : (prog->float_substr || prog->float_utf8
801 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
804 /*if (OP(prog->regstclass) == TRIE)
807 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
810 const char *what = NULL;
812 if (endpos == strend) {
813 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
814 "Could not match STCLASS...\n") );
817 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
818 "This position contradicts STCLASS...\n") );
819 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
821 /* Contradict one of substrings */
822 if (prog->anchored_substr || prog->anchored_utf8) {
823 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
824 DEBUG_EXECUTE_r( what = "anchored" );
826 s = HOP3c(t, 1, strend);
827 if (s + start_shift + end_shift > strend) {
828 /* XXXX Should be taken into account earlier? */
829 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
830 "Could not match STCLASS...\n") );
835 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
836 "Looking for %s substr starting at offset %ld...\n",
837 what, (long)(s + start_shift - i_strpos)) );
840 /* Have both, check_string is floating */
841 if (t + start_shift >= check_at) /* Contradicts floating=check */
842 goto retry_floating_check;
843 /* Recheck anchored substring, but not floating... */
847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
848 "Looking for anchored substr starting at offset %ld...\n",
849 (long)(other_last - i_strpos)) );
850 goto do_other_anchored;
852 /* Another way we could have checked stclass at the
853 current position only: */
858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
859 "Looking for /%s^%s/m starting at offset %ld...\n",
860 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
863 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
865 /* Check is floating subtring. */
866 retry_floating_check:
867 t = check_at - start_shift;
868 DEBUG_EXECUTE_r( what = "floating" );
869 goto hop_and_restart;
872 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
873 "By STCLASS: moving %ld --> %ld\n",
874 (long)(t - i_strpos), (long)(s - i_strpos))
878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
879 "Does not contradict STCLASS...\n");
884 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
885 PL_colors[4], (check ? "Guessed" : "Giving up"),
886 PL_colors[5], (long)(s - i_strpos)) );
889 fail_finish: /* Substring not found */
890 if (prog->check_substr || prog->check_utf8) /* could be removed already */
891 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
894 PL_colors[4], PL_colors[5]));
900 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
901 foldlen, foldbuf, uniflags) STMT_START { \
902 switch (trie_type) { \
903 case trie_utf8_fold: \
905 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
910 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
911 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
912 foldlen -= UNISKIP( uvc ); \
913 uscan = foldbuf + UNISKIP( uvc ); \
917 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
925 charid = trie->charmap[ uvc ]; \
929 if (trie->widecharmap) { \
930 SV** const svpp = hv_fetch(trie->widecharmap, \
931 (char*)&uvc, sizeof(UV), 0); \
933 charid = (U16)SvIV(*svpp); \
938 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
941 ibcmp_utf8(s, NULL, 0, do_utf8, \
942 m, NULL, ln, (bool)UTF)) \
943 && (!reginfo || regtry(reginfo, s)) ) \
946 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
947 uvchr_to_utf8(tmpbuf, c); \
948 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
950 && (f == c1 || f == c2) \
951 && (ln == foldlen || \
952 !ibcmp_utf8((char *) foldbuf, \
953 NULL, foldlen, do_utf8, \
955 NULL, ln, (bool)UTF)) \
956 && (!reginfo || regtry(reginfo, s)) ) \
961 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
965 && (ln == 1 || !(OP(c) == EXACTF \
967 : ibcmp_locale(s, m, ln))) \
968 && (!reginfo || regtry(reginfo, s)) ) \
974 #define REXEC_FBC_UTF8_SCAN(CoDe) \
976 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
982 #define REXEC_FBC_SCAN(CoDe) \
984 while (s < strend) { \
990 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
991 REXEC_FBC_UTF8_SCAN( \
993 if (tmp && (!reginfo || regtry(reginfo, s))) \
1002 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1005 if (tmp && (!reginfo || regtry(reginfo, s))) \
1014 #define REXEC_FBC_TRYIT \
1015 if ((!reginfo || regtry(reginfo, s))) \
1018 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1021 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1024 REXEC_FBC_CLASS_SCAN(CoNd); \
1028 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1029 PL_reg_flags |= RF_tainted; \
1031 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1034 REXEC_FBC_CLASS_SCAN(CoNd); \
1038 #define DUMP_EXEC_POS(li,s,doutf8) \
1039 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1041 /* We know what class REx starts with. Try to find this position... */
1042 /* if reginfo is NULL, its a dryrun */
1043 /* annoyingly all the vars in this routine have different names from their counterparts
1044 in regmatch. /grrr */
1047 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1048 const char *strend, const regmatch_info *reginfo)
1051 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1055 register STRLEN uskip;
1059 register I32 tmp = 1; /* Scratch variable? */
1060 register const bool do_utf8 = PL_reg_match_utf8;
1062 /* We know what class it must start with. */
1066 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1067 !UTF8_IS_INVARIANT((U8)s[0]) ?
1068 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1069 REGINCLASS(prog, c, (U8*)s));
1072 while (s < strend) {
1075 if (REGINCLASS(prog, c, (U8*)s) ||
1076 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1077 /* The assignment of 2 is intentional:
1078 * for the folded sharp s, the skip is 2. */
1079 (skip = SHARP_S_SKIP))) {
1080 if (tmp && (!reginfo || regtry(reginfo, s)))
1093 if (tmp && (!reginfo || regtry(reginfo, s)))
1101 ln = STR_LEN(c); /* length to match in octets/bytes */
1102 lnc = (I32) ln; /* length to match in characters */
1104 STRLEN ulen1, ulen2;
1106 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1107 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1108 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1110 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1111 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1113 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1115 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1118 while (sm < ((U8 *) m + ln)) {
1133 c2 = PL_fold_locale[c1];
1135 e = HOP3c(strend, -((I32)lnc), s);
1137 if (!reginfo && e < s)
1138 e = s; /* Due to minlen logic of intuit() */
1140 /* The idea in the EXACTF* cases is to first find the
1141 * first character of the EXACTF* node and then, if
1142 * necessary, case-insensitively compare the full
1143 * text of the node. The c1 and c2 are the first
1144 * characters (though in Unicode it gets a bit
1145 * more complicated because there are more cases
1146 * than just upper and lower: one needs to use
1147 * the so-called folding case for case-insensitive
1148 * matching (called "loose matching" in Unicode).
1149 * ibcmp_utf8() will do just that. */
1153 U8 tmpbuf [UTF8_MAXBYTES+1];
1154 STRLEN len, foldlen;
1155 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1157 /* Upper and lower of 1st char are equal -
1158 * probably not a "letter". */
1160 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1162 REXEC_FBC_EXACTISH_CHECK(c == c1);
1167 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1170 /* Handle some of the three Greek sigmas cases.
1171 * Note that not all the possible combinations
1172 * are handled here: some of them are handled
1173 * by the standard folding rules, and some of
1174 * them (the character class or ANYOF cases)
1175 * are handled during compiletime in
1176 * regexec.c:S_regclass(). */
1177 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1178 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1179 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1181 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1187 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1189 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1193 PL_reg_flags |= RF_tainted;
1200 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1201 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1203 tmp = ((OP(c) == BOUND ?
1204 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1205 LOAD_UTF8_CHARCLASS_ALNUM();
1206 REXEC_FBC_UTF8_SCAN(
1207 if (tmp == !(OP(c) == BOUND ?
1208 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1209 isALNUM_LC_utf8((U8*)s)))
1217 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1218 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1221 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1227 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1231 PL_reg_flags |= RF_tainted;
1238 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1239 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1241 tmp = ((OP(c) == NBOUND ?
1242 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1243 LOAD_UTF8_CHARCLASS_ALNUM();
1244 REXEC_FBC_UTF8_SCAN(
1245 if (tmp == !(OP(c) == NBOUND ?
1246 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1247 isALNUM_LC_utf8((U8*)s)))
1249 else REXEC_FBC_TRYIT;
1253 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1254 tmp = ((OP(c) == NBOUND ?
1255 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1258 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1260 else REXEC_FBC_TRYIT;
1263 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1267 REXEC_FBC_CSCAN_PRELOAD(
1268 LOAD_UTF8_CHARCLASS_ALNUM(),
1269 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1273 REXEC_FBC_CSCAN_TAINT(
1274 isALNUM_LC_utf8((U8*)s),
1278 REXEC_FBC_CSCAN_PRELOAD(
1279 LOAD_UTF8_CHARCLASS_ALNUM(),
1280 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1284 REXEC_FBC_CSCAN_TAINT(
1285 !isALNUM_LC_utf8((U8*)s),
1289 REXEC_FBC_CSCAN_PRELOAD(
1290 LOAD_UTF8_CHARCLASS_SPACE(),
1291 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1295 REXEC_FBC_CSCAN_TAINT(
1296 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1300 REXEC_FBC_CSCAN_PRELOAD(
1301 LOAD_UTF8_CHARCLASS_SPACE(),
1302 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1306 REXEC_FBC_CSCAN_TAINT(
1307 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1311 REXEC_FBC_CSCAN_PRELOAD(
1312 LOAD_UTF8_CHARCLASS_DIGIT(),
1313 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1317 REXEC_FBC_CSCAN_TAINT(
1318 isDIGIT_LC_utf8((U8*)s),
1322 REXEC_FBC_CSCAN_PRELOAD(
1323 LOAD_UTF8_CHARCLASS_DIGIT(),
1324 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1328 REXEC_FBC_CSCAN_TAINT(
1329 !isDIGIT_LC_utf8((U8*)s),
1335 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1336 trie_type = do_utf8 ?
1337 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1339 /* what trie are we using right now */
1341 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1342 reg_trie_data *trie=aho->trie;
1344 const char *last_start = strend - trie->minlen;
1346 const char *real_start = s;
1348 STRLEN maxlen = trie->maxlen;
1350 U8 **points; /* map of where we were in the input string
1351 when reading a given char. For ASCII this
1352 is unnecessary overhead as the relationship
1353 is always 1:1, but for unicode, especially
1354 case folded unicode this is not true. */
1355 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1359 GET_RE_DEBUG_FLAGS_DECL;
1361 /* We can't just allocate points here. We need to wrap it in
1362 * an SV so it gets freed properly if there is a croak while
1363 * running the match */
1366 sv_points=newSV(maxlen * sizeof(U8 *));
1367 SvCUR_set(sv_points,
1368 maxlen * sizeof(U8 *));
1369 SvPOK_on(sv_points);
1370 sv_2mortal(sv_points);
1371 points=(U8**)SvPV_nolen(sv_points );
1372 if ( trie_type != trie_utf8_fold && (trie->bitmap || OP(c)==TRIEC) ) {
1374 bitmap=(U8*)trie->bitmap;
1376 bitmap=(U8*)ANYOF_BITMAP(c);
1378 /* this is the Aho-Corasick algorithm modified a touch
1379 to include special handling for long "unknown char"
1380 sequences. The basic idea being that we use AC as long
1381 as we are dealing with a possible matching char, when
1382 we encounter an unknown char (and we have not encountered
1383 an accepting state) we scan forward until we find a legal
1385 AC matching is basically that of trie matching, except
1386 that when we encounter a failing transition, we fall back
1387 to the current states "fail state", and try the current char
1388 again, a process we repeat until we reach the root state,
1389 state 1, or a legal transition. If we fail on the root state
1390 then we can either terminate if we have reached an accepting
1391 state previously, or restart the entire process from the beginning
1395 while (s <= last_start) {
1396 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1404 U8 *uscan = (U8*)NULL;
1405 U8 *leftmost = NULL;
1407 U32 accepted_word= 0;
1411 while ( state && uc <= (U8*)strend ) {
1413 U32 word = aho->states[ state ].wordnum;
1415 if( state==1 && bitmap ) {
1416 DEBUG_TRIE_EXECUTE_r(
1417 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1418 dump_exec_pos( (char *)uc, c, strend, real_start,
1419 (char*)uc, do_utf8 );
1420 PerlIO_printf( Perl_debug_log,
1421 " Scanning for legal start char...\n");
1424 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1428 if (uc >(U8*)last_start) break;
1432 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1433 if (!leftmost || lpos < leftmost) {
1434 DEBUG_r(accepted_word=word);
1440 points[pointpos++ % maxlen]= uc;
1441 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1442 uvc, charid, foldlen, foldbuf, uniflags);
1443 DEBUG_TRIE_EXECUTE_r({
1444 dump_exec_pos( (char *)uc, c, strend, real_start,
1446 PerlIO_printf(Perl_debug_log,
1447 " Charid:%3u CP:%4"UVxf" ",
1453 word = aho->states[ state ].wordnum;
1455 base = aho->states[ state ].trans.base;
1457 DEBUG_TRIE_EXECUTE_r({
1459 dump_exec_pos( (char *)uc, c, strend, real_start,
1461 PerlIO_printf( Perl_debug_log,
1462 "%sState: %4"UVxf", word=%"UVxf,
1463 failed ? " Fail transition to " : "",
1464 (UV)state, (UV)word);
1469 (base + charid > trie->uniquecharcount )
1470 && (base + charid - 1 - trie->uniquecharcount
1472 && trie->trans[base + charid - 1 -
1473 trie->uniquecharcount].check == state
1474 && (tmp=trie->trans[base + charid - 1 -
1475 trie->uniquecharcount ].next))
1477 DEBUG_TRIE_EXECUTE_r(
1478 PerlIO_printf( Perl_debug_log," - legal\n"));
1483 DEBUG_TRIE_EXECUTE_r(
1484 PerlIO_printf( Perl_debug_log," - fail\n"));
1486 state = aho->fail[state];
1490 /* we must be accepting here */
1491 DEBUG_TRIE_EXECUTE_r(
1492 PerlIO_printf( Perl_debug_log," - accepting\n"));
1501 if (!state) state = 1;
1504 if ( aho->states[ state ].wordnum ) {
1505 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1506 if (!leftmost || lpos < leftmost) {
1507 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1512 s = (char*)leftmost;
1513 DEBUG_TRIE_EXECUTE_r({
1515 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1516 (UV)accepted_word, s - real_start
1519 if (!reginfo || regtry(reginfo, s)) {
1525 DEBUG_TRIE_EXECUTE_r({
1526 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1529 DEBUG_TRIE_EXECUTE_r(
1530 PerlIO_printf( Perl_debug_log,"No match.\n"));
1539 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1548 - regexec_flags - match a regexp against a string
1551 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1552 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1553 /* strend: pointer to null at end of string */
1554 /* strbeg: real beginning of string */
1555 /* minend: end of match must be >=minend after stringarg. */
1556 /* data: May be used for some additional optimizations. */
1557 /* nosave: For optimizations. */
1561 register regnode *c;
1562 register char *startpos = stringarg;
1563 I32 minlen; /* must match at least this many chars */
1564 I32 dontbother = 0; /* how many characters not to try at end */
1565 I32 end_shift = 0; /* Same for the end. */ /* CC */
1566 I32 scream_pos = -1; /* Internal iterator of scream. */
1567 char *scream_olds = NULL;
1568 SV* const oreplsv = GvSV(PL_replgv);
1569 const bool do_utf8 = DO_UTF8(sv);
1572 regmatch_info reginfo; /* create some info to pass to regtry etc */
1574 GET_RE_DEBUG_FLAGS_DECL;
1576 PERL_UNUSED_ARG(data);
1578 /* Be paranoid... */
1579 if (prog == NULL || startpos == NULL) {
1580 Perl_croak(aTHX_ "NULL regexp parameter");
1584 multiline = prog->reganch & PMf_MULTILINE;
1585 reginfo.prog = prog;
1587 RX_MATCH_UTF8_set(prog, do_utf8);
1589 minlen = prog->minlen;
1590 if (strend - startpos < minlen) {
1591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1592 "String too short [regexec_flags]...\n"));
1596 /* Check validity of program. */
1597 if (UCHARAT(prog->program) != REG_MAGIC) {
1598 Perl_croak(aTHX_ "corrupted regexp program");
1602 PL_reg_eval_set = 0;
1605 if (prog->reganch & ROPT_UTF8)
1606 PL_reg_flags |= RF_utf8;
1608 /* Mark beginning of line for ^ and lookbehind. */
1609 reginfo.bol = startpos; /* XXX not used ??? */
1613 /* Mark end of line for $ (and such) */
1616 /* see how far we have to get to not match where we matched before */
1617 reginfo.till = startpos+minend;
1619 /* If there is a "must appear" string, look for it. */
1622 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1625 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1626 reginfo.ganch = startpos;
1627 else if (sv && SvTYPE(sv) >= SVt_PVMG
1629 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1630 && mg->mg_len >= 0) {
1631 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1632 if (prog->reganch & ROPT_ANCH_GPOS) {
1633 if (s > reginfo.ganch)
1638 else /* pos() not defined */
1639 reginfo.ganch = strbeg;
1642 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1643 re_scream_pos_data d;
1645 d.scream_olds = &scream_olds;
1646 d.scream_pos = &scream_pos;
1647 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1650 goto phooey; /* not present */
1655 debug_start_match(prog, do_utf8, startpos, strend,
1659 /* Simplest case: anchored match need be tried only once. */
1660 /* [unless only anchor is BOL and multiline is set] */
1661 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1662 if (s == startpos && regtry(®info, startpos))
1664 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1665 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1670 dontbother = minlen - 1;
1671 end = HOP3c(strend, -dontbother, strbeg) - 1;
1672 /* for multiline we only have to try after newlines */
1673 if (prog->check_substr || prog->check_utf8) {
1677 if (regtry(®info, s))
1682 if (prog->reganch & RE_USE_INTUIT) {
1683 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1694 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1695 if (regtry(®info, s))
1702 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1703 if (regtry(®info, reginfo.ganch))
1708 /* Messy cases: unanchored match. */
1709 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1710 /* we have /x+whatever/ */
1711 /* it must be a one character string (XXXX Except UTF?) */
1716 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1717 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1718 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1723 DEBUG_EXECUTE_r( did_match = 1 );
1724 if (regtry(®info, s)) goto got_it;
1726 while (s < strend && *s == ch)
1734 DEBUG_EXECUTE_r( did_match = 1 );
1735 if (regtry(®info, s)) goto got_it;
1737 while (s < strend && *s == ch)
1742 DEBUG_EXECUTE_r(if (!did_match)
1743 PerlIO_printf(Perl_debug_log,
1744 "Did not find anchored character...\n")
1747 else if (prog->anchored_substr != NULL
1748 || prog->anchored_utf8 != NULL
1749 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1750 && prog->float_max_offset < strend - s)) {
1755 char *last1; /* Last position checked before */
1759 if (prog->anchored_substr || prog->anchored_utf8) {
1760 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1761 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1762 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1763 back_max = back_min = prog->anchored_offset;
1765 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1766 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1767 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1768 back_max = prog->float_max_offset;
1769 back_min = prog->float_min_offset;
1771 if (must == &PL_sv_undef)
1772 /* could not downgrade utf8 check substring, so must fail */
1775 last = HOP3c(strend, /* Cannot start after this */
1776 -(I32)(CHR_SVLEN(must)
1777 - (SvTAIL(must) != 0) + back_min), strbeg);
1780 last1 = HOPc(s, -1);
1782 last1 = s - 1; /* bogus */
1784 /* XXXX check_substr already used to find "s", can optimize if
1785 check_substr==must. */
1787 dontbother = end_shift;
1788 strend = HOPc(strend, -dontbother);
1789 while ( (s <= last) &&
1790 ((flags & REXEC_SCREAM)
1791 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1792 end_shift, &scream_pos, 0))
1793 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1794 (unsigned char*)strend, must,
1795 multiline ? FBMrf_MULTILINE : 0))) ) {
1796 /* we may be pointing at the wrong string */
1797 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1798 s = strbeg + (s - SvPVX_const(sv));
1799 DEBUG_EXECUTE_r( did_match = 1 );
1800 if (HOPc(s, -back_max) > last1) {
1801 last1 = HOPc(s, -back_min);
1802 s = HOPc(s, -back_max);
1805 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1807 last1 = HOPc(s, -back_min);
1811 while (s <= last1) {
1812 if (regtry(®info, s))
1818 while (s <= last1) {
1819 if (regtry(®info, s))
1825 DEBUG_EXECUTE_r(if (!did_match) {
1826 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1827 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1828 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1829 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1830 ? "anchored" : "floating"),
1831 quoted, RE_SV_TAIL(must));
1835 else if ( (c = prog->regstclass) ) {
1837 const OPCODE op = OP(prog->regstclass);
1838 /* don't bother with what can't match */
1839 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1840 strend = HOPc(strend, -(minlen - 1));
1843 SV * const prop = sv_newmortal();
1844 regprop(prog, prop, c);
1846 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1848 PerlIO_printf(Perl_debug_log,
1849 "Matching stclass %.*s against %s (%d chars)\n",
1850 (int)SvCUR(prop), SvPVX_const(prop),
1851 quoted, (int)(strend - s));
1854 if (find_byclass(prog, c, s, strend, ®info))
1856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1860 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1865 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1867 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1869 if (flags & REXEC_SCREAM) {
1870 last = screaminstr(sv, float_real, s - strbeg,
1871 end_shift, &scream_pos, 1); /* last one */
1873 last = scream_olds; /* Only one occurrence. */
1874 /* we may be pointing at the wrong string */
1875 else if (RX_MATCH_COPIED(prog))
1876 s = strbeg + (s - SvPVX_const(sv));
1880 const char * const little = SvPV_const(float_real, len);
1882 if (SvTAIL(float_real)) {
1883 if (memEQ(strend - len + 1, little, len - 1))
1884 last = strend - len + 1;
1885 else if (!multiline)
1886 last = memEQ(strend - len, little, len)
1887 ? strend - len : NULL;
1893 last = rninstr(s, strend, little, little + len);
1895 last = strend; /* matching "$" */
1899 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1900 "%sCan't trim the tail, match fails (should not happen)%s\n",
1901 PL_colors[4], PL_colors[5]));
1902 goto phooey; /* Should not happen! */
1904 dontbother = strend - last + prog->float_min_offset;
1906 if (minlen && (dontbother < minlen))
1907 dontbother = minlen - 1;
1908 strend -= dontbother; /* this one's always in bytes! */
1909 /* We don't know much -- general case. */
1912 if (regtry(®info, s))
1921 if (regtry(®info, s))
1923 } while (s++ < strend);
1931 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1933 if (PL_reg_eval_set) {
1934 /* Preserve the current value of $^R */
1935 if (oreplsv != GvSV(PL_replgv))
1936 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1937 restored, the value remains
1939 restore_pos(aTHX_ prog);
1942 /* make sure $`, $&, $', and $digit will work later */
1943 if ( !(flags & REXEC_NOT_FIRST) ) {
1944 RX_MATCH_COPY_FREE(prog);
1945 if (flags & REXEC_COPY_STR) {
1946 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
1947 #ifdef PERL_OLD_COPY_ON_WRITE
1949 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1951 PerlIO_printf(Perl_debug_log,
1952 "Copy on write: regexp capture, type %d\n",
1955 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
1956 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
1957 assert (SvPOKp(prog->saved_copy));
1961 RX_MATCH_COPIED_on(prog);
1962 s = savepvn(strbeg, i);
1968 prog->subbeg = strbeg;
1969 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1977 PL_colors[4], PL_colors[5]));
1978 if (PL_reg_eval_set)
1979 restore_pos(aTHX_ prog);
1984 - regtry - try match at specific point
1986 STATIC I32 /* 0 failure, 1 success */
1987 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
1993 regexp *prog = reginfo->prog;
1994 GET_RE_DEBUG_FLAGS_DECL;
1997 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1999 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2002 PL_reg_eval_set = RS_init;
2003 DEBUG_EXECUTE_r(DEBUG_s(
2004 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2005 (IV)(PL_stack_sp - PL_stack_base));
2007 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2008 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2009 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2011 /* Apparently this is not needed, judging by wantarray. */
2012 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2013 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2016 /* Make $_ available to executed code. */
2017 if (reginfo->sv != DEFSV) {
2019 DEFSV = reginfo->sv;
2022 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2023 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2024 /* prepare for quick setting of pos */
2025 #ifdef PERL_OLD_COPY_ON_WRITE
2027 sv_force_normal_flags(sv, 0);
2029 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2030 &PL_vtbl_mglob, NULL, 0);
2034 PL_reg_oldpos = mg->mg_len;
2035 SAVEDESTRUCTOR_X(restore_pos, prog);
2037 if (!PL_reg_curpm) {
2038 Newxz(PL_reg_curpm, 1, PMOP);
2041 SV* const repointer = newSViv(0);
2042 /* so we know which PL_regex_padav element is PL_reg_curpm */
2043 SvFLAGS(repointer) |= SVf_BREAK;
2044 av_push(PL_regex_padav,repointer);
2045 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2046 PL_regex_pad = AvARRAY(PL_regex_padav);
2050 PM_SETRE(PL_reg_curpm, prog);
2051 PL_reg_oldcurpm = PL_curpm;
2052 PL_curpm = PL_reg_curpm;
2053 if (RX_MATCH_COPIED(prog)) {
2054 /* Here is a serious problem: we cannot rewrite subbeg,
2055 since it may be needed if this match fails. Thus
2056 $` inside (?{}) could fail... */
2057 PL_reg_oldsaved = prog->subbeg;
2058 PL_reg_oldsavedlen = prog->sublen;
2059 #ifdef PERL_OLD_COPY_ON_WRITE
2060 PL_nrs = prog->saved_copy;
2062 RX_MATCH_COPIED_off(prog);
2065 PL_reg_oldsaved = NULL;
2066 prog->subbeg = PL_bostr;
2067 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2069 prog->startp[0] = startpos - PL_bostr;
2070 PL_reginput = startpos;
2071 PL_regstartp = prog->startp;
2072 PL_regendp = prog->endp;
2073 PL_reglastparen = &prog->lastparen;
2074 PL_reglastcloseparen = &prog->lastcloseparen;
2075 prog->lastparen = 0;
2076 prog->lastcloseparen = 0;
2078 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2079 if (PL_reg_start_tmpl <= prog->nparens) {
2080 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2081 if(PL_reg_start_tmp)
2082 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2084 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2087 /* XXXX What this code is doing here?!!! There should be no need
2088 to do this again and again, PL_reglastparen should take care of
2091 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2092 * Actually, the code in regcppop() (which Ilya may be meaning by
2093 * PL_reglastparen), is not needed at all by the test suite
2094 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2095 * enough, for building DynaLoader, or otherwise this
2096 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2097 * will happen. Meanwhile, this code *is* needed for the
2098 * above-mentioned test suite tests to succeed. The common theme
2099 * on those tests seems to be returning null fields from matches.
2104 if (prog->nparens) {
2106 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2113 if (regmatch(reginfo, prog->program + 1)) {
2114 prog->endp[0] = PL_reginput - PL_bostr;
2117 REGCP_UNWIND(lastcp);
2122 #define sayYES goto yes
2123 #define sayNO goto no
2124 #define sayNO_ANYOF goto no_anyof
2125 #define sayYES_FINAL goto yes_final
2126 #define sayNO_FINAL goto no_final
2127 #define sayNO_SILENT goto do_no
2128 #define saySAME(x) if (x) goto yes; else goto no
2130 #define CACHEsayNO STMT_START { \
2131 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2132 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2133 (1<<st->u.whilem.cache_bit); \
2138 /* this is used to determine how far from the left messages like
2139 'failed...' are printed. Currently 29 makes these messages line
2140 up with the opcode they refer to. Earlier perls used 25 which
2141 left these messages outdented making reviewing a debug output
2144 #define REPORT_CODE_OFF 29
2147 /* Make sure there is a test for this +1 options in re_tests */
2148 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2150 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2151 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2153 #define SLAB_FIRST(s) (&(s)->states[0])
2154 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2156 /* grab a new slab and return the first slot in it */
2158 STATIC regmatch_state *
2161 #if PERL_VERSION < 9
2164 regmatch_slab *s = PL_regmatch_slab->next;
2166 Newx(s, 1, regmatch_slab);
2167 s->prev = PL_regmatch_slab;
2169 PL_regmatch_slab->next = s;
2171 PL_regmatch_slab = s;
2172 return SLAB_FIRST(s);
2175 /* simulate a recursive call to regmatch */
2177 #define REGMATCH(ns, where) \
2180 st->resume_state = resume_##where; \
2181 goto start_recurse; \
2182 resume_point_##where:
2184 /* push a new state then goto it */
2186 #define PUSH_STATE_GOTO(state, node) \
2188 st->resume_state = state; \
2191 /* push a new state with success backtracking, then goto it */
2193 #define PUSH_YES_STATE_GOTO(state, node) \
2195 st->resume_state = state; \
2196 goto push_yes_state;
2201 - regmatch - main matching routine
2203 * Conceptually the strategy is simple: check to see whether the current
2204 * node matches, call self recursively to see whether the rest matches,
2205 * and then act accordingly. In practice we make some effort to avoid
2206 * recursion, in particular by going through "ordinary" nodes (that don't
2207 * need to know whether the rest of the match failed) by a loop instead of
2210 /* [lwall] I've hoisted the register declarations to the outer block in order to
2211 * maybe save a little bit of pushing and popping on the stack. It also takes
2212 * advantage of machines that use a register save mask on subroutine entry.
2214 * This function used to be heavily recursive, but since this had the
2215 * effect of blowing the CPU stack on complex regexes, it has been
2216 * restructured to be iterative, and to save state onto the heap rather
2217 * than the stack. Essentially whereever regmatch() used to be called, it
2218 * pushes the current state, notes where to return, then jumps back into
2221 * Originally the structure of this function used to look something like
2226 while (scan != NULL) {
2227 a++; // do stuff with a and b
2233 if (regmatch(...)) // recurse
2243 * Now it looks something like this:
2251 regmatch_state *st = new();
2253 st->a++; // do stuff with a and b
2255 while (scan != NULL) {
2263 st->resume_state = resume_FOO;
2264 goto start_recurse; // recurse
2273 st = new(); push a new state
2274 st->a = 1; st->b = 2;
2281 switch (resume_state) {
2283 goto resume_point_FOO;
2290 * WARNING: this means that any line in this function that contains a
2291 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2292 * regmatch() using gotos instead. Thus the values of any local variables
2293 * not saved in the regmatch_state structure will have been lost when
2294 * execution resumes on the next line .
2296 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2297 * PL_regmatch_state always points to the currently active state, and
2298 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2299 * The first time regmatch is called, the first slab is allocated, and is
2300 * never freed until interpreter desctruction. When the slab is full,
2301 * a new one is allocated chained to the end. At exit from regmatch, slabs
2302 * allocated since entry are freed.
2305 /* *** every FOO_fail should = FOO+1 */
2306 #define TRIE_next (REGNODE_MAX+1)
2307 #define TRIE_next_fail (REGNODE_MAX+2)
2308 #define EVAL_A (REGNODE_MAX+3)
2309 #define EVAL_A_fail (REGNODE_MAX+4)
2310 #define resume_CURLYX (REGNODE_MAX+5)
2311 #define resume_WHILEM1 (REGNODE_MAX+6)
2312 #define resume_WHILEM2 (REGNODE_MAX+7)
2313 #define resume_WHILEM3 (REGNODE_MAX+8)
2314 #define resume_WHILEM4 (REGNODE_MAX+9)
2315 #define resume_WHILEM5 (REGNODE_MAX+10)
2316 #define resume_WHILEM6 (REGNODE_MAX+11)
2317 #define BRANCH_next (REGNODE_MAX+12)
2318 #define BRANCH_next_fail (REGNODE_MAX+13)
2319 #define CURLYM_A (REGNODE_MAX+14)
2320 #define CURLYM_A_fail (REGNODE_MAX+15)
2321 #define CURLYM_B (REGNODE_MAX+16)
2322 #define CURLYM_B_fail (REGNODE_MAX+17)
2323 #define IFMATCH_A (REGNODE_MAX+18)
2324 #define IFMATCH_A_fail (REGNODE_MAX+19)
2325 #define CURLY_B_min_known (REGNODE_MAX+20)
2326 #define CURLY_B_min_known_fail (REGNODE_MAX+21)
2327 #define CURLY_B_min (REGNODE_MAX+22)
2328 #define CURLY_B_min_fail (REGNODE_MAX+23)
2329 #define CURLY_B_max (REGNODE_MAX+24)
2330 #define CURLY_B_max_fail (REGNODE_MAX+25)
2333 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2337 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2338 const char *start, const char *end, const char *blurb)
2340 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2344 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2345 prog->precomp, prog->prelen, 60);
2347 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2348 start, end - start, 60);
2350 PerlIO_printf(Perl_debug_log,
2351 "%s%s REx%s %s against %s\n",
2352 PL_colors[4], blurb, PL_colors[5], s0, s1);
2354 if (do_utf8||utf8_pat)
2355 PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
2356 !do_utf8 ? "pattern" : !utf8_pat ? "string" :
2357 "pattern and string"
2363 S_dump_exec_pos(pTHX_ const char *locinput,
2364 const regnode *scan,
2365 const char *loc_regeol,
2366 const char *loc_bostr,
2367 const char *loc_reg_starttry,
2370 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2371 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2372 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2373 /* The part of the string before starttry has one color
2374 (pref0_len chars), between starttry and current
2375 position another one (pref_len - pref0_len chars),
2376 after the current position the third one.
2377 We assume that pref0_len <= pref_len, otherwise we
2378 decrease pref0_len. */
2379 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2380 ? (5 + taill) - l : locinput - loc_bostr;
2383 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2385 pref0_len = pref_len - (locinput - loc_reg_starttry);
2386 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2387 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2388 ? (5 + taill) - pref_len : loc_regeol - locinput);
2389 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2393 if (pref0_len > pref_len)
2394 pref0_len = pref_len;
2396 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2398 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2399 (locinput - pref_len),pref0_len, pref0_len, 4, 5);
2401 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2402 (locinput - pref_len + pref0_len),
2403 pref_len - pref0_len, pref_len - pref0_len, 2, 3);
2405 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2406 locinput, loc_regeol - locinput, l, 0, 1);
2408 PerlIO_printf(Perl_debug_log,
2409 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2410 (IV)(locinput - loc_bostr),
2413 (docolor ? "" : "> <"),
2415 15 - l - pref_len + 1,
2422 STATIC I32 /* 0 failure, 1 success */
2423 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2425 #if PERL_VERSION < 9
2429 register const bool do_utf8 = PL_reg_match_utf8;
2430 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2432 regexp *rex = reginfo->prog;
2434 regmatch_slab *orig_slab;
2435 regmatch_state *orig_state;
2437 /* the current state. This is a cached copy of PL_regmatch_state */
2438 register regmatch_state *st;
2440 /* cache heavy used fields of st in registers */
2441 register regnode *scan;
2442 register regnode *next;
2443 register I32 n = 0; /* initialize to shut up compiler warning */
2444 register char *locinput = PL_reginput;
2446 /* these variables are NOT saved during a recusive RFEGMATCH: */
2447 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2448 bool result = 0; /* return value of S_regmatch */
2449 int depth = 0; /* depth of recursion */
2450 regmatch_state *yes_state = NULL; /* state to pop to on success of
2457 GET_RE_DEBUG_FLAGS_DECL;
2461 /* on first ever call to regmatch, allocate first slab */
2462 if (!PL_regmatch_slab) {
2463 Newx(PL_regmatch_slab, 1, regmatch_slab);
2464 PL_regmatch_slab->prev = NULL;
2465 PL_regmatch_slab->next = NULL;
2466 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2469 /* remember current high-water mark for exit */
2470 /* XXX this should be done with SAVE* instead */
2471 orig_slab = PL_regmatch_slab;
2472 orig_state = PL_regmatch_state;
2474 /* grab next free state slot */
2475 st = ++PL_regmatch_state;
2476 if (st > SLAB_LAST(PL_regmatch_slab))
2477 st = PL_regmatch_state = S_push_slab(aTHX);
2484 /* Note that nextchr is a byte even in UTF */
2485 nextchr = UCHARAT(locinput);
2487 while (scan != NULL) {
2490 SV * const prop = sv_newmortal();
2491 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2492 regprop(rex, prop, scan);
2494 PerlIO_printf(Perl_debug_log,
2495 "%3"IVdf":%*s%s(%"IVdf")\n",
2496 (IV)(scan - rex->program), PL_regindent*2, "",
2498 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2501 next = scan + NEXT_OFF(scan);
2504 state_num = OP(scan);
2507 switch (state_num) {
2509 if (locinput == PL_bostr)
2511 /* reginfo->till = reginfo->bol; */
2516 if (locinput == PL_bostr ||
2517 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2523 if (locinput == PL_bostr)
2527 if (locinput == reginfo->ganch)
2533 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2538 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2540 if (PL_regeol - locinput > 1)
2544 if (PL_regeol != locinput)
2548 if (!nextchr && locinput >= PL_regeol)
2551 locinput += PL_utf8skip[nextchr];
2552 if (locinput > PL_regeol)
2554 nextchr = UCHARAT(locinput);
2557 nextchr = UCHARAT(++locinput);
2560 if (!nextchr && locinput >= PL_regeol)
2562 nextchr = UCHARAT(++locinput);
2565 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2568 locinput += PL_utf8skip[nextchr];
2569 if (locinput > PL_regeol)
2571 nextchr = UCHARAT(locinput);
2574 nextchr = UCHARAT(++locinput);
2578 #define ST st->u.trie
2580 /* In this case the charclass data is available inline so
2581 we can fail fast without a lot of extra overhead.
2583 if (scan->flags == EXACT || !do_utf8) {
2584 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2586 PerlIO_printf(Perl_debug_log,
2587 "%*s %sfailed to match trie start class...%s\n",
2588 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2597 /* what type of TRIE am I? (utf8 makes this contextual) */
2598 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2599 trie_type = do_utf8 ?
2600 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2603 /* what trie are we using right now */
2604 reg_trie_data * const trie
2605 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2606 U32 state = trie->startstate;
2608 if (trie->bitmap && trie_type != trie_utf8_fold &&
2609 !TRIE_BITMAP_TEST(trie,*locinput)
2611 if (trie->states[ state ].wordnum) {
2613 PerlIO_printf(Perl_debug_log,
2614 "%*s %smatched empty string...%s\n",
2615 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2620 PerlIO_printf(Perl_debug_log,
2621 "%*s %sfailed to match trie start class...%s\n",
2622 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2629 U8 *uc = ( U8* )locinput;
2633 U8 *uscan = (U8*)NULL;
2635 SV *sv_accept_buff = NULL;
2636 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2638 ST.accepted = 0; /* how many accepting states we have seen */
2640 ST.jump = trie->jump;
2649 traverse the TRIE keeping track of all accepting states
2650 we transition through until we get to a failing node.
2653 while ( state && uc <= (U8*)PL_regeol ) {
2654 U32 base = trie->states[ state ].trans.base;
2657 /* We use charid to hold the wordnum as we don't use it
2658 for charid until after we have done the wordnum logic.
2659 We define an alias just so that the wordnum logic reads
2662 #define got_wordnum charid
2663 got_wordnum = trie->states[ state ].wordnum;
2665 if ( got_wordnum ) {
2666 if ( ! ST.accepted ) {
2669 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2670 sv_accept_buff=newSV(bufflen *
2671 sizeof(reg_trie_accepted) - 1);
2672 SvCUR_set(sv_accept_buff, 0);
2673 SvPOK_on(sv_accept_buff);
2674 sv_2mortal(sv_accept_buff);
2677 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2680 if (ST.accepted >= bufflen) {
2682 ST.accept_buff =(reg_trie_accepted*)
2683 SvGROW(sv_accept_buff,
2684 bufflen * sizeof(reg_trie_accepted));
2686 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2687 + sizeof(reg_trie_accepted));
2690 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2691 ST.accept_buff[ST.accepted].endpos = uc;
2693 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2697 DEBUG_TRIE_EXECUTE_r({
2698 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2699 PerlIO_printf( Perl_debug_log,
2700 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2701 2+PL_regindent * 2, "", PL_colors[4],
2702 (UV)state, (UV)ST.accepted );
2706 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2707 uvc, charid, foldlen, foldbuf, uniflags);
2710 (base + charid > trie->uniquecharcount )
2711 && (base + charid - 1 - trie->uniquecharcount
2713 && trie->trans[base + charid - 1 -
2714 trie->uniquecharcount].check == state)
2716 state = trie->trans[base + charid - 1 -
2717 trie->uniquecharcount ].next;
2728 DEBUG_TRIE_EXECUTE_r(
2729 PerlIO_printf( Perl_debug_log,
2730 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2731 charid, uvc, (UV)state, PL_colors[5] );
2738 PerlIO_printf( Perl_debug_log,
2739 "%*s %sgot %"IVdf" possible matches%s\n",
2740 REPORT_CODE_OFF + PL_regindent * 2, "",
2741 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2747 case TRIE_next_fail: /* we failed - try next alterative */
2749 if ( ST.accepted == 1 ) {
2750 /* only one choice left - just continue */
2752 reg_trie_data * const trie
2753 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2754 SV ** const tmp = RX_DEBUG(reginfo->prog)
2755 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2757 PerlIO_printf( Perl_debug_log,
2758 "%*s %sonly one match left: #%d <%s>%s\n",
2759 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2760 ST.accept_buff[ 0 ].wordnum,
2761 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2764 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2765 /* in this case we free tmps/leave before we call regmatch
2766 as we wont be using accept_buff again. */
2769 locinput = PL_reginput;
2770 nextchr = UCHARAT(locinput);
2775 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2777 continue; /* execute rest of RE */
2780 if (!ST.accepted-- ) {
2787 There are at least two accepting states left. Presumably
2788 the number of accepting states is going to be low,
2789 typically two. So we simply scan through to find the one
2790 with lowest wordnum. Once we find it, we swap the last
2791 state into its place and decrement the size. We then try to
2792 match the rest of the pattern at the point where the word
2793 ends. If we succeed, control just continues along the
2794 regex; if we fail we return here to try the next accepting
2801 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2802 DEBUG_TRIE_EXECUTE_r(
2803 PerlIO_printf( Perl_debug_log,
2804 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2805 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2806 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2807 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2810 if (ST.accept_buff[cur].wordnum <
2811 ST.accept_buff[best].wordnum)
2816 reg_trie_data * const trie
2817 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2818 SV ** const tmp = RX_DEBUG(reginfo->prog)
2819 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2821 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2822 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2823 ST.accept_buff[best].wordnum,
2824 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2828 if ( best<ST.accepted ) {
2829 reg_trie_accepted tmp = ST.accept_buff[ best ];
2830 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2831 ST.accept_buff[ ST.accepted ] = tmp;
2834 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2836 PUSH_STATE_GOTO(TRIE_next, ST.B);
2839 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2849 char *s = STRING(scan);
2850 st->ln = STR_LEN(scan);
2851 if (do_utf8 != UTF) {
2852 /* The target and the pattern have differing utf8ness. */
2854 const char * const e = s + st->ln;
2857 /* The target is utf8, the pattern is not utf8. */
2862 if (NATIVE_TO_UNI(*(U8*)s) !=
2863 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2871 /* The target is not utf8, the pattern is utf8. */
2876 if (NATIVE_TO_UNI(*((U8*)l)) !=
2877 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2885 nextchr = UCHARAT(locinput);
2888 /* The target and the pattern have the same utf8ness. */
2889 /* Inline the first character, for speed. */
2890 if (UCHARAT(s) != nextchr)
2892 if (PL_regeol - locinput < st->ln)
2894 if (st->ln > 1 && memNE(s, locinput, st->ln))
2897 nextchr = UCHARAT(locinput);
2901 PL_reg_flags |= RF_tainted;
2904 char * const s = STRING(scan);
2905 st->ln = STR_LEN(scan);
2907 if (do_utf8 || UTF) {
2908 /* Either target or the pattern are utf8. */
2909 const char * const l = locinput;
2910 char *e = PL_regeol;
2912 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2913 l, &e, 0, do_utf8)) {
2914 /* One more case for the sharp s:
2915 * pack("U0U*", 0xDF) =~ /ss/i,
2916 * the 0xC3 0x9F are the UTF-8
2917 * byte sequence for the U+00DF. */
2919 toLOWER(s[0]) == 's' &&
2921 toLOWER(s[1]) == 's' &&
2928 nextchr = UCHARAT(locinput);
2932 /* Neither the target and the pattern are utf8. */
2934 /* Inline the first character, for speed. */
2935 if (UCHARAT(s) != nextchr &&
2936 UCHARAT(s) != ((OP(scan) == EXACTF)
2937 ? PL_fold : PL_fold_locale)[nextchr])
2939 if (PL_regeol - locinput < st->ln)
2941 if (st->ln > 1 && (OP(scan) == EXACTF
2942 ? ibcmp(s, locinput, st->ln)
2943 : ibcmp_locale(s, locinput, st->ln)))
2946 nextchr = UCHARAT(locinput);
2951 STRLEN inclasslen = PL_regeol - locinput;
2953 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
2955 if (locinput >= PL_regeol)
2957 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2958 nextchr = UCHARAT(locinput);
2963 nextchr = UCHARAT(locinput);
2964 if (!REGINCLASS(rex, scan, (U8*)locinput))
2966 if (!nextchr && locinput >= PL_regeol)
2968 nextchr = UCHARAT(++locinput);
2972 /* If we might have the case of the German sharp s
2973 * in a casefolding Unicode character class. */
2975 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2976 locinput += SHARP_S_SKIP;
2977 nextchr = UCHARAT(locinput);
2983 PL_reg_flags |= RF_tainted;
2989 LOAD_UTF8_CHARCLASS_ALNUM();
2990 if (!(OP(scan) == ALNUM
2991 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2992 : isALNUM_LC_utf8((U8*)locinput)))
2996 locinput += PL_utf8skip[nextchr];
2997 nextchr = UCHARAT(locinput);
3000 if (!(OP(scan) == ALNUM
3001 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3003 nextchr = UCHARAT(++locinput);
3006 PL_reg_flags |= RF_tainted;
3009 if (!nextchr && locinput >= PL_regeol)
3012 LOAD_UTF8_CHARCLASS_ALNUM();
3013 if (OP(scan) == NALNUM
3014 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3015 : isALNUM_LC_utf8((U8*)locinput))
3019 locinput += PL_utf8skip[nextchr];
3020 nextchr = UCHARAT(locinput);
3023 if (OP(scan) == NALNUM
3024 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3026 nextchr = UCHARAT(++locinput);
3030 PL_reg_flags |= RF_tainted;
3034 /* was last char in word? */
3036 if (locinput == PL_bostr)
3039 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3041 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3043 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3044 st->ln = isALNUM_uni(st->ln);
3045 LOAD_UTF8_CHARCLASS_ALNUM();
3046 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3049 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
3050 n = isALNUM_LC_utf8((U8*)locinput);
3054 st->ln = (locinput != PL_bostr) ?
3055 UCHARAT(locinput - 1) : '\n';
3056 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3057 st->ln = isALNUM(st->ln);
3058 n = isALNUM(nextchr);
3061 st->ln = isALNUM_LC(st->ln);
3062 n = isALNUM_LC(nextchr);
3065 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3066 OP(scan) == BOUNDL))
3070 PL_reg_flags |= RF_tainted;
3076 if (UTF8_IS_CONTINUED(nextchr)) {
3077 LOAD_UTF8_CHARCLASS_SPACE();
3078 if (!(OP(scan) == SPACE
3079 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3080 : isSPACE_LC_utf8((U8*)locinput)))
3084 locinput += PL_utf8skip[nextchr];
3085 nextchr = UCHARAT(locinput);
3088 if (!(OP(scan) == SPACE
3089 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3091 nextchr = UCHARAT(++locinput);
3094 if (!(OP(scan) == SPACE
3095 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3097 nextchr = UCHARAT(++locinput);
3101 PL_reg_flags |= RF_tainted;
3104 if (!nextchr && locinput >= PL_regeol)
3107 LOAD_UTF8_CHARCLASS_SPACE();
3108 if (OP(scan) == NSPACE
3109 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3110 : isSPACE_LC_utf8((U8*)locinput))
3114 locinput += PL_utf8skip[nextchr];
3115 nextchr = UCHARAT(locinput);
3118 if (OP(scan) == NSPACE
3119 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3121 nextchr = UCHARAT(++locinput);
3124 PL_reg_flags |= RF_tainted;
3130 LOAD_UTF8_CHARCLASS_DIGIT();
3131 if (!(OP(scan) == DIGIT
3132 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3133 : isDIGIT_LC_utf8((U8*)locinput)))
3137 locinput += PL_utf8skip[nextchr];
3138 nextchr = UCHARAT(locinput);
3141 if (!(OP(scan) == DIGIT
3142 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3144 nextchr = UCHARAT(++locinput);
3147 PL_reg_flags |= RF_tainted;
3150 if (!nextchr && locinput >= PL_regeol)
3153 LOAD_UTF8_CHARCLASS_DIGIT();
3154 if (OP(scan) == NDIGIT
3155 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3156 : isDIGIT_LC_utf8((U8*)locinput))
3160 locinput += PL_utf8skip[nextchr];
3161 nextchr = UCHARAT(locinput);
3164 if (OP(scan) == NDIGIT
3165 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3167 nextchr = UCHARAT(++locinput);
3170 if (locinput >= PL_regeol)
3173 LOAD_UTF8_CHARCLASS_MARK();
3174 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3176 locinput += PL_utf8skip[nextchr];
3177 while (locinput < PL_regeol &&
3178 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3179 locinput += UTF8SKIP(locinput);
3180 if (locinput > PL_regeol)
3185 nextchr = UCHARAT(locinput);
3188 PL_reg_flags |= RF_tainted;
3193 n = ARG(scan); /* which paren pair */
3194 st->ln = PL_regstartp[n];
3195 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3196 if ((I32)*PL_reglastparen < n || st->ln == -1)
3197 sayNO; /* Do not match unless seen CLOSEn. */
3198 if (st->ln == PL_regendp[n])
3201 s = PL_bostr + st->ln;
3202 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3204 const char *e = PL_bostr + PL_regendp[n];
3206 * Note that we can't do the "other character" lookup trick as
3207 * in the 8-bit case (no pun intended) because in Unicode we
3208 * have to map both upper and title case to lower case.
3210 if (OP(scan) == REFF) {
3212 STRLEN ulen1, ulen2;
3213 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3214 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3218 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3219 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3220 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3227 nextchr = UCHARAT(locinput);
3231 /* Inline the first character, for speed. */
3232 if (UCHARAT(s) != nextchr &&
3234 (UCHARAT(s) != ((OP(scan) == REFF
3235 ? PL_fold : PL_fold_locale)[nextchr]))))
3237 st->ln = PL_regendp[n] - st->ln;
3238 if (locinput + st->ln > PL_regeol)
3240 if (st->ln > 1 && (OP(scan) == REF
3241 ? memNE(s, locinput, st->ln)
3243 ? ibcmp(s, locinput, st->ln)
3244 : ibcmp_locale(s, locinput, st->ln))))
3247 nextchr = UCHARAT(locinput);
3258 #define ST st->u.eval
3260 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3264 /* execute the code in the {...} */
3266 SV ** const before = SP;
3267 OP_4tree * const oop = PL_op;
3268 COP * const ocurcop = PL_curcop;
3272 PL_op = (OP_4tree*)rex->data->data[n];
3273 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3274 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3275 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3277 CALLRUNOPS(aTHX); /* Scalar context. */
3280 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3287 PAD_RESTORE_LOCAL(old_comppad);
3288 PL_curcop = ocurcop;
3291 sv_setsv(save_scalar(PL_replgv), ret);
3295 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3298 /* extract RE object from returned value; compiling if
3303 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3304 mg = mg_find(sv, PERL_MAGIC_qr);
3305 else if (SvSMAGICAL(ret)) {
3306 if (SvGMAGICAL(ret))
3307 sv_unmagic(ret, PERL_MAGIC_qr);
3309 mg = mg_find(ret, PERL_MAGIC_qr);
3313 re = (regexp *)mg->mg_obj;
3314 (void)ReREFCNT_inc(re);
3318 const char * const t = SvPV_const(ret, len);
3320 const I32 osize = PL_regsize;
3323 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3324 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3326 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3328 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3334 /* run the pattern returned from (??{...}) */
3336 debug_start_match(re, do_utf8, locinput, PL_regeol,
3337 "Matching embedded");
3340 ST.cp = regcppush(0); /* Save *all* the positions. */
3341 REGCP_SET(ST.lastcp);
3342 *PL_reglastparen = 0;
3343 *PL_reglastcloseparen = 0;
3344 PL_reginput = locinput;
3346 /* XXXX This is too dramatic a measure... */
3350 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3351 ((re->reganch & ROPT_UTF8) != 0);
3352 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3357 /* now continue from first node in postoned RE */
3358 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
3361 /* /(?(?{...})X|Y)/ */
3362 st->sw = SvTRUE(ret);
3367 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3369 PL_reg_flags ^= RF_utf8;
3372 /* XXXX This is too dramatic a measure... */
3374 /* Restore parens of the caller without popping the
3377 const I32 tmp = PL_savestack_ix;
3378 PL_savestack_ix = ST.lastcp;
3380 PL_savestack_ix = tmp;
3382 PL_reginput = locinput;
3383 /* continue at the node following the (??{...}) */
3387 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3388 /* Restore state to the outer re then re-throw the failure */
3390 PL_reg_flags ^= RF_utf8;
3394 /* XXXX This is too dramatic a measure... */
3397 PL_reginput = locinput;
3398 REGCP_UNWIND(ST.lastcp);
3405 n = ARG(scan); /* which paren pair */
3406 PL_reg_start_tmp[n] = locinput;
3411 n = ARG(scan); /* which paren pair */
3412 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3413 PL_regendp[n] = locinput - PL_bostr;
3414 if (n > (I32)*PL_reglastparen)
3415 *PL_reglastparen = n;
3416 *PL_reglastcloseparen = n;
3419 n = ARG(scan); /* which paren pair */
3420 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3423 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3425 next = NEXTOPER(NEXTOPER(scan));
3427 next = scan + ARG(scan);
3428 if (OP(next) == IFTHEN) /* Fake one. */
3429 next = NEXTOPER(NEXTOPER(next));
3433 st->logical = scan->flags;
3435 /*******************************************************************
3436 cc points to the regmatch_state associated with the most recent CURLYX.
3437 This struct contains info about the innermost (...)* loop (an
3438 "infoblock"), and a pointer to the next outer cc.
3440 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3442 1) After matching Y, regnode for CURLYX is processed;
3444 2) This regnode populates cc, and calls regmatch() recursively
3445 with the starting point at WHILEM node;
3447 3) Each hit of WHILEM node tries to match A and Z (in the order
3448 depending on the current iteration, min/max of {min,max} and
3449 greediness). The information about where are nodes for "A"
3450 and "Z" is read from cc, as is info on how many times "A"
3451 was already matched, and greediness.
3453 4) After A matches, the same WHILEM node is hit again.
3455 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3456 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3457 resets cc, since this Y(A)*Z can be a part of some other loop:
3458 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3459 of the external loop.
3461 Currently present infoblocks form a tree with a stem formed by st->cc
3462 and whatever it mentions via ->next, and additional attached trees
3463 corresponding to temporarily unset infoblocks as in "5" above.
3465 In the following picture, infoblocks for outer loop of
3466 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3467 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3468 infoblocks are drawn below the "reset" infoblock.
3470 In fact in the picture below we do not show failed matches for Z and T
3471 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3472 more obvious *why* one needs to *temporary* unset infoblocks.]
3474 Matched REx position InfoBlocks Comment
3478 Y A)*?Z)*?T x <- O <- I
3479 YA )*?Z)*?T x <- O <- I
3480 YA A)*?Z)*?T x <- O <- I
3481 YAA )*?Z)*?T x <- O <- I
3482 YAA Z)*?T x <- O # Temporary unset I
3485 YAAZ Y(A)*?Z)*?T x <- O
3488 YAAZY (A)*?Z)*?T x <- O
3491 YAAZY A)*?Z)*?T x <- O <- I
3494 YAAZYA )*?Z)*?T x <- O <- I
3497 YAAZYA Z)*?T x <- O # Temporary unset I
3503 YAAZYAZ T x # Temporary unset O
3510 *******************************************************************/
3513 /* No need to save/restore up to this paren */
3514 parenfloor = scan->flags;
3518 CURLYX and WHILEM are always paired: they're the moral
3519 equivalent of pp_enteriter anbd pp_iter.
3521 The only time next could be null is if the node tree is
3522 corrupt. This was mentioned on p5p a few days ago.
3524 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3525 So we'll assert that this is true:
3528 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3530 /* XXXX Probably it is better to teach regpush to support
3531 parenfloor > PL_regsize... */
3532 if (parenfloor > (I32)*PL_reglastparen)
3533 parenfloor = *PL_reglastparen; /* Pessimization... */
3535 st->u.curlyx.cp = PL_savestack_ix;
3536 st->u.curlyx.outercc = st->cc;
3538 /* these fields contain the state of the current curly.
3539 * they are accessed by subsequent WHILEMs;
3540 * cur and lastloc are also updated by WHILEM */
3541 st->u.curlyx.parenfloor = parenfloor;
3542 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3543 st->u.curlyx.min = ARG1(scan);
3544 st->u.curlyx.max = ARG2(scan);
3545 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3546 st->u.curlyx.lastloc = 0;
3547 /* st->next and st->minmod are also read by WHILEM */
3549 PL_reginput = locinput;
3550 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3551 /*** all unsaved local vars undefined at this point */
3552 regcpblow(st->u.curlyx.cp);
3553 st->cc = st->u.curlyx.outercc;
3559 * This is really hard to understand, because after we match
3560 * what we're trying to match, we must make sure the rest of
3561 * the REx is going to match for sure, and to do that we have
3562 * to go back UP the parse tree by recursing ever deeper. And
3563 * if it fails, we have to reset our parent's current state
3564 * that we can try again after backing off.
3569 st->cc gets initialised by CURLYX ready for use by WHILEM.
3570 So again, unless somethings been corrupted, st->cc cannot
3571 be null at that point in WHILEM.
3573 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3574 So we'll assert that this is true:
3577 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3578 st->u.whilem.cache_offset = 0;
3579 st->u.whilem.cache_bit = 0;
3581 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3582 PL_reginput = locinput;
3585 PerlIO_printf(Perl_debug_log,
3586 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3587 REPORT_CODE_OFF+PL_regindent*2, "",
3588 (long)n, (long)st->cc->u.curlyx.min,
3589 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3592 /* If degenerate scan matches "", assume scan done. */
3594 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3595 st->u.whilem.savecc = st->cc;
3596 st->cc = st->cc->u.curlyx.outercc;
3598 st->ln = st->cc->u.curlyx.cur;
3600 PerlIO_printf(Perl_debug_log,
3601 "%*s empty match detected, try continuation...\n",
3602 REPORT_CODE_OFF+PL_regindent*2, "")
3604 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3605 /*** all unsaved local vars undefined at this point */
3606 st->cc = st->u.whilem.savecc;
3609 if (st->cc->u.curlyx.outercc)
3610 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3614 /* First just match a string of min scans. */
3616 if (n < st->cc->u.curlyx.min) {
3617 st->cc->u.curlyx.cur = n;
3618 st->cc->u.curlyx.lastloc = locinput;
3619 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3620 /*** all unsaved local vars undefined at this point */
3623 st->cc->u.curlyx.cur = n - 1;
3624 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3629 /* Check whether we already were at this position.
3630 Postpone detection until we know the match is not
3631 *that* much linear. */
3632 if (!PL_reg_maxiter) {
3633 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3634 /* possible overflow for long strings and many CURLYX's */
3635 if (PL_reg_maxiter < 0)
3636 PL_reg_maxiter = I32_MAX;
3637 PL_reg_leftiter = PL_reg_maxiter;
3639 if (PL_reg_leftiter-- == 0) {
3640 const I32 size = (PL_reg_maxiter + 7)/8;
3641 if (PL_reg_poscache) {
3642 if ((I32)PL_reg_poscache_size < size) {
3643 Renew(PL_reg_poscache, size, char);
3644 PL_reg_poscache_size = size;
3646 Zero(PL_reg_poscache, size, char);
3649 PL_reg_poscache_size = size;
3650 Newxz(PL_reg_poscache, size, char);
3653 PerlIO_printf(Perl_debug_log,
3654 "%sDetected a super-linear match, switching on caching%s...\n",
3655 PL_colors[4], PL_colors[5])
3658 if (PL_reg_leftiter < 0) {
3659 st->u.whilem.cache_offset = locinput - PL_bostr;
3661 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
3662 + st->u.whilem.cache_offset * (scan->flags>>4);
3663 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3664 st->u.whilem.cache_offset /= 8;
3665 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3667 PerlIO_printf(Perl_debug_log,
3668 "%*s already tried at this position...\n",
3669 REPORT_CODE_OFF+PL_regindent*2, "")
3671 sayNO; /* cache records failure */
3676 /* Prefer next over scan for minimal matching. */
3678 if (st->cc->minmod) {
3679 st->u.whilem.savecc = st->cc;
3680 st->cc = st->cc->u.curlyx.outercc;
3682 st->ln = st->cc->u.curlyx.cur;
3683 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3684 REGCP_SET(st->u.whilem.lastcp);
3685 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3686 /*** all unsaved local vars undefined at this point */
3687 st->cc = st->u.whilem.savecc;
3689 regcpblow(st->u.whilem.cp);
3690 sayYES; /* All done. */
3692 REGCP_UNWIND(st->u.whilem.lastcp);
3694 if (st->cc->u.curlyx.outercc)
3695 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3697 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3698 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3699 && !(PL_reg_flags & RF_warned)) {
3700 PL_reg_flags |= RF_warned;
3701 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3702 "Complex regular subexpression recursion",
3709 PerlIO_printf(Perl_debug_log,
3710 "%*s trying longer...\n",
3711 REPORT_CODE_OFF+PL_regindent*2, "")
3713 /* Try scanning more and see if it helps. */
3714 PL_reginput = locinput;
3715 st->cc->u.curlyx.cur = n;
3716 st->cc->u.curlyx.lastloc = locinput;
3717 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3718 REGCP_SET(st->u.whilem.lastcp);
3719 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3720 /*** all unsaved local vars undefined at this point */
3722 regcpblow(st->u.whilem.cp);
3725 REGCP_UNWIND(st->u.whilem.lastcp);
3727 st->cc->u.curlyx.cur = n - 1;
3728 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3732 /* Prefer scan over next for maximal matching. */
3734 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3735 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3736 st->cc->u.curlyx.cur = n;
3737 st->cc->u.curlyx.lastloc = locinput;
3738 REGCP_SET(st->u.whilem.lastcp);
3739 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
3740 /*** all unsaved local vars undefined at this point */
3742 regcpblow(st->u.whilem.cp);
3745 REGCP_UNWIND(st->u.whilem.lastcp);
3746 regcppop(rex); /* Restore some previous $<digit>s? */
3747 PL_reginput = locinput;
3749 PerlIO_printf(Perl_debug_log,
3750 "%*s failed, try continuation...\n",
3751 REPORT_CODE_OFF+PL_regindent*2, "")
3754 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3755 && !(PL_reg_flags & RF_warned)) {
3756 PL_reg_flags |= RF_warned;
3757 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3758 "Complex regular subexpression recursion",
3762 /* Failed deeper matches of scan, so see if this one works. */
3763 st->u.whilem.savecc = st->cc;
3764 st->cc = st->cc->u.curlyx.outercc;
3766 st->ln = st->cc->u.curlyx.cur;
3767 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
3768 /*** all unsaved local vars undefined at this point */
3769 st->cc = st->u.whilem.savecc;
3772 if (st->cc->u.curlyx.outercc)
3773 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3774 st->cc->u.curlyx.cur = n - 1;
3775 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3781 #define ST st->u.branch
3783 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3784 next = scan + ARG(scan);
3787 scan = NEXTOPER(scan);
3790 case BRANCH: /* /(...|A|...)/ */
3791 scan = NEXTOPER(scan); /* scan now points to inner node */
3792 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3793 /* last branch; skip state push and jump direct to node */
3795 ST.lastparen = *PL_reglastparen;
3796 ST.next_branch = next;
3798 PL_reginput = locinput;
3800 /* Now go into the branch */
3801 PUSH_STATE_GOTO(BRANCH_next, scan);
3804 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3805 REGCP_UNWIND(ST.cp);
3806 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3808 *PL_reglastparen = n;
3809 scan = ST.next_branch;
3810 /* no more branches? */
3811 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3813 continue; /* execute next BRANCH[J] op */
3821 #define ST st->u.curlym
3823 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3825 /* This is an optimisation of CURLYX that enables us to push
3826 * only a single backtracking state, no matter now many matches
3827 * there are in {m,n}. It relies on the pattern being constant
3828 * length, with no parens to influence future backrefs
3832 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3834 /* if paren positive, emulate an OPEN/CLOSE around A */
3836 I32 paren = ST.me->flags;
3837 if (paren > PL_regsize)
3839 if (paren > (I32)*PL_reglastparen)
3840 *PL_reglastparen = paren;
3841 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3847 ST.minmod = st->minmod;
3849 ST.c1 = CHRTEST_UNINIT;
3852 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3855 curlym_do_A: /* execute the A in /A{m,n}B/ */
3856 PL_reginput = locinput;
3857 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3860 case CURLYM_A: /* we've just matched an A */
3861 locinput = st->locinput;
3862 nextchr = UCHARAT(locinput);
3865 /* after first match, determine A's length: u.curlym.alen */
3866 if (ST.count == 1) {
3867 if (PL_reg_match_utf8) {
3869 while (s < PL_reginput) {
3875 ST.alen = PL_reginput - locinput;
3878 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3881 PerlIO_printf(Perl_debug_log,
3882 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
3883 (int)(REPORT_CODE_OFF+(PL_regindent*2)), "",
3884 (IV) ST.count, (IV)ST.alen)
3887 locinput = PL_reginput;
3888 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3889 goto curlym_do_A; /* try to match another A */
3890 goto curlym_do_B; /* try to match B */
3892 case CURLYM_A_fail: /* just failed to match an A */
3893 REGCP_UNWIND(ST.cp);
3894 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3897 curlym_do_B: /* execute the B in /A{m,n}B/ */
3898 PL_reginput = locinput;
3899 if (ST.c1 == CHRTEST_UNINIT) {
3900 /* calculate c1 and c2 for possible match of 1st char
3901 * following curly */
3902 ST.c1 = ST.c2 = CHRTEST_VOID;
3903 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3904 regnode *text_node = ST.B;
3905 if (! HAS_TEXT(text_node))
3906 FIND_NEXT_IMPT(text_node);