5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
80 #define RF_evaled 4 /* Did an EVAL with setting? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
153 static void restore_pos(pTHX_ void *arg);
156 S_regcppush(pTHX_ I32 parenfloor)
159 const int retval = PL_savestack_ix;
160 #define REGCP_PAREN_ELEMS 4
161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
163 GET_RE_DEBUG_FLAGS_DECL;
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168 #define REGCP_OTHER_ELEMS 6
169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
170 for (p = PL_regsize; p > parenfloor; p--) {
171 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
174 SSPUSHPTR(PL_reg_start_tmp[p]);
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
183 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
186 SSPUSHINT(*PL_reglastcloseparen);
187 SSPUSHPTR(PL_reginput);
188 #define REGCP_FRAME_ELEMS 2
189 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
197 /* These are needed since we do not localize EVAL nodes: */
198 #define REGCP_SET(cp) \
200 PerlIO_printf(Perl_debug_log, \
201 " Setting an EVAL scope, savestack=%"IVdf"\n", \
202 (IV)PL_savestack_ix)); \
205 #define REGCP_UNWIND(cp) \
207 if (cp != PL_savestack_ix) \
208 PerlIO_printf(Perl_debug_log, \
209 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
210 (IV)(cp), (IV)PL_savestack_ix)); \
214 S_regcppop(pTHX_ const regexp *rex)
220 GET_RE_DEBUG_FLAGS_DECL;
222 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
224 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
225 i = SSPOPINT; /* Parentheses elements to pop. */
226 input = (char *) SSPOPPTR;
227 *PL_reglastcloseparen = SSPOPINT;
228 *PL_reglastparen = SSPOPINT;
229 PL_regsize = SSPOPINT;
231 /* Now restore the parentheses context. */
232 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
233 i > 0; i -= REGCP_PAREN_ELEMS) {
235 U32 paren = (U32)SSPOPINT;
236 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
237 PL_regstartp[paren] = SSPOPINT;
239 if (paren <= *PL_reglastparen)
240 PL_regendp[paren] = tmps;
242 PerlIO_printf(Perl_debug_log,
243 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
244 (UV)paren, (IV)PL_regstartp[paren],
245 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
246 (IV)PL_regendp[paren],
247 (paren > *PL_reglastparen ? "(no)" : ""));
251 if (*PL_reglastparen + 1 <= rex->nparens) {
252 PerlIO_printf(Perl_debug_log,
253 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
254 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
258 /* It would seem that the similar code in regtry()
259 * already takes care of this, and in fact it is in
260 * a better location to since this code can #if 0-ed out
261 * but the code in regtry() is needed or otherwise tests
262 * requiring null fields (pat.t#187 and split.t#{13,14}
263 * (as of patchlevel 7877) will fail. Then again,
264 * this code seems to be necessary or otherwise
265 * building DynaLoader will fail:
266 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
268 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
270 PL_regstartp[i] = -1;
277 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
280 * pregexec and friends
283 #ifndef PERL_IN_XSUB_RE
285 - pregexec - match a regexp against a string
288 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
289 char *strbeg, I32 minend, SV *screamer, U32 nosave)
290 /* strend: pointer to null at end of string */
291 /* strbeg: real beginning of string */
292 /* minend: end of match must be >=minend after stringarg. */
293 /* nosave: For optimizations. */
296 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
297 nosave ? 0 : REXEC_COPY_STR);
302 * Need to implement the following flags for reg_anch:
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
315 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
317 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
320 /* XXXX We assume that strpos is strbeg unless sv. */
322 /* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
329 /* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
332 regtry() should be needed.
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
339 We use a)b)d) and multiline-part of c), and try to find a position in the
340 string which does not contradict any of them.
343 /* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
348 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
352 register I32 start_shift = 0;
353 /* Should be nonnegative! */
354 register I32 end_shift = 0;
359 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
361 register char *other_last = NULL; /* other substr checked before this */
362 char *check_at = NULL; /* check substr found at this pos */
363 const I32 multiline = prog->reganch & PMf_MULTILINE;
365 const char * const i_strpos = strpos;
368 GET_RE_DEBUG_FLAGS_DECL;
370 RX_MATCH_UTF8_set(prog,do_utf8);
372 if (prog->reganch & ROPT_UTF8) {
373 PL_reg_flags |= RF_utf8;
376 debug_start_match(prog, do_utf8, strpos, strend,
377 "Guessing start of match for");
380 /* CHR_DIST() would be more correct here but it makes things slow. */
381 if (prog->minlen > strend - strpos) {
382 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
383 "String too short... [re_intuit_start]\n"));
386 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
389 if (!prog->check_utf8 && prog->check_substr)
390 to_utf8_substr(prog);
391 check = prog->check_utf8;
393 if (!prog->check_substr && prog->check_utf8)
394 to_byte_substr(prog);
395 check = prog->check_substr;
397 if (check == &PL_sv_undef) {
398 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
399 "Non-utf string cannot match utf check string\n"));
402 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
403 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
404 || ( (prog->reganch & ROPT_ANCH_BOL)
405 && !multiline ) ); /* Check after \n? */
408 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
409 | ROPT_IMPLICIT)) /* not a real BOL */
410 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
412 && (strpos != strbeg)) {
413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
416 if (prog->check_offset_min == prog->check_offset_max &&
417 !(prog->reganch & ROPT_CANY_SEEN)) {
418 /* Substring at constant offset from beg-of-str... */
421 s = HOP3c(strpos, prog->check_offset_min, strend);
423 slen = SvCUR(check); /* >= 1 */
425 if ( strend - s > slen || strend - s < slen - 1
426 || (strend - s == slen && strend[-1] != '\n')) {
427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
430 /* Now should match s[0..slen-2] */
432 if (slen && (*SvPVX_const(check) != *s
434 && memNE(SvPVX_const(check), s, slen)))) {
436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
440 else if (*SvPVX_const(check) != *s
441 || ((slen = SvCUR(check)) > 1
442 && memNE(SvPVX_const(check), s, slen)))
445 goto success_at_start;
448 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
450 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
451 end_shift = prog->minlen - start_shift -
452 CHR_SVLEN(check) + (SvTAIL(check) != 0);
454 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
455 - (SvTAIL(check) != 0);
456 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
458 if (end_shift < eshift)
462 else { /* Can match at random position */
465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
466 /* Should be nonnegative! */
467 end_shift = prog->minlen - start_shift -
468 CHR_SVLEN(check) + (SvTAIL(check) != 0);
471 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
473 Perl_croak(aTHX_ "panic: end_shift");
477 /* Find a possible match in the region s..strend by looking for
478 the "check" substring in the region corrected by start/end_shift. */
479 if (flags & REXEC_SCREAM) {
480 I32 p = -1; /* Internal iterator of scream. */
481 I32 * const pp = data ? data->scream_pos : &p;
483 if (PL_screamfirst[BmRARE(check)] >= 0
484 || ( BmRARE(check) == '\n'
485 && (BmPREVIOUS(check) == SvCUR(check) - 1)
487 s = screaminstr(sv, check,
488 start_shift + (s - strbeg), end_shift, pp, 0);
491 /* we may be pointing at the wrong string */
492 if (s && RX_MATCH_COPIED(prog))
493 s = strbeg + (s - SvPVX_const(sv));
495 *data->scream_olds = s;
497 else if (prog->reganch & ROPT_CANY_SEEN)
498 s = fbm_instr((U8*)(s + start_shift),
499 (U8*)(strend - end_shift),
500 check, multiline ? FBMrf_MULTILINE : 0);
502 s = fbm_instr(HOP3(s, start_shift, strend),
503 HOP3(strend, -end_shift, strbeg),
504 check, multiline ? FBMrf_MULTILINE : 0);
506 /* Update the count-of-usability, remove useless subpatterns,
510 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
511 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
512 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
513 (s ? "Found" : "Did not find"),
514 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
515 ? "anchored" : "floating"),
518 (s ? " at offset " : "...\n") );
526 /* Finish the diagnostic message */
527 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
529 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
530 Start with the other substr.
531 XXXX no SCREAM optimization yet - and a very coarse implementation
532 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
533 *always* match. Probably should be marked during compile...
534 Probably it is right to do no SCREAM here...
537 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
538 /* Take into account the "other" substring. */
539 /* XXXX May be hopelessly wrong for UTF... */
542 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
545 char * const last = HOP3c(s, -start_shift, strbeg);
547 char * const saved_s = s;
550 t = s - prog->check_offset_max;
551 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
553 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
558 t = HOP3c(t, prog->anchored_offset, strend);
559 if (t < other_last) /* These positions already checked */
561 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
564 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
565 /* On end-of-str: see comment below. */
566 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
567 if (must == &PL_sv_undef) {
569 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
574 HOP3(HOP3(last1, prog->anchored_offset, strend)
575 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
577 multiline ? FBMrf_MULTILINE : 0
580 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
581 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
582 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
583 (s ? "Found" : "Contradicts"),
584 quoted, RE_SV_TAIL(must));
589 if (last1 >= last2) {
590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
591 ", giving up...\n"));
594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
595 ", trying floating at offset %ld...\n",
596 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
597 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
598 s = HOP3c(last, 1, strend);
602 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
603 (long)(s - i_strpos)));
604 t = HOP3c(s, -prog->anchored_offset, strbeg);
605 other_last = HOP3c(s, 1, strend);
613 else { /* Take into account the floating substring. */
615 char * const saved_s = s;
618 t = HOP3c(s, -start_shift, strbeg);
620 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
621 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
622 last = HOP3c(t, prog->float_max_offset, strend);
623 s = HOP3c(t, prog->float_min_offset, strend);
626 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
627 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
628 /* fbm_instr() takes into account exact value of end-of-str
629 if the check is SvTAIL(ed). Since false positives are OK,
630 and end-of-str is not later than strend we are OK. */
631 if (must == &PL_sv_undef) {
633 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
636 s = fbm_instr((unsigned char*)s,
637 (unsigned char*)last + SvCUR(must)
639 must, multiline ? FBMrf_MULTILINE : 0);
641 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
642 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
643 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
644 (s ? "Found" : "Contradicts"),
645 quoted, RE_SV_TAIL(must));
649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
650 ", giving up...\n"));
653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
654 ", trying anchored starting at offset %ld...\n",
655 (long)(saved_s + 1 - i_strpos)));
657 s = HOP3c(t, 1, strend);
661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
662 (long)(s - i_strpos)));
663 other_last = s; /* Fix this later. --Hugo */
672 t = s - prog->check_offset_max;
673 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
675 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
677 /* Fixed substring is found far enough so that the match
678 cannot start at strpos. */
680 if (ml_anch && t[-1] != '\n') {
681 /* Eventually fbm_*() should handle this, but often
682 anchored_offset is not 0, so this check will not be wasted. */
683 /* XXXX In the code below we prefer to look for "^" even in
684 presence of anchored substrings. And we search even
685 beyond the found float position. These pessimizations
686 are historical artefacts only. */
688 while (t < strend - prog->minlen) {
690 if (t < check_at - prog->check_offset_min) {
691 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
692 /* Since we moved from the found position,
693 we definitely contradict the found anchored
694 substr. Due to the above check we do not
695 contradict "check" substr.
696 Thus we can arrive here only if check substr
697 is float. Redo checking for "other"=="fixed".
700 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
701 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
702 goto do_other_anchored;
704 /* We don't contradict the found floating substring. */
705 /* XXXX Why not check for STCLASS? */
707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
708 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
711 /* Position contradicts check-string */
712 /* XXXX probably better to look for check-string
713 than for "\n", so one should lower the limit for t? */
714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
715 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
716 other_last = strpos = s = t + 1;
721 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
722 PL_colors[0], PL_colors[1]));
726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
727 PL_colors[0], PL_colors[1]));
731 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
734 /* The found string does not prohibit matching at strpos,
735 - no optimization of calling REx engine can be performed,
736 unless it was an MBOL and we are not after MBOL,
737 or a future STCLASS check will fail this. */
739 /* Even in this situation we may use MBOL flag if strpos is offset
740 wrt the start of the string. */
741 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
742 && (strpos != strbeg) && strpos[-1] != '\n'
743 /* May be due to an implicit anchor of m{.*foo} */
744 && !(prog->reganch & ROPT_IMPLICIT))
749 DEBUG_EXECUTE_r( if (ml_anch)
750 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
751 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
754 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
756 prog->check_utf8 /* Could be deleted already */
757 && --BmUSEFUL(prog->check_utf8) < 0
758 && (prog->check_utf8 == prog->float_utf8)
760 prog->check_substr /* Could be deleted already */
761 && --BmUSEFUL(prog->check_substr) < 0
762 && (prog->check_substr == prog->float_substr)
765 /* If flags & SOMETHING - do not do it many times on the same match */
766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
767 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
768 if (do_utf8 ? prog->check_substr : prog->check_utf8)
769 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
770 prog->check_substr = prog->check_utf8 = NULL; /* disable */
771 prog->float_substr = prog->float_utf8 = NULL; /* clear */
772 check = NULL; /* abort */
774 /* XXXX This is a remnant of the old implementation. It
775 looks wasteful, since now INTUIT can use many
777 prog->reganch &= ~RE_USE_INTUIT;
784 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
785 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
786 /* minlen == 0 is possible if regstclass is \b or \B,
787 and the fixed substr is ''$.
788 Since minlen is already taken into account, s+1 is before strend;
789 accidentally, minlen >= 1 guaranties no false positives at s + 1
790 even for \b or \B. But (minlen? 1 : 0) below assumes that
791 regstclass does not come from lookahead... */
792 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
793 This leaves EXACTF only, which is dealt with in find_byclass(). */
794 const U8* const str = (U8*)STRING(prog->regstclass);
795 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
796 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
798 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
799 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
800 : (prog->float_substr || prog->float_utf8
801 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
804 /*if (OP(prog->regstclass) == TRIE)
807 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
810 const char *what = NULL;
812 if (endpos == strend) {
813 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
814 "Could not match STCLASS...\n") );
817 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
818 "This position contradicts STCLASS...\n") );
819 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
821 /* Contradict one of substrings */
822 if (prog->anchored_substr || prog->anchored_utf8) {
823 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
824 DEBUG_EXECUTE_r( what = "anchored" );
826 s = HOP3c(t, 1, strend);
827 if (s + start_shift + end_shift > strend) {
828 /* XXXX Should be taken into account earlier? */
829 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
830 "Could not match STCLASS...\n") );
835 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
836 "Looking for %s substr starting at offset %ld...\n",
837 what, (long)(s + start_shift - i_strpos)) );
840 /* Have both, check_string is floating */
841 if (t + start_shift >= check_at) /* Contradicts floating=check */
842 goto retry_floating_check;
843 /* Recheck anchored substring, but not floating... */
847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
848 "Looking for anchored substr starting at offset %ld...\n",
849 (long)(other_last - i_strpos)) );
850 goto do_other_anchored;
852 /* Another way we could have checked stclass at the
853 current position only: */
858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
859 "Looking for /%s^%s/m starting at offset %ld...\n",
860 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
863 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
865 /* Check is floating subtring. */
866 retry_floating_check:
867 t = check_at - start_shift;
868 DEBUG_EXECUTE_r( what = "floating" );
869 goto hop_and_restart;
872 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
873 "By STCLASS: moving %ld --> %ld\n",
874 (long)(t - i_strpos), (long)(s - i_strpos))
878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
879 "Does not contradict STCLASS...\n");
884 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
885 PL_colors[4], (check ? "Guessed" : "Giving up"),
886 PL_colors[5], (long)(s - i_strpos)) );
889 fail_finish: /* Substring not found */
890 if (prog->check_substr || prog->check_utf8) /* could be removed already */
891 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
894 PL_colors[4], PL_colors[5]));
898 /* We know what class REx starts with. Try to find this position... */
899 /* if reginfo is NULL, its a dryrun */
900 /* annoyingly all the vars in this routine have different names from their counterparts
901 in regmatch. /grrr */
903 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
904 foldlen, foldbuf, uniflags) STMT_START { \
905 switch (trie_type) { \
906 case trie_utf8_fold: \
908 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
913 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
914 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
915 foldlen -= UNISKIP( uvc ); \
916 uscan = foldbuf + UNISKIP( uvc ); \
920 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
928 charid = trie->charmap[ uvc ]; \
932 if (trie->widecharmap) { \
933 SV** const svpp = hv_fetch(trie->widecharmap, \
934 (char*)&uvc, sizeof(UV), 0); \
936 charid = (U16)SvIV(*svpp); \
941 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
944 ibcmp_utf8(s, NULL, 0, do_utf8, \
945 m, NULL, ln, (bool)UTF)) \
946 && (!reginfo || regtry(reginfo, s)) ) \
949 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
950 uvchr_to_utf8(tmpbuf, c); \
951 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
953 && (f == c1 || f == c2) \
954 && (ln == foldlen || \
955 !ibcmp_utf8((char *) foldbuf, \
956 NULL, foldlen, do_utf8, \
958 NULL, ln, (bool)UTF)) \
959 && (!reginfo || regtry(reginfo, s)) ) \
964 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
968 && (ln == 1 || !(OP(c) == EXACTF \
970 : ibcmp_locale(s, m, ln))) \
971 && (!reginfo || regtry(reginfo, s)) ) \
977 #define REXEC_FBC_UTF8_SCAN(CoDe) \
979 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
985 #define REXEC_FBC_SCAN(CoDe) \
987 while (s < strend) { \
993 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
994 REXEC_FBC_UTF8_SCAN( \
996 if (tmp && (!reginfo || regtry(reginfo, s))) \
1005 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1008 if (tmp && (!reginfo || regtry(reginfo, s))) \
1017 #define REXEC_FBC_TRYIT \
1018 if ((!reginfo || regtry(reginfo, s))) \
1021 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1024 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1027 REXEC_FBC_CLASS_SCAN(CoNd); \
1031 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1032 PL_reg_flags |= RF_tainted; \
1034 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1037 REXEC_FBC_CLASS_SCAN(CoNd); \
1042 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1043 const char *strend, const regmatch_info *reginfo)
1046 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1050 register STRLEN uskip;
1054 register I32 tmp = 1; /* Scratch variable? */
1055 register const bool do_utf8 = PL_reg_match_utf8;
1057 /* We know what class it must start with. */
1061 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1062 !UTF8_IS_INVARIANT((U8)s[0]) ?
1063 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1064 REGINCLASS(prog, c, (U8*)s));
1067 while (s < strend) {
1070 if (REGINCLASS(prog, c, (U8*)s) ||
1071 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1072 /* The assignment of 2 is intentional:
1073 * for the folded sharp s, the skip is 2. */
1074 (skip = SHARP_S_SKIP))) {
1075 if (tmp && (!reginfo || regtry(reginfo, s)))
1088 if (tmp && (!reginfo || regtry(reginfo, s)))
1096 ln = STR_LEN(c); /* length to match in octets/bytes */
1097 lnc = (I32) ln; /* length to match in characters */
1099 STRLEN ulen1, ulen2;
1101 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1102 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1103 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1105 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1106 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1108 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1110 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1113 while (sm < ((U8 *) m + ln)) {
1128 c2 = PL_fold_locale[c1];
1130 e = HOP3c(strend, -((I32)lnc), s);
1132 if (!reginfo && e < s)
1133 e = s; /* Due to minlen logic of intuit() */
1135 /* The idea in the EXACTF* cases is to first find the
1136 * first character of the EXACTF* node and then, if
1137 * necessary, case-insensitively compare the full
1138 * text of the node. The c1 and c2 are the first
1139 * characters (though in Unicode it gets a bit
1140 * more complicated because there are more cases
1141 * than just upper and lower: one needs to use
1142 * the so-called folding case for case-insensitive
1143 * matching (called "loose matching" in Unicode).
1144 * ibcmp_utf8() will do just that. */
1148 U8 tmpbuf [UTF8_MAXBYTES+1];
1149 STRLEN len, foldlen;
1150 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1152 /* Upper and lower of 1st char are equal -
1153 * probably not a "letter". */
1155 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1157 REXEC_FBC_EXACTISH_CHECK(c == c1);
1162 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1165 /* Handle some of the three Greek sigmas cases.
1166 * Note that not all the possible combinations
1167 * are handled here: some of them are handled
1168 * by the standard folding rules, and some of
1169 * them (the character class or ANYOF cases)
1170 * are handled during compiletime in
1171 * regexec.c:S_regclass(). */
1172 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1173 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1174 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1176 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1182 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1184 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1188 PL_reg_flags |= RF_tainted;
1195 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1196 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1198 tmp = ((OP(c) == BOUND ?
1199 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1200 LOAD_UTF8_CHARCLASS_ALNUM();
1201 REXEC_FBC_UTF8_SCAN(
1202 if (tmp == !(OP(c) == BOUND ?
1203 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1204 isALNUM_LC_utf8((U8*)s)))
1212 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1213 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1222 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1226 PL_reg_flags |= RF_tainted;
1233 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1234 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1236 tmp = ((OP(c) == NBOUND ?
1237 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1238 LOAD_UTF8_CHARCLASS_ALNUM();
1239 REXEC_FBC_UTF8_SCAN(
1240 if (tmp == !(OP(c) == NBOUND ?
1241 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1242 isALNUM_LC_utf8((U8*)s)))
1244 else REXEC_FBC_TRYIT;
1248 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1249 tmp = ((OP(c) == NBOUND ?
1250 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1253 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1255 else REXEC_FBC_TRYIT;
1258 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1262 REXEC_FBC_CSCAN_PRELOAD(
1263 LOAD_UTF8_CHARCLASS_ALNUM(),
1264 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1268 REXEC_FBC_CSCAN_TAINT(
1269 isALNUM_LC_utf8((U8*)s),
1273 REXEC_FBC_CSCAN_PRELOAD(
1274 LOAD_UTF8_CHARCLASS_ALNUM(),
1275 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1279 REXEC_FBC_CSCAN_TAINT(
1280 !isALNUM_LC_utf8((U8*)s),
1284 REXEC_FBC_CSCAN_PRELOAD(
1285 LOAD_UTF8_CHARCLASS_SPACE(),
1286 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1290 REXEC_FBC_CSCAN_TAINT(
1291 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1295 REXEC_FBC_CSCAN_PRELOAD(
1296 LOAD_UTF8_CHARCLASS_SPACE(),
1297 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1301 REXEC_FBC_CSCAN_TAINT(
1302 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1306 REXEC_FBC_CSCAN_PRELOAD(
1307 LOAD_UTF8_CHARCLASS_DIGIT(),
1308 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1312 REXEC_FBC_CSCAN_TAINT(
1313 isDIGIT_LC_utf8((U8*)s),
1317 REXEC_FBC_CSCAN_PRELOAD(
1318 LOAD_UTF8_CHARCLASS_DIGIT(),
1319 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1323 REXEC_FBC_CSCAN_TAINT(
1324 !isDIGIT_LC_utf8((U8*)s),
1328 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1330 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1331 trie_type = do_utf8 ?
1332 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1334 /* what trie are we using right now */
1336 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1337 reg_trie_data *trie=aho->trie;
1339 const char *last_start = strend - trie->minlen;
1341 const char *real_start = s;
1343 STRLEN maxlen = trie->maxlen;
1345 U8 **points; /* map of where we were in the input string
1346 when reading a given string. For ASCII this
1347 is unnecessary overhead as the relationship
1348 is always 1:1, but for unicode, especially
1349 case folded unicode this is not true. */
1350 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1352 GET_RE_DEBUG_FLAGS_DECL;
1354 /* We can't just allocate points here. We need to wrap it in
1355 * an SV so it gets freed properly if there is a croak while
1356 * running the match */
1359 sv_points=newSV(maxlen * sizeof(U8 *));
1360 SvCUR_set(sv_points,
1361 maxlen * sizeof(U8 *));
1362 SvPOK_on(sv_points);
1363 sv_2mortal(sv_points);
1364 points=(U8**)SvPV_nolen(sv_points );
1366 if (trie->bitmap && trie_type != trie_utf8_fold) {
1367 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
1372 while (s <= last_start) {
1373 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1381 U8 *uscan = (U8*)NULL;
1382 U8 *leftmost = NULL;
1386 while ( state && uc <= (U8*)strend ) {
1388 if (aho->states[ state ].wordnum) {
1389 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1390 if (!leftmost || lpos < leftmost)
1394 points[pointpos++ % maxlen]= uc;
1395 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1396 uvc, charid, foldlen, foldbuf, uniflags);
1397 DEBUG_TRIE_EXECUTE_r(
1398 PerlIO_printf(Perl_debug_log,
1399 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1400 (int)((const char*)uc - real_start), charid, uvc)
1406 U32 word = aho->states[ state ].wordnum;
1408 base = aho->states[ state ].trans.base;
1410 DEBUG_TRIE_EXECUTE_r(
1411 PerlIO_printf( Perl_debug_log,
1412 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1413 failed ? "Fail transition to " : "",
1414 (UV)state, (UV)base, (UV)uvc, (UV)word)
1419 (base + charid > trie->uniquecharcount )
1420 && (base + charid - 1 - trie->uniquecharcount
1422 && trie->trans[base + charid - 1 -
1423 trie->uniquecharcount].check == state
1424 && (tmp=trie->trans[base + charid - 1 -
1425 trie->uniquecharcount ].next))
1435 state = aho->fail[state];
1439 /* we must be accepting here */
1447 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1448 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
1454 if ( aho->states[ state ].wordnum ) {
1455 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1456 if (!leftmost || lpos < leftmost)
1459 DEBUG_TRIE_EXECUTE_r(
1460 PerlIO_printf( Perl_debug_log,
1461 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1463 (UV)state, (UV)base, (UV)uvc)
1466 s = (char*)leftmost;
1467 if (!reginfo || regtry(reginfo, s)) {
1482 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1491 - regexec_flags - match a regexp against a string
1494 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1495 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1496 /* strend: pointer to null at end of string */
1497 /* strbeg: real beginning of string */
1498 /* minend: end of match must be >=minend after stringarg. */
1499 /* data: May be used for some additional optimizations. */
1500 /* nosave: For optimizations. */
1504 register regnode *c;
1505 register char *startpos = stringarg;
1506 I32 minlen; /* must match at least this many chars */
1507 I32 dontbother = 0; /* how many characters not to try at end */
1508 I32 end_shift = 0; /* Same for the end. */ /* CC */
1509 I32 scream_pos = -1; /* Internal iterator of scream. */
1510 char *scream_olds = NULL;
1511 SV* const oreplsv = GvSV(PL_replgv);
1512 const bool do_utf8 = DO_UTF8(sv);
1515 regmatch_info reginfo; /* create some info to pass to regtry etc */
1517 GET_RE_DEBUG_FLAGS_DECL;
1519 PERL_UNUSED_ARG(data);
1521 /* Be paranoid... */
1522 if (prog == NULL || startpos == NULL) {
1523 Perl_croak(aTHX_ "NULL regexp parameter");
1527 multiline = prog->reganch & PMf_MULTILINE;
1528 reginfo.prog = prog;
1530 RX_MATCH_UTF8_set(prog, do_utf8);
1532 minlen = prog->minlen;
1533 if (strend - startpos < minlen) {
1534 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1535 "String too short [regexec_flags]...\n"));
1539 /* Check validity of program. */
1540 if (UCHARAT(prog->program) != REG_MAGIC) {
1541 Perl_croak(aTHX_ "corrupted regexp program");
1545 PL_reg_eval_set = 0;
1548 if (prog->reganch & ROPT_UTF8)
1549 PL_reg_flags |= RF_utf8;
1551 /* Mark beginning of line for ^ and lookbehind. */
1552 reginfo.bol = startpos; /* XXX not used ??? */
1556 /* Mark end of line for $ (and such) */
1559 /* see how far we have to get to not match where we matched before */
1560 reginfo.till = startpos+minend;
1562 /* If there is a "must appear" string, look for it. */
1565 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1568 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1569 reginfo.ganch = startpos;
1570 else if (sv && SvTYPE(sv) >= SVt_PVMG
1572 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1573 && mg->mg_len >= 0) {
1574 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1575 if (prog->reganch & ROPT_ANCH_GPOS) {
1576 if (s > reginfo.ganch)
1581 else /* pos() not defined */
1582 reginfo.ganch = strbeg;
1585 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1586 re_scream_pos_data d;
1588 d.scream_olds = &scream_olds;
1589 d.scream_pos = &scream_pos;
1590 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1592 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1593 goto phooey; /* not present */
1598 debug_start_match(prog, do_utf8, startpos, strend,
1602 /* Simplest case: anchored match need be tried only once. */
1603 /* [unless only anchor is BOL and multiline is set] */
1604 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1605 if (s == startpos && regtry(®info, startpos))
1607 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1608 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1613 dontbother = minlen - 1;
1614 end = HOP3c(strend, -dontbother, strbeg) - 1;
1615 /* for multiline we only have to try after newlines */
1616 if (prog->check_substr || prog->check_utf8) {
1620 if (regtry(®info, s))
1625 if (prog->reganch & RE_USE_INTUIT) {
1626 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1637 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1638 if (regtry(®info, s))
1645 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1646 if (regtry(®info, reginfo.ganch))
1651 /* Messy cases: unanchored match. */
1652 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1653 /* we have /x+whatever/ */
1654 /* it must be a one character string (XXXX Except UTF?) */
1659 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1660 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1661 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1666 DEBUG_EXECUTE_r( did_match = 1 );
1667 if (regtry(®info, s)) goto got_it;
1669 while (s < strend && *s == ch)
1677 DEBUG_EXECUTE_r( did_match = 1 );
1678 if (regtry(®info, s)) goto got_it;
1680 while (s < strend && *s == ch)
1685 DEBUG_EXECUTE_r(if (!did_match)
1686 PerlIO_printf(Perl_debug_log,
1687 "Did not find anchored character...\n")
1690 else if (prog->anchored_substr != NULL
1691 || prog->anchored_utf8 != NULL
1692 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1693 && prog->float_max_offset < strend - s)) {
1698 char *last1; /* Last position checked before */
1702 if (prog->anchored_substr || prog->anchored_utf8) {
1703 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1704 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1705 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1706 back_max = back_min = prog->anchored_offset;
1708 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1709 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1710 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1711 back_max = prog->float_max_offset;
1712 back_min = prog->float_min_offset;
1714 if (must == &PL_sv_undef)
1715 /* could not downgrade utf8 check substring, so must fail */
1718 last = HOP3c(strend, /* Cannot start after this */
1719 -(I32)(CHR_SVLEN(must)
1720 - (SvTAIL(must) != 0) + back_min), strbeg);
1723 last1 = HOPc(s, -1);
1725 last1 = s - 1; /* bogus */
1727 /* XXXX check_substr already used to find "s", can optimize if
1728 check_substr==must. */
1730 dontbother = end_shift;
1731 strend = HOPc(strend, -dontbother);
1732 while ( (s <= last) &&
1733 ((flags & REXEC_SCREAM)
1734 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1735 end_shift, &scream_pos, 0))
1736 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1737 (unsigned char*)strend, must,
1738 multiline ? FBMrf_MULTILINE : 0))) ) {
1739 /* we may be pointing at the wrong string */
1740 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1741 s = strbeg + (s - SvPVX_const(sv));
1742 DEBUG_EXECUTE_r( did_match = 1 );
1743 if (HOPc(s, -back_max) > last1) {
1744 last1 = HOPc(s, -back_min);
1745 s = HOPc(s, -back_max);
1748 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1750 last1 = HOPc(s, -back_min);
1754 while (s <= last1) {
1755 if (regtry(®info, s))
1761 while (s <= last1) {
1762 if (regtry(®info, s))
1768 DEBUG_EXECUTE_r(if (!did_match) {
1769 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1770 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1771 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1772 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1773 ? "anchored" : "floating"),
1774 quoted, RE_SV_TAIL(must));
1778 else if ((c = prog->regstclass)) {
1780 const OPCODE op = OP(prog->regstclass);
1781 /* don't bother with what can't match */
1782 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
1783 strend = HOPc(strend, -(minlen - 1));
1786 SV * const prop = sv_newmortal();
1787 regprop(prog, prop, c);
1789 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1791 PerlIO_printf(Perl_debug_log,
1792 "Matching stclass %.*s against %s (%d chars)\n",
1793 (int)SvCUR(prop), SvPVX_const(prop),
1794 quoted, (int)(strend - s));
1797 if (find_byclass(prog, c, s, strend, ®info))
1799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1803 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1808 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1809 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1810 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1812 if (flags & REXEC_SCREAM) {
1813 last = screaminstr(sv, float_real, s - strbeg,
1814 end_shift, &scream_pos, 1); /* last one */
1816 last = scream_olds; /* Only one occurrence. */
1817 /* we may be pointing at the wrong string */
1818 else if (RX_MATCH_COPIED(prog))
1819 s = strbeg + (s - SvPVX_const(sv));
1823 const char * const little = SvPV_const(float_real, len);
1825 if (SvTAIL(float_real)) {
1826 if (memEQ(strend - len + 1, little, len - 1))
1827 last = strend - len + 1;
1828 else if (!multiline)
1829 last = memEQ(strend - len, little, len)
1830 ? strend - len : NULL;
1836 last = rninstr(s, strend, little, little + len);
1838 last = strend; /* matching "$" */
1842 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1843 "%sCan't trim the tail, match fails (should not happen)%s\n",
1844 PL_colors[4], PL_colors[5]));
1845 goto phooey; /* Should not happen! */
1847 dontbother = strend - last + prog->float_min_offset;
1849 if (minlen && (dontbother < minlen))
1850 dontbother = minlen - 1;
1851 strend -= dontbother; /* this one's always in bytes! */
1852 /* We don't know much -- general case. */
1855 if (regtry(®info, s))
1864 if (regtry(®info, s))
1866 } while (s++ < strend);
1874 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1876 if (PL_reg_eval_set) {
1877 /* Preserve the current value of $^R */
1878 if (oreplsv != GvSV(PL_replgv))
1879 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1880 restored, the value remains
1882 restore_pos(aTHX_ prog);
1885 /* make sure $`, $&, $', and $digit will work later */
1886 if ( !(flags & REXEC_NOT_FIRST) ) {
1887 RX_MATCH_COPY_FREE(prog);
1888 if (flags & REXEC_COPY_STR) {
1889 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
1890 #ifdef PERL_OLD_COPY_ON_WRITE
1892 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1894 PerlIO_printf(Perl_debug_log,
1895 "Copy on write: regexp capture, type %d\n",
1898 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
1899 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
1900 assert (SvPOKp(prog->saved_copy));
1904 RX_MATCH_COPIED_on(prog);
1905 s = savepvn(strbeg, i);
1911 prog->subbeg = strbeg;
1912 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1919 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1920 PL_colors[4], PL_colors[5]));
1921 if (PL_reg_eval_set)
1922 restore_pos(aTHX_ prog);
1927 - regtry - try match at specific point
1929 STATIC I32 /* 0 failure, 1 success */
1930 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
1936 regexp *prog = reginfo->prog;
1937 GET_RE_DEBUG_FLAGS_DECL;
1940 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1942 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1945 PL_reg_eval_set = RS_init;
1946 DEBUG_EXECUTE_r(DEBUG_s(
1947 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1948 (IV)(PL_stack_sp - PL_stack_base));
1950 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1951 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1952 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1954 /* Apparently this is not needed, judging by wantarray. */
1955 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1956 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1959 /* Make $_ available to executed code. */
1960 if (reginfo->sv != DEFSV) {
1962 DEFSV = reginfo->sv;
1965 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1966 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
1967 /* prepare for quick setting of pos */
1968 #ifdef PERL_OLD_COPY_ON_WRITE
1970 sv_force_normal_flags(sv, 0);
1972 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
1973 &PL_vtbl_mglob, NULL, 0);
1977 PL_reg_oldpos = mg->mg_len;
1978 SAVEDESTRUCTOR_X(restore_pos, prog);
1980 if (!PL_reg_curpm) {
1981 Newxz(PL_reg_curpm, 1, PMOP);
1984 SV* const repointer = newSViv(0);
1985 /* so we know which PL_regex_padav element is PL_reg_curpm */
1986 SvFLAGS(repointer) |= SVf_BREAK;
1987 av_push(PL_regex_padav,repointer);
1988 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
1989 PL_regex_pad = AvARRAY(PL_regex_padav);
1993 PM_SETRE(PL_reg_curpm, prog);
1994 PL_reg_oldcurpm = PL_curpm;
1995 PL_curpm = PL_reg_curpm;
1996 if (RX_MATCH_COPIED(prog)) {
1997 /* Here is a serious problem: we cannot rewrite subbeg,
1998 since it may be needed if this match fails. Thus
1999 $` inside (?{}) could fail... */
2000 PL_reg_oldsaved = prog->subbeg;
2001 PL_reg_oldsavedlen = prog->sublen;
2002 #ifdef PERL_OLD_COPY_ON_WRITE
2003 PL_nrs = prog->saved_copy;
2005 RX_MATCH_COPIED_off(prog);
2008 PL_reg_oldsaved = NULL;
2009 prog->subbeg = PL_bostr;
2010 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2012 prog->startp[0] = startpos - PL_bostr;
2013 PL_reginput = startpos;
2014 PL_regstartp = prog->startp;
2015 PL_regendp = prog->endp;
2016 PL_reglastparen = &prog->lastparen;
2017 PL_reglastcloseparen = &prog->lastcloseparen;
2018 prog->lastparen = 0;
2019 prog->lastcloseparen = 0;
2021 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2022 if (PL_reg_start_tmpl <= prog->nparens) {
2023 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2024 if(PL_reg_start_tmp)
2025 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2027 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2030 /* XXXX What this code is doing here?!!! There should be no need
2031 to do this again and again, PL_reglastparen should take care of
2034 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2035 * Actually, the code in regcppop() (which Ilya may be meaning by
2036 * PL_reglastparen), is not needed at all by the test suite
2037 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2038 * enough, for building DynaLoader, or otherwise this
2039 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2040 * will happen. Meanwhile, this code *is* needed for the
2041 * above-mentioned test suite tests to succeed. The common theme
2042 * on those tests seems to be returning null fields from matches.
2047 if (prog->nparens) {
2049 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2056 if (regmatch(reginfo, prog->program + 1)) {
2057 prog->endp[0] = PL_reginput - PL_bostr;
2060 REGCP_UNWIND(lastcp);
2065 #define sayYES goto yes
2066 #define sayNO goto no
2067 #define sayNO_ANYOF goto no_anyof
2068 #define sayYES_FINAL goto yes_final
2069 #define sayNO_FINAL goto no_final
2070 #define sayNO_SILENT goto do_no
2071 #define saySAME(x) if (x) goto yes; else goto no
2073 #define CACHEsayNO STMT_START { \
2074 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2075 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2076 (1<<st->u.whilem.cache_bit); \
2081 /* this is used to determine how far from the left messages like
2082 'failed...' are printed. Currently 29 makes these messages line
2083 up with the opcode they refer to. Earlier perls used 25 which
2084 left these messages outdented making reviewing a debug output
2087 #define REPORT_CODE_OFF 29
2090 /* Make sure there is a test for this +1 options in re_tests */
2091 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2093 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2094 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2096 #define SLAB_FIRST(s) (&(s)->states[0])
2097 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2099 /* grab a new slab and return the first slot in it */
2101 STATIC regmatch_state *
2104 #if PERL_VERSION < 9
2107 regmatch_slab *s = PL_regmatch_slab->next;
2109 Newx(s, 1, regmatch_slab);
2110 s->prev = PL_regmatch_slab;
2112 PL_regmatch_slab->next = s;
2114 PL_regmatch_slab = s;
2115 return SLAB_FIRST(s);
2118 /* simulate a recursive call to regmatch */
2120 #define REGMATCH(ns, where) \
2123 st->resume_state = resume_##where; \
2124 goto start_recurse; \
2125 resume_point_##where:
2127 /* push a new state then goto it */
2129 #define PUSH_STATE_GOTO(state, node) \
2131 st->resume_state = state; \
2134 /* push a new state with success backtracking, then goto it */
2136 #define PUSH_YES_STATE_GOTO(state, node) \
2138 st->resume_state = state; \
2139 goto push_yes_state;
2144 - regmatch - main matching routine
2146 * Conceptually the strategy is simple: check to see whether the current
2147 * node matches, call self recursively to see whether the rest matches,
2148 * and then act accordingly. In practice we make some effort to avoid
2149 * recursion, in particular by going through "ordinary" nodes (that don't
2150 * need to know whether the rest of the match failed) by a loop instead of
2153 /* [lwall] I've hoisted the register declarations to the outer block in order to
2154 * maybe save a little bit of pushing and popping on the stack. It also takes
2155 * advantage of machines that use a register save mask on subroutine entry.
2157 * This function used to be heavily recursive, but since this had the
2158 * effect of blowing the CPU stack on complex regexes, it has been
2159 * restructured to be iterative, and to save state onto the heap rather
2160 * than the stack. Essentially whereever regmatch() used to be called, it
2161 * pushes the current state, notes where to return, then jumps back into
2164 * Originally the structure of this function used to look something like
2169 while (scan != NULL) {
2170 a++; // do stuff with a and b
2176 if (regmatch(...)) // recurse
2186 * Now it looks something like this:
2194 regmatch_state *st = new();
2196 st->a++; // do stuff with a and b
2198 while (scan != NULL) {
2206 st->resume_state = resume_FOO;
2207 goto start_recurse; // recurse
2216 st = new(); push a new state
2217 st->a = 1; st->b = 2;
2224 switch (resume_state) {
2226 goto resume_point_FOO;
2233 * WARNING: this means that any line in this function that contains a
2234 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2235 * regmatch() using gotos instead. Thus the values of any local variables
2236 * not saved in the regmatch_state structure will have been lost when
2237 * execution resumes on the next line .
2239 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2240 * PL_regmatch_state always points to the currently active state, and
2241 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2242 * The first time regmatch is called, the first slab is allocated, and is
2243 * never freed until interpreter desctruction. When the slab is full,
2244 * a new one is allocated chained to the end. At exit from regmatch, slabs
2245 * allocated since entry are freed.
2248 /* *** every FOO_fail should = FOO+1 */
2249 #define TRIE_next (REGNODE_MAX+1)
2250 #define TRIE_next_fail (REGNODE_MAX+2)
2251 #define EVAL_A (REGNODE_MAX+3)
2252 #define EVAL_A_fail (REGNODE_MAX+4)
2253 #define resume_CURLYX (REGNODE_MAX+5)
2254 #define resume_WHILEM1 (REGNODE_MAX+6)
2255 #define resume_WHILEM2 (REGNODE_MAX+7)
2256 #define resume_WHILEM3 (REGNODE_MAX+8)
2257 #define resume_WHILEM4 (REGNODE_MAX+9)
2258 #define resume_WHILEM5 (REGNODE_MAX+10)
2259 #define resume_WHILEM6 (REGNODE_MAX+11)
2260 #define BRANCH_next (REGNODE_MAX+12)
2261 #define BRANCH_next_fail (REGNODE_MAX+13)
2262 #define CURLYM_A (REGNODE_MAX+14)
2263 #define CURLYM_A_fail (REGNODE_MAX+15)
2264 #define CURLYM_B (REGNODE_MAX+16)
2265 #define CURLYM_B_fail (REGNODE_MAX+17)
2266 #define IFMATCH_A (REGNODE_MAX+18)
2267 #define IFMATCH_A_fail (REGNODE_MAX+19)
2268 #define CURLY_B_min_known (REGNODE_MAX+20)
2269 #define CURLY_B_min_known_fail (REGNODE_MAX+21)
2270 #define CURLY_B_min (REGNODE_MAX+22)
2271 #define CURLY_B_min_fail (REGNODE_MAX+23)
2272 #define CURLY_B_max (REGNODE_MAX+24)
2273 #define CURLY_B_max_fail (REGNODE_MAX+25)
2276 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2280 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2281 const char *start, const char *end, const char *blurb)
2283 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2287 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2288 prog->precomp, prog->prelen, 60);
2290 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2291 start, end - start, 60);
2293 PerlIO_printf(Perl_debug_log,
2294 "%s%s REx%s %s against %s\n",
2295 PL_colors[4], blurb, PL_colors[5], s0, s1);
2297 if (do_utf8||utf8_pat)
2298 PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
2299 !do_utf8 ? "pattern" : !utf8_pat ? "string" :
2300 "pattern and string"
2306 S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2308 const int docolor = *PL_colors[0];
2309 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2310 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2311 /* The part of the string before starttry has one color
2312 (pref0_len chars), between starttry and current
2313 position another one (pref_len - pref0_len chars),
2314 after the current position the third one.
2315 We assume that pref0_len <= pref_len, otherwise we
2316 decrease pref0_len. */
2317 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2318 ? (5 + taill) - l : locinput - PL_bostr;
2321 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2323 pref0_len = pref_len - (locinput - PL_reg_starttry);
2324 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2325 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2326 ? (5 + taill) - pref_len : PL_regeol - locinput);
2327 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2331 if (pref0_len > pref_len)
2332 pref0_len = pref_len;
2334 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2336 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2337 (locinput - pref_len),pref0_len, 60, 4, 5);
2339 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2340 (locinput - pref_len + pref0_len),
2341 pref_len - pref0_len, 60, 2, 3);
2343 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2344 locinput, PL_regeol - locinput, 60, 0, 1);
2346 PerlIO_printf(Perl_debug_log,
2347 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2348 (IV)(locinput - PL_bostr),
2351 (docolor ? "" : "> <"),
2353 15 - l - pref_len + 1,
2360 STATIC I32 /* 0 failure, 1 success */
2361 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2363 #if PERL_VERSION < 9
2367 register const bool do_utf8 = PL_reg_match_utf8;
2368 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2370 regexp *rex = reginfo->prog;
2372 regmatch_slab *orig_slab;
2373 regmatch_state *orig_state;
2375 /* the current state. This is a cached copy of PL_regmatch_state */
2376 register regmatch_state *st;
2378 /* cache heavy used fields of st in registers */
2379 register regnode *scan;
2380 register regnode *next;
2381 register I32 n = 0; /* initialize to shut up compiler warning */
2382 register char *locinput = PL_reginput;
2384 /* these variables are NOT saved during a recusive RFEGMATCH: */
2385 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2386 bool result; /* return value of S_regmatch */
2387 int depth = 0; /* depth of recursion */
2388 regmatch_state *yes_state = NULL; /* state to pop to on success of
2393 GET_RE_DEBUG_FLAGS_DECL;
2397 /* on first ever call to regmatch, allocate first slab */
2398 if (!PL_regmatch_slab) {
2399 Newx(PL_regmatch_slab, 1, regmatch_slab);
2400 PL_regmatch_slab->prev = NULL;
2401 PL_regmatch_slab->next = NULL;
2402 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2405 /* remember current high-water mark for exit */
2406 /* XXX this should be done with SAVE* instead */
2407 orig_slab = PL_regmatch_slab;
2408 orig_state = PL_regmatch_state;
2410 /* grab next free state slot */
2411 st = ++PL_regmatch_state;
2412 if (st > SLAB_LAST(PL_regmatch_slab))
2413 st = PL_regmatch_state = S_push_slab(aTHX);
2419 /* Note that nextchr is a byte even in UTF */
2420 nextchr = UCHARAT(locinput);
2422 while (scan != NULL) {
2425 SV * const prop = sv_newmortal();
2426 dump_exec_pos( locinput, scan, do_utf8 );
2427 regprop(rex, prop, scan);
2429 PerlIO_printf(Perl_debug_log,
2430 "%3"IVdf":%*s%s(%"IVdf")\n",
2431 (IV)(scan - rex->program), PL_regindent*2, "",
2433 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2436 next = scan + NEXT_OFF(scan);
2439 state_num = OP(scan);
2442 switch (state_num) {
2444 if (locinput == PL_bostr)
2446 /* reginfo->till = reginfo->bol; */
2451 if (locinput == PL_bostr ||
2452 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2458 if (locinput == PL_bostr)
2462 if (locinput == reginfo->ganch)
2468 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2473 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2475 if (PL_regeol - locinput > 1)
2479 if (PL_regeol != locinput)
2483 if (!nextchr && locinput >= PL_regeol)
2486 locinput += PL_utf8skip[nextchr];
2487 if (locinput > PL_regeol)
2489 nextchr = UCHARAT(locinput);
2492 nextchr = UCHARAT(++locinput);
2495 if (!nextchr && locinput >= PL_regeol)
2497 nextchr = UCHARAT(++locinput);
2500 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2503 locinput += PL_utf8skip[nextchr];
2504 if (locinput > PL_regeol)
2506 nextchr = UCHARAT(locinput);
2509 nextchr = UCHARAT(++locinput);
2513 #define ST st->u.trie
2517 /* what type of TRIE am I? (utf8 makes this contextual) */
2518 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2519 trie_type = do_utf8 ?
2520 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2523 /* what trie are we using right now */
2524 reg_trie_data * const trie
2525 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2526 U32 state = trie->startstate;
2528 U8 *uc = ( U8* )locinput;
2534 U8 *uscan = (U8*)NULL;
2536 SV *sv_accept_buff = NULL;
2537 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2539 ST.accepted = 0; /* how many accepting states we have seen */
2545 if (trie->bitmap && trie_type != trie_utf8_fold &&
2546 !TRIE_BITMAP_TEST(trie,*locinput)
2548 if (trie->states[ state ].wordnum) {
2550 PerlIO_printf(Perl_debug_log,
2551 "%*s %smatched empty string...%s\n",
2552 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2557 PerlIO_printf(Perl_debug_log,
2558 "%*s %sfailed to match start class...%s\n",
2559 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2566 traverse the TRIE keeping track of all accepting states
2567 we transition through until we get to a failing node.
2570 while ( state && uc <= (U8*)PL_regeol ) {
2572 if (trie->states[ state ].wordnum) {
2573 if (!ST.accepted ) {
2576 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2577 sv_accept_buff=newSV(bufflen *
2578 sizeof(reg_trie_accepted) - 1);
2579 SvCUR_set(sv_accept_buff,
2580 sizeof(reg_trie_accepted));
2581 SvPOK_on(sv_accept_buff);
2582 sv_2mortal(sv_accept_buff);
2585 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2588 if (ST.accepted >= bufflen) {
2590 ST.accept_buff =(reg_trie_accepted*)
2591 SvGROW(sv_accept_buff,
2592 bufflen * sizeof(reg_trie_accepted));
2594 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2595 + sizeof(reg_trie_accepted));
2597 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2598 ST.accept_buff[ST.accepted].endpos = uc;
2602 base = trie->states[ state ].trans.base;
2604 DEBUG_TRIE_EXECUTE_r({
2605 dump_exec_pos( (char *)uc, scan, do_utf8 );
2606 PerlIO_printf( Perl_debug_log,
2607 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2608 2+PL_regindent * 2, "", PL_colors[4],
2609 (UV)state, (UV)base, (UV)ST.accepted );
2613 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2614 uvc, charid, foldlen, foldbuf, uniflags);
2617 (base + charid > trie->uniquecharcount )
2618 && (base + charid - 1 - trie->uniquecharcount
2620 && trie->trans[base + charid - 1 -
2621 trie->uniquecharcount].check == state)
2623 state = trie->trans[base + charid - 1 -
2624 trie->uniquecharcount ].next;
2635 DEBUG_TRIE_EXECUTE_r(
2636 PerlIO_printf( Perl_debug_log,
2637 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2638 charid, uvc, (UV)state, PL_colors[5] );
2645 PerlIO_printf( Perl_debug_log,
2646 "%*s %sgot %"IVdf" possible matches%s\n",
2647 REPORT_CODE_OFF + PL_regindent * 2, "",
2648 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2654 case TRIE_next_fail: /* we failed - try next alterative */
2656 if ( ST.accepted == 1 ) {
2657 /* only one choice left - just continue */
2659 reg_trie_data * const trie
2660 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2661 SV ** const tmp = RX_DEBUG(reginfo->prog)
2662 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2664 PerlIO_printf( Perl_debug_log,
2665 "%*s %sonly one match left: #%d <%s>%s\n",
2666 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2667 ST.accept_buff[ 0 ].wordnum,
2668 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2671 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2672 /* in this case we free tmps/leave before we call regmatch
2673 as we wont be using accept_buff again. */
2676 locinput = PL_reginput;
2677 nextchr = UCHARAT(locinput);
2679 continue; /* execute rest of RE */
2682 if (!ST.accepted-- ) {
2689 There are at least two accepting states left. Presumably
2690 the number of accepting states is going to be low,
2691 typically two. So we simply scan through to find the one
2692 with lowest wordnum. Once we find it, we swap the last
2693 state into its place and decrement the size. We then try to
2694 match the rest of the pattern at the point where the word
2695 ends. If we succeed, control just continues along the
2696 regex; if we fail we return here to try the next accepting
2703 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2704 DEBUG_TRIE_EXECUTE_r(
2705 PerlIO_printf( Perl_debug_log,
2706 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2707 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2708 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2709 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2712 if (ST.accept_buff[cur].wordnum <
2713 ST.accept_buff[best].wordnum)
2718 reg_trie_data * const trie
2719 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2720 SV ** const tmp = RX_DEBUG(reginfo->prog)
2721 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2723 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2724 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2725 ST.accept_buff[best].wordnum,
2726 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2730 if ( best<ST.accepted ) {
2731 reg_trie_accepted tmp = ST.accept_buff[ best ];
2732 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2733 ST.accept_buff[ ST.accepted ] = tmp;
2736 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2738 PUSH_STATE_GOTO(TRIE_next, ST.B);
2744 char *s = STRING(scan);
2745 st->ln = STR_LEN(scan);
2746 if (do_utf8 != UTF) {
2747 /* The target and the pattern have differing utf8ness. */
2749 const char * const e = s + st->ln;
2752 /* The target is utf8, the pattern is not utf8. */
2757 if (NATIVE_TO_UNI(*(U8*)s) !=
2758 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2766 /* The target is not utf8, the pattern is utf8. */
2771 if (NATIVE_TO_UNI(*((U8*)l)) !=
2772 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2780 nextchr = UCHARAT(locinput);
2783 /* The target and the pattern have the same utf8ness. */
2784 /* Inline the first character, for speed. */
2785 if (UCHARAT(s) != nextchr)
2787 if (PL_regeol - locinput < st->ln)
2789 if (st->ln > 1 && memNE(s, locinput, st->ln))
2792 nextchr = UCHARAT(locinput);
2796 PL_reg_flags |= RF_tainted;
2799 char * const s = STRING(scan);
2800 st->ln = STR_LEN(scan);
2802 if (do_utf8 || UTF) {
2803 /* Either target or the pattern are utf8. */
2804 const char * const l = locinput;
2805 char *e = PL_regeol;
2807 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2808 l, &e, 0, do_utf8)) {
2809 /* One more case for the sharp s:
2810 * pack("U0U*", 0xDF) =~ /ss/i,
2811 * the 0xC3 0x9F are the UTF-8
2812 * byte sequence for the U+00DF. */
2814 toLOWER(s[0]) == 's' &&
2816 toLOWER(s[1]) == 's' &&
2823 nextchr = UCHARAT(locinput);
2827 /* Neither the target and the pattern are utf8. */
2829 /* Inline the first character, for speed. */
2830 if (UCHARAT(s) != nextchr &&
2831 UCHARAT(s) != ((OP(scan) == EXACTF)
2832 ? PL_fold : PL_fold_locale)[nextchr])
2834 if (PL_regeol - locinput < st->ln)
2836 if (st->ln > 1 && (OP(scan) == EXACTF
2837 ? ibcmp(s, locinput, st->ln)
2838 : ibcmp_locale(s, locinput, st->ln)))
2841 nextchr = UCHARAT(locinput);
2846 STRLEN inclasslen = PL_regeol - locinput;
2848 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
2850 if (locinput >= PL_regeol)
2852 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2853 nextchr = UCHARAT(locinput);
2858 nextchr = UCHARAT(locinput);
2859 if (!REGINCLASS(rex, scan, (U8*)locinput))
2861 if (!nextchr && locinput >= PL_regeol)
2863 nextchr = UCHARAT(++locinput);
2867 /* If we might have the case of the German sharp s
2868 * in a casefolding Unicode character class. */
2870 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2871 locinput += SHARP_S_SKIP;
2872 nextchr = UCHARAT(locinput);
2878 PL_reg_flags |= RF_tainted;
2884 LOAD_UTF8_CHARCLASS_ALNUM();
2885 if (!(OP(scan) == ALNUM
2886 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2887 : isALNUM_LC_utf8((U8*)locinput)))
2891 locinput += PL_utf8skip[nextchr];
2892 nextchr = UCHARAT(locinput);
2895 if (!(OP(scan) == ALNUM
2896 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2898 nextchr = UCHARAT(++locinput);
2901 PL_reg_flags |= RF_tainted;
2904 if (!nextchr && locinput >= PL_regeol)
2907 LOAD_UTF8_CHARCLASS_ALNUM();
2908 if (OP(scan) == NALNUM
2909 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2910 : isALNUM_LC_utf8((U8*)locinput))
2914 locinput += PL_utf8skip[nextchr];
2915 nextchr = UCHARAT(locinput);
2918 if (OP(scan) == NALNUM
2919 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2921 nextchr = UCHARAT(++locinput);
2925 PL_reg_flags |= RF_tainted;
2929 /* was last char in word? */
2931 if (locinput == PL_bostr)
2934 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2936 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
2938 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2939 st->ln = isALNUM_uni(st->ln);
2940 LOAD_UTF8_CHARCLASS_ALNUM();
2941 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2944 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
2945 n = isALNUM_LC_utf8((U8*)locinput);
2949 st->ln = (locinput != PL_bostr) ?
2950 UCHARAT(locinput - 1) : '\n';
2951 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2952 st->ln = isALNUM(st->ln);
2953 n = isALNUM(nextchr);
2956 st->ln = isALNUM_LC(st->ln);
2957 n = isALNUM_LC(nextchr);
2960 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
2961 OP(scan) == BOUNDL))
2965 PL_reg_flags |= RF_tainted;
2971 if (UTF8_IS_CONTINUED(nextchr)) {
2972 LOAD_UTF8_CHARCLASS_SPACE();
2973 if (!(OP(scan) == SPACE
2974 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2975 : isSPACE_LC_utf8((U8*)locinput)))
2979 locinput += PL_utf8skip[nextchr];
2980 nextchr = UCHARAT(locinput);
2983 if (!(OP(scan) == SPACE
2984 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2986 nextchr = UCHARAT(++locinput);
2989 if (!(OP(scan) == SPACE
2990 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2992 nextchr = UCHARAT(++locinput);
2996 PL_reg_flags |= RF_tainted;
2999 if (!nextchr && locinput >= PL_regeol)
3002 LOAD_UTF8_CHARCLASS_SPACE();
3003 if (OP(scan) == NSPACE
3004 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3005 : isSPACE_LC_utf8((U8*)locinput))
3009 locinput += PL_utf8skip[nextchr];
3010 nextchr = UCHARAT(locinput);
3013 if (OP(scan) == NSPACE
3014 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3016 nextchr = UCHARAT(++locinput);
3019 PL_reg_flags |= RF_tainted;
3025 LOAD_UTF8_CHARCLASS_DIGIT();
3026 if (!(OP(scan) == DIGIT
3027 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3028 : isDIGIT_LC_utf8((U8*)locinput)))
3032 locinput += PL_utf8skip[nextchr];
3033 nextchr = UCHARAT(locinput);
3036 if (!(OP(scan) == DIGIT
3037 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3039 nextchr = UCHARAT(++locinput);
3042 PL_reg_flags |= RF_tainted;
3045 if (!nextchr && locinput >= PL_regeol)
3048 LOAD_UTF8_CHARCLASS_DIGIT();
3049 if (OP(scan) == NDIGIT
3050 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3051 : isDIGIT_LC_utf8((U8*)locinput))
3055 locinput += PL_utf8skip[nextchr];
3056 nextchr = UCHARAT(locinput);
3059 if (OP(scan) == NDIGIT
3060 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3062 nextchr = UCHARAT(++locinput);
3065 if (locinput >= PL_regeol)
3068 LOAD_UTF8_CHARCLASS_MARK();
3069 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3071 locinput += PL_utf8skip[nextchr];
3072 while (locinput < PL_regeol &&
3073 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3074 locinput += UTF8SKIP(locinput);
3075 if (locinput > PL_regeol)
3080 nextchr = UCHARAT(locinput);
3083 PL_reg_flags |= RF_tainted;
3088 n = ARG(scan); /* which paren pair */
3089 st->ln = PL_regstartp[n];
3090 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3091 if ((I32)*PL_reglastparen < n || st->ln == -1)
3092 sayNO; /* Do not match unless seen CLOSEn. */
3093 if (st->ln == PL_regendp[n])
3096 s = PL_bostr + st->ln;
3097 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3099 const char *e = PL_bostr + PL_regendp[n];
3101 * Note that we can't do the "other character" lookup trick as
3102 * in the 8-bit case (no pun intended) because in Unicode we
3103 * have to map both upper and title case to lower case.
3105 if (OP(scan) == REFF) {
3107 STRLEN ulen1, ulen2;
3108 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3109 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3113 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3114 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3115 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3122 nextchr = UCHARAT(locinput);
3126 /* Inline the first character, for speed. */
3127 if (UCHARAT(s) != nextchr &&
3129 (UCHARAT(s) != ((OP(scan) == REFF
3130 ? PL_fold : PL_fold_locale)[nextchr]))))
3132 st->ln = PL_regendp[n] - st->ln;
3133 if (locinput + st->ln > PL_regeol)
3135 if (st->ln > 1 && (OP(scan) == REF
3136 ? memNE(s, locinput, st->ln)
3138 ? ibcmp(s, locinput, st->ln)
3139 : ibcmp_locale(s, locinput, st->ln))))
3142 nextchr = UCHARAT(locinput);
3153 #define ST st->u.eval
3155 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3159 /* execute the code in the {...} */
3161 SV ** const before = SP;
3162 OP_4tree * const oop = PL_op;
3163 COP * const ocurcop = PL_curcop;
3167 PL_op = (OP_4tree*)rex->data->data[n];
3168 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3169 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3170 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3172 CALLRUNOPS(aTHX); /* Scalar context. */
3175 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3182 PAD_RESTORE_LOCAL(old_comppad);
3183 PL_curcop = ocurcop;
3186 sv_setsv(save_scalar(PL_replgv), ret);
3190 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3193 /* extract RE object from returned value; compiling if
3198 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3199 mg = mg_find(sv, PERL_MAGIC_qr);
3200 else if (SvSMAGICAL(ret)) {
3201 if (SvGMAGICAL(ret))
3202 sv_unmagic(ret, PERL_MAGIC_qr);
3204 mg = mg_find(ret, PERL_MAGIC_qr);
3208 re = (regexp *)mg->mg_obj;
3209 (void)ReREFCNT_inc(re);
3213 const char * const t = SvPV_const(ret, len);
3215 const I32 osize = PL_regsize;
3218 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3219 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3221 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3223 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3229 /* run the pattern returned from (??{...}) */
3231 debug_start_match(re, do_utf8, locinput, PL_regeol,
3232 "Matching embedded");
3235 ST.cp = regcppush(0); /* Save *all* the positions. */
3236 REGCP_SET(ST.lastcp);
3237 *PL_reglastparen = 0;
3238 *PL_reglastcloseparen = 0;
3239 PL_reginput = locinput;
3241 /* XXXX This is too dramatic a measure... */
3245 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3246 ((re->reganch & ROPT_UTF8) != 0);
3247 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3252 /* now continue from first node in postoned RE */
3253 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
3256 /* /(?(?{...})X|Y)/ */
3257 st->sw = SvTRUE(ret);
3262 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3264 PL_reg_flags ^= RF_utf8;
3267 /* XXXX This is too dramatic a measure... */
3269 /* Restore parens of the caller without popping the
3272 const I32 tmp = PL_savestack_ix;
3273 PL_savestack_ix = ST.lastcp;
3275 PL_savestack_ix = tmp;
3277 PL_reginput = locinput;
3278 /* continue at the node following the (??{...}) */
3282 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3283 /* Restore state to the outer re then re-throw the failure */
3285 PL_reg_flags ^= RF_utf8;
3289 /* XXXX This is too dramatic a measure... */
3292 PL_reginput = locinput;
3293 REGCP_UNWIND(ST.lastcp);
3300 n = ARG(scan); /* which paren pair */
3301 PL_reg_start_tmp[n] = locinput;
3306 n = ARG(scan); /* which paren pair */
3307 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3308 PL_regendp[n] = locinput - PL_bostr;
3309 if (n > (I32)*PL_reglastparen)
3310 *PL_reglastparen = n;
3311 *PL_reglastcloseparen = n;
3314 n = ARG(scan); /* which paren pair */
3315 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3318 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3320 next = NEXTOPER(NEXTOPER(scan));
3322 next = scan + ARG(scan);
3323 if (OP(next) == IFTHEN) /* Fake one. */
3324 next = NEXTOPER(NEXTOPER(next));
3328 st->logical = scan->flags;
3330 /*******************************************************************
3331 cc points to the regmatch_state associated with the most recent CURLYX.
3332 This struct contains info about the innermost (...)* loop (an
3333 "infoblock"), and a pointer to the next outer cc.
3335 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3337 1) After matching Y, regnode for CURLYX is processed;
3339 2) This regnode populates cc, and calls regmatch() recursively
3340 with the starting point at WHILEM node;
3342 3) Each hit of WHILEM node tries to match A and Z (in the order
3343 depending on the current iteration, min/max of {min,max} and
3344 greediness). The information about where are nodes for "A"
3345 and "Z" is read from cc, as is info on how many times "A"
3346 was already matched, and greediness.
3348 4) After A matches, the same WHILEM node is hit again.
3350 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3351 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3352 resets cc, since this Y(A)*Z can be a part of some other loop:
3353 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3354 of the external loop.
3356 Currently present infoblocks form a tree with a stem formed by st->cc
3357 and whatever it mentions via ->next, and additional attached trees
3358 corresponding to temporarily unset infoblocks as in "5" above.
3360 In the following picture, infoblocks for outer loop of
3361 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3362 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3363 infoblocks are drawn below the "reset" infoblock.
3365 In fact in the picture below we do not show failed matches for Z and T
3366 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3367 more obvious *why* one needs to *temporary* unset infoblocks.]
3369 Matched REx position InfoBlocks Comment
3373 Y A)*?Z)*?T x <- O <- I
3374 YA )*?Z)*?T x <- O <- I
3375 YA A)*?Z)*?T x <- O <- I
3376 YAA )*?Z)*?T x <- O <- I
3377 YAA Z)*?T x <- O # Temporary unset I
3380 YAAZ Y(A)*?Z)*?T x <- O
3383 YAAZY (A)*?Z)*?T x <- O
3386 YAAZY A)*?Z)*?T x <- O <- I
3389 YAAZYA )*?Z)*?T x <- O <- I
3392 YAAZYA Z)*?T x <- O # Temporary unset I
3398 YAAZYAZ T x # Temporary unset O
3405 *******************************************************************/
3408 /* No need to save/restore up to this paren */
3409 I32 parenfloor = scan->flags;
3413 CURLYX and WHILEM are always paired: they're the moral
3414 equivalent of pp_enteriter anbd pp_iter.
3416 The only time next could be null is if the node tree is
3417 corrupt. This was mentioned on p5p a few days ago.
3419 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3420 So we'll assert that this is true:
3423 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3425 /* XXXX Probably it is better to teach regpush to support
3426 parenfloor > PL_regsize... */
3427 if (parenfloor > (I32)*PL_reglastparen)
3428 parenfloor = *PL_reglastparen; /* Pessimization... */
3430 st->u.curlyx.cp = PL_savestack_ix;
3431 st->u.curlyx.outercc = st->cc;
3433 /* these fields contain the state of the current curly.
3434 * they are accessed by subsequent WHILEMs;
3435 * cur and lastloc are also updated by WHILEM */
3436 st->u.curlyx.parenfloor = parenfloor;
3437 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3438 st->u.curlyx.min = ARG1(scan);
3439 st->u.curlyx.max = ARG2(scan);
3440 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3441 st->u.curlyx.lastloc = 0;
3442 /* st->next and st->minmod are also read by WHILEM */
3444 PL_reginput = locinput;
3445 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3446 /*** all unsaved local vars undefined at this point */
3447 regcpblow(st->u.curlyx.cp);
3448 st->cc = st->u.curlyx.outercc;
3454 * This is really hard to understand, because after we match
3455 * what we're trying to match, we must make sure the rest of
3456 * the REx is going to match for sure, and to do that we have
3457 * to go back UP the parse tree by recursing ever deeper. And
3458 * if it fails, we have to reset our parent's current state
3459 * that we can try again after backing off.
3464 st->cc gets initialised by CURLYX ready for use by WHILEM.
3465 So again, unless somethings been corrupted, st->cc cannot
3466 be null at that point in WHILEM.
3468 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3469 So we'll assert that this is true:
3472 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3473 st->u.whilem.cache_offset = 0;
3474 st->u.whilem.cache_bit = 0;
3476 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3477 PL_reginput = locinput;
3480 PerlIO_printf(Perl_debug_log,
3481 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3482 REPORT_CODE_OFF+PL_regindent*2, "",
3483 (long)n, (long)st->cc->u.curlyx.min,
3484 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3487 /* If degenerate scan matches "", assume scan done. */
3489 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3490 st->u.whilem.savecc = st->cc;
3491 st->cc = st->cc->u.curlyx.outercc;
3493 st->ln = st->cc->u.curlyx.cur;
3495 PerlIO_printf(Perl_debug_log,
3496 "%*s empty match detected, try continuation...\n",
3497 REPORT_CODE_OFF+PL_regindent*2, "")
3499 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3500 /*** all unsaved local vars undefined at this point */
3501 st->cc = st->u.whilem.savecc;
3504 if (st->cc->u.curlyx.outercc)
3505 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3509 /* First just match a string of min scans. */
3511 if (n < st->cc->u.curlyx.min) {
3512 st->cc->u.curlyx.cur = n;
3513 st->cc->u.curlyx.lastloc = locinput;
3514 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3515 /*** all unsaved local vars undefined at this point */
3518 st->cc->u.curlyx.cur = n - 1;
3519 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3524 /* Check whether we already were at this position.
3525 Postpone detection until we know the match is not
3526 *that* much linear. */
3527 if (!PL_reg_maxiter) {
3528 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3529 /* possible overflow for long strings and many CURLYX's */
3530 if (PL_reg_maxiter < 0)
3531 PL_reg_maxiter = I32_MAX;
3532 PL_reg_leftiter = PL_reg_maxiter;
3534 if (PL_reg_leftiter-- == 0) {
3535 const I32 size = (PL_reg_maxiter + 7)/8;
3536 if (PL_reg_poscache) {
3537 if ((I32)PL_reg_poscache_size < size) {
3538 Renew(PL_reg_poscache, size, char);
3539 PL_reg_poscache_size = size;
3541 Zero(PL_reg_poscache, size, char);
3544 PL_reg_poscache_size = size;
3545 Newxz(PL_reg_poscache, size, char);
3548 PerlIO_printf(Perl_debug_log,
3549 "%sDetected a super-linear match, switching on caching%s...\n",
3550 PL_colors[4], PL_colors[5])
3553 if (PL_reg_leftiter < 0) {
3554 st->u.whilem.cache_offset = locinput - PL_bostr;
3556 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
3557 + st->u.whilem.cache_offset * (scan->flags>>4);
3558 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3559 st->u.whilem.cache_offset /= 8;
3560 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3562 PerlIO_printf(Perl_debug_log,
3563 "%*s already tried at this position...\n",
3564 REPORT_CODE_OFF+PL_regindent*2, "")
3566 sayNO; /* cache records failure */
3571 /* Prefer next over scan for minimal matching. */
3573 if (st->cc->minmod) {
3574 st->u.whilem.savecc = st->cc;
3575 st->cc = st->cc->u.curlyx.outercc;
3577 st->ln = st->cc->u.curlyx.cur;
3578 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3579 REGCP_SET(st->u.whilem.lastcp);
3580 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3581 /*** all unsaved local vars undefined at this point */
3582 st->cc = st->u.whilem.savecc;
3584 regcpblow(st->u.whilem.cp);
3585 sayYES; /* All done. */
3587 REGCP_UNWIND(st->u.whilem.lastcp);
3589 if (st->cc->u.curlyx.outercc)
3590 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3592 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3593 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3594 && !(PL_reg_flags & RF_warned)) {
3595 PL_reg_flags |= RF_warned;
3596 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3597 "Complex regular subexpression recursion",
3604 PerlIO_printf(Perl_debug_log,
3605 "%*s trying longer...\n",
3606 REPORT_CODE_OFF+PL_regindent*2, "")
3608 /* Try scanning more and see if it helps. */
3609 PL_reginput = locinput;
3610 st->cc->u.curlyx.cur = n;
3611 st->cc->u.curlyx.lastloc = locinput;
3612 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3613 REGCP_SET(st->u.whilem.lastcp);
3614 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3615 /*** all unsaved local vars undefined at this point */
3617 regcpblow(st->u.whilem.cp);
3620 REGCP_UNWIND(st->u.whilem.lastcp);
3622 st->cc->u.curlyx.cur = n - 1;
3623 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3627 /* Prefer scan over next for maximal matching. */
3629 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3630 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3631 st->cc->u.curlyx.cur = n;
3632 st->cc->u.curlyx.lastloc = locinput;
3633 REGCP_SET(st->u.whilem.lastcp);
3634 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
3635 /*** all unsaved local vars undefined at this point */
3637 regcpblow(st->u.whilem.cp);
3640 REGCP_UNWIND(st->u.whilem.lastcp);
3641 regcppop(rex); /* Restore some previous $<digit>s? */
3642 PL_reginput = locinput;
3644 PerlIO_printf(Perl_debug_log,
3645 "%*s failed, try continuation...\n",
3646 REPORT_CODE_OFF+PL_regindent*2, "")
3649 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3650 && !(PL_reg_flags & RF_warned)) {
3651 PL_reg_flags |= RF_warned;
3652 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3653 "Complex regular subexpression recursion",
3657 /* Failed deeper matches of scan, so see if this one works. */
3658 st->u.whilem.savecc = st->cc;
3659 st->cc = st->cc->u.curlyx.outercc;
3661 st->ln = st->cc->u.curlyx.cur;
3662 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
3663 /*** all unsaved local vars undefined at this point */
3664 st->cc = st->u.whilem.savecc;
3667 if (st->cc->u.curlyx.outercc)
3668 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3669 st->cc->u.curlyx.cur = n - 1;
3670 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3676 #define ST st->u.branch
3678 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3679 next = scan + ARG(scan);
3682 scan = NEXTOPER(scan);
3685 case BRANCH: /* /(...|A|...)/ */
3686 scan = NEXTOPER(scan); /* scan now points to inner node */
3687 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3688 /* last branch; skip state push and jump direct to node */
3690 ST.lastparen = *PL_reglastparen;
3691 ST.next_branch = next;
3693 PL_reginput = locinput;
3695 /* Now go into the branch */
3696 PUSH_STATE_GOTO(BRANCH_next, scan);
3699 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3700 REGCP_UNWIND(ST.cp);
3701 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3703 *PL_reglastparen = n;
3704 scan = ST.next_branch;
3705 /* no more branches? */
3706 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3708 continue; /* execute next BRANCH[J] op */
3716 #define ST st->u.curlym
3718 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3720 /* This is an optimisation of CURLYX that enables us to push
3721 * only a single backtracking state, no matter now many matches
3722 * there are in {m,n}. It relies on the pattern being constant
3723 * length, with no parens to influence future backrefs
3727 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3729 /* if paren positive, emulate an OPEN/CLOSE around A */
3731 I32 paren = ST.me->flags;
3732 if (paren > PL_regsize)
3734 if (paren > (I32)*PL_reglastparen)
3735 *PL_reglastparen = paren;
3736 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3742 ST.minmod = st->minmod;
3744 ST.c1 = CHRTEST_UNINIT;
3747 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3750 curlym_do_A: /* execute the A in /A{m,n}B/ */
3751 PL_reginput = locinput;
3752 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3755 case CURLYM_A: /* we've just matched an A */
3756 locinput = st->locinput;
3757 nextchr = UCHARAT(locinput);
3760 /* after first match, determine A's length: u.curlym.alen */
3761 if (ST.count == 1) {
3762 if (PL_reg_match_utf8) {
3764 while (s < PL_reginput) {
3770 ST.alen = PL_reginput - locinput;
3773 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3776 PerlIO_printf(Perl_debug_log,
3777 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
3778 (int)(REPORT_CODE_OFF+(PL_regindent*2)), "",
3779 (IV) ST.count, (IV)ST.alen)
3782 locinput = PL_reginput;
3783 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3784 goto curlym_do_A; /* try to match another A */
3785 goto curlym_do_B; /* try to match B */
3787 case CURLYM_A_fail: /* just failed to match an A */
3788 REGCP_UNWIND(ST.cp);
3789 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3792 curlym_do_B: /* execute the B in /A{m,n}B/ */
3793 PL_reginput = locinput;
3794 if (ST.c1 == CHRTEST_UNINIT) {
3795 /* calculate c1 and c2 for possible match of 1st char
3796 * following curly */
3797 ST.c1 = ST.c2 = CHRTEST_VOID;
3798 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3799 regnode *text_node = ST.B;
3800 if (! HAS_TEXT(text_node))
3801 FIND_NEXT_IMPT(text_node);
3802 if (HAS_TEXT(text_node)
3803 && PL_regkind[OP(text_node)] != REF)
3805 ST.c1 = (U8)*STRING(text_node);
3807 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3809 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3810 ? PL_fold_locale[ST.c1]
3817 PerlIO_printf(Perl_debug_log,
3818 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
3819 (int)(REPORT_CODE_OFF+(PL_regindent*2)),
3822 if (ST.c1 != CHRTEST_VOID
3823 && UCHARAT(PL_reginput) != ST.c1
3824 && UCHARAT(PL_reginput) != ST.c2)
3826 /* simulate B failing */
3827 state_num = CURLYM_B_fail;
3828 goto reenter_switch;
3832 /* mark current A as captured */
3833 I32 paren = ST.me->flags;
3836 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3837 PL_regendp[paren] = PL_reginput - PL_bostr;
3840 PL_regendp[paren] = -1;
3842 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
3845 case CURLYM_B_fail: /* just failed to match a B */
3846 REGCP_UNWIND(ST.cp);
3848 if (ST.count == ARG2(ST.me) /* max */)
3850 goto curlym_do_A; /* try to match a further A */
3852 /* backtrack one A */
3853 if (ST.count == ARG1(ST.me) /* min */)
3856 locinput = HOPc(locinput, -ST.alen);
3857 goto curlym_do_B; /* try to match B */
3860 #define ST st->u.curly
3862 #define CURLY_SETPAREN(paren, success) \
3865 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3866 PL_regendp[paren] = locinput - PL_bostr; \
3869 PL_regendp[paren] = -1; \
3872 case STAR: /* /A*B/ where A is width 1 */
3876 scan = NEXTOPER(scan);
3878 case PLUS: /* /A+B/ where A is width 1 */
3882 scan = NEXTOPER(scan);
3884 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3885 ST.paren = scan->flags; /* Which paren to set */
3886 if (ST.paren > PL_regsize)
3887 PL_regsize = ST.paren;
3888 if (ST.paren > (I32)*PL_reglastparen)
3889 *PL_reglastparen = ST.paren;
3890 ST.min = ARG1(scan); /* min to match */
3891 ST.max = ARG2(scan); /* max to match */
3892 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3894 case CURLY: /* /A{m,n}B/ where A is width 1 */
3896 ST.min = ARG1(scan); /* min to match */
3897 ST.max = ARG2(scan); /* max to match */
3898 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3901 * Lookahead to avoid useless match attempts
3902 * when we know what character comes next.
3904 * Used to only do .*x and .*?x, but now it allows
3905 * for )'s, ('s and (?{ ... })'s to be in the way
3906 * of the quantifier and the EXACT-like node. -- japhy
3909 if (ST.min > ST.max) /* XXX make this a compile-time check? */
3911 if (HAS_TEXT(next) || JUMPABLE(next)) {
3913 regnode *text_node = next;
3915 if (! HAS_TEXT(text_node))
3916 FIND_NEXT_IMPT(text_node);
3918 if (! HAS_TEXT(text_node))
3919 ST.c1 = ST.c2 = CHRTEST_VOID;
3921 if (PL_regkind[OP(text_node)] == REF) {
3922 ST.c1 = ST.c2 = CHRTEST_VOID;
3923 goto assume_ok_easy;