5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
38 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
70 #define PERL_IN_REGEXEC_C
73 #ifdef PERL_IN_XSUB_RE
79 #define RF_tainted 1 /* tainted information used? */
80 #define RF_warned 2 /* warned about big count? */
81 #define RF_evaled 4 /* Did an EVAL with setting? */
82 #define RF_utf8 8 /* String contains multibyte chars? */
84 #define UTF ((PL_reg_flags & RF_utf8) != 0)
86 #define RS_init 1 /* eval environment created */
87 #define RS_set 2 /* replsv value is set */
93 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
99 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
100 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
102 #define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) ((char*) \
106 ((PL_reg_match_utf8) \
107 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
108 : (pos - off >= PL_bostr) \
113 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
114 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
115 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
117 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
118 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
119 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
120 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
121 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
122 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) \
142 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
143 rn = NEXTOPER(NEXTOPER(rn)); \
144 else if (OP(rn) == PLUS) \
146 else if (OP(rn) == IFMATCH) \
147 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
148 else rn += NEXT_OFF(rn); \
151 static void restore_pos(pTHX_ void *arg);
154 S_regcppush(pTHX_ I32 parenfloor)
157 const int retval = PL_savestack_ix;
158 #define REGCP_PAREN_ELEMS 4
159 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
162 if (paren_elems_to_push < 0)
163 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
165 #define REGCP_OTHER_ELEMS 6
166 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
167 for (p = PL_regsize; p > parenfloor; p--) {
168 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
169 SSPUSHINT(PL_regendp[p]);
170 SSPUSHINT(PL_regstartp[p]);
171 SSPUSHPTR(PL_reg_start_tmp[p]);
174 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
175 SSPUSHINT(PL_regsize);
176 SSPUSHINT(*PL_reglastparen);
177 SSPUSHINT(*PL_reglastcloseparen);
178 SSPUSHPTR(PL_reginput);
179 #define REGCP_FRAME_ELEMS 2
180 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
181 * are needed for the regexp context stack bookkeeping. */
182 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
183 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
188 /* These are needed since we do not localize EVAL nodes: */
189 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
190 " Setting an EVAL scope, savestack=%"IVdf"\n", \
191 (IV)PL_savestack_ix)); cp = PL_savestack_ix
193 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
194 PerlIO_printf(Perl_debug_log, \
195 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
196 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
199 S_regcppop(pTHX_ const regexp *rex)
205 GET_RE_DEBUG_FLAGS_DECL;
207 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
209 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
210 i = SSPOPINT; /* Parentheses elements to pop. */
211 input = (char *) SSPOPPTR;
212 *PL_reglastcloseparen = SSPOPINT;
213 *PL_reglastparen = SSPOPINT;
214 PL_regsize = SSPOPINT;
216 /* Now restore the parentheses context. */
217 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
218 i > 0; i -= REGCP_PAREN_ELEMS) {
220 U32 paren = (U32)SSPOPINT;
221 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
222 PL_regstartp[paren] = SSPOPINT;
224 if (paren <= *PL_reglastparen)
225 PL_regendp[paren] = tmps;
227 PerlIO_printf(Perl_debug_log,
228 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
229 (UV)paren, (IV)PL_regstartp[paren],
230 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
231 (IV)PL_regendp[paren],
232 (paren > *PL_reglastparen ? "(no)" : ""));
236 if (*PL_reglastparen + 1 <= rex->nparens) {
237 PerlIO_printf(Perl_debug_log,
238 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
239 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
243 /* It would seem that the similar code in regtry()
244 * already takes care of this, and in fact it is in
245 * a better location to since this code can #if 0-ed out
246 * but the code in regtry() is needed or otherwise tests
247 * requiring null fields (pat.t#187 and split.t#{13,14}
248 * (as of patchlevel 7877) will fail. Then again,
249 * this code seems to be necessary or otherwise
250 * building DynaLoader will fail:
251 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
253 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
255 PL_regstartp[i] = -1;
262 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
264 #define TRYPAREN(paren, n, input, where) { \
267 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
268 PL_regendp[paren] = input - PL_bostr; \
271 PL_regendp[paren] = -1; \
273 REGMATCH(next, where); \
277 PL_regendp[paren] = -1; \
282 * pregexec and friends
285 #ifndef PERL_IN_XSUB_RE
287 - pregexec - match a regexp against a string
290 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
291 char *strbeg, I32 minend, SV *screamer, U32 nosave)
292 /* strend: pointer to null at end of string */
293 /* strbeg: real beginning of string */
294 /* minend: end of match must be >=minend after stringarg. */
295 /* nosave: For optimizations. */
298 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
299 nosave ? 0 : REXEC_COPY_STR);
304 * Need to implement the following flags for reg_anch:
306 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
308 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
309 * INTUIT_AUTORITATIVE_ML
310 * INTUIT_ONCE_NOML - Intuit can match in one location only.
313 * Another flag for this function: SECOND_TIME (so that float substrs
314 * with giant delta may be not rechecked).
317 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
319 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
320 Otherwise, only SvCUR(sv) is used to get strbeg. */
322 /* XXXX We assume that strpos is strbeg unless sv. */
324 /* XXXX Some places assume that there is a fixed substring.
325 An update may be needed if optimizer marks as "INTUITable"
326 RExen without fixed substrings. Similarly, it is assumed that
327 lengths of all the strings are no more than minlen, thus they
328 cannot come from lookahead.
329 (Or minlen should take into account lookahead.) */
331 /* A failure to find a constant substring means that there is no need to make
332 an expensive call to REx engine, thus we celebrate a failure. Similarly,
333 finding a substring too deep into the string means that less calls to
334 regtry() should be needed.
336 REx compiler's optimizer found 4 possible hints:
337 a) Anchored substring;
339 c) Whether we are anchored (beginning-of-line or \G);
340 d) First node (of those at offset 0) which may distingush positions;
341 We use a)b)d) and multiline-part of c), and try to find a position in the
342 string which does not contradict any of them.
345 /* Most of decisions we do here should have been done at compile time.
346 The nodes of the REx which we used for the search should have been
347 deleted from the finite automaton. */
350 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
351 char *strend, U32 flags, re_scream_pos_data *data)
354 register I32 start_shift = 0;
355 /* Should be nonnegative! */
356 register I32 end_shift = 0;
361 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
363 register char *other_last = NULL; /* other substr checked before this */
364 char *check_at = NULL; /* check substr found at this pos */
365 const I32 multiline = prog->reganch & PMf_MULTILINE;
367 const char * const i_strpos = strpos;
368 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
371 GET_RE_DEBUG_FLAGS_DECL;
373 RX_MATCH_UTF8_set(prog,do_utf8);
375 if (prog->reganch & ROPT_UTF8) {
376 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
377 "UTF-8 regex...\n"));
378 PL_reg_flags |= RF_utf8;
382 const char *s = PL_reg_match_utf8 ?
383 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
385 const int len = PL_reg_match_utf8 ?
386 (int)strlen(s) : strend - strpos;
389 if (PL_reg_match_utf8)
390 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
391 "UTF-8 target...\n"));
392 PerlIO_printf(Perl_debug_log,
393 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
394 PL_colors[4], PL_colors[5], PL_colors[0],
397 (strlen(prog->precomp) > 60 ? "..." : ""),
399 (int)(len > 60 ? 60 : len),
401 (len > 60 ? "..." : "")
405 /* CHR_DIST() would be more correct here but it makes things slow. */
406 if (prog->minlen > strend - strpos) {
407 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
408 "String too short... [re_intuit_start]\n"));
411 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
414 if (!prog->check_utf8 && prog->check_substr)
415 to_utf8_substr(prog);
416 check = prog->check_utf8;
418 if (!prog->check_substr && prog->check_utf8)
419 to_byte_substr(prog);
420 check = prog->check_substr;
422 if (check == &PL_sv_undef) {
423 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
424 "Non-utf string cannot match utf check string\n"));
427 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
428 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
429 || ( (prog->reganch & ROPT_ANCH_BOL)
430 && !multiline ) ); /* Check after \n? */
433 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
434 | ROPT_IMPLICIT)) /* not a real BOL */
435 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
437 && (strpos != strbeg)) {
438 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
441 if (prog->check_offset_min == prog->check_offset_max &&
442 !(prog->reganch & ROPT_CANY_SEEN)) {
443 /* Substring at constant offset from beg-of-str... */
446 s = HOP3c(strpos, prog->check_offset_min, strend);
448 slen = SvCUR(check); /* >= 1 */
450 if ( strend - s > slen || strend - s < slen - 1
451 || (strend - s == slen && strend[-1] != '\n')) {
452 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
455 /* Now should match s[0..slen-2] */
457 if (slen && (*SvPVX_const(check) != *s
459 && memNE(SvPVX_const(check), s, slen)))) {
461 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
465 else if (*SvPVX_const(check) != *s
466 || ((slen = SvCUR(check)) > 1
467 && memNE(SvPVX_const(check), s, slen)))
470 goto success_at_start;
473 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
475 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
476 end_shift = prog->minlen - start_shift -
477 CHR_SVLEN(check) + (SvTAIL(check) != 0);
479 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
480 - (SvTAIL(check) != 0);
481 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
483 if (end_shift < eshift)
487 else { /* Can match at random position */
490 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
491 /* Should be nonnegative! */
492 end_shift = prog->minlen - start_shift -
493 CHR_SVLEN(check) + (SvTAIL(check) != 0);
496 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
498 Perl_croak(aTHX_ "panic: end_shift");
502 /* Find a possible match in the region s..strend by looking for
503 the "check" substring in the region corrected by start/end_shift. */
504 if (flags & REXEC_SCREAM) {
505 I32 p = -1; /* Internal iterator of scream. */
506 I32 * const pp = data ? data->scream_pos : &p;
508 if (PL_screamfirst[BmRARE(check)] >= 0
509 || ( BmRARE(check) == '\n'
510 && (BmPREVIOUS(check) == SvCUR(check) - 1)
512 s = screaminstr(sv, check,
513 start_shift + (s - strbeg), end_shift, pp, 0);
516 /* we may be pointing at the wrong string */
517 if (s && RX_MATCH_COPIED(prog))
518 s = strbeg + (s - SvPVX_const(sv));
520 *data->scream_olds = s;
522 else if (prog->reganch & ROPT_CANY_SEEN)
523 s = fbm_instr((U8*)(s + start_shift),
524 (U8*)(strend - end_shift),
525 check, multiline ? FBMrf_MULTILINE : 0);
527 s = fbm_instr(HOP3(s, start_shift, strend),
528 HOP3(strend, -end_shift, strbeg),
529 check, multiline ? FBMrf_MULTILINE : 0);
531 /* Update the count-of-usability, remove useless subpatterns,
534 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
535 (s ? "Found" : "Did not find"),
536 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
538 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
540 PL_colors[1], (SvTAIL(check) ? "$" : ""),
541 (s ? " at offset " : "...\n") ) );
548 /* Finish the diagnostic message */
549 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
551 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
552 Start with the other substr.
553 XXXX no SCREAM optimization yet - and a very coarse implementation
554 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
555 *always* match. Probably should be marked during compile...
556 Probably it is right to do no SCREAM here...
559 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
560 /* Take into account the "other" substring. */
561 /* XXXX May be hopelessly wrong for UTF... */
564 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
567 char * const last = HOP3c(s, -start_shift, strbeg);
572 t = s - prog->check_offset_max;
573 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
575 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
580 t = HOP3c(t, prog->anchored_offset, strend);
581 if (t < other_last) /* These positions already checked */
583 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
586 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
587 /* On end-of-str: see comment below. */
588 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
589 if (must == &PL_sv_undef) {
591 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
596 HOP3(HOP3(last1, prog->anchored_offset, strend)
597 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
599 multiline ? FBMrf_MULTILINE : 0
601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
602 "%s anchored substr \"%s%.*s%s\"%s",
603 (s ? "Found" : "Contradicts"),
606 - (SvTAIL(must)!=0)),
608 PL_colors[1], (SvTAIL(must) ? "$" : "")));
610 if (last1 >= last2) {
611 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
612 ", giving up...\n"));
615 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
616 ", trying floating at offset %ld...\n",
617 (long)(HOP3c(s1, 1, strend) - i_strpos)));
618 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
619 s = HOP3c(last, 1, strend);
623 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
624 (long)(s - i_strpos)));
625 t = HOP3c(s, -prog->anchored_offset, strbeg);
626 other_last = HOP3c(s, 1, strend);
634 else { /* Take into account the floating substring. */
639 t = HOP3c(s, -start_shift, strbeg);
641 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
642 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
643 last = HOP3c(t, prog->float_max_offset, strend);
644 s = HOP3c(t, prog->float_min_offset, strend);
647 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
648 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
649 /* fbm_instr() takes into account exact value of end-of-str
650 if the check is SvTAIL(ed). Since false positives are OK,
651 and end-of-str is not later than strend we are OK. */
652 if (must == &PL_sv_undef) {
654 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
657 s = fbm_instr((unsigned char*)s,
658 (unsigned char*)last + SvCUR(must)
660 must, multiline ? FBMrf_MULTILINE : 0);
661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
662 (s ? "Found" : "Contradicts"),
664 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
666 PL_colors[1], (SvTAIL(must) ? "$" : "")));
669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
670 ", giving up...\n"));
673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
674 ", trying anchored starting at offset %ld...\n",
675 (long)(s1 + 1 - i_strpos)));
677 s = HOP3c(t, 1, strend);
681 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
682 (long)(s - i_strpos)));
683 other_last = s; /* Fix this later. --Hugo */
692 t = s - prog->check_offset_max;
693 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
695 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
697 /* Fixed substring is found far enough so that the match
698 cannot start at strpos. */
700 if (ml_anch && t[-1] != '\n') {
701 /* Eventually fbm_*() should handle this, but often
702 anchored_offset is not 0, so this check will not be wasted. */
703 /* XXXX In the code below we prefer to look for "^" even in
704 presence of anchored substrings. And we search even
705 beyond the found float position. These pessimizations
706 are historical artefacts only. */
708 while (t < strend - prog->minlen) {
710 if (t < check_at - prog->check_offset_min) {
711 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
712 /* Since we moved from the found position,
713 we definitely contradict the found anchored
714 substr. Due to the above check we do not
715 contradict "check" substr.
716 Thus we can arrive here only if check substr
717 is float. Redo checking for "other"=="fixed".
720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
721 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
722 goto do_other_anchored;
724 /* We don't contradict the found floating substring. */
725 /* XXXX Why not check for STCLASS? */
727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
728 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
731 /* Position contradicts check-string */
732 /* XXXX probably better to look for check-string
733 than for "\n", so one should lower the limit for t? */
734 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
735 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
736 other_last = strpos = s = t + 1;
741 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
742 PL_colors[0], PL_colors[1]));
746 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
747 PL_colors[0], PL_colors[1]));
751 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
754 /* The found string does not prohibit matching at strpos,
755 - no optimization of calling REx engine can be performed,
756 unless it was an MBOL and we are not after MBOL,
757 or a future STCLASS check will fail this. */
759 /* Even in this situation we may use MBOL flag if strpos is offset
760 wrt the start of the string. */
761 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
762 && (strpos != strbeg) && strpos[-1] != '\n'
763 /* May be due to an implicit anchor of m{.*foo} */
764 && !(prog->reganch & ROPT_IMPLICIT))
769 DEBUG_EXECUTE_r( if (ml_anch)
770 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
771 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
774 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
776 prog->check_utf8 /* Could be deleted already */
777 && --BmUSEFUL(prog->check_utf8) < 0
778 && (prog->check_utf8 == prog->float_utf8)
780 prog->check_substr /* Could be deleted already */
781 && --BmUSEFUL(prog->check_substr) < 0
782 && (prog->check_substr == prog->float_substr)
785 /* If flags & SOMETHING - do not do it many times on the same match */
786 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
787 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
788 if (do_utf8 ? prog->check_substr : prog->check_utf8)
789 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
790 prog->check_substr = prog->check_utf8 = NULL; /* disable */
791 prog->float_substr = prog->float_utf8 = NULL; /* clear */
792 check = NULL; /* abort */
794 /* XXXX This is a remnant of the old implementation. It
795 looks wasteful, since now INTUIT can use many
797 prog->reganch &= ~RE_USE_INTUIT;
804 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
805 if (prog->regstclass) {
806 /* minlen == 0 is possible if regstclass is \b or \B,
807 and the fixed substr is ''$.
808 Since minlen is already taken into account, s+1 is before strend;
809 accidentally, minlen >= 1 guaranties no false positives at s + 1
810 even for \b or \B. But (minlen? 1 : 0) below assumes that
811 regstclass does not come from lookahead... */
812 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
813 This leaves EXACTF only, which is dealt with in find_byclass(). */
814 const U8* const str = (U8*)STRING(prog->regstclass);
815 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
816 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
818 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
819 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
820 : (prog->float_substr || prog->float_utf8
821 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
826 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
829 const char *what = NULL;
831 if (endpos == strend) {
832 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
833 "Could not match STCLASS...\n") );
836 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
837 "This position contradicts STCLASS...\n") );
838 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
840 /* Contradict one of substrings */
841 if (prog->anchored_substr || prog->anchored_utf8) {
842 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
843 DEBUG_EXECUTE_r( what = "anchored" );
845 s = HOP3c(t, 1, strend);
846 if (s + start_shift + end_shift > strend) {
847 /* XXXX Should be taken into account earlier? */
848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
849 "Could not match STCLASS...\n") );
854 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
855 "Looking for %s substr starting at offset %ld...\n",
856 what, (long)(s + start_shift - i_strpos)) );
859 /* Have both, check_string is floating */
860 if (t + start_shift >= check_at) /* Contradicts floating=check */
861 goto retry_floating_check;
862 /* Recheck anchored substring, but not floating... */
866 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
867 "Looking for anchored substr starting at offset %ld...\n",
868 (long)(other_last - i_strpos)) );
869 goto do_other_anchored;
871 /* Another way we could have checked stclass at the
872 current position only: */
877 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
878 "Looking for /%s^%s/m starting at offset %ld...\n",
879 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
882 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
884 /* Check is floating subtring. */
885 retry_floating_check:
886 t = check_at - start_shift;
887 DEBUG_EXECUTE_r( what = "floating" );
888 goto hop_and_restart;
891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
892 "By STCLASS: moving %ld --> %ld\n",
893 (long)(t - i_strpos), (long)(s - i_strpos))
897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
898 "Does not contradict STCLASS...\n");
903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
904 PL_colors[4], (check ? "Guessed" : "Giving up"),
905 PL_colors[5], (long)(s - i_strpos)) );
908 fail_finish: /* Substring not found */
909 if (prog->check_substr || prog->check_utf8) /* could be removed already */
910 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
912 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
913 PL_colors[4], PL_colors[5]));
917 /* We know what class REx starts with. Try to find this position... */
918 /* if reginfo is NULL, its a dryrun */
921 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char
922 *strend, const regmatch_info *reginfo)
925 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
929 register STRLEN uskip;
933 register I32 tmp = 1; /* Scratch variable? */
934 register const bool do_utf8 = PL_reg_match_utf8;
936 /* We know what class it must start with. */
940 while (s + (uskip = UTF8SKIP(s)) <= strend) {
941 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
942 !UTF8_IS_INVARIANT((U8)s[0]) ?
943 reginclass(prog, c, (U8*)s, 0, do_utf8) :
944 REGINCLASS(prog, c, (U8*)s)) {
945 if (tmp && (!reginfo || regtry(reginfo, s)))
959 if (REGINCLASS(prog, c, (U8*)s) ||
960 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
961 /* The assignment of 2 is intentional:
962 * for the folded sharp s, the skip is 2. */
963 (skip = SHARP_S_SKIP))) {
964 if (tmp && (!reginfo || regtry(reginfo, s)))
977 if (tmp && (!reginfo || regtry(reginfo, s)))
986 ln = STR_LEN(c); /* length to match in octets/bytes */
987 lnc = (I32) ln; /* length to match in characters */
991 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
992 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
993 const U32 uniflags = UTF8_ALLOW_DEFAULT;
995 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
996 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
998 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1000 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1003 while (sm < ((U8 *) m + ln)) {
1018 c2 = PL_fold_locale[c1];
1020 e = HOP3c(strend, -((I32)lnc), s);
1022 if (!reginfo && e < s)
1023 e = s; /* Due to minlen logic of intuit() */
1025 /* The idea in the EXACTF* cases is to first find the
1026 * first character of the EXACTF* node and then, if
1027 * necessary, case-insensitively compare the full
1028 * text of the node. The c1 and c2 are the first
1029 * characters (though in Unicode it gets a bit
1030 * more complicated because there are more cases
1031 * than just upper and lower: one needs to use
1032 * the so-called folding case for case-insensitive
1033 * matching (called "loose matching" in Unicode).
1034 * ibcmp_utf8() will do just that. */
1038 U8 tmpbuf [UTF8_MAXBYTES+1];
1039 STRLEN len, foldlen;
1040 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1042 /* Upper and lower of 1st char are equal -
1043 * probably not a "letter". */
1045 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1049 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1050 m, (char **)0, ln, (bool)UTF))
1051 && (!reginfo || regtry(reginfo, s)) )
1054 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1055 uvchr_to_utf8(tmpbuf, c);
1056 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1058 && (f == c1 || f == c2)
1059 && (ln == foldlen ||
1060 !ibcmp_utf8((char *) foldbuf,
1061 (char **)0, foldlen, do_utf8,
1063 (char **)0, ln, (bool)UTF))
1064 && (!reginfo || regtry(reginfo, s)) )
1072 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1075 /* Handle some of the three Greek sigmas cases.
1076 * Note that not all the possible combinations
1077 * are handled here: some of them are handled
1078 * by the standard folding rules, and some of
1079 * them (the character class or ANYOF cases)
1080 * are handled during compiletime in
1081 * regexec.c:S_regclass(). */
1082 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1083 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1084 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1086 if ( (c == c1 || c == c2)
1088 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1089 m, (char **)0, ln, (bool)UTF))
1090 && (!reginfo || regtry(reginfo, s)) )
1093 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1094 uvchr_to_utf8(tmpbuf, c);
1095 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1097 && (f == c1 || f == c2)
1098 && (ln == foldlen ||
1099 !ibcmp_utf8((char *) foldbuf,
1100 (char **)0, foldlen, do_utf8,
1102 (char **)0, ln, (bool)UTF))
1103 && (!reginfo || regtry(reginfo, s)) )
1114 && (ln == 1 || !(OP(c) == EXACTF
1116 : ibcmp_locale(s, m, ln)))
1117 && (!reginfo || regtry(reginfo, s)) )
1123 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1124 && (ln == 1 || !(OP(c) == EXACTF
1126 : ibcmp_locale(s, m, ln)))
1127 && (!reginfo || regtry(reginfo, s)) )
1134 PL_reg_flags |= RF_tainted;
1141 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1142 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1144 tmp = ((OP(c) == BOUND ?
1145 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1146 LOAD_UTF8_CHARCLASS_ALNUM();
1147 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1148 if (tmp == !(OP(c) == BOUND ?
1149 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1150 isALNUM_LC_utf8((U8*)s)))
1153 if ((!reginfo || regtry(reginfo, s)))
1160 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1161 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1162 while (s < strend) {
1164 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1166 if ((!reginfo || regtry(reginfo, s)))
1172 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1176 PL_reg_flags |= RF_tainted;
1183 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1184 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1186 tmp = ((OP(c) == NBOUND ?
1187 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1188 LOAD_UTF8_CHARCLASS_ALNUM();
1189 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1190 if (tmp == !(OP(c) == NBOUND ?
1191 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1192 isALNUM_LC_utf8((U8*)s)))
1194 else if ((!reginfo || regtry(reginfo, s)))
1200 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1201 tmp = ((OP(c) == NBOUND ?
1202 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1203 while (s < strend) {
1205 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1207 else if ((!reginfo || regtry(reginfo, s)))
1212 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1217 LOAD_UTF8_CHARCLASS_ALNUM();
1218 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1219 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1220 if (tmp && (!reginfo || regtry(reginfo, s)))
1231 while (s < strend) {
1233 if (tmp && (!reginfo || regtry(reginfo, s)))
1245 PL_reg_flags |= RF_tainted;
1247 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1248 if (isALNUM_LC_utf8((U8*)s)) {
1249 if (tmp && (!reginfo || regtry(reginfo, s)))
1260 while (s < strend) {
1261 if (isALNUM_LC(*s)) {
1262 if (tmp && (!reginfo || regtry(reginfo, s)))
1275 LOAD_UTF8_CHARCLASS_ALNUM();
1276 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1277 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1278 if (tmp && (!reginfo || regtry(reginfo, s)))
1289 while (s < strend) {
1291 if (tmp && (!reginfo || regtry(reginfo, s)))
1303 PL_reg_flags |= RF_tainted;
1305 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1306 if (!isALNUM_LC_utf8((U8*)s)) {
1307 if (tmp && (!reginfo || regtry(reginfo, s)))
1318 while (s < strend) {
1319 if (!isALNUM_LC(*s)) {
1320 if (tmp && (!reginfo || regtry(reginfo, s)))
1333 LOAD_UTF8_CHARCLASS_SPACE();
1334 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1335 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1336 if (tmp && (!reginfo || regtry(reginfo, s)))
1347 while (s < strend) {
1349 if (tmp && (!reginfo || regtry(reginfo, s)))
1361 PL_reg_flags |= RF_tainted;
1363 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1364 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1365 if (tmp && (!reginfo || regtry(reginfo, s)))
1376 while (s < strend) {
1377 if (isSPACE_LC(*s)) {
1378 if (tmp && (!reginfo || regtry(reginfo, s)))
1391 LOAD_UTF8_CHARCLASS_SPACE();
1392 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1393 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1394 if (tmp && (!reginfo || regtry(reginfo, s)))
1405 while (s < strend) {
1407 if (tmp && (!reginfo || regtry(reginfo, s)))
1419 PL_reg_flags |= RF_tainted;
1421 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1422 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1423 if (tmp && (!reginfo || regtry(reginfo, s)))
1434 while (s < strend) {
1435 if (!isSPACE_LC(*s)) {
1436 if (tmp && (!reginfo || regtry(reginfo, s)))
1449 LOAD_UTF8_CHARCLASS_DIGIT();
1450 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1451 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1452 if (tmp && (!reginfo || regtry(reginfo, s)))
1463 while (s < strend) {
1465 if (tmp && (!reginfo || regtry(reginfo, s)))
1477 PL_reg_flags |= RF_tainted;
1479 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1480 if (isDIGIT_LC_utf8((U8*)s)) {
1481 if (tmp && (!reginfo || regtry(reginfo, s)))
1492 while (s < strend) {
1493 if (isDIGIT_LC(*s)) {
1494 if (tmp && (!reginfo || regtry(reginfo, s)))
1507 LOAD_UTF8_CHARCLASS_DIGIT();
1508 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1509 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1510 if (tmp && (!reginfo || regtry(reginfo, s)))
1521 while (s < strend) {
1523 if (tmp && (!reginfo || regtry(reginfo, s)))
1535 PL_reg_flags |= RF_tainted;
1537 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1538 if (!isDIGIT_LC_utf8((U8*)s)) {
1539 if (tmp && (!reginfo || regtry(reginfo, s)))
1550 while (s < strend) {
1551 if (!isDIGIT_LC(*s)) {
1552 if (tmp && (!reginfo || regtry(reginfo, s)))
1564 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1573 - regexec_flags - match a regexp against a string
1576 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1577 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1578 /* strend: pointer to null at end of string */
1579 /* strbeg: real beginning of string */
1580 /* minend: end of match must be >=minend after stringarg. */
1581 /* data: May be used for some additional optimizations. */
1582 /* nosave: For optimizations. */
1586 register regnode *c;
1587 register char *startpos = stringarg;
1588 I32 minlen; /* must match at least this many chars */
1589 I32 dontbother = 0; /* how many characters not to try at end */
1590 I32 end_shift = 0; /* Same for the end. */ /* CC */
1591 I32 scream_pos = -1; /* Internal iterator of scream. */
1592 char *scream_olds = NULL;
1593 SV* oreplsv = GvSV(PL_replgv);
1594 const bool do_utf8 = DO_UTF8(sv);
1600 regmatch_info reginfo; /* create some info to pass to regtry etc */
1602 GET_RE_DEBUG_FLAGS_DECL;
1604 PERL_UNUSED_ARG(data);
1606 /* Be paranoid... */
1607 if (prog == NULL || startpos == NULL) {
1608 Perl_croak(aTHX_ "NULL regexp parameter");
1612 multiline = prog->reganch & PMf_MULTILINE;
1613 reginfo.prog = prog;
1616 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1617 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1620 RX_MATCH_UTF8_set(prog, do_utf8);
1622 minlen = prog->minlen;
1623 if (strend - startpos < minlen) {
1624 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1625 "String too short [regexec_flags]...\n"));
1629 /* Check validity of program. */
1630 if (UCHARAT(prog->program) != REG_MAGIC) {
1631 Perl_croak(aTHX_ "corrupted regexp program");
1635 PL_reg_eval_set = 0;
1638 if (prog->reganch & ROPT_UTF8)
1639 PL_reg_flags |= RF_utf8;
1641 /* Mark beginning of line for ^ and lookbehind. */
1642 reginfo.bol = startpos; /* XXX not used ??? */
1646 /* Mark end of line for $ (and such) */
1649 /* see how far we have to get to not match where we matched before */
1650 reginfo.till = startpos+minend;
1652 /* If there is a "must appear" string, look for it. */
1655 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1658 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1659 reginfo.ganch = startpos;
1660 else if (sv && SvTYPE(sv) >= SVt_PVMG
1662 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1663 && mg->mg_len >= 0) {
1664 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1665 if (prog->reganch & ROPT_ANCH_GPOS) {
1666 if (s > reginfo.ganch)
1671 else /* pos() not defined */
1672 reginfo.ganch = strbeg;
1675 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1676 re_scream_pos_data d;
1678 d.scream_olds = &scream_olds;
1679 d.scream_pos = &scream_pos;
1680 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1683 goto phooey; /* not present */
1688 const char * const s0 = UTF
1689 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1692 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1693 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1694 UNI_DISPLAY_REGEX) : startpos;
1695 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
1698 PerlIO_printf(Perl_debug_log,
1699 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1700 PL_colors[4], PL_colors[5], PL_colors[0],
1703 len0 > 60 ? "..." : "",
1705 (int)(len1 > 60 ? 60 : len1),
1707 (len1 > 60 ? "..." : "")
1711 /* Simplest case: anchored match need be tried only once. */
1712 /* [unless only anchor is BOL and multiline is set] */
1713 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1714 if (s == startpos && regtry(®info, startpos))
1716 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1717 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1722 dontbother = minlen - 1;
1723 end = HOP3c(strend, -dontbother, strbeg) - 1;
1724 /* for multiline we only have to try after newlines */
1725 if (prog->check_substr || prog->check_utf8) {
1729 if (regtry(®info, s))
1734 if (prog->reganch & RE_USE_INTUIT) {
1735 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1746 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1747 if (regtry(®info, s))
1754 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1755 if (regtry(®info, reginfo.ganch))
1760 /* Messy cases: unanchored match. */
1761 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1762 /* we have /x+whatever/ */
1763 /* it must be a one character string (XXXX Except UTF?) */
1768 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1769 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1770 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1773 while (s < strend) {
1775 DEBUG_EXECUTE_r( did_match = 1 );
1776 if (regtry(®info, s)) goto got_it;
1778 while (s < strend && *s == ch)
1785 while (s < strend) {
1787 DEBUG_EXECUTE_r( did_match = 1 );
1788 if (regtry(®info, s)) goto got_it;
1790 while (s < strend && *s == ch)
1796 DEBUG_EXECUTE_r(if (!did_match)
1797 PerlIO_printf(Perl_debug_log,
1798 "Did not find anchored character...\n")
1801 else if (prog->anchored_substr != NULL
1802 || prog->anchored_utf8 != NULL
1803 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1804 && prog->float_max_offset < strend - s)) {
1809 char *last1; /* Last position checked before */
1813 if (prog->anchored_substr || prog->anchored_utf8) {
1814 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1815 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1816 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1817 back_max = back_min = prog->anchored_offset;
1819 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1820 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1821 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1822 back_max = prog->float_max_offset;
1823 back_min = prog->float_min_offset;
1825 if (must == &PL_sv_undef)
1826 /* could not downgrade utf8 check substring, so must fail */
1829 last = HOP3c(strend, /* Cannot start after this */
1830 -(I32)(CHR_SVLEN(must)
1831 - (SvTAIL(must) != 0) + back_min), strbeg);
1834 last1 = HOPc(s, -1);
1836 last1 = s - 1; /* bogus */
1838 /* XXXX check_substr already used to find "s", can optimize if
1839 check_substr==must. */
1841 dontbother = end_shift;
1842 strend = HOPc(strend, -dontbother);
1843 while ( (s <= last) &&
1844 ((flags & REXEC_SCREAM)
1845 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1846 end_shift, &scream_pos, 0))
1847 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1848 (unsigned char*)strend, must,
1849 multiline ? FBMrf_MULTILINE : 0))) ) {
1850 /* we may be pointing at the wrong string */
1851 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1852 s = strbeg + (s - SvPVX_const(sv));
1853 DEBUG_EXECUTE_r( did_match = 1 );
1854 if (HOPc(s, -back_max) > last1) {
1855 last1 = HOPc(s, -back_min);
1856 s = HOPc(s, -back_max);
1859 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1861 last1 = HOPc(s, -back_min);
1865 while (s <= last1) {
1866 if (regtry(®info, s))
1872 while (s <= last1) {
1873 if (regtry(®info, s))
1879 DEBUG_EXECUTE_r(if (!did_match)
1880 PerlIO_printf(Perl_debug_log,
1881 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1882 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1883 ? "anchored" : "floating"),
1885 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1887 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1891 else if ((c = prog->regstclass)) {
1893 I32 op = (U8)OP(prog->regstclass);
1894 /* don't bother with what can't match */
1895 if (PL_regkind[op] != EXACT && op != CANY)
1896 strend = HOPc(strend, -(minlen - 1));
1899 SV *prop = sv_newmortal();
1905 regprop(prog, prop, c);
1907 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1908 UNI_DISPLAY_REGEX) :
1910 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1912 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1913 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
1914 PerlIO_printf(Perl_debug_log,
1915 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1919 if (find_byclass(prog, c, s, strend, ®info))
1921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1925 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1930 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1931 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1932 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1934 if (flags & REXEC_SCREAM) {
1935 last = screaminstr(sv, float_real, s - strbeg,
1936 end_shift, &scream_pos, 1); /* last one */
1938 last = scream_olds; /* Only one occurrence. */
1939 /* we may be pointing at the wrong string */
1940 else if (RX_MATCH_COPIED(prog))
1941 s = strbeg + (s - SvPVX_const(sv));
1945 const char * const little = SvPV_const(float_real, len);
1947 if (SvTAIL(float_real)) {
1948 if (memEQ(strend - len + 1, little, len - 1))
1949 last = strend - len + 1;
1950 else if (!multiline)
1951 last = memEQ(strend - len, little, len)
1952 ? strend - len : NULL;
1958 last = rninstr(s, strend, little, little + len);
1960 last = strend; /* matching "$" */
1964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1965 "%sCan't trim the tail, match fails (should not happen)%s\n",
1966 PL_colors[4], PL_colors[5]));
1967 goto phooey; /* Should not happen! */
1969 dontbother = strend - last + prog->float_min_offset;
1971 if (minlen && (dontbother < minlen))
1972 dontbother = minlen - 1;
1973 strend -= dontbother; /* this one's always in bytes! */
1974 /* We don't know much -- general case. */
1977 if (regtry(®info, s))
1986 if (regtry(®info, s))
1988 } while (s++ < strend);
1996 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1998 if (PL_reg_eval_set) {
1999 /* Preserve the current value of $^R */
2000 if (oreplsv != GvSV(PL_replgv))
2001 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2002 restored, the value remains
2004 restore_pos(aTHX_ prog);
2007 /* make sure $`, $&, $', and $digit will work later */
2008 if ( !(flags & REXEC_NOT_FIRST) ) {
2009 RX_MATCH_COPY_FREE(prog);
2010 if (flags & REXEC_COPY_STR) {
2011 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2012 #ifdef PERL_OLD_COPY_ON_WRITE
2014 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2016 PerlIO_printf(Perl_debug_log,
2017 "Copy on write: regexp capture, type %d\n",
2020 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2021 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2022 assert (SvPOKp(prog->saved_copy));
2026 RX_MATCH_COPIED_on(prog);
2027 s = savepvn(strbeg, i);
2033 prog->subbeg = strbeg;
2034 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2041 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2042 PL_colors[4], PL_colors[5]));
2043 if (PL_reg_eval_set)
2044 restore_pos(aTHX_ prog);
2049 - regtry - try match at specific point
2051 STATIC I32 /* 0 failure, 1 success */
2052 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2058 regexp *prog = reginfo->prog;
2059 GET_RE_DEBUG_FLAGS_DECL;
2062 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2064 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2067 PL_reg_eval_set = RS_init;
2068 DEBUG_EXECUTE_r(DEBUG_s(
2069 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2070 (IV)(PL_stack_sp - PL_stack_base));
2072 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2073 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2074 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2076 /* Apparently this is not needed, judging by wantarray. */
2077 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2078 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2081 /* Make $_ available to executed code. */
2082 if (reginfo->sv != DEFSV) {
2084 DEFSV = reginfo->sv;
2087 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2088 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2089 /* prepare for quick setting of pos */
2090 #ifdef PERL_OLD_COPY_ON_WRITE
2092 sv_force_normal_flags(sv, 0);
2094 mg = sv_magicext(reginfo->sv, (SV*)0, PERL_MAGIC_regex_global,
2095 &PL_vtbl_mglob, NULL, 0);
2099 PL_reg_oldpos = mg->mg_len;
2100 SAVEDESTRUCTOR_X(restore_pos, prog);
2102 if (!PL_reg_curpm) {
2103 Newxz(PL_reg_curpm, 1, PMOP);
2106 SV* repointer = newSViv(0);
2107 /* so we know which PL_regex_padav element is PL_reg_curpm */
2108 SvFLAGS(repointer) |= SVf_BREAK;
2109 av_push(PL_regex_padav,repointer);
2110 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2111 PL_regex_pad = AvARRAY(PL_regex_padav);
2115 PM_SETRE(PL_reg_curpm, prog);
2116 PL_reg_oldcurpm = PL_curpm;
2117 PL_curpm = PL_reg_curpm;
2118 if (RX_MATCH_COPIED(prog)) {
2119 /* Here is a serious problem: we cannot rewrite subbeg,
2120 since it may be needed if this match fails. Thus
2121 $` inside (?{}) could fail... */
2122 PL_reg_oldsaved = prog->subbeg;
2123 PL_reg_oldsavedlen = prog->sublen;
2124 #ifdef PERL_OLD_COPY_ON_WRITE
2125 PL_nrs = prog->saved_copy;
2127 RX_MATCH_COPIED_off(prog);
2130 PL_reg_oldsaved = NULL;
2131 prog->subbeg = PL_bostr;
2132 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2134 prog->startp[0] = startpos - PL_bostr;
2135 PL_reginput = startpos;
2136 PL_regstartp = prog->startp;
2137 PL_regendp = prog->endp;
2138 PL_reglastparen = &prog->lastparen;
2139 PL_reglastcloseparen = &prog->lastcloseparen;
2140 prog->lastparen = 0;
2141 prog->lastcloseparen = 0;
2143 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2144 if (PL_reg_start_tmpl <= prog->nparens) {
2145 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2146 if(PL_reg_start_tmp)
2147 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2149 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2152 /* XXXX What this code is doing here?!!! There should be no need
2153 to do this again and again, PL_reglastparen should take care of
2156 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2157 * Actually, the code in regcppop() (which Ilya may be meaning by
2158 * PL_reglastparen), is not needed at all by the test suite
2159 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2160 * enough, for building DynaLoader, or otherwise this
2161 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2162 * will happen. Meanwhile, this code *is* needed for the
2163 * above-mentioned test suite tests to succeed. The common theme
2164 * on those tests seems to be returning null fields from matches.
2169 if (prog->nparens) {
2171 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2178 if (regmatch(reginfo, prog->program + 1)) {
2179 prog->endp[0] = PL_reginput - PL_bostr;
2182 REGCP_UNWIND(lastcp);
2186 #define RE_UNWIND_BRANCH 1
2187 #define RE_UNWIND_BRANCHJ 2
2191 typedef struct { /* XX: makes sense to enlarge it... */
2195 } re_unwind_generic_t;
2209 } re_unwind_branch_t;
2211 typedef union re_unwind_t {
2213 re_unwind_generic_t generic;
2214 re_unwind_branch_t branch;
2217 #define sayYES goto yes
2218 #define sayNO goto no
2219 #define sayNO_ANYOF goto no_anyof
2220 #define sayYES_FINAL goto yes_final
2221 #define sayNO_FINAL goto no_final
2222 #define sayNO_SILENT goto do_no
2223 #define saySAME(x) if (x) goto yes; else goto no
2225 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2226 #define POSCACHE_SEEN 1 /* we know what we're caching */
2227 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2229 #define CACHEsayYES STMT_START { \
2230 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2231 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2232 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2233 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2235 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2236 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2239 /* cache records failure, but this is success */ \
2241 PerlIO_printf(Perl_debug_log, \
2242 "%*s (remove success from failure cache)\n", \
2243 REPORT_CODE_OFF+PL_regindent*2, "") \
2245 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2251 #define CACHEsayNO STMT_START { \
2252 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2253 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2254 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2255 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2257 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2258 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2261 /* cache records success, but this is failure */ \
2263 PerlIO_printf(Perl_debug_log, \
2264 "%*s (remove failure from success cache)\n", \
2265 REPORT_CODE_OFF+PL_regindent*2, "") \
2267 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2273 /* this is used to determine how far from the left messages like
2274 'failed...' are printed. Currently 29 makes these messages line
2275 up with the opcode they refer to. Earlier perls used 25 which
2276 left these messages outdented making reviewing a debug output
2279 #define REPORT_CODE_OFF 29
2282 /* Make sure there is a test for this +1 options in re_tests */
2283 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2285 /* this value indiciates that the c1/c2 "next char" test should be skipped */
2286 #define CHRTEST_VOID -1000
2288 #define SLAB_FIRST(s) (&(s)->states[0])
2289 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2291 /* grab a new slab and return the first slot in it */
2293 STATIC regmatch_state *
2296 #if PERL_VERSION < 9
2299 regmatch_slab *s = PL_regmatch_slab->next;
2301 Newx(s, 1, regmatch_slab);
2302 s->prev = PL_regmatch_slab;
2304 PL_regmatch_slab->next = s;
2306 PL_regmatch_slab = s;
2307 return SLAB_FIRST(s);
2310 /* simulate a recursive call to regmatch */
2312 #define REGMATCH(ns, where) \
2315 st->resume_state = resume_##where; \
2316 goto start_recurse; \
2317 resume_point_##where:
2320 /* push a new regex state. Set newst to point to it */
2322 #define PUSH_STATE(newst, resume) \
2324 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2328 st->locinput = locinput; \
2329 st->resume_state = resume; \
2331 if (newst > SLAB_LAST(PL_regmatch_slab)) \
2332 newst = S_push_slab(aTHX); \
2333 PL_regmatch_state = newst; \
2335 newst->minmod = 0; \
2337 newst->logical = 0; \
2338 newst->unwind = 0; \
2339 locinput = PL_reginput; \
2340 nextchr = UCHARAT(locinput);
2343 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2346 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
2347 PL_regmatch_slab = PL_regmatch_slab->prev; \
2348 st = SLAB_LAST(PL_regmatch_slab); \
2350 PL_regmatch_state = st; \
2354 locinput = st->locinput; \
2355 nextchr = UCHARAT(locinput);
2358 - regmatch - main matching routine
2360 * Conceptually the strategy is simple: check to see whether the current
2361 * node matches, call self recursively to see whether the rest matches,
2362 * and then act accordingly. In practice we make some effort to avoid
2363 * recursion, in particular by going through "ordinary" nodes (that don't
2364 * need to know whether the rest of the match failed) by a loop instead of
2367 /* [lwall] I've hoisted the register declarations to the outer block in order to
2368 * maybe save a little bit of pushing and popping on the stack. It also takes
2369 * advantage of machines that use a register save mask on subroutine entry.
2371 * This function used to be heavily recursive, but since this had the
2372 * effect of blowing the CPU stack on complex regexes, it has been
2373 * restructured to be iterative, and to save state onto the heap rather
2374 * than the stack. Essentially whereever regmatch() used to be called, it
2375 * pushes the current state, notes where to return, then jumps back into
2378 * Originally the structure of this function used to look something like
2383 while (scan != NULL) {
2384 a++; // do stuff with a and b
2390 if (regmatch(...)) // recurse
2400 * Now it looks something like this:
2408 regmatch_state *st = new();
2410 st->a++; // do stuff with a and b
2412 while (scan != NULL) {
2420 st->resume_state = resume_FOO;
2421 goto start_recurse; // recurse
2430 st = new(); push a new state
2431 st->a = 1; st->b = 2;
2438 switch (resume_state) {
2440 goto resume_point_FOO;
2447 * WARNING: this means that any line in this function that contains a
2448 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2449 * regmatch() using gotos instead. Thus the values of any local variables
2450 * not saved in the regmatch_state structure will have been lost when
2451 * execution resumes on the next line .
2453 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2454 * PL_regmatch_state always points to the currently active state, and
2455 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2456 * The first time regmatch is called, the first slab is allocated, and is
2457 * never freed until interpreter desctruction. When the slab is full,
2458 * a new one is allocated chained to the end. At exit from regmatch, slabs
2459 * allocated since entry are freed.
2463 STATIC I32 /* 0 failure, 1 success */
2464 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2466 #if PERL_VERSION < 9
2470 register const bool do_utf8 = PL_reg_match_utf8;
2471 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2473 regexp *rex = reginfo->prog;
2475 regmatch_slab *orig_slab;
2476 regmatch_state *orig_state;
2478 /* the current state. This is a cached copy of PL_regmatch_state */
2479 register regmatch_state *st;
2481 /* cache heavy used fields of st in registers */
2482 register regnode *scan;
2483 register regnode *next;
2484 register I32 n = 0; /* initialize to shut up compiler warning */
2485 register char *locinput = PL_reginput;
2487 /* these variables are NOT saved during a recusive RFEGMATCH: */
2488 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2489 bool result; /* return value of S_regmatch */
2490 regnode *inner; /* Next node in internal branch. */
2491 int depth = 0; /* depth of recursion */
2492 regmatch_state *newst; /* when pushing a state, this is the new one */
2493 regmatch_state *yes_state = NULL; /* state to pop to on success of
2497 SV *re_debug_flags = NULL;
2502 /* on first ever call to regmatch, allocate first slab */
2503 if (!PL_regmatch_slab) {
2504 Newx(PL_regmatch_slab, 1, regmatch_slab);
2505 PL_regmatch_slab->prev = NULL;
2506 PL_regmatch_slab->next = NULL;
2507 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2510 /* remember current high-water mark for exit */
2511 /* XXX this should be done with SAVE* instead */
2512 orig_slab = PL_regmatch_slab;
2513 orig_state = PL_regmatch_state;
2515 /* grab next free state slot */
2516 st = ++PL_regmatch_state;
2517 if (st > SLAB_LAST(PL_regmatch_slab))
2518 st = PL_regmatch_state = S_push_slab(aTHX);
2525 /* Note that nextchr is a byte even in UTF */
2526 nextchr = UCHARAT(locinput);
2528 while (scan != NULL) {
2531 SV * const prop = sv_newmortal();
2532 const int docolor = *PL_colors[0];
2533 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2534 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2535 /* The part of the string before starttry has one color
2536 (pref0_len chars), between starttry and current
2537 position another one (pref_len - pref0_len chars),
2538 after the current position the third one.
2539 We assume that pref0_len <= pref_len, otherwise we
2540 decrease pref0_len. */
2541 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2542 ? (5 + taill) - l : locinput - PL_bostr;
2545 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2547 pref0_len = pref_len - (locinput - PL_reg_starttry);
2548 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2549 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2550 ? (5 + taill) - pref_len : PL_regeol - locinput);
2551 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2555 if (pref0_len > pref_len)
2556 pref0_len = pref_len;
2557 regprop(rex, prop, scan);
2559 const char * const s0 =
2560 do_utf8 && OP(scan) != CANY ?
2561 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2562 pref0_len, 60, UNI_DISPLAY_REGEX) :
2563 locinput - pref_len;
2564 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2565 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2566 pv_uni_display(PERL_DEBUG_PAD(1),
2567 (U8*)(locinput - pref_len + pref0_len),
2568 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2569 locinput - pref_len + pref0_len;
2570 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2571 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2572 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2573 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2575 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2576 PerlIO_printf(Perl_debug_log,
2577 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2578 (IV)(locinput - PL_bostr),
2585 (docolor ? "" : "> <"),
2589 15 - l - pref_len + 1,
2591 (IV)(scan - rex->program), PL_regindent*2, "",
2596 next = scan + NEXT_OFF(scan);
2602 if (locinput == PL_bostr)
2604 /* reginfo->till = reginfo->bol; */
2609 if (locinput == PL_bostr ||
2610 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2616 if (locinput == PL_bostr)
2620 if (locinput == reginfo->ganch)
2626 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2631 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2633 if (PL_regeol - locinput > 1)
2637 if (PL_regeol != locinput)
2641 if (!nextchr && locinput >= PL_regeol)
2644 locinput += PL_utf8skip[nextchr];
2645 if (locinput > PL_regeol)
2647 nextchr = UCHARAT(locinput);
2650 nextchr = UCHARAT(++locinput);
2653 if (!nextchr && locinput >= PL_regeol)
2655 nextchr = UCHARAT(++locinput);
2658 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2661 locinput += PL_utf8skip[nextchr];
2662 if (locinput > PL_regeol)
2664 nextchr = UCHARAT(locinput);
2667 nextchr = UCHARAT(++locinput);
2673 traverse the TRIE keeping track of all accepting states
2674 we transition through until we get to a failing node.
2682 U8 *uc = ( U8* )locinput;
2689 U8 *uscan = (U8*)NULL;
2691 SV *sv_accept_buff = NULL;
2692 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2693 trie_type = do_utf8 ?
2694 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2697 /* what trie are we using right now */
2699 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2700 st->u.trie.accepted = 0; /* how many accepting states we have seen */
2703 while ( state && uc <= (U8*)PL_regeol ) {
2705 if (trie->states[ state ].wordnum) {
2706 if (!st->u.trie.accepted ) {
2709 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2710 sv_accept_buff=newSV(bufflen *
2711 sizeof(reg_trie_accepted) - 1);
2712 SvCUR_set(sv_accept_buff,
2713 sizeof(reg_trie_accepted));
2714 SvPOK_on(sv_accept_buff);
2715 sv_2mortal(sv_accept_buff);
2716 st->u.trie.accept_buff =
2717 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2720 if (st->u.trie.accepted >= bufflen) {
2722 st->u.trie.accept_buff =(reg_trie_accepted*)
2723 SvGROW(sv_accept_buff,
2724 bufflen * sizeof(reg_trie_accepted));
2726 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2727 + sizeof(reg_trie_accepted));
2729 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2730 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2731 ++st->u.trie.accepted;
2734 base = trie->states[ state ].trans.base;
2736 DEBUG_TRIE_EXECUTE_r(
2737 PerlIO_printf( Perl_debug_log,
2738 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2739 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2740 (UV)state, (UV)base, (UV)st->u.trie.accepted );
2744 switch (trie_type) {
2745 case trie_uft8_fold:
2747 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2752 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2753 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2754 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2755 foldlen -= UNISKIP( uvc );
2756 uscan = foldbuf + UNISKIP( uvc );
2760 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2769 charid = trie->charmap[ uvc ];
2773 if (trie->widecharmap) {
2774 SV** svpp = (SV**)NULL;
2775 svpp = hv_fetch(trie->widecharmap,
2776 (char*)&uvc, sizeof(UV), 0);
2778 charid = (U16)SvIV(*svpp);
2783 (base + charid > trie->uniquecharcount )
2784 && (base + charid - 1 - trie->uniquecharcount
2786 && trie->trans[base + charid - 1 -
2787 trie->uniquecharcount].check == state)
2789 state = trie->trans[base + charid - 1 -
2790 trie->uniquecharcount ].next;
2801 DEBUG_TRIE_EXECUTE_r(
2802 PerlIO_printf( Perl_debug_log,
2803 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2804 charid, uvc, (UV)state, PL_colors[5] );
2807 if (!st->u.trie.accepted )
2811 There was at least one accepting state that we
2812 transitioned through. Presumably the number of accepting
2813 states is going to be low, typically one or two. So we
2814 simply scan through to find the one with lowest wordnum.
2815 Once we find it, we swap the last state into its place
2816 and decrement the size. We then try to match the rest of
2817 the pattern at the point where the word ends, if we
2818 succeed then we end the loop, otherwise the loop
2819 eventually terminates once all of the accepting states
2823 if ( st->u.trie.accepted == 1 ) {
2825 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
2826 PerlIO_printf( Perl_debug_log,
2827 "%*s %sonly one match : #%d <%s>%s\n",
2828 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2829 st->u.trie.accept_buff[ 0 ].wordnum,
2830 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2833 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
2834 /* in this case we free tmps/leave before we call regmatch
2835 as we wont be using accept_buff again. */
2838 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2839 /*** all unsaved local vars undefined at this point */
2842 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2843 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
2846 while ( !result && st->u.trie.accepted-- ) {
2849 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
2850 DEBUG_TRIE_EXECUTE_r(
2851 PerlIO_printf( Perl_debug_log,
2852 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2853 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2854 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2855 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
2858 if (st->u.trie.accept_buff[cur].wordnum <
2859 st->u.trie.accept_buff[best].wordnum)
2863 reg_trie_data * const trie = (reg_trie_data*)
2864 rex->data->data[ARG(scan)];
2865 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
2866 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
2867 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2868 st->u.trie.accept_buff[best].wordnum,
2869 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", (void*)scan,
2872 if ( best<st->u.trie.accepted ) {
2873 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2874 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2875 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2876 best = st->u.trie.accepted;
2878 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
2881 as far as I can tell we only need the SAVETMPS/FREETMPS
2882 for re's with EVAL in them but I'm leaving them in for
2883 all until I can be sure.
2886 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2887 /*** all unsaved local vars undefined at this point */
2900 /* unreached codepoint */
2902 char *s = STRING(scan);
2903 st->ln = STR_LEN(scan);
2904 if (do_utf8 != UTF) {
2905 /* The target and the pattern have differing utf8ness. */
2907 const char *e = s + st->ln;
2910 /* The target is utf8, the pattern is not utf8. */
2915 if (NATIVE_TO_UNI(*(U8*)s) !=
2916 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2924 /* The target is not utf8, the pattern is utf8. */
2929 if (NATIVE_TO_UNI(*((U8*)l)) !=
2930 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2938 nextchr = UCHARAT(locinput);
2941 /* The target and the pattern have the same utf8ness. */
2942 /* Inline the first character, for speed. */
2943 if (UCHARAT(s) != nextchr)
2945 if (PL_regeol - locinput < st->ln)
2947 if (st->ln > 1 && memNE(s, locinput, st->ln))
2950 nextchr = UCHARAT(locinput);
2954 PL_reg_flags |= RF_tainted;
2957 char *s = STRING(scan);
2958 st->ln = STR_LEN(scan);
2960 if (do_utf8 || UTF) {
2961 /* Either target or the pattern are utf8. */
2963 char *e = PL_regeol;
2965 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2966 l, &e, 0, do_utf8)) {
2967 /* One more case for the sharp s:
2968 * pack("U0U*", 0xDF) =~ /ss/i,
2969 * the 0xC3 0x9F are the UTF-8
2970 * byte sequence for the U+00DF. */
2972 toLOWER(s[0]) == 's' &&
2974 toLOWER(s[1]) == 's' &&
2981 nextchr = UCHARAT(locinput);
2985 /* Neither the target and the pattern are utf8. */
2987 /* Inline the first character, for speed. */
2988 if (UCHARAT(s) != nextchr &&
2989 UCHARAT(s) != ((OP(scan) == EXACTF)
2990 ? PL_fold : PL_fold_locale)[nextchr])
2992 if (PL_regeol - locinput < st->ln)
2994 if (st->ln > 1 && (OP(scan) == EXACTF
2995 ? ibcmp(s, locinput, st->ln)
2996 : ibcmp_locale(s, locinput, st->ln)))
2999 nextchr = UCHARAT(locinput);
3004 STRLEN inclasslen = PL_regeol - locinput;
3006 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3008 if (locinput >= PL_regeol)
3010 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3011 nextchr = UCHARAT(locinput);
3016 nextchr = UCHARAT(locinput);
3017 if (!REGINCLASS(rex, scan, (U8*)locinput))
3019 if (!nextchr && locinput >= PL_regeol)
3021 nextchr = UCHARAT(++locinput);
3025 /* If we might have the case of the German sharp s
3026 * in a casefolding Unicode character class. */
3028 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3029 locinput += SHARP_S_SKIP;
3030 nextchr = UCHARAT(locinput);
3036 PL_reg_flags |= RF_tainted;
3042 LOAD_UTF8_CHARCLASS_ALNUM();
3043 if (!(OP(scan) == ALNUM
3044 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3045 : isALNUM_LC_utf8((U8*)locinput)))
3049 locinput += PL_utf8skip[nextchr];
3050 nextchr = UCHARAT(locinput);
3053 if (!(OP(scan) == ALNUM
3054 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3056 nextchr = UCHARAT(++locinput);
3059 PL_reg_flags |= RF_tainted;
3062 if (!nextchr && locinput >= PL_regeol)
3065 LOAD_UTF8_CHARCLASS_ALNUM();
3066 if (OP(scan) == NALNUM
3067 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3068 : isALNUM_LC_utf8((U8*)locinput))
3072 locinput += PL_utf8skip[nextchr];
3073 nextchr = UCHARAT(locinput);
3076 if (OP(scan) == NALNUM
3077 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3079 nextchr = UCHARAT(++locinput);
3083 PL_reg_flags |= RF_tainted;
3087 /* was last char in word? */
3089 if (locinput == PL_bostr)
3092 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3094 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3096 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3097 st->ln = isALNUM_uni(st->ln);
3098 LOAD_UTF8_CHARCLASS_ALNUM();
3099 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3102 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
3103 n = isALNUM_LC_utf8((U8*)locinput);
3107 st->ln = (locinput != PL_bostr) ?
3108 UCHARAT(locinput - 1) : '\n';
3109 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3110 st->ln = isALNUM(st->ln);
3111 n = isALNUM(nextchr);
3114 st->ln = isALNUM_LC(st->ln);
3115 n = isALNUM_LC(nextchr);
3118 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3119 OP(scan) == BOUNDL))
3123 PL_reg_flags |= RF_tainted;
3129 if (UTF8_IS_CONTINUED(nextchr)) {
3130 LOAD_UTF8_CHARCLASS_SPACE();
3131 if (!(OP(scan) == SPACE
3132 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3133 : isSPACE_LC_utf8((U8*)locinput)))
3137 locinput += PL_utf8skip[nextchr];
3138 nextchr = UCHARAT(locinput);
3141 if (!(OP(scan) == SPACE
3142 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3144 nextchr = UCHARAT(++locinput);
3147 if (!(OP(scan) == SPACE
3148 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3150 nextchr = UCHARAT(++locinput);
3154 PL_reg_flags |= RF_tainted;
3157 if (!nextchr && locinput >= PL_regeol)
3160 LOAD_UTF8_CHARCLASS_SPACE();
3161 if (OP(scan) == NSPACE
3162 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3163 : isSPACE_LC_utf8((U8*)locinput))
3167 locinput += PL_utf8skip[nextchr];
3168 nextchr = UCHARAT(locinput);
3171 if (OP(scan) == NSPACE
3172 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3174 nextchr = UCHARAT(++locinput);
3177 PL_reg_flags |= RF_tainted;
3183 LOAD_UTF8_CHARCLASS_DIGIT();
3184 if (!(OP(scan) == DIGIT
3185 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3186 : isDIGIT_LC_utf8((U8*)locinput)))
3190 locinput += PL_utf8skip[nextchr];
3191 nextchr = UCHARAT(locinput);
3194 if (!(OP(scan) == DIGIT
3195 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3197 nextchr = UCHARAT(++locinput);
3200 PL_reg_flags |= RF_tainted;
3203 if (!nextchr && locinput >= PL_regeol)
3206 LOAD_UTF8_CHARCLASS_DIGIT();
3207 if (OP(scan) == NDIGIT
3208 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3209 : isDIGIT_LC_utf8((U8*)locinput))
3213 locinput += PL_utf8skip[nextchr];
3214 nextchr = UCHARAT(locinput);
3217 if (OP(scan) == NDIGIT
3218 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3220 nextchr = UCHARAT(++locinput);
3223 if (locinput >= PL_regeol)
3226 LOAD_UTF8_CHARCLASS_MARK();
3227 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3229 locinput += PL_utf8skip[nextchr];
3230 while (locinput < PL_regeol &&
3231 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3232 locinput += UTF8SKIP(locinput);
3233 if (locinput > PL_regeol)
3238 nextchr = UCHARAT(locinput);
3241 PL_reg_flags |= RF_tainted;
3246 n = ARG(scan); /* which paren pair */
3247 st->ln = PL_regstartp[n];
3248 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3249 if ((I32)*PL_reglastparen < n || st->ln == -1)
3250 sayNO; /* Do not match unless seen CLOSEn. */
3251 if (st->ln == PL_regendp[n])
3254 s = PL_bostr + st->ln;
3255 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3257 const char *e = PL_bostr + PL_regendp[n];
3259 * Note that we can't do the "other character" lookup trick as
3260 * in the 8-bit case (no pun intended) because in Unicode we
3261 * have to map both upper and title case to lower case.
3263 if (OP(scan) == REFF) {
3265 STRLEN ulen1, ulen2;
3266 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3267 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3271 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3272 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3273 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3280 nextchr = UCHARAT(locinput);
3284 /* Inline the first character, for speed. */
3285 if (UCHARAT(s) != nextchr &&
3287 (UCHARAT(s) != ((OP(scan) == REFF
3288 ? PL_fold : PL_fold_locale)[nextchr]))))
3290 st->ln = PL_regendp[n] - st->ln;
3291 if (locinput + st->ln > PL_regeol)
3293 if (st->ln > 1 && (OP(scan) == REF
3294 ? memNE(s, locinput, st->ln)
3296 ? ibcmp(s, locinput, st->ln)
3297 : ibcmp_locale(s, locinput, st->ln))))
3300 nextchr = UCHARAT(locinput);
3313 /* execute the code in the {...} */
3315 SV ** const before = SP;
3316 OP_4tree * const oop = PL_op;
3317 COP * const ocurcop = PL_curcop;
3321 PL_op = (OP_4tree*)rex->data->data[n];
3322 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3323 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3324 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3326 CALLRUNOPS(aTHX); /* Scalar context. */
3329 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3336 PAD_RESTORE_LOCAL(old_comppad);
3337 PL_curcop = ocurcop;
3340 sv_setsv(save_scalar(PL_replgv), ret);
3344 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3347 /* extract RE object from returned value; compiling if
3352 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3353 mg = mg_find(sv, PERL_MAGIC_qr);
3354 else if (SvSMAGICAL(ret)) {
3355 if (SvGMAGICAL(ret))
3356 sv_unmagic(ret, PERL_MAGIC_qr);
3358 mg = mg_find(ret, PERL_MAGIC_qr);
3362 re = (regexp *)mg->mg_obj;
3363 (void)ReREFCNT_inc(re);
3367 const char * const t = SvPV_const(ret, len);
3369 const I32 osize = PL_regsize;
3372 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3373 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3375 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3377 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3383 /* run the pattern returned from (??{...}) */
3386 PerlIO_printf(Perl_debug_log,
3387 "Entering embedded \"%s%.60s%s%s\"\n",
3391 (strlen(re->precomp) > 60 ? "..." : ""))
3394 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3395 REGCP_SET(st->u.eval.lastcp);
3396 *PL_reglastparen = 0;
3397 *PL_reglastcloseparen = 0;
3398 PL_reginput = locinput;
3400 /* XXXX This is too dramatic a measure... */
3404 st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3405 ((re->reganch & ROPT_UTF8) != 0);
3406 if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
3407 st->u.eval.prev_rex = rex;
3410 /* resume to current state on success */
3411 st->u.yes.prev_yes_state = yes_state;
3413 PUSH_STATE(newst, resume_EVAL);
3416 /* now continue from first node in postoned RE */
3417 next = re->program + 1;
3421 /* /(?(?{...})X|Y)/ */
3422 st->sw = SvTRUE(ret);
3427 n = ARG(scan); /* which paren pair */
3428 PL_reg_start_tmp[n] = locinput;
3433 n = ARG(scan); /* which paren pair */
3434 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3435 PL_regendp[n] = locinput - PL_bostr;
3436 if (n > (I32)*PL_reglastparen)
3437 *PL_reglastparen = n;
3438 *PL_reglastcloseparen = n;
3441 n = ARG(scan); /* which paren pair */
3442 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3445 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3447 next = NEXTOPER(NEXTOPER(scan));
3449 next = scan + ARG(scan);
3450 if (OP(next) == IFTHEN) /* Fake one. */
3451 next = NEXTOPER(NEXTOPER(next));
3455 st->logical = scan->flags;
3457 /*******************************************************************
3458 cc points to the regmatch_state associated with the most recent CURLYX.
3459 This struct contains info about the innermost (...)* loop (an
3460 "infoblock"), and a pointer to the next outer cc.
3462 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3464 1) After matching Y, regnode for CURLYX is processed;
3466 2) This regnode populates cc, and calls regmatch() recursively
3467 with the starting point at WHILEM node;
3469 3) Each hit of WHILEM node tries to match A and Z (in the order
3470 depending on the current iteration, min/max of {min,max} and
3471 greediness). The information about where are nodes for "A"
3472 and "Z" is read from cc, as is info on how many times "A"
3473 was already matched, and greediness.
3475 4) After A matches, the same WHILEM node is hit again.
3477 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3478 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3479 resets cc, since this Y(A)*Z can be a part of some other loop:
3480 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3481 of the external loop.
3483 Currently present infoblocks form a tree with a stem formed by st->cc
3484 and whatever it mentions via ->next, and additional attached trees
3485 corresponding to temporarily unset infoblocks as in "5" above.
3487 In the following picture, infoblocks for outer loop of
3488 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3489 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3490 infoblocks are drawn below the "reset" infoblock.
3492 In fact in the picture below we do not show failed matches for Z and T
3493 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3494 more obvious *why* one needs to *temporary* unset infoblocks.]
3496 Matched REx position InfoBlocks Comment
3500 Y A)*?Z)*?T x <- O <- I
3501 YA )*?Z)*?T x <- O <- I
3502 YA A)*?Z)*?T x <- O <- I
3503 YAA )*?Z)*?T x <- O <- I
3504 YAA Z)*?T x <- O # Temporary unset I
3507 YAAZ Y(A)*?Z)*?T x <- O
3510 YAAZY (A)*?Z)*?T x <- O
3513 YAAZY A)*?Z)*?T x <- O <- I
3516 YAAZYA )*?Z)*?T x <- O <- I
3519 YAAZYA Z)*?T x <- O # Temporary unset I
3525 YAAZYAZ T x # Temporary unset O
3532 *******************************************************************/
3535 /* No need to save/restore up to this paren */
3536 I32 parenfloor = scan->flags;
3540 CURLYX and WHILEM are always paired: they're the moral
3541 equivalent of pp_enteriter anbd pp_iter.
3543 The only time next could be null is if the node tree is
3544 corrupt. This was mentioned on p5p a few days ago.
3546 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3547 So we'll assert that this is true:
3550 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3552 /* XXXX Probably it is better to teach regpush to support
3553 parenfloor > PL_regsize... */
3554 if (parenfloor > (I32)*PL_reglastparen)
3555 parenfloor = *PL_reglastparen; /* Pessimization... */
3557 st->u.curlyx.cp = PL_savestack_ix;
3558 st->u.curlyx.outercc = st->cc;
3560 /* these fields contain the state of the current curly.
3561 * they are accessed by subsequent WHILEMs;
3562 * cur and lastloc are also updated by WHILEM */
3563 st->u.curlyx.parenfloor = parenfloor;
3564 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3565 st->u.curlyx.min = ARG1(scan);
3566 st->u.curlyx.max = ARG2(scan);
3567 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3568 st->u.curlyx.lastloc = 0;
3569 /* st->next and st->minmod are also read by WHILEM */
3571 PL_reginput = locinput;
3572 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3573 /*** all unsaved local vars undefined at this point */
3574 regcpblow(st->u.curlyx.cp);
3575 st->cc = st->u.curlyx.outercc;
3581 * This is really hard to understand, because after we match
3582 * what we're trying to match, we must make sure the rest of
3583 * the REx is going to match for sure, and to do that we have
3584 * to go back UP the parse tree by recursing ever deeper. And
3585 * if it fails, we have to reset our parent's current state
3586 * that we can try again after backing off.
3591 st->cc gets initialised by CURLYX ready for use by WHILEM.
3592 So again, unless somethings been corrupted, st->cc cannot
3593 be null at that point in WHILEM.
3595 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3596 So we'll assert that this is true:
3599 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3600 st->u.whilem.cache_offset = 0;
3601 st->u.whilem.cache_bit = 0;
3603 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3604 PL_reginput = locinput;
3607 PerlIO_printf(Perl_debug_log,
3608 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3609 REPORT_CODE_OFF+PL_regindent*2, "",
3610 (long)n, (long)st->cc->u.curlyx.min,
3611 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3614 /* If degenerate scan matches "", assume scan done. */
3616 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3617 st->u.whilem.savecc = st->cc;
3618 st->cc = st->cc->u.curlyx.outercc;
3620 st->ln = st->cc->u.curlyx.cur;
3622 PerlIO_printf(Perl_debug_log,
3623 "%*s empty match detected, try continuation...\n",
3624 REPORT_CODE_OFF+PL_regindent*2, "")
3626 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3627 /*** all unsaved local vars undefined at this point */
3628 st->cc = st->u.whilem.savecc;
3631 if (st->cc->u.curlyx.outercc)
3632 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3636 /* First just match a string of min scans. */
3638 if (n < st->cc->u.curlyx.min) {
3639 st->cc->u.curlyx.cur = n;
3640 st->cc->u.curlyx.lastloc = locinput;
3641 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3642 /*** all unsaved local vars undefined at this point */
3645 st->cc->u.curlyx.cur = n - 1;
3646 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3651 /* Check whether we already were at this position.
3652 Postpone detection until we know the match is not
3653 *that* much linear. */
3654 if (!PL_reg_maxiter) {
3655 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3656 PL_reg_leftiter = PL_reg_maxiter;
3658 if (PL_reg_leftiter-- == 0) {
3659 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3660 if (PL_reg_poscache) {
3661 if ((I32)PL_reg_poscache_size < size) {
3662 Renew(PL_reg_poscache, size, char);
3663 PL_reg_poscache_size = size;
3665 Zero(PL_reg_poscache, size, char);
3668 PL_reg_poscache_size = size;
3669 Newxz(PL_reg_poscache, size, char);
3672 PerlIO_printf(Perl_debug_log,
3673 "%sDetected a super-linear match, switching on caching%s...\n",
3674 PL_colors[4], PL_colors[5])
3677 if (PL_reg_leftiter < 0) {
3678 st->u.whilem.cache_offset = locinput - PL_bostr;
3680 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3681 + st->u.whilem.cache_offset * (scan->flags>>4);
3682 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3683 st->u.whilem.cache_offset /= 8;
3684 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3686 PerlIO_printf(Perl_debug_log,
3687 "%*s already tried at this position...\n",
3688 REPORT_CODE_OFF+PL_regindent*2, "")
3690 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3691 /* cache records success */
3694 /* cache records failure */
3700 /* Prefer next over scan for minimal matching. */
3702 if (st->cc->minmod) {
3703 st->u.whilem.savecc = st->cc;
3704 st->cc = st->cc->u.curlyx.outercc;
3706 st->ln = st->cc->u.curlyx.cur;
3707 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3708 REGCP_SET(st->u.whilem.lastcp);
3709 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3710 /*** all unsaved local vars undefined at this point */
3711 st->cc = st->u.whilem.savecc;
3713 regcpblow(st->u.whilem.cp);
3714 CACHEsayYES; /* All done. */
3716 REGCP_UNWIND(st->u.whilem.lastcp);
3718 if (st->cc->u.curlyx.outercc)
3719 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3721 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3722 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3723 && !(PL_reg_flags & RF_warned)) {
3724 PL_reg_flags |= RF_warned;
3725 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3726 "Complex regular subexpression recursion",
3733 PerlIO_printf(Perl_debug_log,
3734 "%*s trying longer...\n",
3735 REPORT_CODE_OFF+PL_regindent*2, "")
3737 /* Try scanning more and see if it helps. */
3738 PL_reginput = locinput;
3739 st->cc->u.curlyx.cur = n;
3740 st->cc->u.curlyx.lastloc = locinput;
3741 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3742 REGCP_SET(st->u.whilem.lastcp);
3743 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3744 /*** all unsaved local vars undefined at this point */
3746 regcpblow(st->u.whilem.cp);
3749 REGCP_UNWIND(st->u.whilem.lastcp);
3751 st->cc->u.curlyx.cur = n - 1;
3752 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3756 /* Prefer scan over next for maximal matching. */
3758 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3759 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3760 st->cc->u.curlyx.cur = n;
3761 st->cc->u.curlyx.lastloc = locinput;
3762 REGCP_SET(st->u.whilem.lastcp);
3763 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
3764 /*** all unsaved local vars undefined at this point */
3766 regcpblow(st->u.whilem.cp);
3769 REGCP_UNWIND(st->u.whilem.lastcp);
3770 regcppop(rex); /* Restore some previous $<digit>s? */
3771 PL_reginput = locinput;
3773 PerlIO_printf(Perl_debug_log,
3774 "%*s failed, try continuation...\n",
3775 REPORT_CODE_OFF+PL_regindent*2, "")
3778 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3779 && !(PL_reg_flags & RF_warned)) {
3780 PL_reg_flags |= RF_warned;
3781 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3782 "Complex regular subexpression recursion",
3786 /* Failed deeper matches of scan, so see if this one works. */
3787 st->u.whilem.savecc = st->cc;
3788 st->cc = st->cc->u.curlyx.outercc;
3790 st->ln = st->cc->u.curlyx.cur;
3791 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
3792 /*** all unsaved local vars undefined at this point */
3793 st->cc = st->u.whilem.savecc;
3796 if (st->cc->u.curlyx.outercc)
3797 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3798 st->cc->u.curlyx.cur = n - 1;
3799 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3804 next = scan + ARG(scan);
3807 inner = NEXTOPER(NEXTOPER(scan));
3810 inner = NEXTOPER(scan);
3815 if (!next || OP(next) != type) /* No choice. */
3816 next = inner; /* Avoid recursion. */
3818 const I32 lastparen = *PL_reglastparen;
3819 /* Put unwinding data on stack */
3820 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
3821 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
3823 uw->prev = st->unwind;
3824 st->unwind = unwind1;
3825 uw->type = ((type == BRANCH)
3827 : RE_UNWIND_BRANCHJ);
3828 uw->lastparen = lastparen;
3830 uw->locinput = locinput;
3831 uw->nextchr = nextchr;
3832 uw->minmod = st->minmod;
3834 uw->regindent = ++PL_regindent;
3837 REGCP_SET(uw->lastcp);
3839 /* Now go into the first branch */
3849 st->u.curlym.l = st->u.curlym.matches = 0;
3851 /* We suppose that the next guy does not need
3852 backtracking: in particular, it is of constant non-zero length,
3853 and has no parenths to influence future backrefs. */
3854 st->ln = ARG1(scan); /* min to match */
3855 n = ARG2(scan); /* max to match */
3856 st->u.curlym.paren = scan->flags;
3857 if (st->u.curlym.paren) {
3858 if (st->u.curlym.paren > PL_regsize)
3859 PL_regsize = st->u.curlym.paren;
3860 if (st->u.curlym.paren > (I32)*PL_reglastparen)
3861 *PL_reglastparen = st->u.curlym.paren;
3863 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3864 if (st->u.curlym.paren)
3865 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3866 PL_reginput = locinput;
3867 st->u.curlym.maxwanted = st->minmod ? st->ln : n;
3868 while (PL_reginput < PL_regeol && st->u.curlym.matches < st->u.curlym.maxwanted) {
3869 /* resume to current state on success */
3870 st->u.yes.prev_yes_state = yes_state;
3872 REGMATCH(scan, CURLYM1);
3873 yes_state = st->u.yes.prev_yes_state;
3874 /*** all unsaved local vars undefined at this point */
3877 /* on first match, determine length, u.curlym.l */
3878 if (!st->u.curlym.matches++) {
3879 if (PL_reg_match_utf8) {
3881 while (s < PL_reginput) {
3887 st->u.curlym.l = PL_reginput - locinput;
3889 if (st->u.curlym.l == 0) {
3890 st->u.curlym.matches = st->u.curlym.maxwanted;
3894 locinput = PL_reginput;
3897 PL_reginput = locinput;
3898 if (st->u.curlym.matches < st->ln) {
3904 PerlIO_printf(Perl_debug_log,
3905 "%*s matched %"IVdf" times, len=%"IVdf"...\n",