5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
34 /* need to replace pregcomp et al, so enable that */
35 # ifndef PERL_IN_XSUB_RE
36 # define PERL_IN_XSUB_RE
38 /* need access to debugger hooks */
39 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
44 #ifdef PERL_IN_XSUB_RE
45 /* We *really* need to overwrite these symbols: */
46 # define Perl_regexec_flags my_regexec
47 # define Perl_regdump my_regdump
48 # define Perl_regprop my_regprop
49 # define Perl_re_intuit_start my_re_intuit_start
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_pregexec my_pregexec
52 # define Perl_reginitcolors my_reginitcolors
53 # define Perl_regclass_swash my_regclass_swash
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
78 **** Alterations to Henry's code are...
80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
91 #define PERL_IN_REGEXEC_C
96 #define RF_tainted 1 /* tainted information used? */
97 #define RF_warned 2 /* warned about big count? */
98 #define RF_evaled 4 /* Did an EVAL with setting? */
99 #define RF_utf8 8 /* String contains multibyte chars? */
101 #define UTF ((PL_reg_flags & RF_utf8) != 0)
103 #define RS_init 1 /* eval environment created */
104 #define RS_set 2 /* replsv value is set */
107 #define STATIC static
110 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
116 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
117 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
119 #define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
120 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
122 #define HOPBACKc(pos, off) ((char*) \
123 ((PL_reg_match_utf8) \
124 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
125 : (pos - off >= PL_bostr) \
130 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
131 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
132 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
134 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
135 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
136 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
137 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
138 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
139 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
141 /* for use after a quantifier and before an EXACT-like node -- japhy */
142 #define JUMPABLE(rn) ( \
143 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
144 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
145 OP(rn) == PLUS || OP(rn) == MINMOD || \
146 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
149 #define HAS_TEXT(rn) ( \
150 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
154 Search for mandatory following text node; for lookahead, the text must
155 follow but for lookbehind (rn->flags != 0) we skip to the next step.
157 #define FIND_NEXT_IMPT(rn) STMT_START { \
158 while (JUMPABLE(rn)) \
159 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
160 rn = NEXTOPER(NEXTOPER(rn)); \
161 else if (OP(rn) == PLUS) \
163 else if (OP(rn) == IFMATCH) \
164 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
165 else rn += NEXT_OFF(rn); \
168 static void restore_pos(pTHX_ void *arg);
171 S_regcppush(pTHX_ I32 parenfloor)
174 const int retval = PL_savestack_ix;
175 #define REGCP_PAREN_ELEMS 4
176 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
179 if (paren_elems_to_push < 0)
180 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
182 #define REGCP_OTHER_ELEMS 6
183 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
184 for (p = PL_regsize; p > parenfloor; p--) {
185 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
186 SSPUSHINT(PL_regendp[p]);
187 SSPUSHINT(PL_regstartp[p]);
188 SSPUSHPTR(PL_reg_start_tmp[p]);
191 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
192 SSPUSHINT(PL_regsize);
193 SSPUSHINT(*PL_reglastparen);
194 SSPUSHINT(*PL_reglastcloseparen);
195 SSPUSHPTR(PL_reginput);
196 #define REGCP_FRAME_ELEMS 2
197 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
198 * are needed for the regexp context stack bookkeeping. */
199 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
200 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
205 /* These are needed since we do not localize EVAL nodes: */
206 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
207 " Setting an EVAL scope, savestack=%"IVdf"\n", \
208 (IV)PL_savestack_ix)); cp = PL_savestack_ix
210 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
211 PerlIO_printf(Perl_debug_log, \
212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
213 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
223 GET_RE_DEBUG_FLAGS_DECL;
225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
229 input = (char *) SSPOPPTR;
230 *PL_reglastcloseparen = SSPOPINT;
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
234 /* Now restore the parentheses context. */
235 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
236 i > 0; i -= REGCP_PAREN_ELEMS) {
238 paren = (U32)SSPOPINT;
239 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
240 PL_regstartp[paren] = SSPOPINT;
242 if (paren <= *PL_reglastparen)
243 PL_regendp[paren] = tmps;
245 PerlIO_printf(Perl_debug_log,
246 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
247 (UV)paren, (IV)PL_regstartp[paren],
248 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
249 (IV)PL_regendp[paren],
250 (paren > *PL_reglastparen ? "(no)" : ""));
254 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
255 PerlIO_printf(Perl_debug_log,
256 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
257 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
261 /* It would seem that the similar code in regtry()
262 * already takes care of this, and in fact it is in
263 * a better location to since this code can #if 0-ed out
264 * but the code in regtry() is needed or otherwise tests
265 * requiring null fields (pat.t#187 and split.t#{13,14}
266 * (as of patchlevel 7877) will fail. Then again,
267 * this code seems to be necessary or otherwise
268 * building DynaLoader will fail:
269 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
271 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
272 if ((I32)paren > PL_regsize)
273 PL_regstartp[paren] = -1;
274 PL_regendp[paren] = -1;
280 typedef struct re_cc_state
284 struct re_cc_state *prev;
289 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
291 #define TRYPAREN(paren, n, input, where) { \
294 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
295 PL_regendp[paren] = input - PL_bostr; \
298 PL_regendp[paren] = -1; \
300 REGMATCH(next, where); \
304 PL_regendp[paren] = -1; \
309 * pregexec and friends
313 - pregexec - match a regexp against a string
316 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
317 char *strbeg, I32 minend, SV *screamer, U32 nosave)
318 /* strend: pointer to null at end of string */
319 /* strbeg: real beginning of string */
320 /* minend: end of match must be >=minend after stringarg. */
321 /* nosave: For optimizations. */
324 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
325 nosave ? 0 : REXEC_COPY_STR);
329 S_cache_re(pTHX_ regexp *prog)
332 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
334 PL_regprogram = prog->program;
336 PL_regnpar = prog->nparens;
337 PL_regdata = prog->data;
342 * Need to implement the following flags for reg_anch:
344 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
346 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
347 * INTUIT_AUTORITATIVE_ML
348 * INTUIT_ONCE_NOML - Intuit can match in one location only.
351 * Another flag for this function: SECOND_TIME (so that float substrs
352 * with giant delta may be not rechecked).
355 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
357 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
358 Otherwise, only SvCUR(sv) is used to get strbeg. */
360 /* XXXX We assume that strpos is strbeg unless sv. */
362 /* XXXX Some places assume that there is a fixed substring.
363 An update may be needed if optimizer marks as "INTUITable"
364 RExen without fixed substrings. Similarly, it is assumed that
365 lengths of all the strings are no more than minlen, thus they
366 cannot come from lookahead.
367 (Or minlen should take into account lookahead.) */
369 /* A failure to find a constant substring means that there is no need to make
370 an expensive call to REx engine, thus we celebrate a failure. Similarly,
371 finding a substring too deep into the string means that less calls to
372 regtry() should be needed.
374 REx compiler's optimizer found 4 possible hints:
375 a) Anchored substring;
377 c) Whether we are anchored (beginning-of-line or \G);
378 d) First node (of those at offset 0) which may distingush positions;
379 We use a)b)d) and multiline-part of c), and try to find a position in the
380 string which does not contradict any of them.
383 /* Most of decisions we do here should have been done at compile time.
384 The nodes of the REx which we used for the search should have been
385 deleted from the finite automaton. */
388 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
389 char *strend, U32 flags, re_scream_pos_data *data)
392 register I32 start_shift = 0;
393 /* Should be nonnegative! */
394 register I32 end_shift = 0;
399 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
401 register char *other_last = NULL; /* other substr checked before this */
402 char *check_at = NULL; /* check substr found at this pos */
403 const I32 multiline = prog->reganch & PMf_MULTILINE;
405 const char * const i_strpos = strpos;
406 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
409 GET_RE_DEBUG_FLAGS_DECL;
411 RX_MATCH_UTF8_set(prog,do_utf8);
413 if (prog->reganch & ROPT_UTF8) {
414 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
415 "UTF-8 regex...\n"));
416 PL_reg_flags |= RF_utf8;
420 const char *s = PL_reg_match_utf8 ?
421 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
423 const int len = PL_reg_match_utf8 ?
424 strlen(s) : strend - strpos;
427 if (PL_reg_match_utf8)
428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
429 "UTF-8 target...\n"));
430 PerlIO_printf(Perl_debug_log,
431 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
432 PL_colors[4], PL_colors[5], PL_colors[0],
435 (strlen(prog->precomp) > 60 ? "..." : ""),
437 (int)(len > 60 ? 60 : len),
439 (len > 60 ? "..." : "")
443 /* CHR_DIST() would be more correct here but it makes things slow. */
444 if (prog->minlen > strend - strpos) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
446 "String too short... [re_intuit_start]\n"));
449 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
452 if (!prog->check_utf8 && prog->check_substr)
453 to_utf8_substr(prog);
454 check = prog->check_utf8;
456 if (!prog->check_substr && prog->check_utf8)
457 to_byte_substr(prog);
458 check = prog->check_substr;
460 if (check == &PL_sv_undef) {
461 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
462 "Non-utf string cannot match utf check string\n"));
465 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
466 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
467 || ( (prog->reganch & ROPT_ANCH_BOL)
468 && !multiline ) ); /* Check after \n? */
471 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
472 | ROPT_IMPLICIT)) /* not a real BOL */
473 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
475 && (strpos != strbeg)) {
476 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
479 if (prog->check_offset_min == prog->check_offset_max &&
480 !(prog->reganch & ROPT_CANY_SEEN)) {
481 /* Substring at constant offset from beg-of-str... */
484 s = HOP3c(strpos, prog->check_offset_min, strend);
486 slen = SvCUR(check); /* >= 1 */
488 if ( strend - s > slen || strend - s < slen - 1
489 || (strend - s == slen && strend[-1] != '\n')) {
490 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
493 /* Now should match s[0..slen-2] */
495 if (slen && (*SvPVX_const(check) != *s
497 && memNE(SvPVX_const(check), s, slen)))) {
499 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
503 else if (*SvPVX_const(check) != *s
504 || ((slen = SvCUR(check)) > 1
505 && memNE(SvPVX_const(check), s, slen)))
508 goto success_at_start;
511 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
513 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
514 end_shift = prog->minlen - start_shift -
515 CHR_SVLEN(check) + (SvTAIL(check) != 0);
517 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
518 - (SvTAIL(check) != 0);
519 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
521 if (end_shift < eshift)
525 else { /* Can match at random position */
528 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
529 /* Should be nonnegative! */
530 end_shift = prog->minlen - start_shift -
531 CHR_SVLEN(check) + (SvTAIL(check) != 0);
534 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
536 Perl_croak(aTHX_ "panic: end_shift");
540 /* Find a possible match in the region s..strend by looking for
541 the "check" substring in the region corrected by start/end_shift. */
542 if (flags & REXEC_SCREAM) {
543 I32 p = -1; /* Internal iterator of scream. */
544 I32 * const pp = data ? data->scream_pos : &p;
546 if (PL_screamfirst[BmRARE(check)] >= 0
547 || ( BmRARE(check) == '\n'
548 && (BmPREVIOUS(check) == SvCUR(check) - 1)
550 s = screaminstr(sv, check,
551 start_shift + (s - strbeg), end_shift, pp, 0);
554 /* we may be pointing at the wrong string */
555 if (s && RX_MATCH_COPIED(prog))
556 s = strbeg + (s - SvPVX_const(sv));
558 *data->scream_olds = s;
560 else if (prog->reganch & ROPT_CANY_SEEN)
561 s = fbm_instr((U8*)(s + start_shift),
562 (U8*)(strend - end_shift),
563 check, multiline ? FBMrf_MULTILINE : 0);
565 s = fbm_instr(HOP3(s, start_shift, strend),
566 HOP3(strend, -end_shift, strbeg),
567 check, multiline ? FBMrf_MULTILINE : 0);
569 /* Update the count-of-usability, remove useless subpatterns,
572 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
573 (s ? "Found" : "Did not find"),
574 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
576 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
578 PL_colors[1], (SvTAIL(check) ? "$" : ""),
579 (s ? " at offset " : "...\n") ) );
586 /* Finish the diagnostic message */
587 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
589 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
590 Start with the other substr.
591 XXXX no SCREAM optimization yet - and a very coarse implementation
592 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
593 *always* match. Probably should be marked during compile...
594 Probably it is right to do no SCREAM here...
597 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
598 /* Take into account the "other" substring. */
599 /* XXXX May be hopelessly wrong for UTF... */
602 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
605 char * const last = HOP3c(s, -start_shift, strbeg);
610 t = s - prog->check_offset_max;
611 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
613 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
618 t = HOP3c(t, prog->anchored_offset, strend);
619 if (t < other_last) /* These positions already checked */
621 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
624 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
625 /* On end-of-str: see comment below. */
626 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
627 if (must == &PL_sv_undef) {
629 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
634 HOP3(HOP3(last1, prog->anchored_offset, strend)
635 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
637 multiline ? FBMrf_MULTILINE : 0
639 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
640 "%s anchored substr \"%s%.*s%s\"%s",
641 (s ? "Found" : "Contradicts"),
644 - (SvTAIL(must)!=0)),
646 PL_colors[1], (SvTAIL(must) ? "$" : "")));
648 if (last1 >= last2) {
649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
650 ", giving up...\n"));
653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
654 ", trying floating at offset %ld...\n",
655 (long)(HOP3c(s1, 1, strend) - i_strpos)));
656 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
657 s = HOP3c(last, 1, strend);
661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
662 (long)(s - i_strpos)));
663 t = HOP3c(s, -prog->anchored_offset, strbeg);
664 other_last = HOP3c(s, 1, strend);
672 else { /* Take into account the floating substring. */
677 t = HOP3c(s, -start_shift, strbeg);
679 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
680 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
681 last = HOP3c(t, prog->float_max_offset, strend);
682 s = HOP3c(t, prog->float_min_offset, strend);
685 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
686 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
687 /* fbm_instr() takes into account exact value of end-of-str
688 if the check is SvTAIL(ed). Since false positives are OK,
689 and end-of-str is not later than strend we are OK. */
690 if (must == &PL_sv_undef) {
692 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
695 s = fbm_instr((unsigned char*)s,
696 (unsigned char*)last + SvCUR(must)
698 must, multiline ? FBMrf_MULTILINE : 0);
699 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
700 (s ? "Found" : "Contradicts"),
702 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
704 PL_colors[1], (SvTAIL(must) ? "$" : "")));
707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
708 ", giving up...\n"));
711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
712 ", trying anchored starting at offset %ld...\n",
713 (long)(s1 + 1 - i_strpos)));
715 s = HOP3c(t, 1, strend);
719 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
720 (long)(s - i_strpos)));
721 other_last = s; /* Fix this later. --Hugo */
730 t = s - prog->check_offset_max;
731 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
733 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
735 /* Fixed substring is found far enough so that the match
736 cannot start at strpos. */
738 if (ml_anch && t[-1] != '\n') {
739 /* Eventually fbm_*() should handle this, but often
740 anchored_offset is not 0, so this check will not be wasted. */
741 /* XXXX In the code below we prefer to look for "^" even in
742 presence of anchored substrings. And we search even
743 beyond the found float position. These pessimizations
744 are historical artefacts only. */
746 while (t < strend - prog->minlen) {
748 if (t < check_at - prog->check_offset_min) {
749 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
750 /* Since we moved from the found position,
751 we definitely contradict the found anchored
752 substr. Due to the above check we do not
753 contradict "check" substr.
754 Thus we can arrive here only if check substr
755 is float. Redo checking for "other"=="fixed".
758 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
759 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
760 goto do_other_anchored;
762 /* We don't contradict the found floating substring. */
763 /* XXXX Why not check for STCLASS? */
765 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
766 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
769 /* Position contradicts check-string */
770 /* XXXX probably better to look for check-string
771 than for "\n", so one should lower the limit for t? */
772 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
773 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
774 other_last = strpos = s = t + 1;
779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
780 PL_colors[0], PL_colors[1]));
784 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
785 PL_colors[0], PL_colors[1]));
789 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
792 /* The found string does not prohibit matching at strpos,
793 - no optimization of calling REx engine can be performed,
794 unless it was an MBOL and we are not after MBOL,
795 or a future STCLASS check will fail this. */
797 /* Even in this situation we may use MBOL flag if strpos is offset
798 wrt the start of the string. */
799 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
800 && (strpos != strbeg) && strpos[-1] != '\n'
801 /* May be due to an implicit anchor of m{.*foo} */
802 && !(prog->reganch & ROPT_IMPLICIT))
807 DEBUG_EXECUTE_r( if (ml_anch)
808 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
809 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
812 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
814 prog->check_utf8 /* Could be deleted already */
815 && --BmUSEFUL(prog->check_utf8) < 0
816 && (prog->check_utf8 == prog->float_utf8)
818 prog->check_substr /* Could be deleted already */
819 && --BmUSEFUL(prog->check_substr) < 0
820 && (prog->check_substr == prog->float_substr)
823 /* If flags & SOMETHING - do not do it many times on the same match */
824 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
825 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
826 if (do_utf8 ? prog->check_substr : prog->check_utf8)
827 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
828 prog->check_substr = prog->check_utf8 = NULL; /* disable */
829 prog->float_substr = prog->float_utf8 = NULL; /* clear */
830 check = NULL; /* abort */
832 /* XXXX This is a remnant of the old implementation. It
833 looks wasteful, since now INTUIT can use many
835 prog->reganch &= ~RE_USE_INTUIT;
842 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
843 if (prog->regstclass) {
844 /* minlen == 0 is possible if regstclass is \b or \B,
845 and the fixed substr is ''$.
846 Since minlen is already taken into account, s+1 is before strend;
847 accidentally, minlen >= 1 guaranties no false positives at s + 1
848 even for \b or \B. But (minlen? 1 : 0) below assumes that
849 regstclass does not come from lookahead... */
850 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
851 This leaves EXACTF only, which is dealt with in find_byclass(). */
852 const U8* const str = (U8*)STRING(prog->regstclass);
853 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
854 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
856 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
857 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
858 : (prog->float_substr || prog->float_utf8
859 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
865 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
868 const char *what = NULL;
870 if (endpos == strend) {
871 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
872 "Could not match STCLASS...\n") );
875 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
876 "This position contradicts STCLASS...\n") );
877 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
879 /* Contradict one of substrings */
880 if (prog->anchored_substr || prog->anchored_utf8) {
881 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
882 DEBUG_EXECUTE_r( what = "anchored" );
884 s = HOP3c(t, 1, strend);
885 if (s + start_shift + end_shift > strend) {
886 /* XXXX Should be taken into account earlier? */
887 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
888 "Could not match STCLASS...\n") );
893 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
894 "Looking for %s substr starting at offset %ld...\n",
895 what, (long)(s + start_shift - i_strpos)) );
898 /* Have both, check_string is floating */
899 if (t + start_shift >= check_at) /* Contradicts floating=check */
900 goto retry_floating_check;
901 /* Recheck anchored substring, but not floating... */
905 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
906 "Looking for anchored substr starting at offset %ld...\n",
907 (long)(other_last - i_strpos)) );
908 goto do_other_anchored;
910 /* Another way we could have checked stclass at the
911 current position only: */
916 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
917 "Looking for /%s^%s/m starting at offset %ld...\n",
918 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
921 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
923 /* Check is floating subtring. */
924 retry_floating_check:
925 t = check_at - start_shift;
926 DEBUG_EXECUTE_r( what = "floating" );
927 goto hop_and_restart;
930 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
931 "By STCLASS: moving %ld --> %ld\n",
932 (long)(t - i_strpos), (long)(s - i_strpos))
936 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
937 "Does not contradict STCLASS...\n");
942 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
943 PL_colors[4], (check ? "Guessed" : "Giving up"),
944 PL_colors[5], (long)(s - i_strpos)) );
947 fail_finish: /* Substring not found */
948 if (prog->check_substr || prog->check_utf8) /* could be removed already */
949 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
952 PL_colors[4], PL_colors[5]));
956 /* We know what class REx starts with. Try to find this position... */
958 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
961 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
965 register STRLEN uskip;
969 register I32 tmp = 1; /* Scratch variable? */
970 register const bool do_utf8 = PL_reg_match_utf8;
972 /* We know what class it must start with. */
976 while (s + (uskip = UTF8SKIP(s)) <= strend) {
977 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
978 !UTF8_IS_INVARIANT((U8)s[0]) ?
979 reginclass(c, (U8*)s, 0, do_utf8) :
980 REGINCLASS(c, (U8*)s)) {
981 if (tmp && (norun || regtry(prog, s)))
995 if (REGINCLASS(c, (U8*)s) ||
996 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
997 /* The assignment of 2 is intentional:
998 * for the folded sharp s, the skip is 2. */
999 (skip = SHARP_S_SKIP))) {
1000 if (tmp && (norun || regtry(prog, s)))
1012 while (s < strend) {
1013 if (tmp && (norun || regtry(prog, s)))
1022 ln = STR_LEN(c); /* length to match in octets/bytes */
1023 lnc = (I32) ln; /* length to match in characters */
1025 STRLEN ulen1, ulen2;
1027 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1028 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1029 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1031 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1032 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1034 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1036 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1039 while (sm < ((U8 *) m + ln)) {
1054 c2 = PL_fold_locale[c1];
1056 e = HOP3c(strend, -((I32)lnc), s);
1059 e = s; /* Due to minlen logic of intuit() */
1061 /* The idea in the EXACTF* cases is to first find the
1062 * first character of the EXACTF* node and then, if
1063 * necessary, case-insensitively compare the full
1064 * text of the node. The c1 and c2 are the first
1065 * characters (though in Unicode it gets a bit
1066 * more complicated because there are more cases
1067 * than just upper and lower: one needs to use
1068 * the so-called folding case for case-insensitive
1069 * matching (called "loose matching" in Unicode).
1070 * ibcmp_utf8() will do just that. */
1074 U8 tmpbuf [UTF8_MAXBYTES+1];
1075 STRLEN len, foldlen;
1076 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1078 /* Upper and lower of 1st char are equal -
1079 * probably not a "letter". */
1081 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1085 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1086 m, (char **)0, ln, (bool)UTF))
1087 && (norun || regtry(prog, s)) )
1090 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1091 uvchr_to_utf8(tmpbuf, c);
1092 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1094 && (f == c1 || f == c2)
1095 && (ln == foldlen ||
1096 !ibcmp_utf8((char *) foldbuf,
1097 (char **)0, foldlen, do_utf8,
1099 (char **)0, ln, (bool)UTF))
1100 && (norun || regtry(prog, s)) )
1108 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1111 /* Handle some of the three Greek sigmas cases.
1112 * Note that not all the possible combinations
1113 * are handled here: some of them are handled
1114 * by the standard folding rules, and some of
1115 * them (the character class or ANYOF cases)
1116 * are handled during compiletime in
1117 * regexec.c:S_regclass(). */
1118 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1119 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1120 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1122 if ( (c == c1 || c == c2)
1124 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1125 m, (char **)0, ln, (bool)UTF))
1126 && (norun || regtry(prog, s)) )
1129 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1130 uvchr_to_utf8(tmpbuf, c);
1131 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1133 && (f == c1 || f == c2)
1134 && (ln == foldlen ||
1135 !ibcmp_utf8((char *) foldbuf,
1136 (char **)0, foldlen, do_utf8,
1138 (char **)0, ln, (bool)UTF))
1139 && (norun || regtry(prog, s)) )
1150 && (ln == 1 || !(OP(c) == EXACTF
1152 : ibcmp_locale(s, m, ln)))
1153 && (norun || regtry(prog, s)) )
1159 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1160 && (ln == 1 || !(OP(c) == EXACTF
1162 : ibcmp_locale(s, m, ln)))
1163 && (norun || regtry(prog, s)) )
1170 PL_reg_flags |= RF_tainted;
1177 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1178 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1180 tmp = ((OP(c) == BOUND ?
1181 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1182 LOAD_UTF8_CHARCLASS_ALNUM();
1183 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1184 if (tmp == !(OP(c) == BOUND ?
1185 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1186 isALNUM_LC_utf8((U8*)s)))
1189 if ((norun || regtry(prog, s)))
1196 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1197 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1198 while (s < strend) {
1200 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1202 if ((norun || regtry(prog, s)))
1208 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1212 PL_reg_flags |= RF_tainted;
1219 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1220 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1222 tmp = ((OP(c) == NBOUND ?
1223 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1224 LOAD_UTF8_CHARCLASS_ALNUM();
1225 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1226 if (tmp == !(OP(c) == NBOUND ?
1227 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1228 isALNUM_LC_utf8((U8*)s)))
1230 else if ((norun || regtry(prog, s)))
1236 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1237 tmp = ((OP(c) == NBOUND ?
1238 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1239 while (s < strend) {
1241 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1243 else if ((norun || regtry(prog, s)))
1248 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1253 LOAD_UTF8_CHARCLASS_ALNUM();
1254 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1255 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1256 if (tmp && (norun || regtry(prog, s)))
1267 while (s < strend) {
1269 if (tmp && (norun || regtry(prog, s)))
1281 PL_reg_flags |= RF_tainted;
1283 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1284 if (isALNUM_LC_utf8((U8*)s)) {
1285 if (tmp && (norun || regtry(prog, s)))
1296 while (s < strend) {
1297 if (isALNUM_LC(*s)) {
1298 if (tmp && (norun || regtry(prog, s)))
1311 LOAD_UTF8_CHARCLASS_ALNUM();
1312 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1313 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1314 if (tmp && (norun || regtry(prog, s)))
1325 while (s < strend) {
1327 if (tmp && (norun || regtry(prog, s)))
1339 PL_reg_flags |= RF_tainted;
1341 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1342 if (!isALNUM_LC_utf8((U8*)s)) {
1343 if (tmp && (norun || regtry(prog, s)))
1354 while (s < strend) {
1355 if (!isALNUM_LC(*s)) {
1356 if (tmp && (norun || regtry(prog, s)))
1369 LOAD_UTF8_CHARCLASS_SPACE();
1370 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1371 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1372 if (tmp && (norun || regtry(prog, s)))
1383 while (s < strend) {
1385 if (tmp && (norun || regtry(prog, s)))
1397 PL_reg_flags |= RF_tainted;
1399 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1400 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1401 if (tmp && (norun || regtry(prog, s)))
1412 while (s < strend) {
1413 if (isSPACE_LC(*s)) {
1414 if (tmp && (norun || regtry(prog, s)))
1427 LOAD_UTF8_CHARCLASS_SPACE();
1428 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1429 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1430 if (tmp && (norun || regtry(prog, s)))
1441 while (s < strend) {
1443 if (tmp && (norun || regtry(prog, s)))
1455 PL_reg_flags |= RF_tainted;
1457 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1458 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1459 if (tmp && (norun || regtry(prog, s)))
1470 while (s < strend) {
1471 if (!isSPACE_LC(*s)) {
1472 if (tmp && (norun || regtry(prog, s)))
1485 LOAD_UTF8_CHARCLASS_DIGIT();
1486 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1487 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1488 if (tmp && (norun || regtry(prog, s)))
1499 while (s < strend) {
1501 if (tmp && (norun || regtry(prog, s)))
1513 PL_reg_flags |= RF_tainted;
1515 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1516 if (isDIGIT_LC_utf8((U8*)s)) {
1517 if (tmp && (norun || regtry(prog, s)))
1528 while (s < strend) {
1529 if (isDIGIT_LC(*s)) {
1530 if (tmp && (norun || regtry(prog, s)))
1543 LOAD_UTF8_CHARCLASS_DIGIT();
1544 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1545 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1546 if (tmp && (norun || regtry(prog, s)))
1557 while (s < strend) {
1559 if (tmp && (norun || regtry(prog, s)))
1571 PL_reg_flags |= RF_tainted;
1573 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1574 if (!isDIGIT_LC_utf8((U8*)s)) {
1575 if (tmp && (norun || regtry(prog, s)))
1586 while (s < strend) {
1587 if (!isDIGIT_LC(*s)) {
1588 if (tmp && (norun || regtry(prog, s)))
1600 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1609 - regexec_flags - match a regexp against a string
1612 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1613 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1614 /* strend: pointer to null at end of string */
1615 /* strbeg: real beginning of string */
1616 /* minend: end of match must be >=minend after stringarg. */
1617 /* data: May be used for some additional optimizations. */
1618 /* nosave: For optimizations. */
1622 register regnode *c;
1623 register char *startpos = stringarg;
1624 I32 minlen; /* must match at least this many chars */
1625 I32 dontbother = 0; /* how many characters not to try at end */
1626 I32 end_shift = 0; /* Same for the end. */ /* CC */
1627 I32 scream_pos = -1; /* Internal iterator of scream. */
1628 char *scream_olds = NULL;
1629 SV* oreplsv = GvSV(PL_replgv);
1630 const bool do_utf8 = DO_UTF8(sv);
1631 const I32 multiline = prog->reganch & PMf_MULTILINE;
1633 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
1634 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
1637 GET_RE_DEBUG_FLAGS_DECL;
1639 PERL_UNUSED_ARG(data);
1640 RX_MATCH_UTF8_set(prog,do_utf8);
1644 PL_regnarrate = DEBUG_r_TEST;
1647 /* Be paranoid... */
1648 if (prog == NULL || startpos == NULL) {
1649 Perl_croak(aTHX_ "NULL regexp parameter");
1653 minlen = prog->minlen;
1654 if (strend - startpos < minlen) {
1655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1656 "String too short [regexec_flags]...\n"));
1660 /* Check validity of program. */
1661 if (UCHARAT(prog->program) != REG_MAGIC) {
1662 Perl_croak(aTHX_ "corrupted regexp program");
1666 PL_reg_eval_set = 0;
1669 if (prog->reganch & ROPT_UTF8)
1670 PL_reg_flags |= RF_utf8;
1672 /* Mark beginning of line for ^ and lookbehind. */
1673 PL_regbol = startpos;
1677 /* Mark end of line for $ (and such) */
1680 /* see how far we have to get to not match where we matched before */
1681 PL_regtill = startpos+minend;
1683 /* We start without call_cc context. */
1686 /* If there is a "must appear" string, look for it. */
1689 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1692 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1693 PL_reg_ganch = startpos;
1694 else if (sv && SvTYPE(sv) >= SVt_PVMG
1696 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1697 && mg->mg_len >= 0) {
1698 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1699 if (prog->reganch & ROPT_ANCH_GPOS) {
1700 if (s > PL_reg_ganch)
1705 else /* pos() not defined */
1706 PL_reg_ganch = strbeg;
1709 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1710 re_scream_pos_data d;
1712 d.scream_olds = &scream_olds;
1713 d.scream_pos = &scream_pos;
1714 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1716 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1717 goto phooey; /* not present */
1722 const char * const s0 = UTF
1723 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1726 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1727 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1728 UNI_DISPLAY_REGEX) : startpos;
1729 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1732 PerlIO_printf(Perl_debug_log,
1733 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1734 PL_colors[4], PL_colors[5], PL_colors[0],
1737 len0 > 60 ? "..." : "",
1739 (int)(len1 > 60 ? 60 : len1),
1741 (len1 > 60 ? "..." : "")
1745 /* Simplest case: anchored match need be tried only once. */
1746 /* [unless only anchor is BOL and multiline is set] */
1747 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1748 if (s == startpos && regtry(prog, startpos))
1750 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1751 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1756 dontbother = minlen - 1;
1757 end = HOP3c(strend, -dontbother, strbeg) - 1;
1758 /* for multiline we only have to try after newlines */
1759 if (prog->check_substr || prog->check_utf8) {
1763 if (regtry(prog, s))
1768 if (prog->reganch & RE_USE_INTUIT) {
1769 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1780 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1781 if (regtry(prog, s))
1788 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1789 if (regtry(prog, PL_reg_ganch))
1794 /* Messy cases: unanchored match. */
1795 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1796 /* we have /x+whatever/ */
1797 /* it must be a one character string (XXXX Except UTF?) */
1802 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1803 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1804 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1807 while (s < strend) {
1809 DEBUG_EXECUTE_r( did_match = 1 );
1810 if (regtry(prog, s)) goto got_it;
1812 while (s < strend && *s == ch)
1819 while (s < strend) {
1821 DEBUG_EXECUTE_r( did_match = 1 );
1822 if (regtry(prog, s)) goto got_it;
1824 while (s < strend && *s == ch)
1830 DEBUG_EXECUTE_r(if (!did_match)
1831 PerlIO_printf(Perl_debug_log,
1832 "Did not find anchored character...\n")
1835 else if (prog->anchored_substr != NULL
1836 || prog->anchored_utf8 != NULL
1837 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1838 && prog->float_max_offset < strend - s)) {
1843 char *last1; /* Last position checked before */
1847 if (prog->anchored_substr || prog->anchored_utf8) {
1848 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1849 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1850 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1851 back_max = back_min = prog->anchored_offset;
1853 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1854 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1855 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1856 back_max = prog->float_max_offset;
1857 back_min = prog->float_min_offset;
1859 if (must == &PL_sv_undef)
1860 /* could not downgrade utf8 check substring, so must fail */
1863 last = HOP3c(strend, /* Cannot start after this */
1864 -(I32)(CHR_SVLEN(must)
1865 - (SvTAIL(must) != 0) + back_min), strbeg);
1868 last1 = HOPc(s, -1);
1870 last1 = s - 1; /* bogus */
1872 /* XXXX check_substr already used to find "s", can optimize if
1873 check_substr==must. */
1875 dontbother = end_shift;
1876 strend = HOPc(strend, -dontbother);
1877 while ( (s <= last) &&
1878 ((flags & REXEC_SCREAM)
1879 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1880 end_shift, &scream_pos, 0))
1881 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1882 (unsigned char*)strend, must,
1883 multiline ? FBMrf_MULTILINE : 0))) ) {
1884 /* we may be pointing at the wrong string */
1885 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1886 s = strbeg + (s - SvPVX_const(sv));
1887 DEBUG_EXECUTE_r( did_match = 1 );
1888 if (HOPc(s, -back_max) > last1) {
1889 last1 = HOPc(s, -back_min);
1890 s = HOPc(s, -back_max);
1893 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1895 last1 = HOPc(s, -back_min);
1899 while (s <= last1) {
1900 if (regtry(prog, s))
1906 while (s <= last1) {
1907 if (regtry(prog, s))
1913 DEBUG_EXECUTE_r(if (!did_match)
1914 PerlIO_printf(Perl_debug_log,
1915 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1916 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1917 ? "anchored" : "floating"),
1919 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1921 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1925 else if ((c = prog->regstclass)) {
1927 I32 op = (U8)OP(prog->regstclass);
1928 /* don't bother with what can't match */
1929 if (PL_regkind[op] != EXACT && op != CANY)
1930 strend = HOPc(strend, -(minlen - 1));
1933 SV *prop = sv_newmortal();
1941 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1942 UNI_DISPLAY_REGEX) :
1944 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1946 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1947 len1 = UTF ? SvCUR(dsv1) : strend - s;
1948 PerlIO_printf(Perl_debug_log,
1949 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1953 if (find_byclass(prog, c, s, strend, 0))
1955 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1959 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1964 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1965 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1966 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1968 if (flags & REXEC_SCREAM) {
1969 last = screaminstr(sv, float_real, s - strbeg,
1970 end_shift, &scream_pos, 1); /* last one */
1972 last = scream_olds; /* Only one occurrence. */
1973 /* we may be pointing at the wrong string */
1974 else if (RX_MATCH_COPIED(prog))
1975 s = strbeg + (s - SvPVX_const(sv));
1979 const char * const little = SvPV_const(float_real, len);
1981 if (SvTAIL(float_real)) {
1982 if (memEQ(strend - len + 1, little, len - 1))
1983 last = strend - len + 1;
1984 else if (!multiline)
1985 last = memEQ(strend - len, little, len)
1986 ? strend - len : NULL;
1992 last = rninstr(s, strend, little, little + len);
1994 last = strend; /* matching "$" */
1998 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1999 "%sCan't trim the tail, match fails (should not happen)%s\n",
2000 PL_colors[4], PL_colors[5]));
2001 goto phooey; /* Should not happen! */
2003 dontbother = strend - last + prog->float_min_offset;
2005 if (minlen && (dontbother < minlen))
2006 dontbother = minlen - 1;
2007 strend -= dontbother; /* this one's always in bytes! */
2008 /* We don't know much -- general case. */
2011 if (regtry(prog, s))
2020 if (regtry(prog, s))
2022 } while (s++ < strend);
2030 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2032 if (PL_reg_eval_set) {
2033 /* Preserve the current value of $^R */
2034 if (oreplsv != GvSV(PL_replgv))
2035 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2036 restored, the value remains
2038 restore_pos(aTHX_ 0);
2041 /* make sure $`, $&, $', and $digit will work later */
2042 if ( !(flags & REXEC_NOT_FIRST) ) {
2043 RX_MATCH_COPY_FREE(prog);
2044 if (flags & REXEC_COPY_STR) {
2045 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2046 #ifdef PERL_OLD_COPY_ON_WRITE
2048 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2050 PerlIO_printf(Perl_debug_log,
2051 "Copy on write: regexp capture, type %d\n",
2054 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2055 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2056 assert (SvPOKp(prog->saved_copy));
2060 RX_MATCH_COPIED_on(prog);
2061 s = savepvn(strbeg, i);
2067 prog->subbeg = strbeg;
2068 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2075 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2076 PL_colors[4], PL_colors[5]));
2077 if (PL_reg_eval_set)
2078 restore_pos(aTHX_ 0);
2083 - regtry - try match at specific point
2085 STATIC I32 /* 0 failure, 1 success */
2086 S_regtry(pTHX_ regexp *prog, char *startpos)
2093 GET_RE_DEBUG_FLAGS_DECL;
2096 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2098 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2101 PL_reg_eval_set = RS_init;
2102 DEBUG_EXECUTE_r(DEBUG_s(
2103 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2104 (IV)(PL_stack_sp - PL_stack_base));
2106 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2107 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2108 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2110 /* Apparently this is not needed, judging by wantarray. */
2111 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2112 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2115 /* Make $_ available to executed code. */
2116 if (PL_reg_sv != DEFSV) {
2121 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2122 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2123 /* prepare for quick setting of pos */
2124 #ifdef PERL_OLD_COPY_ON_WRITE
2126 sv_force_normal_flags(sv, 0);
2128 mg = sv_magicext(PL_reg_sv, (SV*)0, PERL_MAGIC_regex_global,
2129 &PL_vtbl_mglob, NULL, 0);
2133 PL_reg_oldpos = mg->mg_len;
2134 SAVEDESTRUCTOR_X(restore_pos, 0);
2136 if (!PL_reg_curpm) {
2137 Newxz(PL_reg_curpm, 1, PMOP);
2140 SV* repointer = newSViv(0);
2141 /* so we know which PL_regex_padav element is PL_reg_curpm */
2142 SvFLAGS(repointer) |= SVf_BREAK;
2143 av_push(PL_regex_padav,repointer);
2144 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2145 PL_regex_pad = AvARRAY(PL_regex_padav);
2149 PM_SETRE(PL_reg_curpm, prog);
2150 PL_reg_oldcurpm = PL_curpm;
2151 PL_curpm = PL_reg_curpm;
2152 if (RX_MATCH_COPIED(prog)) {
2153 /* Here is a serious problem: we cannot rewrite subbeg,
2154 since it may be needed if this match fails. Thus
2155 $` inside (?{}) could fail... */
2156 PL_reg_oldsaved = prog->subbeg;
2157 PL_reg_oldsavedlen = prog->sublen;
2158 #ifdef PERL_OLD_COPY_ON_WRITE
2159 PL_nrs = prog->saved_copy;
2161 RX_MATCH_COPIED_off(prog);
2164 PL_reg_oldsaved = NULL;
2165 prog->subbeg = PL_bostr;
2166 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2168 prog->startp[0] = startpos - PL_bostr;
2169 PL_reginput = startpos;
2170 PL_regstartp = prog->startp;
2171 PL_regendp = prog->endp;
2172 PL_reglastparen = &prog->lastparen;
2173 PL_reglastcloseparen = &prog->lastcloseparen;
2174 prog->lastparen = 0;
2175 prog->lastcloseparen = 0;
2177 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2178 if (PL_reg_start_tmpl <= prog->nparens) {
2179 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2180 if(PL_reg_start_tmp)
2181 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2183 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2186 /* XXXX What this code is doing here?!!! There should be no need
2187 to do this again and again, PL_reglastparen should take care of
2190 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2191 * Actually, the code in regcppop() (which Ilya may be meaning by
2192 * PL_reglastparen), is not needed at all by the test suite
2193 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2194 * enough, for building DynaLoader, or otherwise this
2195 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2196 * will happen. Meanwhile, this code *is* needed for the
2197 * above-mentioned test suite tests to succeed. The common theme
2198 * on those tests seems to be returning null fields from matches.
2203 if (prog->nparens) {
2204 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2211 if (regmatch(prog->program + 1)) {
2212 prog->endp[0] = PL_reginput - PL_bostr;
2215 REGCP_UNWIND(lastcp);
2219 #define RE_UNWIND_BRANCH 1
2220 #define RE_UNWIND_BRANCHJ 2
2224 typedef struct { /* XX: makes sense to enlarge it... */
2228 } re_unwind_generic_t;
2241 } re_unwind_branch_t;
2243 typedef union re_unwind_t {
2245 re_unwind_generic_t generic;
2246 re_unwind_branch_t branch;
2249 #define sayYES goto yes
2250 #define sayNO goto no
2251 #define sayNO_ANYOF goto no_anyof
2252 #define sayYES_FINAL goto yes_final
2253 #define sayYES_LOUD goto yes_loud
2254 #define sayNO_FINAL goto no_final
2255 #define sayNO_SILENT goto do_no
2256 #define saySAME(x) if (x) goto yes; else goto no
2258 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2259 #define POSCACHE_SEEN 1 /* we know what we're caching */
2260 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2261 #define CACHEsayYES STMT_START { \
2262 if (cache_offset | cache_bit) { \
2263 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2264 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2265 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2266 /* cache records failure, but this is success */ \
2268 PerlIO_printf(Perl_debug_log, \
2269 "%*s (remove success from failure cache)\n", \
2270 REPORT_CODE_OFF+PL_regindent*2, "") \
2272 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2277 #define CACHEsayNO STMT_START { \
2278 if (cache_offset | cache_bit) { \
2279 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2280 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2281 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2282 /* cache records success, but this is failure */ \
2284 PerlIO_printf(Perl_debug_log, \
2285 "%*s (remove failure from success cache)\n", \
2286 REPORT_CODE_OFF+PL_regindent*2, "") \
2288 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2294 /* this is used to determine how far from the left messages like
2295 'failed...' are printed. Currently 29 makes these messages line
2296 up with the opcode they refer to. Earlier perls used 25 which
2297 left these messages outdented making reviewing a debug output
2300 #define REPORT_CODE_OFF 29
2303 /* Make sure there is a test for this +1 options in re_tests */
2304 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2307 /* simulate a recursive call to regmatch */
2309 #define REGMATCH(ns, where) \
2311 resume_state = resume_##where; \
2312 goto start_recurse; \
2313 resume_point_##where:
2338 struct regmatch_state {
2339 struct regmatch_state *prev_state;
2340 resume_states resume_state;
2353 CHECKPOINT cp, lastcp;
2356 I32 cache_offset, cache_bit;
2363 re_cc_state *cur_call_cc;
2365 reg_trie_accepted *accept_buff;
2368 re_cc_state *reg_call_cc; /* saved value of PL_reg_call_cc */
2372 - regmatch - main matching routine
2374 * Conceptually the strategy is simple: check to see whether the current
2375 * node matches, call self recursively to see whether the rest matches,
2376 * and then act accordingly. In practice we make some effort to avoid
2377 * recursion, in particular by going through "ordinary" nodes (that don't
2378 * need to know whether the rest of the match failed) by a loop instead of
2381 /* [lwall] I've hoisted the register declarations to the outer block in order to
2382 * maybe save a little bit of pushing and popping on the stack. It also takes
2383 * advantage of machines that use a register save mask on subroutine entry.
2385 * This function used to be heavily recursive, but since this had the
2386 * effect of blowing the CPU stack on complex regexes, it has been
2387 * restructured to be iterative, and to save state onto the heap rather
2388 * than the stack. Essentially whereever regmatch() used to be called, it
2389 * pushes the current state, notes where to return, then jumps back into
2392 * Originally the structure of this function used to look something like
2397 while (scan != NULL) {
2403 if (regmatch(...)) // recurse
2413 * Now it looks something like this:
2415 struct regmatch_state {
2424 while (scan != NULL) {
2430 resume_state = resume_FOO;
2435 if (result) // recurse
2441 ...push a, b, local onto the heap
2447 if (states pushed on heap) {
2448 ... restore a, b, local from heap
2449 switch (resume_state) {
2451 goto resume_point_FOO;
2458 * WARNING: this means that any line in this function that contains a
2459 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2460 * regmatch() using gotos instead. Thus the values of any local variables
2461 * not saved in the regmatch_state structure will have been lost when
2462 * execution resumes on the next line .
2466 STATIC I32 /* 0 failure, 1 success */
2467 S_regmatch(pTHX_ regnode *prog)
2470 register const bool do_utf8 = PL_reg_match_utf8;
2471 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
2473 /************************************************************
2474 * the following variabes are saved and restored on each fake
2475 * recursive call to regmatch:
2477 * The first ones contain state that needs to be maintained
2478 * across the main while loop: */
2480 struct regmatch_state *prev_state = NULL; /* stack of pushed states */
2481 resume_states resume_state; /* where to jump to on return */
2482 register regnode *scan; /* Current node. */
2483 regnode *next; /* Next node. */
2484 bool minmod = 0; /* the next {n.m} is a {n,m}? */
2485 bool sw = 0; /* the condition value in (?(cond)a|b) */
2487 I32 unwind = 0; /* savestack index of current unwind block */
2488 CURCUR *cc = NULL; /* current innermost curly struct */
2489 register char *locinput = PL_reginput;
2491 /* while the rest of these are local to an individual branch, and
2492 * have only been hoisted into this outer scope to allow for saving and
2493 * restoration - thus they can be safely reused in other branches.
2494 * Note that they are only initialized here to silence compiler
2497 register I32 n = 0; /* no or next */
2498 register I32 ln = 0; /* len or last */
2499 register I32 c1 = 0, c2 = 0, paren = 0; /* case fold search, parenth */
2500 CHECKPOINT cp = 0; /* remember current savestack indexes */
2501 CHECKPOINT lastcp = 0;
2502 CURCUR *oldcc = NULL; /* tmp copy of cc */
2503 char *lastloc = NULL; /* Detection of 0-len. */
2504 I32 cache_offset = 0;
2512 re_cc_state *cur_call_cc = NULL;
2513 regexp *end_re = NULL;
2514 reg_trie_accepted *accept_buff = NULL;
2515 U32 accepted = 0; /* how many accepting states we have seen */
2517 /************************************************************
2518 * these variables are NOT saved: */
2520 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2521 regnode *new_scan; /* node to begin exedcution at when recursing */
2522 bool result; /* return value of S_regmatch */
2524 regnode *inner; /* Next node in internal branch. */
2527 SV *re_debug_flags = NULL;
2532 /* Note that nextchr is a byte even in UTF */
2533 nextchr = UCHARAT(locinput);
2535 while (scan != NULL) {
2538 SV * const prop = sv_newmortal();
2539 const int docolor = *PL_colors[0];
2540 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2541 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2542 /* The part of the string before starttry has one color
2543 (pref0_len chars), between starttry and current
2544 position another one (pref_len - pref0_len chars),
2545 after the current position the third one.
2546 We assume that pref0_len <= pref_len, otherwise we
2547 decrease pref0_len. */
2548 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2549 ? (5 + taill) - l : locinput - PL_bostr;
2552 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2554 pref0_len = pref_len - (locinput - PL_reg_starttry);
2555 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2556 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2557 ? (5 + taill) - pref_len : PL_regeol - locinput);
2558 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2562 if (pref0_len > pref_len)
2563 pref0_len = pref_len;
2564 regprop(prop, scan);
2566 const char * const s0 =
2567 do_utf8 && OP(scan) != CANY ?
2568 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2569 pref0_len, 60, UNI_DISPLAY_REGEX) :
2570 locinput - pref_len;
2571 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2572 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2573 pv_uni_display(PERL_DEBUG_PAD(1),
2574 (U8*)(locinput - pref_len + pref0_len),
2575 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2576 locinput - pref_len + pref0_len;
2577 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2578 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2579 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2580 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2582 const int len2 = do_utf8 ? strlen(s2) : l;
2583 PerlIO_printf(Perl_debug_log,
2584 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2585 (IV)(locinput - PL_bostr),
2592 (docolor ? "" : "> <"),
2596 15 - l - pref_len + 1,
2598 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2603 next = scan + NEXT_OFF(scan);
2609 if (locinput == PL_bostr)
2611 /* regtill = regbol; */
2616 if (locinput == PL_bostr ||
2617 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2623 if (locinput == PL_bostr)
2627 if (locinput == PL_reg_ganch)
2633 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2638 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2640 if (PL_regeol - locinput > 1)
2644 if (PL_regeol != locinput)
2648 if (!nextchr && locinput >= PL_regeol)
2651 locinput += PL_utf8skip[nextchr];
2652 if (locinput > PL_regeol)
2654 nextchr = UCHARAT(locinput);
2657 nextchr = UCHARAT(++locinput);
2660 if (!nextchr && locinput >= PL_regeol)
2662 nextchr = UCHARAT(++locinput);
2665 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2668 locinput += PL_utf8skip[nextchr];
2669 if (locinput > PL_regeol)
2671 nextchr = UCHARAT(locinput);
2674 nextchr = UCHARAT(++locinput);
2680 traverse the TRIE keeping track of all accepting states
2681 we transition through until we get to a failing node.
2689 U8 *uc = ( U8* )locinput;
2696 U8 *uscan = (U8*)NULL;
2698 SV *sv_accept_buff = NULL;
2699 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2700 trie_type = do_utf8 ?
2701 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2704 /* what trie are we using right now */
2706 = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2707 accepted = 0; /* how many accepting states we have seen */
2710 while ( state && uc <= (U8*)PL_regeol ) {
2712 if (trie->states[ state ].wordnum) {
2716 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2717 sv_accept_buff=newSV(bufflen *
2718 sizeof(reg_trie_accepted) - 1);
2719 SvCUR_set(sv_accept_buff,
2720 sizeof(reg_trie_accepted));
2721 SvPOK_on(sv_accept_buff);
2722 sv_2mortal(sv_accept_buff);
2724 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2727 if (accepted >= bufflen) {
2729 accept_buff =(reg_trie_accepted*)
2730 SvGROW(sv_accept_buff,
2731 bufflen * sizeof(reg_trie_accepted));
2733 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2734 + sizeof(reg_trie_accepted));
2736 accept_buff[accepted].wordnum = trie->states[state].wordnum;
2737 accept_buff[accepted].endpos = uc;
2741 base = trie->states[ state ].trans.base;
2743 DEBUG_TRIE_EXECUTE_r(
2744 PerlIO_printf( Perl_debug_log,
2745 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2746 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2747 (UV)state, (UV)base, (UV)accepted );
2751 switch (trie_type) {
2752 case trie_uft8_fold:
2754 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2759 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2760 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2761 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2762 foldlen -= UNISKIP( uvc );
2763 uscan = foldbuf + UNISKIP( uvc );
2767 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2776 charid = trie->charmap[ uvc ];
2780 if (trie->widecharmap) {
2781 SV** svpp = (SV**)NULL;
2782 svpp = hv_fetch(trie->widecharmap,
2783 (char*)&uvc, sizeof(UV), 0);
2785 charid = (U16)SvIV(*svpp);
2790 (base + charid > trie->uniquecharcount )
2791 && (base + charid - 1 - trie->uniquecharcount
2793 && trie->trans[base + charid - 1 -
2794 trie->uniquecharcount].check == state)
2796 state = trie->trans[base + charid - 1 -
2797 trie->uniquecharcount ].next;
2808 DEBUG_TRIE_EXECUTE_r(
2809 PerlIO_printf( Perl_debug_log,
2810 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2811 charid, uvc, (UV)state, PL_colors[5] );
2818 There was at least one accepting state that we
2819 transitioned through. Presumably the number of accepting
2820 states is going to be low, typically one or two. So we
2821 simply scan through to find the one with lowest wordnum.
2822 Once we find it, we swap the last state into its place
2823 and decrement the size. We then try to match the rest of
2824 the pattern at the point where the word ends, if we
2825 succeed then we end the loop, otherwise the loop
2826 eventually terminates once all of the accepting states
2830 if ( accepted == 1 ) {
2832 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2833 PerlIO_printf( Perl_debug_log,
2834 "%*s %sonly one match : #%d <%s>%s\n",
2835 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2836 accept_buff[ 0 ].wordnum,
2837 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2840 PL_reginput = (char *)accept_buff[ 0 ].endpos;
2841 /* in this case we free tmps/leave before we call regmatch
2842 as we wont be using accept_buff again. */
2845 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2846 /*** all unsaved local vars undefined at this point */
2849 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2850 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
2853 while ( !result && accepted-- ) {
2856 for( cur = 1 ; cur <= accepted ; cur++ ) {
2857 DEBUG_TRIE_EXECUTE_r(
2858 PerlIO_printf( Perl_debug_log,
2859 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2860 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2861 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2862 accept_buff[ cur ].wordnum, PL_colors[5] );
2865 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2869 SV ** const tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2870 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
2871 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2872 accept_buff[best].wordnum,
2873 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
2876 if ( best<accepted ) {
2877 reg_trie_accepted tmp = accept_buff[ best ];
2878 accept_buff[ best ] = accept_buff[ accepted ];
2879 accept_buff[ accepted ] = tmp;
2882 PL_reginput = (char *)accept_buff[ best ].endpos;
2885 as far as I can tell we only need the SAVETMPS/FREETMPS
2886 for re's with EVAL in them but I'm leaving them in for
2887 all until I can be sure.
2890 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2891 /*** all unsaved local vars undefined at this point */
2904 /* unreached codepoint */
2906 char *s = STRING(scan);
2908 if (do_utf8 != UTF) {
2909 /* The target and the pattern have differing utf8ness. */
2911 const char *e = s + ln;
2914 /* The target is utf8, the pattern is not utf8. */
2919 if (NATIVE_TO_UNI(*(U8*)s) !=
2920 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2928 /* The target is not utf8, the pattern is utf8. */
2933 if (NATIVE_TO_UNI(*((U8*)l)) !=
2934 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2942 nextchr = UCHARAT(locinput);
2945 /* The target and the pattern have the same utf8ness. */
2946 /* Inline the first character, for speed. */
2947 if (UCHARAT(s) != nextchr)
2949 if (PL_regeol - locinput < ln)
2951 if (ln > 1 && memNE(s, locinput, ln))
2954 nextchr = UCHARAT(locinput);
2958 PL_reg_flags |= RF_tainted;
2961 char *s = STRING(scan);
2964 if (do_utf8 || UTF) {
2965 /* Either target or the pattern are utf8. */
2967 char *e = PL_regeol;
2969 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2970 l, &e, 0, do_utf8)) {
2971 /* One more case for the sharp s:
2972 * pack("U0U*", 0xDF) =~ /ss/i,
2973 * the 0xC3 0x9F are the UTF-8
2974 * byte sequence for the U+00DF. */
2976 toLOWER(s[0]) == 's' &&
2978 toLOWER(s[1]) == 's' &&
2985 nextchr = UCHARAT(locinput);
2989 /* Neither the target and the pattern are utf8. */
2991 /* Inline the first character, for speed. */
2992 if (UCHARAT(s) != nextchr &&
2993 UCHARAT(s) != ((OP(scan) == EXACTF)
2994 ? PL_fold : PL_fold_locale)[nextchr])
2996 if (PL_regeol - locinput < ln)
2998 if (ln > 1 && (OP(scan) == EXACTF
2999 ? ibcmp(s, locinput, ln)
3000 : ibcmp_locale(s, locinput, ln)))
3003 nextchr = UCHARAT(locinput);
3008 STRLEN inclasslen = PL_regeol - locinput;
3010 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
3012 if (locinput >= PL_regeol)
3014 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3015 nextchr = UCHARAT(locinput);
3020 nextchr = UCHARAT(locinput);
3021 if (!REGINCLASS(scan, (U8*)locinput))
3023 if (!nextchr && locinput >= PL_regeol)
3025 nextchr = UCHARAT(++locinput);
3029 /* If we might have the case of the German sharp s
3030 * in a casefolding Unicode character class. */
3032 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3033 locinput += SHARP_S_SKIP;
3034 nextchr = UCHARAT(locinput);
3040 PL_reg_flags |= RF_tainted;
3046 LOAD_UTF8_CHARCLASS_ALNUM();
3047 if (!(OP(scan) == ALNUM
3048 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3049 : isALNUM_LC_utf8((U8*)locinput)))
3053 locinput += PL_utf8skip[nextchr];
3054 nextchr = UCHARAT(locinput);
3057 if (!(OP(scan) == ALNUM
3058 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3060 nextchr = UCHARAT(++locinput);
3063 PL_reg_flags |= RF_tainted;
3066 if (!nextchr && locinput >= PL_regeol)
3069 LOAD_UTF8_CHARCLASS_ALNUM();
3070 if (OP(scan) == NALNUM
3071 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3072 : isALNUM_LC_utf8((U8*)locinput))
3076 locinput += PL_utf8skip[nextchr];
3077 nextchr = UCHARAT(locinput);
3080 if (OP(scan) == NALNUM
3081 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3083 nextchr = UCHARAT(++locinput);
3087 PL_reg_flags |= RF_tainted;
3091 /* was last char in word? */
3093 if (locinput == PL_bostr)
3096 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3098 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
3100 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3101 ln = isALNUM_uni(ln);
3102 LOAD_UTF8_CHARCLASS_ALNUM();
3103 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3106 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3107 n = isALNUM_LC_utf8((U8*)locinput);
3111 ln = (locinput != PL_bostr) ?
3112 UCHARAT(locinput - 1) : '\n';
3113 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3115 n = isALNUM(nextchr);
3118 ln = isALNUM_LC(ln);
3119 n = isALNUM_LC(nextchr);
3122 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3123 OP(scan) == BOUNDL))
3127 PL_reg_flags |= RF_tainted;
3133 if (UTF8_IS_CONTINUED(nextchr)) {
3134 LOAD_UTF8_CHARCLASS_SPACE();
3135 if (!(OP(scan) == SPACE
3136 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3137 : isSPACE_LC_utf8((U8*)locinput)))
3141 locinput += PL_utf8skip[nextchr];
3142 nextchr = UCHARAT(locinput);
3145 if (!(OP(scan) == SPACE
3146 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3148 nextchr = UCHARAT(++locinput);
3151 if (!(OP(scan) == SPACE
3152 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3154 nextchr = UCHARAT(++locinput);
3158 PL_reg_flags |= RF_tainted;
3161 if (!nextchr && locinput >= PL_regeol)
3164 LOAD_UTF8_CHARCLASS_SPACE();
3165 if (OP(scan) == NSPACE
3166 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3167 : isSPACE_LC_utf8((U8*)locinput))
3171 locinput += PL_utf8skip[nextchr];
3172 nextchr = UCHARAT(locinput);
3175 if (OP(scan) == NSPACE
3176 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3178 nextchr = UCHARAT(++locinput);
3181 PL_reg_flags |= RF_tainted;
3187 LOAD_UTF8_CHARCLASS_DIGIT();
3188 if (!(OP(scan) == DIGIT
3189 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3190 : isDIGIT_LC_utf8((U8*)locinput)))
3194 locinput += PL_utf8skip[nextchr];
3195 nextchr = UCHARAT(locinput);
3198 if (!(OP(scan) == DIGIT
3199 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3201 nextchr = UCHARAT(++locinput);
3204 PL_reg_flags |= RF_tainted;
3207 if (!nextchr && locinput >= PL_regeol)
3210 LOAD_UTF8_CHARCLASS_DIGIT();
3211 if (OP(scan) == NDIGIT
3212 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3213 : isDIGIT_LC_utf8((U8*)locinput))
3217 locinput += PL_utf8skip[nextchr];
3218 nextchr = UCHARAT(locinput);
3221 if (OP(scan) == NDIGIT
3222 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3224 nextchr = UCHARAT(++locinput);
3227 if (locinput >= PL_regeol)
3230 LOAD_UTF8_CHARCLASS_MARK();
3231 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3233 locinput += PL_utf8skip[nextchr];
3234 while (locinput < PL_regeol &&
3235 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3236 locinput += UTF8SKIP(locinput);
3237 if (locinput > PL_regeol)
3242 nextchr = UCHARAT(locinput);
3245 PL_reg_flags |= RF_tainted;
3250 n = ARG(scan); /* which paren pair */
3251 ln = PL_regstartp[n];
3252 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3253 if ((I32)*PL_reglastparen < n || ln == -1)
3254 sayNO; /* Do not match unless seen CLOSEn. */
3255 if (ln == PL_regendp[n])
3259 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3261 const char *e = PL_bostr + PL_regendp[n];
3263 * Note that we can't do the "other character" lookup trick as
3264 * in the 8-bit case (no pun intended) because in Unicode we
3265 * have to map both upper and title case to lower case.
3267 if (OP(scan) == REFF) {
3269 STRLEN ulen1, ulen2;
3270 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3271 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3275 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3276 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3277 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3284 nextchr = UCHARAT(locinput);
3288 /* Inline the first character, for speed. */
3289 if (UCHARAT(s) != nextchr &&
3291 (UCHARAT(s) != ((OP(scan) == REFF
3292 ? PL_fold : PL_fold_locale)[nextchr]))))
3294 ln = PL_regendp[n] - ln;
3295 if (locinput + ln > PL_regeol)
3297 if (ln > 1 && (OP(scan) == REF
3298 ? memNE(s, locinput, ln)
3300 ? ibcmp(s, locinput, ln)
3301 : ibcmp_locale(s, locinput, ln))))
3304 nextchr = UCHARAT(locinput);
3316 OP_4tree * const oop = PL_op;
3317 COP * const ocurcop = PL_curcop;
3320 struct regexp * const oreg = PL_reg_re;
3323 PL_op = (OP_4tree*)PL_regdata->data[n];
3324 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3325 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
3326 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3329 SV ** const before = SP;
3330 CALLRUNOPS(aTHX); /* Scalar context. */
3333 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3341 PAD_RESTORE_LOCAL(old_comppad);
3342 PL_curcop = ocurcop;
3344 if (logical == 2) { /* Postponed subexpression. */
3351 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3352 mg = mg_find(sv, PERL_MAGIC_qr);
3353 else if (SvSMAGICAL(ret)) {
3354 if (SvGMAGICAL(ret))
3355 sv_unmagic(ret, PERL_MAGIC_qr);
3357 mg = mg_find(ret, PERL_MAGIC_qr);
3361 re = (regexp *)mg->mg_obj;
3362 (void)ReREFCNT_inc(re);
3366 const char * const t = SvPV_const(ret, len);
3368 char * const oprecomp = PL_regprecomp;
3369 const I32 osize = PL_regsize;
3370 const I32 onpar = PL_regnpar;
3373 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3374 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3376 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3378 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3380 PL_regprecomp = oprecomp;
3385 PerlIO_printf(Perl_debug_log,
3386 "Entering embedded \"%s%.60s%s%s\"\n",
3390 (strlen(re->precomp) > 60 ? "..." : ""))
3393 state.prev = PL_reg_call_cc;
3395 state.re = PL_reg_re;
3399 cp = regcppush(0); /* Save *all* the positions. */
3402 state.ss = PL_savestack_ix;
3403 *PL_reglastparen = 0;
3404 *PL_reglastcloseparen = 0;
3405 PL_reg_call_cc = &state;
3406 PL_reginput = locinput;
3407 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3408 ((re->reganch & ROPT_UTF8) != 0);
3409 if (toggleutf) PL_reg_flags ^= RF_utf8;
3411 /* XXXX This is too dramatic a measure... */
3414 /* XXX the only recursion left in regmatch() */
3415 if (regmatch(re->program + 1)) {
3416 /* Even though we succeeded, we need to restore
3417 global variables, since we may be wrapped inside
3418 SUSPEND, thus the match may be not finished yet. */
3420 /* XXXX Do this only if SUSPENDed? */
3421 PL_reg_call_cc = state.prev;
3423 PL_reg_re = state.re;
3424 cache_re(PL_reg_re);
3425 if (toggleutf) PL_reg_flags ^= RF_utf8;
3427 /* XXXX This is too dramatic a measure... */
3430 /* These are needed even if not SUSPEND. */
3436 REGCP_UNWIND(lastcp);
3438 PL_reg_call_cc = state.prev;
3440 PL_reg_re = state.re;
3441 cache_re(PL_reg_re);
3442 if (toggleutf) PL_reg_flags ^= RF_utf8;
3444 /* XXXX This is too dramatic a measure... */
3454 sv_setsv(save_scalar(PL_replgv), ret);
3460 n = ARG(scan); /* which paren pair */
3461 PL_reg_start_tmp[n] = locinput;
3466 n = ARG(scan); /* which paren pair */
3467 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3468 PL_regendp[n] = locinput - PL_bostr;
3469 if (n > (I32)*PL_reglastparen)
3470 *PL_reglastparen = n;
3471 *PL_reglastcloseparen = n;
3474 n = ARG(scan); /* which paren pair */
3475 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3478 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3480 next = NEXTOPER(NEXTOPER(scan));
3482 next = scan + ARG(scan);
3483 if (OP(next) == IFTHEN) /* Fake one. */
3484 next = NEXTOPER(NEXTOPER(next));
3488 logical = scan->flags;
3490 /*******************************************************************
3491 cc contains infoblock about the innermost (...)* loop, and
3492 a pointer to the next outer infoblock.
3494 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3496 1) After matching Y, regnode for CURLYX is processed;
3498 2) This regnode mallocs an infoblock, and calls regmatch() recursively
3499 with the starting point at WHILEM node;
3501 3) Each hit of WHILEM node tries to match A and Z (in the order
3502 depending on the current iteration, min/max of {min,max} and
3503 greediness). The information about where are nodes for "A"
3504 and "Z" is read from the infoblock, as is info on how many times "A"
3505 was already matched, and greediness.
3507 4) After A matches, the same WHILEM node is hit again.
3509 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3510 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3511 resets cc, since this Y(A)*Z can be a part of some other loop:
3512 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3513 of the external loop.
3515 Currently present infoblocks form a tree with a stem formed by PL_curcc
3516 and whatever it mentions via ->next, and additional attached trees
3517 corresponding to temporarily unset infoblocks as in "5" above.
3519 In the following picture, infoblocks for outer loop of
3520 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3521 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3522 infoblocks are drawn below the "reset" infoblock.
3524 In fact in the picture below we do not show failed matches for Z and T
3525 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3526 more obvious *why* one needs to *temporary* unset infoblocks.]
3528 Matched REx position InfoBlocks Comment
3532 Y A)*?Z)*?T x <- O <- I
3533 YA )*?Z)*?T x <- O <- I
3534 YA A)*?Z)*?T x <- O <- I
3535 YAA )*?Z)*?T x <- O <- I
3536 YAA Z)*?T x <- O # Temporary unset I
3539 YAAZ Y(A)*?Z)*?T x <- O
3542 YAAZY (A)*?Z)*?T x <- O
3545 YAAZY A)*?Z)*?T x <- O <- I
3548 YAAZYA )*?Z)*?T x <- O <- I
3551 YAAZYA Z)*?T x <- O # Temporary unset I
3557 YAAZYAZ T x # Temporary unset O
3564 *******************************************************************/
3567 /* No need to save/restore up to this paren */
3568 I32 parenfloor = scan->flags;
3572 Newx(newcc, 1, CURCUR);
3577 cp = PL_savestack_ix;
3578 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3580 /* XXXX Probably it is better to teach regpush to support
3581 parenfloor > PL_regsize... */
3582 if (parenfloor > (I32)*PL_reglastparen)
3583 parenfloor = *PL_reglastparen; /* Pessimization... */
3584 cc->parenfloor = parenfloor;
3586 cc->min = ARG1(scan);
3587 cc->max = ARG2(scan);
3588 cc->scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3590 cc->minmod = minmod;
3592 PL_reginput = locinput;
3593 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3594 /*** all unsaved local vars undefined at this point */
3603 * This is really hard to understand, because after we match
3604 * what we're trying to match, we must make sure the rest of
3605 * the REx is going to match for sure, and to do that we have
3606 * to go back UP the parse tree by recursing ever deeper. And
3607 * if it fails, we have to reset our parent's current state
3608 * that we can try again after backing off.
3611 lastloc = cc->lastloc; /* Detection of 0-len. */
3615 n = cc->cur + 1; /* how many we know we matched */
3616 PL_reginput = locinput;
3619 PerlIO_printf(Perl_debug_log,
3620 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3621 REPORT_CODE_OFF+PL_regindent*2, "",
3622 (long)n, (long)cc->min,
3623 (long)cc->max, PTR2UV(cc))
3626 /* If degenerate scan matches "", assume scan done. */
3628 if (locinput == cc->lastloc && n >= cc->min) {
3634 PerlIO_printf(Perl_debug_log,
3635 "%*s empty match detected, try continuation...\n",
3636 REPORT_CODE_OFF+PL_regindent*2, "")
3638 REGMATCH(oldcc->next, WHILEM1);
3639 /*** all unsaved local vars undefined at this point */
3644 cc->oldcc->cur = ln;
3648 /* First just match a string of min scans. */
3652 cc->lastloc = locinput;
3653 REGMATCH(cc->scan, WHILEM2);
3654 /*** all unsaved local vars undefined at this point */
3658 cc->lastloc = lastloc;
3663 /* Check whether we already were at this position.
3664 Postpone detection until we know the match is not
3665 *that* much linear. */
3666 if (!PL_reg_maxiter) {
3667 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3668 PL_reg_leftiter = PL_reg_maxiter;
3670 if (PL_reg_leftiter-- == 0) {
3671 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3672 if (PL_reg_poscache) {
3673 if ((I32)PL_reg_poscache_size < size) {
3674 Renew(PL_reg_poscache, size, char);
3675 PL_reg_poscache_size = size;
3677 Zero(PL_reg_poscache, size, char);
3680 PL_reg_poscache_size = size;
3681 Newxz(PL_reg_poscache, size, char);
3684 PerlIO_printf(Perl_debug_log,
3685 "%sDetected a super-linear match, switching on caching%s...\n",
3686 PL_colors[4], PL_colors[5])
3689 if (PL_reg_leftiter < 0) {
3690 cache_offset = locinput - PL_bostr;
3692 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3693 + cache_offset * (scan->flags>>4);
3694 cache_bit = cache_offset % 8;
3696 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
3698 PerlIO_printf(Perl_debug_log,
3699 "%*s already tried at this position...\n",
3700 REPORT_CODE_OFF+PL_regindent*2, "")
3702 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3703 /* cache records success */
3706 /* cache records failure */
3709 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
3713 /* Prefer next over scan for minimal matching. */
3720 cp = regcppush(oldcc->parenfloor);
3722 REGMATCH(oldcc->next, WHILEM3);
3723 /*** all unsaved local vars undefined at this point */
3727 CACHEsayYES; /* All done. */
3729 REGCP_UNWIND(lastcp);
3732 cc->oldcc->cur = ln;
3734 if (n >= cc->max) { /* Maximum greed exceeded? */
3735 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3736 && !(PL_reg_flags & RF_warned)) {
3737 PL_reg_flags |= RF_warned;
3738 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3739 "Complex regular subexpression recursion",
3746 PerlIO_printf(Perl_debug_log,
3747 "%*s trying longer...\n",
3748 REPORT_CODE_OFF+PL_regindent*2, "")
3750 /* Try scanning more and see if it helps. */
3751 PL_reginput = locinput;
3753 cc->lastloc = locinput;
3754 cp = regcppush(cc->parenfloor);
3756 REGMATCH(cc->scan, WHILEM4);
3757 /*** all unsaved local vars undefined at this point */
3762 REGCP_UNWIND(lastcp);
3765 cc->lastloc = lastloc;
3769 /* Prefer scan over next for maximal matching. */
3771 if (n < cc->max) { /* More greed allowed? */
3772 cp = regcppush(cc->parenfloor);
3774 cc->lastloc = locinput;
3776 REGMATCH(cc->scan, WHILEM5);
3777 /*** all unsaved local vars undefined at this point */
3782 REGCP_UNWIND(lastcp);
3783 regcppop(); /* Restore some previous $<digit>s? */
3784 PL_reginput = locinput;
3786 PerlIO_printf(Perl_debug_log,
3787 "%*s failed, try continuation...\n",
3788 REPORT_CODE_OFF+PL_regindent*2, "")
3791 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3792 && !(PL_reg_flags & RF_warned)) {
3793 PL_reg_flags |= RF_warned;
3794 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3795 "Complex regular subexpression recursion",
3799 /* Failed deeper matches of scan, so see if this one works. */
3804 REGMATCH(oldcc->next, WHILEM6);
3805 /*** all unsaved local vars undefined at this point */
3810 cc->oldcc->cur = ln;
3812 cc->lastloc = lastloc;
3817 next = scan + ARG(scan);
3820 inner = NEXTOPER(NEXTOPER(scan));
3823 inner = NEXTOPER(scan);
3827 if (OP(next) != c1) /* No choice. */
3828 next = inner; /* Avoid recursion. */
3830 const I32 lastparen = *PL_reglastparen;
3831 /* Put unwinding data on stack */
3832 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
3833 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
3837 uw->type = ((c1 == BRANCH)
3839 : RE_UNWIND_BRANCHJ);
3840 uw->lastparen = lastparen;
3842 uw->locinput = locinput;
3843 uw->nextchr = nextchr;
3845 uw->regindent = ++PL_regindent;
3848 REGCP_SET(uw->lastcp);
3850 /* Now go into the first branch */
3860 curlym_l = matches = 0;
3862 /* We suppose that the next guy does not need
3863 backtracking: in particular, it is of constant non-zero length,
3864 and has no parenths to influence future backrefs. */
3865 ln = ARG1(scan); /* min to match */
3866 n = ARG2(scan); /* max to match */
3867 paren = scan->flags;
3869 if (paren > PL_regsize)
3871 if (paren > (I32)*PL_reglastparen)
3872 *PL_reglastparen = paren;
3874 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3876 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3877 PL_reginput = locinput;
3878 maxwanted = minmod ? ln : n;
3880 while (PL_reginput < PL_regeol && matches < maxwanted) {
3881 REGMATCH(scan, CURLYM1);
3882 /*** all unsaved local vars undefined at this point */
3885 /* on first match, determine length, curlym_l */
3887 if (PL_reg_match_utf8) {
3889 while (s < PL_reginput) {
3895 curlym_l = PL_reginput - locinput;
3897 if (curlym_l == 0) {
3898 matches = maxwanted;
3902 locinput = PL_reginput;
3906 PL_reginput = locinput;
3910 if (ln && matches < ln)
3912 if (HAS_TEXT(next) || JUMPABLE(next)) {
3913 regnode *text_node = next;
3915 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3917 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3919 if (PL_regkind[(U8)OP(text_node)] == REF) {