5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
34 /* need to replace pregcomp et al, so enable that */
35 # ifndef PERL_IN_XSUB_RE
36 # define PERL_IN_XSUB_RE
38 /* need access to debugger hooks */
39 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
44 #ifdef PERL_IN_XSUB_RE
45 /* We *really* need to overwrite these symbols: */
46 # define Perl_regexec_flags my_regexec
47 # define Perl_regdump my_regdump
48 # define Perl_regprop my_regprop
49 # define Perl_re_intuit_start my_re_intuit_start
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_pregexec my_pregexec
52 # define Perl_reginitcolors my_reginitcolors
53 # define Perl_regclass_swash my_regclass_swash
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
78 **** Alterations to Henry's code are...
80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
91 #define PERL_IN_REGEXEC_C
96 #define RF_tainted 1 /* tainted information used? */
97 #define RF_warned 2 /* warned about big count? */
98 #define RF_evaled 4 /* Did an EVAL with setting? */
99 #define RF_utf8 8 /* String contains multibyte chars? */
101 #define UTF ((PL_reg_flags & RF_utf8) != 0)
103 #define RS_init 1 /* eval environment created */
104 #define RS_set 2 /* replsv value is set */
107 #define STATIC static
110 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
116 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
117 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
119 #define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
120 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
122 #define HOPBACKc(pos, off) ((char*) \
123 ((PL_reg_match_utf8) \
124 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
125 : (pos - off >= PL_bostr) \
130 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
131 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
132 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
134 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
135 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
136 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
137 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
138 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
139 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
141 /* for use after a quantifier and before an EXACT-like node -- japhy */
142 #define JUMPABLE(rn) ( \
143 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
144 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
145 OP(rn) == PLUS || OP(rn) == MINMOD || \
146 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
149 #define HAS_TEXT(rn) ( \
150 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
154 Search for mandatory following text node; for lookahead, the text must
155 follow but for lookbehind (rn->flags != 0) we skip to the next step.
157 #define FIND_NEXT_IMPT(rn) STMT_START { \
158 while (JUMPABLE(rn)) \
159 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
160 rn = NEXTOPER(NEXTOPER(rn)); \
161 else if (OP(rn) == PLUS) \
163 else if (OP(rn) == IFMATCH) \
164 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
165 else rn += NEXT_OFF(rn); \
168 static void restore_pos(pTHX_ void *arg);
171 S_regcppush(pTHX_ I32 parenfloor)
174 const int retval = PL_savestack_ix;
175 #define REGCP_PAREN_ELEMS 4
176 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
179 if (paren_elems_to_push < 0)
180 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
182 #define REGCP_OTHER_ELEMS 6
183 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
184 for (p = PL_regsize; p > parenfloor; p--) {
185 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
186 SSPUSHINT(PL_regendp[p]);
187 SSPUSHINT(PL_regstartp[p]);
188 SSPUSHPTR(PL_reg_start_tmp[p]);
191 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
192 SSPUSHINT(PL_regsize);
193 SSPUSHINT(*PL_reglastparen);
194 SSPUSHINT(*PL_reglastcloseparen);
195 SSPUSHPTR(PL_reginput);
196 #define REGCP_FRAME_ELEMS 2
197 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
198 * are needed for the regexp context stack bookkeeping. */
199 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
200 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
205 /* These are needed since we do not localize EVAL nodes: */
206 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
207 " Setting an EVAL scope, savestack=%"IVdf"\n", \
208 (IV)PL_savestack_ix)); cp = PL_savestack_ix
210 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
211 PerlIO_printf(Perl_debug_log, \
212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
213 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
223 GET_RE_DEBUG_FLAGS_DECL;
225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
229 input = (char *) SSPOPPTR;
230 *PL_reglastcloseparen = SSPOPINT;
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
234 /* Now restore the parentheses context. */
235 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
236 i > 0; i -= REGCP_PAREN_ELEMS) {
238 paren = (U32)SSPOPINT;
239 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
240 PL_regstartp[paren] = SSPOPINT;
242 if (paren <= *PL_reglastparen)
243 PL_regendp[paren] = tmps;
245 PerlIO_printf(Perl_debug_log,
246 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
247 (UV)paren, (IV)PL_regstartp[paren],
248 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
249 (IV)PL_regendp[paren],
250 (paren > *PL_reglastparen ? "(no)" : ""));
254 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
255 PerlIO_printf(Perl_debug_log,
256 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
257 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
261 /* It would seem that the similar code in regtry()
262 * already takes care of this, and in fact it is in
263 * a better location to since this code can #if 0-ed out
264 * but the code in regtry() is needed or otherwise tests
265 * requiring null fields (pat.t#187 and split.t#{13,14}
266 * (as of patchlevel 7877) will fail. Then again,
267 * this code seems to be necessary or otherwise
268 * building DynaLoader will fail:
269 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
271 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
272 if ((I32)paren > PL_regsize)
273 PL_regstartp[paren] = -1;
274 PL_regendp[paren] = -1;
280 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
282 #define TRYPAREN(paren, n, input, where) { \
285 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
286 PL_regendp[paren] = input - PL_bostr; \
289 PL_regendp[paren] = -1; \
291 REGMATCH(next, where); \
295 PL_regendp[paren] = -1; \
300 * pregexec and friends
304 - pregexec - match a regexp against a string
307 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
308 char *strbeg, I32 minend, SV *screamer, U32 nosave)
309 /* strend: pointer to null at end of string */
310 /* strbeg: real beginning of string */
311 /* minend: end of match must be >=minend after stringarg. */
312 /* nosave: For optimizations. */
315 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
316 nosave ? 0 : REXEC_COPY_STR);
320 S_cache_re(pTHX_ regexp *prog)
323 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
325 PL_regprogram = prog->program;
327 PL_regnpar = prog->nparens;
328 PL_regdata = prog->data;
333 * Need to implement the following flags for reg_anch:
335 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
337 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
338 * INTUIT_AUTORITATIVE_ML
339 * INTUIT_ONCE_NOML - Intuit can match in one location only.
342 * Another flag for this function: SECOND_TIME (so that float substrs
343 * with giant delta may be not rechecked).
346 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
348 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
349 Otherwise, only SvCUR(sv) is used to get strbeg. */
351 /* XXXX We assume that strpos is strbeg unless sv. */
353 /* XXXX Some places assume that there is a fixed substring.
354 An update may be needed if optimizer marks as "INTUITable"
355 RExen without fixed substrings. Similarly, it is assumed that
356 lengths of all the strings are no more than minlen, thus they
357 cannot come from lookahead.
358 (Or minlen should take into account lookahead.) */
360 /* A failure to find a constant substring means that there is no need to make
361 an expensive call to REx engine, thus we celebrate a failure. Similarly,
362 finding a substring too deep into the string means that less calls to
363 regtry() should be needed.
365 REx compiler's optimizer found 4 possible hints:
366 a) Anchored substring;
368 c) Whether we are anchored (beginning-of-line or \G);
369 d) First node (of those at offset 0) which may distingush positions;
370 We use a)b)d) and multiline-part of c), and try to find a position in the
371 string which does not contradict any of them.
374 /* Most of decisions we do here should have been done at compile time.
375 The nodes of the REx which we used for the search should have been
376 deleted from the finite automaton. */
379 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
380 char *strend, U32 flags, re_scream_pos_data *data)
383 register I32 start_shift = 0;
384 /* Should be nonnegative! */
385 register I32 end_shift = 0;
390 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
392 register char *other_last = NULL; /* other substr checked before this */
393 char *check_at = NULL; /* check substr found at this pos */
394 const I32 multiline = prog->reganch & PMf_MULTILINE;
396 const char * const i_strpos = strpos;
397 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
400 GET_RE_DEBUG_FLAGS_DECL;
402 RX_MATCH_UTF8_set(prog,do_utf8);
404 if (prog->reganch & ROPT_UTF8) {
405 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
406 "UTF-8 regex...\n"));
407 PL_reg_flags |= RF_utf8;
411 const char *s = PL_reg_match_utf8 ?
412 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
414 const int len = PL_reg_match_utf8 ?
415 strlen(s) : strend - strpos;
418 if (PL_reg_match_utf8)
419 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
420 "UTF-8 target...\n"));
421 PerlIO_printf(Perl_debug_log,
422 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
423 PL_colors[4], PL_colors[5], PL_colors[0],
426 (strlen(prog->precomp) > 60 ? "..." : ""),
428 (int)(len > 60 ? 60 : len),
430 (len > 60 ? "..." : "")
434 /* CHR_DIST() would be more correct here but it makes things slow. */
435 if (prog->minlen > strend - strpos) {
436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
437 "String too short... [re_intuit_start]\n"));
440 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
443 if (!prog->check_utf8 && prog->check_substr)
444 to_utf8_substr(prog);
445 check = prog->check_utf8;
447 if (!prog->check_substr && prog->check_utf8)
448 to_byte_substr(prog);
449 check = prog->check_substr;
451 if (check == &PL_sv_undef) {
452 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
453 "Non-utf string cannot match utf check string\n"));
456 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
457 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
458 || ( (prog->reganch & ROPT_ANCH_BOL)
459 && !multiline ) ); /* Check after \n? */
462 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
463 | ROPT_IMPLICIT)) /* not a real BOL */
464 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
466 && (strpos != strbeg)) {
467 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
470 if (prog->check_offset_min == prog->check_offset_max &&
471 !(prog->reganch & ROPT_CANY_SEEN)) {
472 /* Substring at constant offset from beg-of-str... */
475 s = HOP3c(strpos, prog->check_offset_min, strend);
477 slen = SvCUR(check); /* >= 1 */
479 if ( strend - s > slen || strend - s < slen - 1
480 || (strend - s == slen && strend[-1] != '\n')) {
481 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
484 /* Now should match s[0..slen-2] */
486 if (slen && (*SvPVX_const(check) != *s
488 && memNE(SvPVX_const(check), s, slen)))) {
490 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
494 else if (*SvPVX_const(check) != *s
495 || ((slen = SvCUR(check)) > 1
496 && memNE(SvPVX_const(check), s, slen)))
499 goto success_at_start;
502 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
504 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
505 end_shift = prog->minlen - start_shift -
506 CHR_SVLEN(check) + (SvTAIL(check) != 0);
508 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
509 - (SvTAIL(check) != 0);
510 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
512 if (end_shift < eshift)
516 else { /* Can match at random position */
519 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
520 /* Should be nonnegative! */
521 end_shift = prog->minlen - start_shift -
522 CHR_SVLEN(check) + (SvTAIL(check) != 0);
525 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
527 Perl_croak(aTHX_ "panic: end_shift");
531 /* Find a possible match in the region s..strend by looking for
532 the "check" substring in the region corrected by start/end_shift. */
533 if (flags & REXEC_SCREAM) {
534 I32 p = -1; /* Internal iterator of scream. */
535 I32 * const pp = data ? data->scream_pos : &p;
537 if (PL_screamfirst[BmRARE(check)] >= 0
538 || ( BmRARE(check) == '\n'
539 && (BmPREVIOUS(check) == SvCUR(check) - 1)
541 s = screaminstr(sv, check,
542 start_shift + (s - strbeg), end_shift, pp, 0);
545 /* we may be pointing at the wrong string */
546 if (s && RX_MATCH_COPIED(prog))
547 s = strbeg + (s - SvPVX_const(sv));
549 *data->scream_olds = s;
551 else if (prog->reganch & ROPT_CANY_SEEN)
552 s = fbm_instr((U8*)(s + start_shift),
553 (U8*)(strend - end_shift),
554 check, multiline ? FBMrf_MULTILINE : 0);
556 s = fbm_instr(HOP3(s, start_shift, strend),
557 HOP3(strend, -end_shift, strbeg),
558 check, multiline ? FBMrf_MULTILINE : 0);
560 /* Update the count-of-usability, remove useless subpatterns,
563 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
564 (s ? "Found" : "Did not find"),
565 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
567 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
569 PL_colors[1], (SvTAIL(check) ? "$" : ""),
570 (s ? " at offset " : "...\n") ) );
577 /* Finish the diagnostic message */
578 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
580 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
581 Start with the other substr.
582 XXXX no SCREAM optimization yet - and a very coarse implementation
583 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
584 *always* match. Probably should be marked during compile...
585 Probably it is right to do no SCREAM here...
588 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
589 /* Take into account the "other" substring. */
590 /* XXXX May be hopelessly wrong for UTF... */
593 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
596 char * const last = HOP3c(s, -start_shift, strbeg);
601 t = s - prog->check_offset_max;
602 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
604 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
609 t = HOP3c(t, prog->anchored_offset, strend);
610 if (t < other_last) /* These positions already checked */
612 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
615 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
616 /* On end-of-str: see comment below. */
617 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
618 if (must == &PL_sv_undef) {
620 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
625 HOP3(HOP3(last1, prog->anchored_offset, strend)
626 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
628 multiline ? FBMrf_MULTILINE : 0
630 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
631 "%s anchored substr \"%s%.*s%s\"%s",
632 (s ? "Found" : "Contradicts"),
635 - (SvTAIL(must)!=0)),
637 PL_colors[1], (SvTAIL(must) ? "$" : "")));
639 if (last1 >= last2) {
640 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
641 ", giving up...\n"));
644 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
645 ", trying floating at offset %ld...\n",
646 (long)(HOP3c(s1, 1, strend) - i_strpos)));
647 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
648 s = HOP3c(last, 1, strend);
652 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
653 (long)(s - i_strpos)));
654 t = HOP3c(s, -prog->anchored_offset, strbeg);
655 other_last = HOP3c(s, 1, strend);
663 else { /* Take into account the floating substring. */
668 t = HOP3c(s, -start_shift, strbeg);
670 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
671 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
672 last = HOP3c(t, prog->float_max_offset, strend);
673 s = HOP3c(t, prog->float_min_offset, strend);
676 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
677 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
678 /* fbm_instr() takes into account exact value of end-of-str
679 if the check is SvTAIL(ed). Since false positives are OK,
680 and end-of-str is not later than strend we are OK. */
681 if (must == &PL_sv_undef) {
683 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
686 s = fbm_instr((unsigned char*)s,
687 (unsigned char*)last + SvCUR(must)
689 must, multiline ? FBMrf_MULTILINE : 0);
690 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
691 (s ? "Found" : "Contradicts"),
693 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
695 PL_colors[1], (SvTAIL(must) ? "$" : "")));
698 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
699 ", giving up...\n"));
702 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
703 ", trying anchored starting at offset %ld...\n",
704 (long)(s1 + 1 - i_strpos)));
706 s = HOP3c(t, 1, strend);
710 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
711 (long)(s - i_strpos)));
712 other_last = s; /* Fix this later. --Hugo */
721 t = s - prog->check_offset_max;
722 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
724 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
726 /* Fixed substring is found far enough so that the match
727 cannot start at strpos. */
729 if (ml_anch && t[-1] != '\n') {
730 /* Eventually fbm_*() should handle this, but often
731 anchored_offset is not 0, so this check will not be wasted. */
732 /* XXXX In the code below we prefer to look for "^" even in
733 presence of anchored substrings. And we search even
734 beyond the found float position. These pessimizations
735 are historical artefacts only. */
737 while (t < strend - prog->minlen) {
739 if (t < check_at - prog->check_offset_min) {
740 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
741 /* Since we moved from the found position,
742 we definitely contradict the found anchored
743 substr. Due to the above check we do not
744 contradict "check" substr.
745 Thus we can arrive here only if check substr
746 is float. Redo checking for "other"=="fixed".
749 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
750 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
751 goto do_other_anchored;
753 /* We don't contradict the found floating substring. */
754 /* XXXX Why not check for STCLASS? */
756 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
757 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
760 /* Position contradicts check-string */
761 /* XXXX probably better to look for check-string
762 than for "\n", so one should lower the limit for t? */
763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
764 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
765 other_last = strpos = s = t + 1;
770 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
771 PL_colors[0], PL_colors[1]));
775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
776 PL_colors[0], PL_colors[1]));
780 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
783 /* The found string does not prohibit matching at strpos,
784 - no optimization of calling REx engine can be performed,
785 unless it was an MBOL and we are not after MBOL,
786 or a future STCLASS check will fail this. */
788 /* Even in this situation we may use MBOL flag if strpos is offset
789 wrt the start of the string. */
790 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
791 && (strpos != strbeg) && strpos[-1] != '\n'
792 /* May be due to an implicit anchor of m{.*foo} */
793 && !(prog->reganch & ROPT_IMPLICIT))
798 DEBUG_EXECUTE_r( if (ml_anch)
799 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
800 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
803 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
805 prog->check_utf8 /* Could be deleted already */
806 && --BmUSEFUL(prog->check_utf8) < 0
807 && (prog->check_utf8 == prog->float_utf8)
809 prog->check_substr /* Could be deleted already */
810 && --BmUSEFUL(prog->check_substr) < 0
811 && (prog->check_substr == prog->float_substr)
814 /* If flags & SOMETHING - do not do it many times on the same match */
815 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
816 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
817 if (do_utf8 ? prog->check_substr : prog->check_utf8)
818 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
819 prog->check_substr = prog->check_utf8 = NULL; /* disable */
820 prog->float_substr = prog->float_utf8 = NULL; /* clear */
821 check = NULL; /* abort */
823 /* XXXX This is a remnant of the old implementation. It
824 looks wasteful, since now INTUIT can use many
826 prog->reganch &= ~RE_USE_INTUIT;
833 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
834 if (prog->regstclass) {
835 /* minlen == 0 is possible if regstclass is \b or \B,
836 and the fixed substr is ''$.
837 Since minlen is already taken into account, s+1 is before strend;
838 accidentally, minlen >= 1 guaranties no false positives at s + 1
839 even for \b or \B. But (minlen? 1 : 0) below assumes that
840 regstclass does not come from lookahead... */
841 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
842 This leaves EXACTF only, which is dealt with in find_byclass(). */
843 const U8* const str = (U8*)STRING(prog->regstclass);
844 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
845 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
847 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
848 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
849 : (prog->float_substr || prog->float_utf8
850 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
856 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
859 const char *what = NULL;
861 if (endpos == strend) {
862 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
863 "Could not match STCLASS...\n") );
866 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
867 "This position contradicts STCLASS...\n") );
868 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
870 /* Contradict one of substrings */
871 if (prog->anchored_substr || prog->anchored_utf8) {
872 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
873 DEBUG_EXECUTE_r( what = "anchored" );
875 s = HOP3c(t, 1, strend);
876 if (s + start_shift + end_shift > strend) {
877 /* XXXX Should be taken into account earlier? */
878 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
879 "Could not match STCLASS...\n") );
884 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
885 "Looking for %s substr starting at offset %ld...\n",
886 what, (long)(s + start_shift - i_strpos)) );
889 /* Have both, check_string is floating */
890 if (t + start_shift >= check_at) /* Contradicts floating=check */
891 goto retry_floating_check;
892 /* Recheck anchored substring, but not floating... */
896 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
897 "Looking for anchored substr starting at offset %ld...\n",
898 (long)(other_last - i_strpos)) );
899 goto do_other_anchored;
901 /* Another way we could have checked stclass at the
902 current position only: */
907 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
908 "Looking for /%s^%s/m starting at offset %ld...\n",
909 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
912 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
914 /* Check is floating subtring. */
915 retry_floating_check:
916 t = check_at - start_shift;
917 DEBUG_EXECUTE_r( what = "floating" );
918 goto hop_and_restart;
921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
922 "By STCLASS: moving %ld --> %ld\n",
923 (long)(t - i_strpos), (long)(s - i_strpos))
927 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
928 "Does not contradict STCLASS...\n");
933 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
934 PL_colors[4], (check ? "Guessed" : "Giving up"),
935 PL_colors[5], (long)(s - i_strpos)) );
938 fail_finish: /* Substring not found */
939 if (prog->check_substr || prog->check_utf8) /* could be removed already */
940 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
942 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
943 PL_colors[4], PL_colors[5]));
947 /* We know what class REx starts with. Try to find this position... */
949 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
952 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
956 register STRLEN uskip;
960 register I32 tmp = 1; /* Scratch variable? */
961 register const bool do_utf8 = PL_reg_match_utf8;
963 /* We know what class it must start with. */
967 while (s + (uskip = UTF8SKIP(s)) <= strend) {
968 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
969 !UTF8_IS_INVARIANT((U8)s[0]) ?
970 reginclass(c, (U8*)s, 0, do_utf8) :
971 REGINCLASS(c, (U8*)s)) {
972 if (tmp && (norun || regtry(prog, s)))
986 if (REGINCLASS(c, (U8*)s) ||
987 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
988 /* The assignment of 2 is intentional:
989 * for the folded sharp s, the skip is 2. */
990 (skip = SHARP_S_SKIP))) {
991 if (tmp && (norun || regtry(prog, s)))
1003 while (s < strend) {
1004 if (tmp && (norun || regtry(prog, s)))
1013 ln = STR_LEN(c); /* length to match in octets/bytes */
1014 lnc = (I32) ln; /* length to match in characters */
1016 STRLEN ulen1, ulen2;
1018 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1019 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1020 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1022 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1023 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1025 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1027 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1030 while (sm < ((U8 *) m + ln)) {
1045 c2 = PL_fold_locale[c1];
1047 e = HOP3c(strend, -((I32)lnc), s);
1050 e = s; /* Due to minlen logic of intuit() */
1052 /* The idea in the EXACTF* cases is to first find the
1053 * first character of the EXACTF* node and then, if
1054 * necessary, case-insensitively compare the full
1055 * text of the node. The c1 and c2 are the first
1056 * characters (though in Unicode it gets a bit
1057 * more complicated because there are more cases
1058 * than just upper and lower: one needs to use
1059 * the so-called folding case for case-insensitive
1060 * matching (called "loose matching" in Unicode).
1061 * ibcmp_utf8() will do just that. */
1065 U8 tmpbuf [UTF8_MAXBYTES+1];
1066 STRLEN len, foldlen;
1067 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1069 /* Upper and lower of 1st char are equal -
1070 * probably not a "letter". */
1072 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1076 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1077 m, (char **)0, ln, (bool)UTF))
1078 && (norun || regtry(prog, s)) )
1081 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1082 uvchr_to_utf8(tmpbuf, c);
1083 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1085 && (f == c1 || f == c2)
1086 && (ln == foldlen ||
1087 !ibcmp_utf8((char *) foldbuf,
1088 (char **)0, foldlen, do_utf8,
1090 (char **)0, ln, (bool)UTF))
1091 && (norun || regtry(prog, s)) )
1099 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1102 /* Handle some of the three Greek sigmas cases.
1103 * Note that not all the possible combinations
1104 * are handled here: some of them are handled
1105 * by the standard folding rules, and some of
1106 * them (the character class or ANYOF cases)
1107 * are handled during compiletime in
1108 * regexec.c:S_regclass(). */
1109 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1110 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1111 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1113 if ( (c == c1 || c == c2)
1115 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1116 m, (char **)0, ln, (bool)UTF))
1117 && (norun || regtry(prog, s)) )
1120 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1121 uvchr_to_utf8(tmpbuf, c);
1122 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1124 && (f == c1 || f == c2)
1125 && (ln == foldlen ||
1126 !ibcmp_utf8((char *) foldbuf,
1127 (char **)0, foldlen, do_utf8,
1129 (char **)0, ln, (bool)UTF))
1130 && (norun || regtry(prog, s)) )
1141 && (ln == 1 || !(OP(c) == EXACTF
1143 : ibcmp_locale(s, m, ln)))
1144 && (norun || regtry(prog, s)) )
1150 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1151 && (ln == 1 || !(OP(c) == EXACTF
1153 : ibcmp_locale(s, m, ln)))
1154 && (norun || regtry(prog, s)) )
1161 PL_reg_flags |= RF_tainted;
1168 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1169 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1171 tmp = ((OP(c) == BOUND ?
1172 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1173 LOAD_UTF8_CHARCLASS_ALNUM();
1174 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1175 if (tmp == !(OP(c) == BOUND ?
1176 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1177 isALNUM_LC_utf8((U8*)s)))
1180 if ((norun || regtry(prog, s)))
1187 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1188 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1189 while (s < strend) {
1191 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1193 if ((norun || regtry(prog, s)))
1199 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1203 PL_reg_flags |= RF_tainted;
1210 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1211 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1213 tmp = ((OP(c) == NBOUND ?
1214 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1215 LOAD_UTF8_CHARCLASS_ALNUM();
1216 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1217 if (tmp == !(OP(c) == NBOUND ?
1218 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1219 isALNUM_LC_utf8((U8*)s)))
1221 else if ((norun || regtry(prog, s)))
1227 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1228 tmp = ((OP(c) == NBOUND ?
1229 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1230 while (s < strend) {
1232 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1234 else if ((norun || regtry(prog, s)))
1239 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1244 LOAD_UTF8_CHARCLASS_ALNUM();
1245 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1246 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1247 if (tmp && (norun || regtry(prog, s)))
1258 while (s < strend) {
1260 if (tmp && (norun || regtry(prog, s)))
1272 PL_reg_flags |= RF_tainted;
1274 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1275 if (isALNUM_LC_utf8((U8*)s)) {
1276 if (tmp && (norun || regtry(prog, s)))
1287 while (s < strend) {
1288 if (isALNUM_LC(*s)) {
1289 if (tmp && (norun || regtry(prog, s)))
1302 LOAD_UTF8_CHARCLASS_ALNUM();
1303 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1304 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1305 if (tmp && (norun || regtry(prog, s)))
1316 while (s < strend) {
1318 if (tmp && (norun || regtry(prog, s)))
1330 PL_reg_flags |= RF_tainted;
1332 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1333 if (!isALNUM_LC_utf8((U8*)s)) {
1334 if (tmp && (norun || regtry(prog, s)))
1345 while (s < strend) {
1346 if (!isALNUM_LC(*s)) {
1347 if (tmp && (norun || regtry(prog, s)))
1360 LOAD_UTF8_CHARCLASS_SPACE();
1361 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1362 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1363 if (tmp && (norun || regtry(prog, s)))
1374 while (s < strend) {
1376 if (tmp && (norun || regtry(prog, s)))
1388 PL_reg_flags |= RF_tainted;
1390 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1391 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1392 if (tmp && (norun || regtry(prog, s)))
1403 while (s < strend) {
1404 if (isSPACE_LC(*s)) {
1405 if (tmp && (norun || regtry(prog, s)))
1418 LOAD_UTF8_CHARCLASS_SPACE();
1419 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1420 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1421 if (tmp && (norun || regtry(prog, s)))
1432 while (s < strend) {
1434 if (tmp && (norun || regtry(prog, s)))
1446 PL_reg_flags |= RF_tainted;
1448 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1449 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1450 if (tmp && (norun || regtry(prog, s)))
1461 while (s < strend) {
1462 if (!isSPACE_LC(*s)) {
1463 if (tmp && (norun || regtry(prog, s)))
1476 LOAD_UTF8_CHARCLASS_DIGIT();
1477 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1478 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1479 if (tmp && (norun || regtry(prog, s)))
1490 while (s < strend) {
1492 if (tmp && (norun || regtry(prog, s)))
1504 PL_reg_flags |= RF_tainted;
1506 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1507 if (isDIGIT_LC_utf8((U8*)s)) {
1508 if (tmp && (norun || regtry(prog, s)))
1519 while (s < strend) {
1520 if (isDIGIT_LC(*s)) {
1521 if (tmp && (norun || regtry(prog, s)))
1534 LOAD_UTF8_CHARCLASS_DIGIT();
1535 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1536 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1537 if (tmp && (norun || regtry(prog, s)))
1548 while (s < strend) {
1550 if (tmp && (norun || regtry(prog, s)))
1562 PL_reg_flags |= RF_tainted;
1564 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1565 if (!isDIGIT_LC_utf8((U8*)s)) {
1566 if (tmp && (norun || regtry(prog, s)))
1577 while (s < strend) {
1578 if (!isDIGIT_LC(*s)) {
1579 if (tmp && (norun || regtry(prog, s)))
1591 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1600 - regexec_flags - match a regexp against a string
1603 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1604 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1605 /* strend: pointer to null at end of string */
1606 /* strbeg: real beginning of string */
1607 /* minend: end of match must be >=minend after stringarg. */
1608 /* data: May be used for some additional optimizations. */
1609 /* nosave: For optimizations. */
1613 register regnode *c;
1614 register char *startpos = stringarg;
1615 I32 minlen; /* must match at least this many chars */
1616 I32 dontbother = 0; /* how many characters not to try at end */
1617 I32 end_shift = 0; /* Same for the end. */ /* CC */
1618 I32 scream_pos = -1; /* Internal iterator of scream. */
1619 char *scream_olds = NULL;
1620 SV* oreplsv = GvSV(PL_replgv);
1621 const bool do_utf8 = DO_UTF8(sv);
1622 const I32 multiline = prog->reganch & PMf_MULTILINE;
1624 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
1625 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
1628 GET_RE_DEBUG_FLAGS_DECL;
1630 PERL_UNUSED_ARG(data);
1631 RX_MATCH_UTF8_set(prog,do_utf8);
1635 PL_regnarrate = DEBUG_r_TEST;
1638 /* Be paranoid... */
1639 if (prog == NULL || startpos == NULL) {
1640 Perl_croak(aTHX_ "NULL regexp parameter");
1644 minlen = prog->minlen;
1645 if (strend - startpos < minlen) {
1646 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1647 "String too short [regexec_flags]...\n"));
1651 /* Check validity of program. */
1652 if (UCHARAT(prog->program) != REG_MAGIC) {
1653 Perl_croak(aTHX_ "corrupted regexp program");
1657 PL_reg_eval_set = 0;
1660 if (prog->reganch & ROPT_UTF8)
1661 PL_reg_flags |= RF_utf8;
1663 /* Mark beginning of line for ^ and lookbehind. */
1664 PL_regbol = startpos;
1668 /* Mark end of line for $ (and such) */
1671 /* see how far we have to get to not match where we matched before */
1672 PL_regtill = startpos+minend;
1674 /* We start without call_cc context. */
1677 /* If there is a "must appear" string, look for it. */
1680 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1683 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1684 PL_reg_ganch = startpos;
1685 else if (sv && SvTYPE(sv) >= SVt_PVMG
1687 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1688 && mg->mg_len >= 0) {
1689 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1690 if (prog->reganch & ROPT_ANCH_GPOS) {
1691 if (s > PL_reg_ganch)
1696 else /* pos() not defined */
1697 PL_reg_ganch = strbeg;
1700 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1701 re_scream_pos_data d;
1703 d.scream_olds = &scream_olds;
1704 d.scream_pos = &scream_pos;
1705 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1708 goto phooey; /* not present */
1713 const char * const s0 = UTF
1714 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1717 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1718 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1719 UNI_DISPLAY_REGEX) : startpos;
1720 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1723 PerlIO_printf(Perl_debug_log,
1724 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1725 PL_colors[4], PL_colors[5], PL_colors[0],
1728 len0 > 60 ? "..." : "",
1730 (int)(len1 > 60 ? 60 : len1),
1732 (len1 > 60 ? "..." : "")
1736 /* Simplest case: anchored match need be tried only once. */
1737 /* [unless only anchor is BOL and multiline is set] */
1738 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1739 if (s == startpos && regtry(prog, startpos))
1741 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1742 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1747 dontbother = minlen - 1;
1748 end = HOP3c(strend, -dontbother, strbeg) - 1;
1749 /* for multiline we only have to try after newlines */
1750 if (prog->check_substr || prog->check_utf8) {
1754 if (regtry(prog, s))
1759 if (prog->reganch & RE_USE_INTUIT) {
1760 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1771 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1772 if (regtry(prog, s))
1779 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1780 if (regtry(prog, PL_reg_ganch))
1785 /* Messy cases: unanchored match. */
1786 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1787 /* we have /x+whatever/ */
1788 /* it must be a one character string (XXXX Except UTF?) */
1793 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1794 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1795 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1798 while (s < strend) {
1800 DEBUG_EXECUTE_r( did_match = 1 );
1801 if (regtry(prog, s)) goto got_it;
1803 while (s < strend && *s == ch)
1810 while (s < strend) {
1812 DEBUG_EXECUTE_r( did_match = 1 );
1813 if (regtry(prog, s)) goto got_it;
1815 while (s < strend && *s == ch)
1821 DEBUG_EXECUTE_r(if (!did_match)
1822 PerlIO_printf(Perl_debug_log,
1823 "Did not find anchored character...\n")
1826 else if (prog->anchored_substr != NULL
1827 || prog->anchored_utf8 != NULL
1828 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1829 && prog->float_max_offset < strend - s)) {
1834 char *last1; /* Last position checked before */
1838 if (prog->anchored_substr || prog->anchored_utf8) {
1839 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1840 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1841 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1842 back_max = back_min = prog->anchored_offset;
1844 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1845 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1846 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1847 back_max = prog->float_max_offset;
1848 back_min = prog->float_min_offset;
1850 if (must == &PL_sv_undef)
1851 /* could not downgrade utf8 check substring, so must fail */
1854 last = HOP3c(strend, /* Cannot start after this */
1855 -(I32)(CHR_SVLEN(must)
1856 - (SvTAIL(must) != 0) + back_min), strbeg);
1859 last1 = HOPc(s, -1);
1861 last1 = s - 1; /* bogus */
1863 /* XXXX check_substr already used to find "s", can optimize if
1864 check_substr==must. */
1866 dontbother = end_shift;
1867 strend = HOPc(strend, -dontbother);
1868 while ( (s <= last) &&
1869 ((flags & REXEC_SCREAM)
1870 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1871 end_shift, &scream_pos, 0))
1872 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1873 (unsigned char*)strend, must,
1874 multiline ? FBMrf_MULTILINE : 0))) ) {
1875 /* we may be pointing at the wrong string */
1876 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1877 s = strbeg + (s - SvPVX_const(sv));
1878 DEBUG_EXECUTE_r( did_match = 1 );
1879 if (HOPc(s, -back_max) > last1) {
1880 last1 = HOPc(s, -back_min);
1881 s = HOPc(s, -back_max);
1884 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1886 last1 = HOPc(s, -back_min);
1890 while (s <= last1) {
1891 if (regtry(prog, s))
1897 while (s <= last1) {
1898 if (regtry(prog, s))
1904 DEBUG_EXECUTE_r(if (!did_match)
1905 PerlIO_printf(Perl_debug_log,
1906 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1907 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1908 ? "anchored" : "floating"),
1910 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1912 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1916 else if ((c = prog->regstclass)) {
1918 I32 op = (U8)OP(prog->regstclass);
1919 /* don't bother with what can't match */
1920 if (PL_regkind[op] != EXACT && op != CANY)
1921 strend = HOPc(strend, -(minlen - 1));
1924 SV *prop = sv_newmortal();
1932 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1933 UNI_DISPLAY_REGEX) :
1935 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1937 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1938 len1 = UTF ? SvCUR(dsv1) : strend - s;
1939 PerlIO_printf(Perl_debug_log,
1940 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1944 if (find_byclass(prog, c, s, strend, 0))
1946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1950 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1955 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1956 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1957 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1959 if (flags & REXEC_SCREAM) {
1960 last = screaminstr(sv, float_real, s - strbeg,
1961 end_shift, &scream_pos, 1); /* last one */
1963 last = scream_olds; /* Only one occurrence. */
1964 /* we may be pointing at the wrong string */
1965 else if (RX_MATCH_COPIED(prog))
1966 s = strbeg + (s - SvPVX_const(sv));
1970 const char * const little = SvPV_const(float_real, len);
1972 if (SvTAIL(float_real)) {
1973 if (memEQ(strend - len + 1, little, len - 1))
1974 last = strend - len + 1;
1975 else if (!multiline)
1976 last = memEQ(strend - len, little, len)
1977 ? strend - len : NULL;
1983 last = rninstr(s, strend, little, little + len);
1985 last = strend; /* matching "$" */
1989 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1990 "%sCan't trim the tail, match fails (should not happen)%s\n",
1991 PL_colors[4], PL_colors[5]));
1992 goto phooey; /* Should not happen! */
1994 dontbother = strend - last + prog->float_min_offset;
1996 if (minlen && (dontbother < minlen))
1997 dontbother = minlen - 1;
1998 strend -= dontbother; /* this one's always in bytes! */
1999 /* We don't know much -- general case. */
2002 if (regtry(prog, s))
2011 if (regtry(prog, s))
2013 } while (s++ < strend);
2021 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2023 if (PL_reg_eval_set) {
2024 /* Preserve the current value of $^R */
2025 if (oreplsv != GvSV(PL_replgv))
2026 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2027 restored, the value remains
2029 restore_pos(aTHX_ 0);
2032 /* make sure $`, $&, $', and $digit will work later */
2033 if ( !(flags & REXEC_NOT_FIRST) ) {
2034 RX_MATCH_COPY_FREE(prog);
2035 if (flags & REXEC_COPY_STR) {
2036 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2037 #ifdef PERL_OLD_COPY_ON_WRITE
2039 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2041 PerlIO_printf(Perl_debug_log,
2042 "Copy on write: regexp capture, type %d\n",
2045 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2046 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2047 assert (SvPOKp(prog->saved_copy));
2051 RX_MATCH_COPIED_on(prog);
2052 s = savepvn(strbeg, i);
2058 prog->subbeg = strbeg;
2059 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2067 PL_colors[4], PL_colors[5]));
2068 if (PL_reg_eval_set)
2069 restore_pos(aTHX_ 0);
2074 - regtry - try match at specific point
2076 STATIC I32 /* 0 failure, 1 success */
2077 S_regtry(pTHX_ regexp *prog, char *startpos)
2084 GET_RE_DEBUG_FLAGS_DECL;
2087 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2089 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2092 PL_reg_eval_set = RS_init;
2093 DEBUG_EXECUTE_r(DEBUG_s(
2094 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2095 (IV)(PL_stack_sp - PL_stack_base));
2097 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2098 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2099 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2101 /* Apparently this is not needed, judging by wantarray. */
2102 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2103 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2106 /* Make $_ available to executed code. */
2107 if (PL_reg_sv != DEFSV) {
2112 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2113 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2114 /* prepare for quick setting of pos */
2115 #ifdef PERL_OLD_COPY_ON_WRITE
2117 sv_force_normal_flags(sv, 0);
2119 mg = sv_magicext(PL_reg_sv, (SV*)0, PERL_MAGIC_regex_global,
2120 &PL_vtbl_mglob, NULL, 0);
2124 PL_reg_oldpos = mg->mg_len;
2125 SAVEDESTRUCTOR_X(restore_pos, 0);
2127 if (!PL_reg_curpm) {
2128 Newxz(PL_reg_curpm, 1, PMOP);
2131 SV* repointer = newSViv(0);
2132 /* so we know which PL_regex_padav element is PL_reg_curpm */
2133 SvFLAGS(repointer) |= SVf_BREAK;
2134 av_push(PL_regex_padav,repointer);
2135 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2136 PL_regex_pad = AvARRAY(PL_regex_padav);
2140 PM_SETRE(PL_reg_curpm, prog);
2141 PL_reg_oldcurpm = PL_curpm;
2142 PL_curpm = PL_reg_curpm;
2143 if (RX_MATCH_COPIED(prog)) {
2144 /* Here is a serious problem: we cannot rewrite subbeg,
2145 since it may be needed if this match fails. Thus
2146 $` inside (?{}) could fail... */
2147 PL_reg_oldsaved = prog->subbeg;
2148 PL_reg_oldsavedlen = prog->sublen;
2149 #ifdef PERL_OLD_COPY_ON_WRITE
2150 PL_nrs = prog->saved_copy;
2152 RX_MATCH_COPIED_off(prog);
2155 PL_reg_oldsaved = NULL;
2156 prog->subbeg = PL_bostr;
2157 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2159 prog->startp[0] = startpos - PL_bostr;
2160 PL_reginput = startpos;
2161 PL_regstartp = prog->startp;
2162 PL_regendp = prog->endp;
2163 PL_reglastparen = &prog->lastparen;
2164 PL_reglastcloseparen = &prog->lastcloseparen;
2165 prog->lastparen = 0;
2166 prog->lastcloseparen = 0;
2168 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2169 if (PL_reg_start_tmpl <= prog->nparens) {
2170 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2171 if(PL_reg_start_tmp)
2172 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2174 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2177 /* XXXX What this code is doing here?!!! There should be no need
2178 to do this again and again, PL_reglastparen should take care of
2181 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2182 * Actually, the code in regcppop() (which Ilya may be meaning by
2183 * PL_reglastparen), is not needed at all by the test suite
2184 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2185 * enough, for building DynaLoader, or otherwise this
2186 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2187 * will happen. Meanwhile, this code *is* needed for the
2188 * above-mentioned test suite tests to succeed. The common theme
2189 * on those tests seems to be returning null fields from matches.
2194 if (prog->nparens) {
2195 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2202 if (regmatch(prog->program + 1)) {
2203 prog->endp[0] = PL_reginput - PL_bostr;
2206 REGCP_UNWIND(lastcp);
2210 #define RE_UNWIND_BRANCH 1
2211 #define RE_UNWIND_BRANCHJ 2
2215 typedef struct { /* XX: makes sense to enlarge it... */
2219 } re_unwind_generic_t;
2232 } re_unwind_branch_t;
2234 typedef union re_unwind_t {
2236 re_unwind_generic_t generic;
2237 re_unwind_branch_t branch;
2240 #define sayYES goto yes
2241 #define sayNO goto no
2242 #define sayNO_ANYOF goto no_anyof
2243 #define sayYES_FINAL goto yes_final
2244 #define sayYES_LOUD goto yes_loud
2245 #define sayNO_FINAL goto no_final
2246 #define sayNO_SILENT goto do_no
2247 #define saySAME(x) if (x) goto yes; else goto no
2249 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2250 #define POSCACHE_SEEN 1 /* we know what we're caching */
2251 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2252 #define CACHEsayYES STMT_START { \
2253 if (st->whilem.cache_offset | st->whilem.cache_bit) { \
2254 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2255 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2256 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2257 /* cache records failure, but this is success */ \
2259 PerlIO_printf(Perl_debug_log, \
2260 "%*s (remove success from failure cache)\n", \
2261 REPORT_CODE_OFF+PL_regindent*2, "") \
2263 PL_reg_poscache[st->whilem.cache_offset] &= ~(1<<st->whilem.cache_bit); \
2268 #define CACHEsayNO STMT_START { \
2269 if (st->whilem.cache_offset | st->whilem.cache_bit) { \
2270 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2271 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2272 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2273 /* cache records success, but this is failure */ \
2275 PerlIO_printf(Perl_debug_log, \
2276 "%*s (remove failure from success cache)\n", \
2277 REPORT_CODE_OFF+PL_regindent*2, "") \
2279 PL_reg_poscache[st->whilem.cache_offset] &= ~(1<<st->whilem.cache_bit); \
2285 /* this is used to determine how far from the left messages like
2286 'failed...' are printed. Currently 29 makes these messages line
2287 up with the opcode they refer to. Earlier perls used 25 which
2288 left these messages outdented making reviewing a debug output
2291 #define REPORT_CODE_OFF 29
2294 /* Make sure there is a test for this +1 options in re_tests */
2295 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2297 /* grab a new slab and return the first slot in it */
2299 STATIC regmatch_state *
2302 regmatch_slab *s = PL_regmatch_slab->next;
2304 Newx(s, 1, regmatch_slab);
2305 s->prev = PL_regmatch_slab;
2307 PL_regmatch_slab->next = s;
2309 PL_regmatch_slab = s;
2310 return &s->states[0];
2313 /* simulate a recursive call to regmatch */
2315 #define REGMATCH(ns, where) \
2318 st->resume_state = resume_##where; \
2319 goto start_recurse; \
2320 resume_point_##where:
2323 - regmatch - main matching routine
2325 * Conceptually the strategy is simple: check to see whether the current
2326 * node matches, call self recursively to see whether the rest matches,
2327 * and then act accordingly. In practice we make some effort to avoid
2328 * recursion, in particular by going through "ordinary" nodes (that don't
2329 * need to know whether the rest of the match failed) by a loop instead of
2332 /* [lwall] I've hoisted the register declarations to the outer block in order to
2333 * maybe save a little bit of pushing and popping on the stack. It also takes
2334 * advantage of machines that use a register save mask on subroutine entry.
2336 * This function used to be heavily recursive, but since this had the
2337 * effect of blowing the CPU stack on complex regexes, it has been
2338 * restructured to be iterative, and to save state onto the heap rather
2339 * than the stack. Essentially whereever regmatch() used to be called, it
2340 * pushes the current state, notes where to return, then jumps back into
2343 * Originally the structure of this function used to look something like
2348 while (scan != NULL) {
2349 a++; // do stuff with a and b
2355 if (regmatch(...)) // recurse
2365 * Now it looks something like this:
2373 regmatch_state *st = new();
2375 st->a++; // do stuff with a and b
2377 while (scan != NULL) {
2385 st->resume_state = resume_FOO;
2386 goto start_recurse; // recurse
2395 st = new(); push a new state
2396 st->a = 1; st->b = 2;
2403 switch (resume_state) {
2405 goto resume_point_FOO;
2412 * WARNING: this means that any line in this function that contains a
2413 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2414 * regmatch() using gotos instead. Thus the values of any local variables
2415 * not saved in the regmatch_state structure will have been lost when
2416 * execution resumes on the next line .
2418 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2419 * PL_regmatch_state always points to the currently active state, and
2420 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2421 * The first time regmatch is called, the first slab is allocated, and is
2422 * never freed until interpreter desctruction. When the slab is full,
2423 * a new one is allocated chained to the end. At exit from regmatch, slabs
2424 * allocated since entry are freed.
2428 STATIC I32 /* 0 failure, 1 success */
2429 S_regmatch(pTHX_ regnode *prog)
2432 register const bool do_utf8 = PL_reg_match_utf8;
2433 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
2435 regmatch_slab *orig_slab;
2436 regmatch_state *orig_state;
2438 /* the current state. This is a cached copy of PL_regmatch_state */
2439 register regmatch_state *st;
2441 /* cache heavy used fields of st in registers */
2442 register regnode *scan;
2443 register regnode *next;
2444 register I32 n = 0; /* initialize to shut up compiler warning */
2445 register char *locinput = PL_reginput;
2447 /* these variables are NOT saved during a recusive RFEGMATCH: */
2448 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2449 bool result; /* return value of S_regmatch */
2450 regnode *inner; /* Next node in internal branch. */
2451 int depth = 0; /* depth of recursion */
2454 SV *re_debug_flags = NULL;
2459 /* on first ever call to regmatch, allocate first slab */
2460 if (!PL_regmatch_slab) {
2461 Newx(PL_regmatch_slab, 1, regmatch_slab);
2462 PL_regmatch_slab->prev = NULL;
2463 PL_regmatch_slab->next = NULL;
2464 PL_regmatch_state = &PL_regmatch_slab->states[0] - 1;
2467 /* remember current high-water mark for exit */
2468 /* XXX this should be done with SAVE* instead */
2469 orig_slab = PL_regmatch_slab;
2470 orig_state = PL_regmatch_state;
2472 /* grab next free state slot */
2473 st = ++PL_regmatch_state;
2474 if (st > &(PL_regmatch_slab->states[PERL_REGMATCH_SLAB_SLOTS-1]))
2475 st = PL_regmatch_state = S_push_slab(aTHX);
2482 /* Note that nextchr is a byte even in UTF */
2483 nextchr = UCHARAT(locinput);
2485 while (scan != NULL) {
2488 SV * const prop = sv_newmortal();
2489 const int docolor = *PL_colors[0];
2490 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2491 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2492 /* The part of the string before starttry has one color
2493 (pref0_len chars), between starttry and current
2494 position another one (pref_len - pref0_len chars),
2495 after the current position the third one.
2496 We assume that pref0_len <= pref_len, otherwise we
2497 decrease pref0_len. */
2498 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2499 ? (5 + taill) - l : locinput - PL_bostr;
2502 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2504 pref0_len = pref_len - (locinput - PL_reg_starttry);
2505 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2506 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2507 ? (5 + taill) - pref_len : PL_regeol - locinput);
2508 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2512 if (pref0_len > pref_len)
2513 pref0_len = pref_len;
2514 regprop(prop, scan);
2516 const char * const s0 =
2517 do_utf8 && OP(scan) != CANY ?
2518 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2519 pref0_len, 60, UNI_DISPLAY_REGEX) :
2520 locinput - pref_len;
2521 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2522 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2523 pv_uni_display(PERL_DEBUG_PAD(1),
2524 (U8*)(locinput - pref_len + pref0_len),
2525 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2526 locinput - pref_len + pref0_len;
2527 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2528 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2529 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2530 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2532 const int len2 = do_utf8 ? strlen(s2) : l;
2533 PerlIO_printf(Perl_debug_log,
2534 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2535 (IV)(locinput - PL_bostr),
2542 (docolor ? "" : "> <"),
2546 15 - l - pref_len + 1,
2548 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2553 next = scan + NEXT_OFF(scan);
2559 if (locinput == PL_bostr)
2561 /* regtill = regbol; */
2566 if (locinput == PL_bostr ||
2567 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2573 if (locinput == PL_bostr)
2577 if (locinput == PL_reg_ganch)
2583 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2588 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2590 if (PL_regeol - locinput > 1)
2594 if (PL_regeol != locinput)
2598 if (!nextchr && locinput >= PL_regeol)
2601 locinput += PL_utf8skip[nextchr];
2602 if (locinput > PL_regeol)
2604 nextchr = UCHARAT(locinput);
2607 nextchr = UCHARAT(++locinput);
2610 if (!nextchr && locinput >= PL_regeol)
2612 nextchr = UCHARAT(++locinput);
2615 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2618 locinput += PL_utf8skip[nextchr];
2619 if (locinput > PL_regeol)
2621 nextchr = UCHARAT(locinput);
2624 nextchr = UCHARAT(++locinput);
2630 traverse the TRIE keeping track of all accepting states
2631 we transition through until we get to a failing node.
2639 U8 *uc = ( U8* )locinput;
2646 U8 *uscan = (U8*)NULL;
2648 SV *sv_accept_buff = NULL;
2649 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2650 trie_type = do_utf8 ?
2651 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2654 /* what trie are we using right now */
2656 = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2657 st->trie.accepted = 0; /* how many accepting states we have seen */
2660 while ( state && uc <= (U8*)PL_regeol ) {
2662 if (trie->states[ state ].wordnum) {
2663 if (!st->trie.accepted ) {
2666 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2667 sv_accept_buff=newSV(bufflen *
2668 sizeof(reg_trie_accepted) - 1);
2669 SvCUR_set(sv_accept_buff,
2670 sizeof(reg_trie_accepted));
2671 SvPOK_on(sv_accept_buff);
2672 sv_2mortal(sv_accept_buff);
2673 st->trie.accept_buff =
2674 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2677 if (st->trie.accepted >= bufflen) {
2679 st->trie.accept_buff =(reg_trie_accepted*)
2680 SvGROW(sv_accept_buff,
2681 bufflen * sizeof(reg_trie_accepted));
2683 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2684 + sizeof(reg_trie_accepted));
2686 st->trie.accept_buff[st->trie.accepted].wordnum = trie->states[state].wordnum;
2687 st->trie.accept_buff[st->trie.accepted].endpos = uc;
2688 ++st->trie.accepted;
2691 base = trie->states[ state ].trans.base;
2693 DEBUG_TRIE_EXECUTE_r(
2694 PerlIO_printf( Perl_debug_log,
2695 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2696 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2697 (UV)state, (UV)base, (UV)st->trie.accepted );
2701 switch (trie_type) {
2702 case trie_uft8_fold:
2704 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2709 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2710 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2711 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2712 foldlen -= UNISKIP( uvc );
2713 uscan = foldbuf + UNISKIP( uvc );
2717 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2726 charid = trie->charmap[ uvc ];
2730 if (trie->widecharmap) {
2731 SV** svpp = (SV**)NULL;
2732 svpp = hv_fetch(trie->widecharmap,
2733 (char*)&uvc, sizeof(UV), 0);
2735 charid = (U16)SvIV(*svpp);
2740 (base + charid > trie->uniquecharcount )
2741 && (base + charid - 1 - trie->uniquecharcount
2743 && trie->trans[base + charid - 1 -
2744 trie->uniquecharcount].check == state)
2746 state = trie->trans[base + charid - 1 -
2747 trie->uniquecharcount ].next;
2758 DEBUG_TRIE_EXECUTE_r(
2759 PerlIO_printf( Perl_debug_log,
2760 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2761 charid, uvc, (UV)state, PL_colors[5] );
2764 if (!st->trie.accepted )
2768 There was at least one accepting state that we
2769 transitioned through. Presumably the number of accepting
2770 states is going to be low, typically one or two. So we
2771 simply scan through to find the one with lowest wordnum.
2772 Once we find it, we swap the last state into its place
2773 and decrement the size. We then try to match the rest of
2774 the pattern at the point where the word ends, if we
2775 succeed then we end the loop, otherwise the loop
2776 eventually terminates once all of the accepting states
2780 if ( st->trie.accepted == 1 ) {
2782 SV **tmp = av_fetch( trie->words, st->trie.accept_buff[ 0 ].wordnum-1, 0 );
2783 PerlIO_printf( Perl_debug_log,
2784 "%*s %sonly one match : #%d <%s>%s\n",
2785 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2786 st->trie.accept_buff[ 0 ].wordnum,
2787 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2790 PL_reginput = (char *)st->trie.accept_buff[ 0 ].endpos;
2791 /* in this case we free tmps/leave before we call regmatch
2792 as we wont be using accept_buff again. */
2795 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2796 /*** all unsaved local vars undefined at this point */
2799 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2800 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->trie.accepted,
2803 while ( !result && st->trie.accepted-- ) {
2806 for( cur = 1 ; cur <= st->trie.accepted ; cur++ ) {
2807 DEBUG_TRIE_EXECUTE_r(
2808 PerlIO_printf( Perl_debug_log,
2809 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2810 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2811 (IV)best, st->trie.accept_buff[ best ].wordnum, (IV)cur,
2812 st->trie.accept_buff[ cur ].wordnum, PL_colors[5] );
2815 if (st->trie.accept_buff[cur].wordnum <
2816 st->trie.accept_buff[best].wordnum)
2820 SV ** const tmp = av_fetch( trie->words, st->trie.accept_buff[ best ].wordnum - 1, 0 );
2821 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
2822 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2823 st->trie.accept_buff[best].wordnum,
2824 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
2827 if ( best<st->trie.accepted ) {
2828 reg_trie_accepted tmp = st->trie.accept_buff[ best ];
2829 st->trie.accept_buff[ best ] = st->trie.accept_buff[ st->trie.accepted ];
2830 st->trie.accept_buff[ st->trie.accepted ] = tmp;
2831 best = st->trie.accepted;
2833 PL_reginput = (char *)st->trie.accept_buff[ best ].endpos;
2836 as far as I can tell we only need the SAVETMPS/FREETMPS
2837 for re's with EVAL in them but I'm leaving them in for
2838 all until I can be sure.
2841 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2842 /*** all unsaved local vars undefined at this point */
2855 /* unreached codepoint */
2857 char *s = STRING(scan);
2858 st->ln = STR_LEN(scan);
2859 if (do_utf8 != UTF) {
2860 /* The target and the pattern have differing utf8ness. */
2862 const char *e = s + st->ln;
2865 /* The target is utf8, the pattern is not utf8. */
2870 if (NATIVE_TO_UNI(*(U8*)s) !=
2871 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2879 /* The target is not utf8, the pattern is utf8. */
2884 if (NATIVE_TO_UNI(*((U8*)l)) !=
2885 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2893 nextchr = UCHARAT(locinput);
2896 /* The target and the pattern have the same utf8ness. */
2897 /* Inline the first character, for speed. */
2898 if (UCHARAT(s) != nextchr)
2900 if (PL_regeol - locinput < st->ln)
2902 if (st->ln > 1 && memNE(s, locinput, st->ln))
2905 nextchr = UCHARAT(locinput);
2909 PL_reg_flags |= RF_tainted;
2912 char *s = STRING(scan);
2913 st->ln = STR_LEN(scan);
2915 if (do_utf8 || UTF) {
2916 /* Either target or the pattern are utf8. */
2918 char *e = PL_regeol;
2920 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2921 l, &e, 0, do_utf8)) {
2922 /* One more case for the sharp s:
2923 * pack("U0U*", 0xDF) =~ /ss/i,
2924 * the 0xC3 0x9F are the UTF-8
2925 * byte sequence for the U+00DF. */
2927 toLOWER(s[0]) == 's' &&
2929 toLOWER(s[1]) == 's' &&
2936 nextchr = UCHARAT(locinput);
2940 /* Neither the target and the pattern are utf8. */
2942 /* Inline the first character, for speed. */
2943 if (UCHARAT(s) != nextchr &&
2944 UCHARAT(s) != ((OP(scan) == EXACTF)
2945 ? PL_fold : PL_fold_locale)[nextchr])
2947 if (PL_regeol - locinput < st->ln)
2949 if (st->ln > 1 && (OP(scan) == EXACTF
2950 ? ibcmp(s, locinput, st->ln)
2951 : ibcmp_locale(s, locinput, st->ln)))
2954 nextchr = UCHARAT(locinput);
2959 STRLEN inclasslen = PL_regeol - locinput;
2961 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2963 if (locinput >= PL_regeol)
2965 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2966 nextchr = UCHARAT(locinput);
2971 nextchr = UCHARAT(locinput);
2972 if (!REGINCLASS(scan, (U8*)locinput))
2974 if (!nextchr && locinput >= PL_regeol)
2976 nextchr = UCHARAT(++locinput);
2980 /* If we might have the case of the German sharp s
2981 * in a casefolding Unicode character class. */
2983 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2984 locinput += SHARP_S_SKIP;
2985 nextchr = UCHARAT(locinput);
2991 PL_reg_flags |= RF_tainted;
2997 LOAD_UTF8_CHARCLASS_ALNUM();
2998 if (!(OP(scan) == ALNUM
2999 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3000 : isALNUM_LC_utf8((U8*)locinput)))
3004 locinput += PL_utf8skip[nextchr];
3005 nextchr = UCHARAT(locinput);
3008 if (!(OP(scan) == ALNUM
3009 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3011 nextchr = UCHARAT(++locinput);
3014 PL_reg_flags |= RF_tainted;
3017 if (!nextchr && locinput >= PL_regeol)
3020 LOAD_UTF8_CHARCLASS_ALNUM();
3021 if (OP(scan) == NALNUM
3022 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3023 : isALNUM_LC_utf8((U8*)locinput))
3027 locinput += PL_utf8skip[nextchr];
3028 nextchr = UCHARAT(locinput);
3031 if (OP(scan) == NALNUM
3032 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3034 nextchr = UCHARAT(++locinput);
3038 PL_reg_flags |= RF_tainted;
3042 /* was last char in word? */
3044 if (locinput == PL_bostr)
3047 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3049 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
3051 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3052 st->ln = isALNUM_uni(st->ln);
3053 LOAD_UTF8_CHARCLASS_ALNUM();
3054 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3057 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
3058 n = isALNUM_LC_utf8((U8*)locinput);
3062 st->ln = (locinput != PL_bostr) ?
3063 UCHARAT(locinput - 1) : '\n';
3064 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3065 st->ln = isALNUM(st->ln);
3066 n = isALNUM(nextchr);
3069 st->ln = isALNUM_LC(st->ln);
3070 n = isALNUM_LC(nextchr);
3073 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3074 OP(scan) == BOUNDL))
3078 PL_reg_flags |= RF_tainted;
3084 if (UTF8_IS_CONTINUED(nextchr)) {
3085 LOAD_UTF8_CHARCLASS_SPACE();
3086 if (!(OP(scan) == SPACE
3087 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3088 : isSPACE_LC_utf8((U8*)locinput)))
3092 locinput += PL_utf8skip[nextchr];
3093 nextchr = UCHARAT(locinput);
3096 if (!(OP(scan) == SPACE
3097 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3099 nextchr = UCHARAT(++locinput);
3102 if (!(OP(scan) == SPACE
3103 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3105 nextchr = UCHARAT(++locinput);
3109 PL_reg_flags |= RF_tainted;
3112 if (!nextchr && locinput >= PL_regeol)
3115 LOAD_UTF8_CHARCLASS_SPACE();
3116 if (OP(scan) == NSPACE
3117 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3118 : isSPACE_LC_utf8((U8*)locinput))
3122 locinput += PL_utf8skip[nextchr];
3123 nextchr = UCHARAT(locinput);
3126 if (OP(scan) == NSPACE
3127 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3129 nextchr = UCHARAT(++locinput);
3132 PL_reg_flags |= RF_tainted;
3138 LOAD_UTF8_CHARCLASS_DIGIT();
3139 if (!(OP(scan) == DIGIT
3140 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3141 : isDIGIT_LC_utf8((U8*)locinput)))
3145 locinput += PL_utf8skip[nextchr];
3146 nextchr = UCHARAT(locinput);
3149 if (!(OP(scan) == DIGIT
3150 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3152 nextchr = UCHARAT(++locinput);
3155 PL_reg_flags |= RF_tainted;
3158 if (!nextchr && locinput >= PL_regeol)
3161 LOAD_UTF8_CHARCLASS_DIGIT();
3162 if (OP(scan) == NDIGIT
3163 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3164 : isDIGIT_LC_utf8((U8*)locinput))
3168 locinput += PL_utf8skip[nextchr];
3169 nextchr = UCHARAT(locinput);
3172 if (OP(scan) == NDIGIT
3173 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3175 nextchr = UCHARAT(++locinput);
3178 if (locinput >= PL_regeol)
3181 LOAD_UTF8_CHARCLASS_MARK();
3182 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3184 locinput += PL_utf8skip[nextchr];
3185 while (locinput < PL_regeol &&
3186 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3187 locinput += UTF8SKIP(locinput);
3188 if (locinput > PL_regeol)
3193 nextchr = UCHARAT(locinput);
3196 PL_reg_flags |= RF_tainted;
3201 n = ARG(scan); /* which paren pair */
3202 st->ln = PL_regstartp[n];
3203 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3204 if ((I32)*PL_reglastparen < n || st->ln == -1)
3205 sayNO; /* Do not match unless seen CLOSEn. */
3206 if (st->ln == PL_regendp[n])
3209 s = PL_bostr + st->ln;
3210 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3212 const char *e = PL_bostr + PL_regendp[n];
3214 * Note that we can't do the "other character" lookup trick as
3215 * in the 8-bit case (no pun intended) because in Unicode we
3216 * have to map both upper and title case to lower case.
3218 if (OP(scan) == REFF) {
3220 STRLEN ulen1, ulen2;
3221 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3222 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3226 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3227 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3228 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3235 nextchr = UCHARAT(locinput);
3239 /* Inline the first character, for speed. */
3240 if (UCHARAT(s) != nextchr &&
3242 (UCHARAT(s) != ((OP(scan) == REFF
3243 ? PL_fold : PL_fold_locale)[nextchr]))))
3245 st->ln = PL_regendp[n] - st->ln;
3246 if (locinput + st->ln > PL_regeol)
3248 if (st->ln > 1 && (OP(scan) == REF
3249 ? memNE(s, locinput, st->ln)
3251 ? ibcmp(s, locinput, st->ln)
3252 : ibcmp_locale(s, locinput, st->ln))))
3255 nextchr = UCHARAT(locinput);
3267 OP_4tree * const oop = PL_op;
3268 COP * const ocurcop = PL_curcop;
3271 struct regexp * const oreg = PL_reg_re;
3274 PL_op = (OP_4tree*)PL_regdata->data[n];
3275 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3276 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
3277 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3280 SV ** const before = SP;
3281 CALLRUNOPS(aTHX); /* Scalar context. */
3284 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3292 PAD_RESTORE_LOCAL(old_comppad);
3293 PL_curcop = ocurcop;
3295 if (st->logical == 2) { /* Postponed subexpression. */
3302 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3303 mg = mg_find(sv, PERL_MAGIC_qr);
3304 else if (SvSMAGICAL(ret)) {
3305 if (SvGMAGICAL(ret))
3306 sv_unmagic(ret, PERL_MAGIC_qr);
3308 mg = mg_find(ret, PERL_MAGIC_qr);
3312 re = (regexp *)mg->mg_obj;
3313 (void)ReREFCNT_inc(re);
3317 const char * const t = SvPV_const(ret, len);
3319 char * const oprecomp = PL_regprecomp;
3320 const I32 osize = PL_regsize;
3321 const I32 onpar = PL_regnpar;
3324 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3325 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3327 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3329 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3331 PL_regprecomp = oprecomp;
3336 PerlIO_printf(Perl_debug_log,
3337 "Entering embedded \"%s%.60s%s%s\"\n",
3341 (strlen(re->precomp) > 60 ? "..." : ""))
3344 state.prev = PL_reg_call_cc;
3346 state.re = PL_reg_re;
3350 st->eval.cp = regcppush(0); /* Save *all* the positions. */
3351 REGCP_SET(st->eval.lastcp);
3353 state.ss = PL_savestack_ix;
3354 *PL_reglastparen = 0;
3355 *PL_reglastcloseparen = 0;
3356 PL_reg_call_cc = &state;
3357 PL_reginput = locinput;
3358 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3359 ((re->reganch & ROPT_UTF8) != 0);
3360 if (toggleutf) PL_reg_flags ^= RF_utf8;
3362 /* XXXX This is too dramatic a measure... */
3365 /* XXX the only recursion left in regmatch() */
3366 if (regmatch(re->program + 1)) {
3367 /* Even though we succeeded, we need to restore
3368 global variables, since we may be wrapped inside
3369 SUSPEND, thus the match may be not finished yet. */
3371 /* XXXX Do this only if SUSPENDed? */
3372 PL_reg_call_cc = state.prev;
3374 PL_reg_re = state.re;
3375 cache_re(PL_reg_re);
3376 if (toggleutf) PL_reg_flags ^= RF_utf8;
3378 /* XXXX This is too dramatic a measure... */
3381 /* These are needed even if not SUSPEND. */
3383 regcpblow(st->eval.cp);
3387 REGCP_UNWIND(st->eval.lastcp);
3389 PL_reg_call_cc = state.prev;
3391 PL_reg_re = state.re;
3392 cache_re(PL_reg_re);
3393 if (toggleutf) PL_reg_flags ^= RF_utf8;
3395 /* XXXX This is too dramatic a measure... */
3401 st->sw = SvTRUE(ret);
3405 sv_setsv(save_scalar(PL_replgv), ret);
3411 n = ARG(scan); /* which paren pair */
3412 PL_reg_start_tmp[n] = locinput;
3417 n = ARG(scan); /* which paren pair */
3418 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3419 PL_regendp[n] = locinput - PL_bostr;
3420 if (n > (I32)*PL_reglastparen)
3421 *PL_reglastparen = n;
3422 *PL_reglastcloseparen = n;
3425 n = ARG(scan); /* which paren pair */
3426 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3429 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3431 next = NEXTOPER(NEXTOPER(scan));
3433 next = scan + ARG(scan);
3434 if (OP(next) == IFTHEN) /* Fake one. */
3435 next = NEXTOPER(NEXTOPER(next));
3439 st->logical = scan->flags;
3441 /*******************************************************************
3442 cc contains infoblock about the innermost (...)* loop, and
3443 a pointer to the next outer infoblock.
3445 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3447 1) After matching Y, regnode for CURLYX is processed;
3449 2) This regnode mallocs an infoblock, and calls regmatch() recursively
3450 with the starting point at WHILEM node;
3452 3) Each hit of WHILEM node tries to match A and Z (in the order
3453 depending on the current iteration, min/max of {min,max} and
3454 greediness). The information about where are nodes for "A"
3455 and "Z" is read from the infoblock, as is info on how many times "A"
3456 was already matched, and greediness.
3458 4) After A matches, the same WHILEM node is hit again.
3460 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3461 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3462 resets cc, since this Y(A)*Z can be a part of some other loop:
3463 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3464 of the external loop.
3466 Currently present infoblocks form a tree with a stem formed by PL_curcc
3467 and whatever it mentions via ->next, and additional attached trees
3468 corresponding to temporarily unset infoblocks as in "5" above.
3470 In the following picture, infoblocks for outer loop of
3471 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3472 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3473 infoblocks are drawn below the "reset" infoblock.
3475 In fact in the picture below we do not show failed matches for Z and T
3476 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3477 more obvious *why* one needs to *temporary* unset infoblocks.]
3479 Matched REx position InfoBlocks Comment
3483 Y A)*?Z)*?T x <- O <- I
3484 YA )*?Z)*?T x <- O <- I
3485 YA A)*?Z)*?T x <- O <- I
3486 YAA )*?Z)*?T x <- O <- I
3487 YAA Z)*?T x <- O # Temporary unset I
3490 YAAZ Y(A)*?Z)*?T x <- O
3493 YAAZY (A)*?Z)*?T x <- O
3496 YAAZY A)*?Z)*?T x <- O <- I
3499 YAAZYA )*?Z)*?T x <- O <- I
3502 YAAZYA Z)*?T x <- O # Temporary unset I
3508 YAAZYAZ T x # Temporary unset O
3515 *******************************************************************/
3518 /* No need to save/restore up to this paren */
3519 I32 parenfloor = scan->flags;
3523 Newx(newcc, 1, CURCUR);
3524 st->curlyx.savecc = st->cc;
3525 newcc->oldcc = st->cc;
3528 st->curlyx.cp = PL_savestack_ix;
3529 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3531 /* XXXX Probably it is better to teach regpush to support
3532 parenfloor > PL_regsize... */
3533 if (parenfloor > (I32)*PL_reglastparen)
3534 parenfloor = *PL_reglastparen; /* Pessimization... */
3535 st->cc->parenfloor = parenfloor;
3537 st->cc->min = ARG1(scan);
3538 st->cc->max = ARG2(scan);
3539 st->cc->scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3540 st->cc->next = next;
3541 st->cc->minmod = st->minmod;
3542 st->cc->lastloc = 0;
3543 PL_reginput = locinput;
3544 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3545 /*** all unsaved local vars undefined at this point */
3546 regcpblow(st->curlyx.cp);
3548 st->cc = st->curlyx.savecc;
3554 * This is really hard to understand, because after we match
3555 * what we're trying to match, we must make sure the rest of
3556 * the REx is going to match for sure, and to do that we have
3557 * to go back UP the parse tree by recursing ever deeper. And
3558 * if it fails, we have to reset our parent's current state
3559 * that we can try again after backing off.
3562 st->whilem.lastloc = st->cc->lastloc; /* Detection of 0-len. */
3563 st->whilem.cache_offset = 0;
3564 st->whilem.cache_bit = 0;
3566 n = st->cc->cur + 1; /* how many we know we matched */
3567 PL_reginput = locinput;
3570 PerlIO_printf(Perl_debug_log,
3571 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3572 REPORT_CODE_OFF+PL_regindent*2, "",
3573 (long)n, (long)st->cc->min,
3574 (long)st->cc->max, PTR2UV(st->cc))
3577 /* If degenerate scan matches "", assume scan done. */
3579 if (locinput == st->cc->lastloc && n >= st->cc->min) {
3580 st->whilem.savecc = st->cc;
3581 st->cc = st->cc->oldcc;
3583 st->ln = st->cc->cur;
3585 PerlIO_printf(Perl_debug_log,
3586 "%*s empty match detected, try continuation...\n",
3587 REPORT_CODE_OFF+PL_regindent*2, "")
3589 REGMATCH(st->whilem.savecc->next, WHILEM1);
3590 /*** all unsaved local vars undefined at this point */
3591 st->cc = st->whilem.savecc;
3595 st->cc->oldcc->cur = st->ln;
3599 /* First just match a string of min scans. */
3601 if (n < st->cc->min) {
3603 st->cc->lastloc = locinput;
3604 REGMATCH(st->cc->scan, WHILEM2);
3605 /*** all unsaved local vars undefined at this point */
3608 st->cc->cur = n - 1;
3609 st->cc->lastloc = st->whilem.lastloc;
3614 /* Check whether we already were at this position.
3615 Postpone detection until we know the match is not
3616 *that* much linear. */
3617 if (!PL_reg_maxiter) {
3618 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3619 PL_reg_leftiter = PL_reg_maxiter;
3621 if (PL_reg_leftiter-- == 0) {
3622 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3623 if (PL_reg_poscache) {
3624 if ((I32)PL_reg_poscache_size < size) {
3625 Renew(PL_reg_poscache, size, char);
3626 PL_reg_poscache_size = size;
3628 Zero(PL_reg_poscache, size, char);
3631 PL_reg_poscache_size = size;
3632 Newxz(PL_reg_poscache, size, char);
3635 PerlIO_printf(Perl_debug_log,
3636 "%sDetected a super-linear match, switching on caching%s...\n",
3637 PL_colors[4], PL_colors[5])
3640 if (PL_reg_leftiter < 0) {
3641 st->whilem.cache_offset = locinput - PL_bostr;
3643 st->whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3644 + st->whilem.cache_offset * (scan->flags>>4);
3645 st->whilem.cache_bit = st->whilem.cache_offset % 8;
3646 st->whilem.cache_offset /= 8;
3647 if (PL_reg_poscache[st->whilem.cache_offset] & (1<<st->whilem.cache_bit)) {
3649 PerlIO_printf(Perl_debug_log,
3650 "%*s already tried at this position...\n",
3651 REPORT_CODE_OFF+PL_regindent*2, "")
3653 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3654 /* cache records success */
3657 /* cache records failure */
3660 PL_reg_poscache[st->whilem.cache_offset] |= (1<<st->whilem.cache_bit);
3664 /* Prefer next over scan for minimal matching. */
3666 if (st->cc->minmod) {
3667 st->whilem.savecc = st->cc;
3668 st->cc = st->cc->oldcc;
3670 st->ln = st->cc->cur;
3671 st->whilem.cp = regcppush(st->whilem.savecc->parenfloor);
3672 REGCP_SET(st->whilem.lastcp);
3673 REGMATCH(st->whilem.savecc->next, WHILEM3);
3674 /*** all unsaved local vars undefined at this point */
3675 st->cc = st->whilem.savecc;
3677 regcpblow(st->whilem.cp);
3678 CACHEsayYES; /* All done. */
3680 REGCP_UNWIND(st->whilem.lastcp);
3683 st->cc->oldcc->cur = st->ln;
3685 if (n >= st->cc->max) { /* Maximum greed exceeded? */
3686 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3687 && !(PL_reg_flags & RF_warned)) {
3688 PL_reg_flags |= RF_warned;
3689 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3690 "Complex regular subexpression recursion",
3697 PerlIO_printf(Perl_debug_log,
3698 "%*s trying longer...\n",
3699 REPORT_CODE_OFF+PL_regindent*2, "")
3701 /* Try scanning more and see if it helps. */
3702 PL_reginput = locinput;
3704 st->cc->lastloc = locinput;
3705 st->whilem.cp = regcppush(st->cc->parenfloor);
3706 REGCP_SET(st->whilem.lastcp);
3707 REGMATCH(st->cc->scan, WHILEM4);
3708 /*** all unsaved local vars undefined at this point */
3710 regcpblow(st->whilem.cp);
3713 REGCP_UNWIND(st->whilem.lastcp);
3715 st->cc->cur = n - 1;
3716 st->cc->lastloc = st->whilem.lastloc;
3720 /* Prefer scan over next for maximal matching. */
3722 if (n < st->cc->max) { /* More greed allowed? */
3723 st->whilem.cp = regcppush(st->cc->parenfloor);
3725 st->cc->lastloc = locinput;
3726 REGCP_SET(st->whilem.lastcp);
3727 REGMATCH(st->cc->scan, WHILEM5);
3728 /*** all unsaved local vars undefined at this point */
3730 regcpblow(st->whilem.cp);
3733 REGCP_UNWIND(st->whilem.lastcp);
3734 regcppop(); /* Restore some previous $<digit>s? */
3735 PL_reginput = locinput;
3737 PerlIO_printf(Perl_debug_log,
3738 "%*s failed, try continuation...\n",
3739 REPORT_CODE_OFF+PL_regindent*2, "")
3742 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3743 && !(PL_reg_flags & RF_warned)) {
3744 PL_reg_flags |= RF_warned;
3745 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3746 "Complex regular subexpression recursion",
3750 /* Failed deeper matches of scan, so see if this one works. */
3751 st->whilem.savecc = st->cc;
3752 st->cc = st->cc->oldcc;
3754 st->ln = st->cc->cur;
3755 REGMATCH(st->whilem.savecc->next, WHILEM6);
3756 /*** all unsaved local vars undefined at this point */
3757 st->cc = st->whilem.savecc;
3761 st->cc->oldcc->cur = st->ln;
3762 st->cc->cur = n - 1;
3763 st->cc->lastloc = st->whilem.lastloc;
3768 next = scan + ARG(scan);
3771 inner = NEXTOPER(NEXTOPER(scan));
3774 inner = NEXTOPER(scan);
3779 if (OP(next) != type) /* No choice. */
3780 next = inner; /* Avoid recursion. */
3782 const I32 lastparen = *PL_reglastparen;
3783 /* Put unwinding data on stack */
3784 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
3785 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
3787 uw->prev = st->unwind;
3788 st->unwind = unwind1;
3789 uw->type = ((type == BRANCH)
3791 : RE_UNWIND_BRANCHJ);
3792 uw->lastparen = lastparen;
3794 uw->locinput = locinput;
3795 uw->nextchr = nextchr;
3797 uw->regindent = ++PL_regindent;
3800 REGCP_SET(uw->lastcp);
3802 /* Now go into the first branch */
3812 st->curlym.l = st->curlym.matches = 0;
3814 /* We suppose that the next guy does not need
3815 backtracking: in particular, it is of constant non-zero length,
3816 and has no parenths to influence future backrefs. */
3817 st->ln = ARG1(scan); /* min to match */
3818 n = ARG2(scan); /* max to match */
3819 st->curlym.paren = scan->flags;
3820 if (st->curlym.paren) {
3821 if (st->curlym.paren > PL_regsize)
3822 PL_regsize = st->curlym.paren;
3823 if (st->curlym.paren > (I32)*PL_reglastparen)
3824 *PL_reglastparen = st->curlym.paren;
3826 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3827 if (st->curlym.paren)
3828 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3829 PL_reginput = locinput;
3830 st->curlym.maxwanted = st->minmod ? st->ln : n;
3831 if (st->curlym.maxwanted) {
3832 while (PL_reginput < PL_regeol && st->curlym.matches < st->curlym.maxwanted) {
3833 REGMATCH(scan, CURLYM1);
3834 /*** all unsaved local vars undefined at this point */
3837 /* on first match, determine length, curlym.l */
3838 if (!st->curlym.matches++) {
3839 if (PL_reg_match_utf8) {
3841 while (s < PL_reginput) {
3847 st->curlym.l = PL_reginput - locinput;
3849 if (st->curlym.l == 0) {
3850 st->curlym.matches = st->curlym.maxwanted;
3854 locinput = PL_reginput;
3858 PL_reginput = locinput;
3862 if (st->ln && st->curlym.matches < st->ln)
3864 if (HAS_TEXT(next) || JUMPABLE(next)) {
3865 regnode *text_node = next;
3867 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3869 if (! HAS_TEXT(text_node)) st->curlym.c1 = st->curlym.c2 = -1000;
3871 if (PL_regkind[(U8)OP(text_node)] == REF) {
3872 st->curlym.c1 = st->curlym.c2 = -1000;
3875 else { st->curlym.c1 = (U8)*STRING(text_node); }
3876 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3877 st->curlym.c2 = PL_fold[st->curlym.c1];
3878 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3879 st->curlym.c2 = PL_fold_locale[st->curlym.c1];
3881 st->curlym.c2 = st->curlym.c1;
3885 st->curlym.c1 = st->curlym.c2 = -1000;
3887 REGCP_SET(st->curlym.lastcp);
3888 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
3889 /* If it could work, try it. */
3890 if (st->curlym.c1 == -1000 ||
3891 UCHARAT(PL_reginput) == st->curlym.c1 ||
3892 UCHARAT(PL_reginput) == st->curlym.c2)