5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
34 /* need to replace pregcomp et al, so enable that */
35 # ifndef PERL_IN_XSUB_RE
36 # define PERL_IN_XSUB_RE
38 /* need access to debugger hooks */
39 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
44 #ifdef PERL_IN_XSUB_RE
45 /* We *really* need to overwrite these symbols: */
46 # define Perl_regexec_flags my_regexec
47 # define Perl_regdump my_regdump
48 # define Perl_regprop my_regprop
49 # define Perl_re_intuit_start my_re_intuit_start
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_pregexec my_pregexec
52 # define Perl_reginitcolors my_reginitcolors
53 # define Perl_regclass_swash my_regclass_swash
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
87 * Beware that some of this code is subtly aware of the way operator
88 * precedence is structured in regular expressions. Serious changes in
89 * regular-expression syntax might require a total rethink.
92 #define PERL_IN_REGEXEC_C
97 #define RF_tainted 1 /* tainted information used? */
98 #define RF_warned 2 /* warned about big count? */
99 #define RF_evaled 4 /* Did an EVAL with setting? */
100 #define RF_utf8 8 /* String contains multibyte chars? */
102 #define UTF ((PL_reg_flags & RF_utf8) != 0)
104 #define RS_init 1 /* eval environment created */
105 #define RS_set 2 /* replsv value is set */
108 #define STATIC static
111 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
117 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
118 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
120 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
121 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
122 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
123 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
124 #define HOPc(pos,off) ((char*)HOP(pos,off))
125 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
127 #define HOPBACK(pos, off) ( \
128 (PL_reg_match_utf8) \
129 ? reghopmaybe((U8*)pos, -off) \
130 : (pos - off >= PL_bostr) \
134 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
136 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
137 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
138 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
139 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
140 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
141 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
143 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
144 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
145 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
146 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
147 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
148 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
150 /* for use after a quantifier and before an EXACT-like node -- japhy */
151 #define JUMPABLE(rn) ( \
152 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
153 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
154 OP(rn) == PLUS || OP(rn) == MINMOD || \
155 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
158 #define HAS_TEXT(rn) ( \
159 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
163 Search for mandatory following text node; for lookahead, the text must
164 follow but for lookbehind (rn->flags != 0) we skip to the next step.
166 #define FIND_NEXT_IMPT(rn) STMT_START { \
167 while (JUMPABLE(rn)) \
168 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
169 rn = NEXTOPER(NEXTOPER(rn)); \
170 else if (OP(rn) == PLUS) \
172 else if (OP(rn) == IFMATCH) \
173 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
174 else rn += NEXT_OFF(rn); \
177 static void restore_pos(pTHX_ void *arg);
180 S_regcppush(pTHX_ I32 parenfloor)
182 const int retval = PL_savestack_ix;
183 #define REGCP_PAREN_ELEMS 4
184 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
187 if (paren_elems_to_push < 0)
188 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
190 #define REGCP_OTHER_ELEMS 6
191 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
192 for (p = PL_regsize; p > parenfloor; p--) {
193 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
194 SSPUSHINT(PL_regendp[p]);
195 SSPUSHINT(PL_regstartp[p]);
196 SSPUSHPTR(PL_reg_start_tmp[p]);
199 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
200 SSPUSHINT(PL_regsize);
201 SSPUSHINT(*PL_reglastparen);
202 SSPUSHINT(*PL_reglastcloseparen);
203 SSPUSHPTR(PL_reginput);
204 #define REGCP_FRAME_ELEMS 2
205 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
206 * are needed for the regexp context stack bookkeeping. */
207 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
208 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
213 /* These are needed since we do not localize EVAL nodes: */
214 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
215 " Setting an EVAL scope, savestack=%"IVdf"\n", \
216 (IV)PL_savestack_ix)); cp = PL_savestack_ix
218 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
219 PerlIO_printf(Perl_debug_log, \
220 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
221 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
230 GET_RE_DEBUG_FLAGS_DECL;
232 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
234 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
235 i = SSPOPINT; /* Parentheses elements to pop. */
236 input = (char *) SSPOPPTR;
237 *PL_reglastcloseparen = SSPOPINT;
238 *PL_reglastparen = SSPOPINT;
239 PL_regsize = SSPOPINT;
241 /* Now restore the parentheses context. */
242 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
243 i > 0; i -= REGCP_PAREN_ELEMS) {
245 paren = (U32)SSPOPINT;
246 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
247 PL_regstartp[paren] = SSPOPINT;
249 if (paren <= *PL_reglastparen)
250 PL_regendp[paren] = tmps;
252 PerlIO_printf(Perl_debug_log,
253 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
254 (UV)paren, (IV)PL_regstartp[paren],
255 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
256 (IV)PL_regendp[paren],
257 (paren > *PL_reglastparen ? "(no)" : ""));
261 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
262 PerlIO_printf(Perl_debug_log,
263 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
264 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
268 /* It would seem that the similar code in regtry()
269 * already takes care of this, and in fact it is in
270 * a better location to since this code can #if 0-ed out
271 * but the code in regtry() is needed or otherwise tests
272 * requiring null fields (pat.t#187 and split.t#{13,14}
273 * (as of patchlevel 7877) will fail. Then again,
274 * this code seems to be necessary or otherwise
275 * building DynaLoader will fail:
276 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
278 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
279 if ((I32)paren > PL_regsize)
280 PL_regstartp[paren] = -1;
281 PL_regendp[paren] = -1;
288 S_regcp_set_to(pTHX_ I32 ss)
290 const I32 tmp = PL_savestack_ix;
292 PL_savestack_ix = ss;
294 PL_savestack_ix = tmp;
298 typedef struct re_cc_state
302 struct re_cc_state *prev;
307 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
309 #define TRYPAREN(paren, n, input) { \
312 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
313 PL_regendp[paren] = input - PL_bostr; \
316 PL_regendp[paren] = -1; \
318 if (regmatch(next)) \
321 PL_regendp[paren] = -1; \
326 * pregexec and friends
330 - pregexec - match a regexp against a string
333 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
334 char *strbeg, I32 minend, SV *screamer, U32 nosave)
335 /* strend: pointer to null at end of string */
336 /* strbeg: real beginning of string */
337 /* minend: end of match must be >=minend after stringarg. */
338 /* nosave: For optimizations. */
341 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
342 nosave ? 0 : REXEC_COPY_STR);
346 S_cache_re(pTHX_ regexp *prog)
348 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
350 PL_regprogram = prog->program;
352 PL_regnpar = prog->nparens;
353 PL_regdata = prog->data;
358 * Need to implement the following flags for reg_anch:
360 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
362 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
363 * INTUIT_AUTORITATIVE_ML
364 * INTUIT_ONCE_NOML - Intuit can match in one location only.
367 * Another flag for this function: SECOND_TIME (so that float substrs
368 * with giant delta may be not rechecked).
371 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
373 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
374 Otherwise, only SvCUR(sv) is used to get strbeg. */
376 /* XXXX We assume that strpos is strbeg unless sv. */
378 /* XXXX Some places assume that there is a fixed substring.
379 An update may be needed if optimizer marks as "INTUITable"
380 RExen without fixed substrings. Similarly, it is assumed that
381 lengths of all the strings are no more than minlen, thus they
382 cannot come from lookahead.
383 (Or minlen should take into account lookahead.) */
385 /* A failure to find a constant substring means that there is no need to make
386 an expensive call to REx engine, thus we celebrate a failure. Similarly,
387 finding a substring too deep into the string means that less calls to
388 regtry() should be needed.
390 REx compiler's optimizer found 4 possible hints:
391 a) Anchored substring;
393 c) Whether we are anchored (beginning-of-line or \G);
394 d) First node (of those at offset 0) which may distingush positions;
395 We use a)b)d) and multiline-part of c), and try to find a position in the
396 string which does not contradict any of them.
399 /* Most of decisions we do here should have been done at compile time.
400 The nodes of the REx which we used for the search should have been
401 deleted from the finite automaton. */
404 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
405 char *strend, U32 flags, re_scream_pos_data *data)
407 register I32 start_shift = 0;
408 /* Should be nonnegative! */
409 register I32 end_shift = 0;
414 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
416 register char *other_last = Nullch; /* other substr checked before this */
417 char *check_at = Nullch; /* check substr found at this pos */
418 const I32 multiline = prog->reganch & PMf_MULTILINE;
420 char *i_strpos = strpos;
421 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
424 GET_RE_DEBUG_FLAGS_DECL;
426 RX_MATCH_UTF8_set(prog,do_utf8);
428 if (prog->reganch & ROPT_UTF8) {
429 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
430 "UTF-8 regex...\n"));
431 PL_reg_flags |= RF_utf8;
435 const char *s = PL_reg_match_utf8 ?
436 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
438 const int len = PL_reg_match_utf8 ?
439 strlen(s) : strend - strpos;
442 if (PL_reg_match_utf8)
443 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
444 "UTF-8 target...\n"));
445 PerlIO_printf(Perl_debug_log,
446 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
447 PL_colors[4], PL_colors[5], PL_colors[0],
450 (strlen(prog->precomp) > 60 ? "..." : ""),
452 (int)(len > 60 ? 60 : len),
454 (len > 60 ? "..." : "")
458 /* CHR_DIST() would be more correct here but it makes things slow. */
459 if (prog->minlen > strend - strpos) {
460 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
461 "String too short... [re_intuit_start]\n"));
464 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
467 if (!prog->check_utf8 && prog->check_substr)
468 to_utf8_substr(prog);
469 check = prog->check_utf8;
471 if (!prog->check_substr && prog->check_utf8)
472 to_byte_substr(prog);
473 check = prog->check_substr;
475 if (check == &PL_sv_undef) {
476 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
477 "Non-utf string cannot match utf check string\n"));
480 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
481 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
482 || ( (prog->reganch & ROPT_ANCH_BOL)
483 && !multiline ) ); /* Check after \n? */
486 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
487 | ROPT_IMPLICIT)) /* not a real BOL */
488 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
490 && (strpos != strbeg)) {
491 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
494 if (prog->check_offset_min == prog->check_offset_max &&
495 !(prog->reganch & ROPT_CANY_SEEN)) {
496 /* Substring at constant offset from beg-of-str... */
499 s = HOP3c(strpos, prog->check_offset_min, strend);
501 slen = SvCUR(check); /* >= 1 */
503 if ( strend - s > slen || strend - s < slen - 1
504 || (strend - s == slen && strend[-1] != '\n')) {
505 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
508 /* Now should match s[0..slen-2] */
510 if (slen && (*SvPVX_const(check) != *s
512 && memNE(SvPVX_const(check), s, slen)))) {
514 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
518 else if (*SvPVX_const(check) != *s
519 || ((slen = SvCUR(check)) > 1
520 && memNE(SvPVX_const(check), s, slen)))
522 goto success_at_start;
525 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
527 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
528 end_shift = prog->minlen - start_shift -
529 CHR_SVLEN(check) + (SvTAIL(check) != 0);
531 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
532 - (SvTAIL(check) != 0);
533 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
535 if (end_shift < eshift)
539 else { /* Can match at random position */
542 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
543 /* Should be nonnegative! */
544 end_shift = prog->minlen - start_shift -
545 CHR_SVLEN(check) + (SvTAIL(check) != 0);
548 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
550 Perl_croak(aTHX_ "panic: end_shift");
554 /* Find a possible match in the region s..strend by looking for
555 the "check" substring in the region corrected by start/end_shift. */
556 if (flags & REXEC_SCREAM) {
557 I32 p = -1; /* Internal iterator of scream. */
558 I32 * const pp = data ? data->scream_pos : &p;
560 if (PL_screamfirst[BmRARE(check)] >= 0
561 || ( BmRARE(check) == '\n'
562 && (BmPREVIOUS(check) == SvCUR(check) - 1)
564 s = screaminstr(sv, check,
565 start_shift + (s - strbeg), end_shift, pp, 0);
568 /* we may be pointing at the wrong string */
569 if (s && RX_MATCH_COPIED(prog))
570 s = strbeg + (s - SvPVX_const(sv));
572 *data->scream_olds = s;
574 else if (prog->reganch & ROPT_CANY_SEEN)
575 s = fbm_instr((U8*)(s + start_shift),
576 (U8*)(strend - end_shift),
577 check, multiline ? FBMrf_MULTILINE : 0);
579 s = fbm_instr(HOP3(s, start_shift, strend),
580 HOP3(strend, -end_shift, strbeg),
581 check, multiline ? FBMrf_MULTILINE : 0);
583 /* Update the count-of-usability, remove useless subpatterns,
586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
587 (s ? "Found" : "Did not find"),
588 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
590 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
592 PL_colors[1], (SvTAIL(check) ? "$" : ""),
593 (s ? " at offset " : "...\n") ) );
600 /* Finish the diagnostic message */
601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
603 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
604 Start with the other substr.
605 XXXX no SCREAM optimization yet - and a very coarse implementation
606 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
607 *always* match. Probably should be marked during compile...
608 Probably it is right to do no SCREAM here...
611 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
612 /* Take into account the "other" substring. */
613 /* XXXX May be hopelessly wrong for UTF... */
616 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
619 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
623 t = s - prog->check_offset_max;
624 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
626 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
631 t = HOP3c(t, prog->anchored_offset, strend);
632 if (t < other_last) /* These positions already checked */
634 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
637 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
638 /* On end-of-str: see comment below. */
639 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
640 if (must == &PL_sv_undef) {
642 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
647 HOP3(HOP3(last1, prog->anchored_offset, strend)
648 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
650 multiline ? FBMrf_MULTILINE : 0
652 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
653 "%s anchored substr \"%s%.*s%s\"%s",
654 (s ? "Found" : "Contradicts"),
657 - (SvTAIL(must)!=0)),
659 PL_colors[1], (SvTAIL(must) ? "$" : "")));
661 if (last1 >= last2) {
662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
663 ", giving up...\n"));
666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
667 ", trying floating at offset %ld...\n",
668 (long)(HOP3c(s1, 1, strend) - i_strpos)));
669 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
670 s = HOP3c(last, 1, strend);
674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
675 (long)(s - i_strpos)));
676 t = HOP3c(s, -prog->anchored_offset, strbeg);
677 other_last = HOP3c(s, 1, strend);
685 else { /* Take into account the floating substring. */
690 t = HOP3c(s, -start_shift, strbeg);
692 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
693 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
694 last = HOP3c(t, prog->float_max_offset, strend);
695 s = HOP3c(t, prog->float_min_offset, strend);
698 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
699 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
700 /* fbm_instr() takes into account exact value of end-of-str
701 if the check is SvTAIL(ed). Since false positives are OK,
702 and end-of-str is not later than strend we are OK. */
703 if (must == &PL_sv_undef) {
705 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
708 s = fbm_instr((unsigned char*)s,
709 (unsigned char*)last + SvCUR(must)
711 must, multiline ? FBMrf_MULTILINE : 0);
712 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
713 (s ? "Found" : "Contradicts"),
715 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
717 PL_colors[1], (SvTAIL(must) ? "$" : "")));
720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
721 ", giving up...\n"));
724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
725 ", trying anchored starting at offset %ld...\n",
726 (long)(s1 + 1 - i_strpos)));
728 s = HOP3c(t, 1, strend);
732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
733 (long)(s - i_strpos)));
734 other_last = s; /* Fix this later. --Hugo */
743 t = s - prog->check_offset_max;
744 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
746 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
748 /* Fixed substring is found far enough so that the match
749 cannot start at strpos. */
751 if (ml_anch && t[-1] != '\n') {
752 /* Eventually fbm_*() should handle this, but often
753 anchored_offset is not 0, so this check will not be wasted. */
754 /* XXXX In the code below we prefer to look for "^" even in
755 presence of anchored substrings. And we search even
756 beyond the found float position. These pessimizations
757 are historical artefacts only. */
759 while (t < strend - prog->minlen) {
761 if (t < check_at - prog->check_offset_min) {
762 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
763 /* Since we moved from the found position,
764 we definitely contradict the found anchored
765 substr. Due to the above check we do not
766 contradict "check" substr.
767 Thus we can arrive here only if check substr
768 is float. Redo checking for "other"=="fixed".
771 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
772 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
773 goto do_other_anchored;
775 /* We don't contradict the found floating substring. */
776 /* XXXX Why not check for STCLASS? */
778 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
779 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
782 /* Position contradicts check-string */
783 /* XXXX probably better to look for check-string
784 than for "\n", so one should lower the limit for t? */
785 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
786 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
787 other_last = strpos = s = t + 1;
792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
793 PL_colors[0], PL_colors[1]));
797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
798 PL_colors[0], PL_colors[1]));
802 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
805 /* The found string does not prohibit matching at strpos,
806 - no optimization of calling REx engine can be performed,
807 unless it was an MBOL and we are not after MBOL,
808 or a future STCLASS check will fail this. */
810 /* Even in this situation we may use MBOL flag if strpos is offset
811 wrt the start of the string. */
812 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
813 && (strpos != strbeg) && strpos[-1] != '\n'
814 /* May be due to an implicit anchor of m{.*foo} */
815 && !(prog->reganch & ROPT_IMPLICIT))
820 DEBUG_EXECUTE_r( if (ml_anch)
821 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
822 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
825 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
827 prog->check_utf8 /* Could be deleted already */
828 && --BmUSEFUL(prog->check_utf8) < 0
829 && (prog->check_utf8 == prog->float_utf8)
831 prog->check_substr /* Could be deleted already */
832 && --BmUSEFUL(prog->check_substr) < 0
833 && (prog->check_substr == prog->float_substr)
836 /* If flags & SOMETHING - do not do it many times on the same match */
837 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
838 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
839 if (do_utf8 ? prog->check_substr : prog->check_utf8)
840 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
841 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
842 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
843 check = Nullsv; /* abort */
845 /* XXXX This is a remnant of the old implementation. It
846 looks wasteful, since now INTUIT can use many
848 prog->reganch &= ~RE_USE_INTUIT;
855 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
856 if (prog->regstclass) {
857 /* minlen == 0 is possible if regstclass is \b or \B,
858 and the fixed substr is ''$.
859 Since minlen is already taken into account, s+1 is before strend;
860 accidentally, minlen >= 1 guaranties no false positives at s + 1
861 even for \b or \B. But (minlen? 1 : 0) below assumes that
862 regstclass does not come from lookahead... */
863 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
864 This leaves EXACTF only, which is dealt with in find_byclass(). */
865 const U8* str = (U8*)STRING(prog->regstclass);
866 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
867 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
869 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
870 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
871 : (prog->float_substr || prog->float_utf8
872 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
878 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
881 const char *what = 0;
883 if (endpos == strend) {
884 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
885 "Could not match STCLASS...\n") );
888 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
889 "This position contradicts STCLASS...\n") );
890 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
892 /* Contradict one of substrings */
893 if (prog->anchored_substr || prog->anchored_utf8) {
894 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
895 DEBUG_EXECUTE_r( what = "anchored" );
897 s = HOP3c(t, 1, strend);
898 if (s + start_shift + end_shift > strend) {
899 /* XXXX Should be taken into account earlier? */
900 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
901 "Could not match STCLASS...\n") );
906 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
907 "Looking for %s substr starting at offset %ld...\n",
908 what, (long)(s + start_shift - i_strpos)) );
911 /* Have both, check_string is floating */
912 if (t + start_shift >= check_at) /* Contradicts floating=check */
913 goto retry_floating_check;
914 /* Recheck anchored substring, but not floating... */
918 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
919 "Looking for anchored substr starting at offset %ld...\n",
920 (long)(other_last - i_strpos)) );
921 goto do_other_anchored;
923 /* Another way we could have checked stclass at the
924 current position only: */
929 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
930 "Looking for /%s^%s/m starting at offset %ld...\n",
931 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
934 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
936 /* Check is floating subtring. */
937 retry_floating_check:
938 t = check_at - start_shift;
939 DEBUG_EXECUTE_r( what = "floating" );
940 goto hop_and_restart;
943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
944 "By STCLASS: moving %ld --> %ld\n",
945 (long)(t - i_strpos), (long)(s - i_strpos))
949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
950 "Does not contradict STCLASS...\n");
955 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
956 PL_colors[4], (check ? "Guessed" : "Giving up"),
957 PL_colors[5], (long)(s - i_strpos)) );
960 fail_finish: /* Substring not found */
961 if (prog->check_substr || prog->check_utf8) /* could be removed already */
962 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
965 PL_colors[4], PL_colors[5]));
969 /* We know what class REx starts with. Try to find this position... */
971 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
974 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
978 register STRLEN uskip;
982 register I32 tmp = 1; /* Scratch variable? */
983 register const bool do_utf8 = PL_reg_match_utf8;
985 /* We know what class it must start with. */
989 while (s + (uskip = UTF8SKIP(s)) <= strend) {
990 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
991 !UTF8_IS_INVARIANT((U8)s[0]) ?
992 reginclass(c, (U8*)s, 0, do_utf8) :
993 REGINCLASS(c, (U8*)s)) {
994 if (tmp && (norun || regtry(prog, s)))
1005 while (s < strend) {
1008 if (REGINCLASS(c, (U8*)s) ||
1009 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1010 /* The assignment of 2 is intentional:
1011 * for the folded sharp s, the skip is 2. */
1012 (skip = SHARP_S_SKIP))) {
1013 if (tmp && (norun || regtry(prog, s)))
1025 while (s < strend) {
1026 if (tmp && (norun || regtry(prog, s)))
1035 ln = STR_LEN(c); /* length to match in octets/bytes */
1036 lnc = (I32) ln; /* length to match in characters */
1038 STRLEN ulen1, ulen2;
1040 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1041 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1043 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1044 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1046 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1047 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1048 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1049 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1051 while (sm < ((U8 *) m + ln)) {
1066 c2 = PL_fold_locale[c1];
1068 e = HOP3c(strend, -((I32)lnc), s);
1071 e = s; /* Due to minlen logic of intuit() */
1073 /* The idea in the EXACTF* cases is to first find the
1074 * first character of the EXACTF* node and then, if
1075 * necessary, case-insensitively compare the full
1076 * text of the node. The c1 and c2 are the first
1077 * characters (though in Unicode it gets a bit
1078 * more complicated because there are more cases
1079 * than just upper and lower: one needs to use
1080 * the so-called folding case for case-insensitive
1081 * matching (called "loose matching" in Unicode).
1082 * ibcmp_utf8() will do just that. */
1086 U8 tmpbuf [UTF8_MAXBYTES+1];
1087 STRLEN len, foldlen;
1090 /* Upper and lower of 1st char are equal -
1091 * probably not a "letter". */
1093 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1095 0 : UTF8_ALLOW_ANY);
1098 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1099 m, (char **)0, ln, (bool)UTF))
1100 && (norun || regtry(prog, s)) )
1103 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1104 uvchr_to_utf8(tmpbuf, c);
1105 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1107 && (f == c1 || f == c2)
1108 && (ln == foldlen ||
1109 !ibcmp_utf8((char *) foldbuf,
1110 (char **)0, foldlen, do_utf8,
1112 (char **)0, ln, (bool)UTF))
1113 && (norun || regtry(prog, s)) )
1121 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1123 0 : UTF8_ALLOW_ANY);
1125 /* Handle some of the three Greek sigmas cases.
1126 * Note that not all the possible combinations
1127 * are handled here: some of them are handled
1128 * by the standard folding rules, and some of
1129 * them (the character class or ANYOF cases)
1130 * are handled during compiletime in
1131 * regexec.c:S_regclass(). */
1132 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1133 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1134 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1136 if ( (c == c1 || c == c2)
1138 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1139 m, (char **)0, ln, (bool)UTF))
1140 && (norun || regtry(prog, s)) )
1143 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1144 uvchr_to_utf8(tmpbuf, c);
1145 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1147 && (f == c1 || f == c2)
1148 && (ln == foldlen ||
1149 !ibcmp_utf8((char *) foldbuf,
1150 (char **)0, foldlen, do_utf8,
1152 (char **)0, ln, (bool)UTF))
1153 && (norun || regtry(prog, s)) )
1164 && (ln == 1 || !(OP(c) == EXACTF
1166 : ibcmp_locale(s, m, ln)))
1167 && (norun || regtry(prog, s)) )
1173 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1174 && (ln == 1 || !(OP(c) == EXACTF
1176 : ibcmp_locale(s, m, ln)))
1177 && (norun || regtry(prog, s)) )
1184 PL_reg_flags |= RF_tainted;
1191 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1193 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1195 tmp = ((OP(c) == BOUND ?
1196 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1197 LOAD_UTF8_CHARCLASS_ALNUM();
1198 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1199 if (tmp == !(OP(c) == BOUND ?
1200 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1201 isALNUM_LC_utf8((U8*)s)))
1204 if ((norun || regtry(prog, s)))
1211 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1212 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1213 while (s < strend) {
1215 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1217 if ((norun || regtry(prog, s)))
1223 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1227 PL_reg_flags |= RF_tainted;
1234 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1236 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1238 tmp = ((OP(c) == NBOUND ?
1239 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1240 LOAD_UTF8_CHARCLASS_ALNUM();
1241 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1242 if (tmp == !(OP(c) == NBOUND ?
1243 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1244 isALNUM_LC_utf8((U8*)s)))
1246 else if ((norun || regtry(prog, s)))
1252 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1253 tmp = ((OP(c) == NBOUND ?
1254 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1255 while (s < strend) {
1257 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1259 else if ((norun || regtry(prog, s)))
1264 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1269 LOAD_UTF8_CHARCLASS_ALNUM();
1270 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1271 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1272 if (tmp && (norun || regtry(prog, s)))
1283 while (s < strend) {
1285 if (tmp && (norun || regtry(prog, s)))
1297 PL_reg_flags |= RF_tainted;
1299 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1300 if (isALNUM_LC_utf8((U8*)s)) {
1301 if (tmp && (norun || regtry(prog, s)))
1312 while (s < strend) {
1313 if (isALNUM_LC(*s)) {
1314 if (tmp && (norun || regtry(prog, s)))
1327 LOAD_UTF8_CHARCLASS_ALNUM();
1328 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1329 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1330 if (tmp && (norun || regtry(prog, s)))
1341 while (s < strend) {
1343 if (tmp && (norun || regtry(prog, s)))
1355 PL_reg_flags |= RF_tainted;
1357 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1358 if (!isALNUM_LC_utf8((U8*)s)) {
1359 if (tmp && (norun || regtry(prog, s)))
1370 while (s < strend) {
1371 if (!isALNUM_LC(*s)) {
1372 if (tmp && (norun || regtry(prog, s)))
1385 LOAD_UTF8_CHARCLASS_SPACE();
1386 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1387 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1388 if (tmp && (norun || regtry(prog, s)))
1399 while (s < strend) {
1401 if (tmp && (norun || regtry(prog, s)))
1413 PL_reg_flags |= RF_tainted;
1415 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1416 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1417 if (tmp && (norun || regtry(prog, s)))
1428 while (s < strend) {
1429 if (isSPACE_LC(*s)) {
1430 if (tmp && (norun || regtry(prog, s)))
1443 LOAD_UTF8_CHARCLASS_SPACE();
1444 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1445 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1446 if (tmp && (norun || regtry(prog, s)))
1457 while (s < strend) {
1459 if (tmp && (norun || regtry(prog, s)))
1471 PL_reg_flags |= RF_tainted;
1473 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1474 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1475 if (tmp && (norun || regtry(prog, s)))
1486 while (s < strend) {
1487 if (!isSPACE_LC(*s)) {
1488 if (tmp && (norun || regtry(prog, s)))
1501 LOAD_UTF8_CHARCLASS_DIGIT();
1502 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1503 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1504 if (tmp && (norun || regtry(prog, s)))
1515 while (s < strend) {
1517 if (tmp && (norun || regtry(prog, s)))
1529 PL_reg_flags |= RF_tainted;
1531 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1532 if (isDIGIT_LC_utf8((U8*)s)) {
1533 if (tmp && (norun || regtry(prog, s)))
1544 while (s < strend) {
1545 if (isDIGIT_LC(*s)) {
1546 if (tmp && (norun || regtry(prog, s)))
1559 LOAD_UTF8_CHARCLASS_DIGIT();
1560 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1561 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1562 if (tmp && (norun || regtry(prog, s)))
1573 while (s < strend) {
1575 if (tmp && (norun || regtry(prog, s)))
1587 PL_reg_flags |= RF_tainted;
1589 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1590 if (!isDIGIT_LC_utf8((U8*)s)) {
1591 if (tmp && (norun || regtry(prog, s)))
1602 while (s < strend) {
1603 if (!isDIGIT_LC(*s)) {
1604 if (tmp && (norun || regtry(prog, s)))
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1625 - regexec_flags - match a regexp against a string
1628 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630 /* strend: pointer to null at end of string */
1631 /* strbeg: real beginning of string */
1632 /* minend: end of match must be >=minend after stringarg. */
1633 /* data: May be used for some additional optimizations. */
1634 /* nosave: For optimizations. */
1637 register regnode *c;
1638 register char *startpos = stringarg;
1639 I32 minlen; /* must match at least this many chars */
1640 I32 dontbother = 0; /* how many characters not to try at end */
1641 I32 end_shift = 0; /* Same for the end. */ /* CC */
1642 I32 scream_pos = -1; /* Internal iterator of scream. */
1644 SV* oreplsv = GvSV(PL_replgv);
1645 const bool do_utf8 = DO_UTF8(sv);
1646 const I32 multiline = prog->reganch & PMf_MULTILINE;
1648 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1649 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1652 GET_RE_DEBUG_FLAGS_DECL;
1654 (void)data; /* Currently unused */
1655 RX_MATCH_UTF8_set(prog,do_utf8);
1661 PL_regnarrate = DEBUG_r_TEST;
1664 /* Be paranoid... */
1665 if (prog == NULL || startpos == NULL) {
1666 Perl_croak(aTHX_ "NULL regexp parameter");
1670 minlen = prog->minlen;
1671 if (strend - startpos < minlen) {
1672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1673 "String too short [regexec_flags]...\n"));
1677 /* Check validity of program. */
1678 if (UCHARAT(prog->program) != REG_MAGIC) {
1679 Perl_croak(aTHX_ "corrupted regexp program");
1683 PL_reg_eval_set = 0;
1686 if (prog->reganch & ROPT_UTF8)
1687 PL_reg_flags |= RF_utf8;
1689 /* Mark beginning of line for ^ and lookbehind. */
1690 PL_regbol = startpos;
1694 /* Mark end of line for $ (and such) */
1697 /* see how far we have to get to not match where we matched before */
1698 PL_regtill = startpos+minend;
1700 /* We start without call_cc context. */
1703 /* If there is a "must appear" string, look for it. */
1706 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1709 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1710 PL_reg_ganch = startpos;
1711 else if (sv && SvTYPE(sv) >= SVt_PVMG
1713 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1714 && mg->mg_len >= 0) {
1715 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1716 if (prog->reganch & ROPT_ANCH_GPOS) {
1717 if (s > PL_reg_ganch)
1722 else /* pos() not defined */
1723 PL_reg_ganch = strbeg;
1726 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1727 re_scream_pos_data d;
1729 d.scream_olds = &scream_olds;
1730 d.scream_pos = &scream_pos;
1731 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1734 goto phooey; /* not present */
1739 const char * const s0 = UTF
1740 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1743 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1744 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1745 UNI_DISPLAY_REGEX) : startpos;
1746 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1749 PerlIO_printf(Perl_debug_log,
1750 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1751 PL_colors[4], PL_colors[5], PL_colors[0],
1754 len0 > 60 ? "..." : "",
1756 (int)(len1 > 60 ? 60 : len1),
1758 (len1 > 60 ? "..." : "")
1762 /* Simplest case: anchored match need be tried only once. */
1763 /* [unless only anchor is BOL and multiline is set] */
1764 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1765 if (s == startpos && regtry(prog, startpos))
1767 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1768 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1773 dontbother = minlen - 1;
1774 end = HOP3c(strend, -dontbother, strbeg) - 1;
1775 /* for multiline we only have to try after newlines */
1776 if (prog->check_substr || prog->check_utf8) {
1780 if (regtry(prog, s))
1785 if (prog->reganch & RE_USE_INTUIT) {
1786 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1797 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1798 if (regtry(prog, s))
1805 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1806 if (regtry(prog, PL_reg_ganch))
1811 /* Messy cases: unanchored match. */
1812 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1813 /* we have /x+whatever/ */
1814 /* it must be a one character string (XXXX Except UTF?) */
1819 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1820 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1821 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1824 while (s < strend) {
1826 DEBUG_EXECUTE_r( did_match = 1 );
1827 if (regtry(prog, s)) goto got_it;
1829 while (s < strend && *s == ch)
1836 while (s < strend) {
1838 DEBUG_EXECUTE_r( did_match = 1 );
1839 if (regtry(prog, s)) goto got_it;
1841 while (s < strend && *s == ch)
1847 DEBUG_EXECUTE_r(if (!did_match)
1848 PerlIO_printf(Perl_debug_log,
1849 "Did not find anchored character...\n")
1853 else if (prog->anchored_substr != Nullsv
1854 || prog->anchored_utf8 != Nullsv
1855 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1856 && prog->float_max_offset < strend - s)) {
1861 char *last1; /* Last position checked before */
1865 if (prog->anchored_substr || prog->anchored_utf8) {
1866 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1867 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1868 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1869 back_max = back_min = prog->anchored_offset;
1871 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1872 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1873 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1874 back_max = prog->float_max_offset;
1875 back_min = prog->float_min_offset;
1877 if (must == &PL_sv_undef)
1878 /* could not downgrade utf8 check substring, so must fail */
1881 last = HOP3c(strend, /* Cannot start after this */
1882 -(I32)(CHR_SVLEN(must)
1883 - (SvTAIL(must) != 0) + back_min), strbeg);
1886 last1 = HOPc(s, -1);
1888 last1 = s - 1; /* bogus */
1890 /* XXXX check_substr already used to find "s", can optimize if
1891 check_substr==must. */
1893 dontbother = end_shift;
1894 strend = HOPc(strend, -dontbother);
1895 while ( (s <= last) &&
1896 ((flags & REXEC_SCREAM)
1897 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1898 end_shift, &scream_pos, 0))
1899 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1900 (unsigned char*)strend, must,
1901 multiline ? FBMrf_MULTILINE : 0))) ) {
1902 /* we may be pointing at the wrong string */
1903 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1904 s = strbeg + (s - SvPVX_const(sv));
1905 DEBUG_EXECUTE_r( did_match = 1 );
1906 if (HOPc(s, -back_max) > last1) {
1907 last1 = HOPc(s, -back_min);
1908 s = HOPc(s, -back_max);
1911 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1913 last1 = HOPc(s, -back_min);
1917 while (s <= last1) {
1918 if (regtry(prog, s))
1924 while (s <= last1) {
1925 if (regtry(prog, s))
1931 DEBUG_EXECUTE_r(if (!did_match)
1932 PerlIO_printf(Perl_debug_log,
1933 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1934 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1935 ? "anchored" : "floating"),
1937 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1939 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1943 else if ((c = prog->regstclass)) {
1945 I32 op = (U8)OP(prog->regstclass);
1946 /* don't bother with what can't match */
1947 if (PL_regkind[op] != EXACT && op != CANY)
1948 strend = HOPc(strend, -(minlen - 1));
1951 SV *prop = sv_newmortal();
1959 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1960 UNI_DISPLAY_REGEX) :
1962 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1964 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1965 len1 = UTF ? SvCUR(dsv1) : strend - s;
1966 PerlIO_printf(Perl_debug_log,
1967 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1971 if (find_byclass(prog, c, s, strend, 0))
1973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1977 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1982 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1983 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1984 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1986 if (flags & REXEC_SCREAM) {
1987 last = screaminstr(sv, float_real, s - strbeg,
1988 end_shift, &scream_pos, 1); /* last one */
1990 last = scream_olds; /* Only one occurrence. */
1991 /* we may be pointing at the wrong string */
1992 else if (RX_MATCH_COPIED(prog))
1993 s = strbeg + (s - SvPVX_const(sv));
1997 const char * const little = SvPV(float_real, len);
1999 if (SvTAIL(float_real)) {
2000 if (memEQ(strend - len + 1, little, len - 1))
2001 last = strend - len + 1;
2002 else if (!multiline)
2003 last = memEQ(strend - len, little, len)
2004 ? strend - len : Nullch;
2010 last = rninstr(s, strend, little, little + len);
2012 last = strend; /* matching "$" */
2016 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2017 "%sCan't trim the tail, match fails (should not happen)%s\n",
2018 PL_colors[4], PL_colors[5]));
2019 goto phooey; /* Should not happen! */
2021 dontbother = strend - last + prog->float_min_offset;
2023 if (minlen && (dontbother < minlen))
2024 dontbother = minlen - 1;
2025 strend -= dontbother; /* this one's always in bytes! */
2026 /* We don't know much -- general case. */
2029 if (regtry(prog, s))
2038 if (regtry(prog, s))
2040 } while (s++ < strend);
2048 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2050 if (PL_reg_eval_set) {
2051 /* Preserve the current value of $^R */
2052 if (oreplsv != GvSV(PL_replgv))
2053 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2054 restored, the value remains
2056 restore_pos(aTHX_ 0);
2059 /* make sure $`, $&, $', and $digit will work later */
2060 if ( !(flags & REXEC_NOT_FIRST) ) {
2061 RX_MATCH_COPY_FREE(prog);
2062 if (flags & REXEC_COPY_STR) {
2063 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2064 #ifdef PERL_OLD_COPY_ON_WRITE
2066 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2068 PerlIO_printf(Perl_debug_log,
2069 "Copy on write: regexp capture, type %d\n",
2072 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2073 prog->subbeg = SvPVX(prog->saved_copy);
2074 assert (SvPOKp(prog->saved_copy));
2078 RX_MATCH_COPIED_on(prog);
2079 s = savepvn(strbeg, i);
2085 prog->subbeg = strbeg;
2086 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2093 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2094 PL_colors[4], PL_colors[5]));
2095 if (PL_reg_eval_set)
2096 restore_pos(aTHX_ 0);
2101 - regtry - try match at specific point
2103 STATIC I32 /* 0 failure, 1 success */
2104 S_regtry(pTHX_ regexp *prog, char *startpos)
2110 GET_RE_DEBUG_FLAGS_DECL;
2113 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2115 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2118 PL_reg_eval_set = RS_init;
2119 DEBUG_EXECUTE_r(DEBUG_s(
2120 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2121 (IV)(PL_stack_sp - PL_stack_base));
2123 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2124 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2125 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2127 /* Apparently this is not needed, judging by wantarray. */
2128 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2129 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2132 /* Make $_ available to executed code. */
2133 if (PL_reg_sv != DEFSV) {
2138 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2139 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2140 /* prepare for quick setting of pos */
2141 sv_magic(PL_reg_sv, (SV*)0,
2142 PERL_MAGIC_regex_global, Nullch, 0);
2143 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2147 PL_reg_oldpos = mg->mg_len;
2148 SAVEDESTRUCTOR_X(restore_pos, 0);
2150 if (!PL_reg_curpm) {
2151 Newz(22, PL_reg_curpm, 1, PMOP);
2154 SV* repointer = newSViv(0);
2155 /* so we know which PL_regex_padav element is PL_reg_curpm */
2156 SvFLAGS(repointer) |= SVf_BREAK;
2157 av_push(PL_regex_padav,repointer);
2158 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2159 PL_regex_pad = AvARRAY(PL_regex_padav);
2163 PM_SETRE(PL_reg_curpm, prog);
2164 PL_reg_oldcurpm = PL_curpm;
2165 PL_curpm = PL_reg_curpm;
2166 if (RX_MATCH_COPIED(prog)) {
2167 /* Here is a serious problem: we cannot rewrite subbeg,
2168 since it may be needed if this match fails. Thus
2169 $` inside (?{}) could fail... */
2170 PL_reg_oldsaved = prog->subbeg;
2171 PL_reg_oldsavedlen = prog->sublen;
2172 #ifdef PERL_OLD_COPY_ON_WRITE
2173 PL_nrs = prog->saved_copy;
2175 RX_MATCH_COPIED_off(prog);
2178 PL_reg_oldsaved = Nullch;
2179 prog->subbeg = PL_bostr;
2180 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2182 prog->startp[0] = startpos - PL_bostr;
2183 PL_reginput = startpos;
2184 PL_regstartp = prog->startp;
2185 PL_regendp = prog->endp;
2186 PL_reglastparen = &prog->lastparen;
2187 PL_reglastcloseparen = &prog->lastcloseparen;
2188 prog->lastparen = 0;
2189 prog->lastcloseparen = 0;
2191 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2192 if (PL_reg_start_tmpl <= prog->nparens) {
2193 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2194 if(PL_reg_start_tmp)
2195 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2197 New(22, PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2200 /* XXXX What this code is doing here?!!! There should be no need
2201 to do this again and again, PL_reglastparen should take care of
2204 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2205 * Actually, the code in regcppop() (which Ilya may be meaning by
2206 * PL_reglastparen), is not needed at all by the test suite
2207 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2208 * enough, for building DynaLoader, or otherwise this
2209 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2210 * will happen. Meanwhile, this code *is* needed for the
2211 * above-mentioned test suite tests to succeed. The common theme
2212 * on those tests seems to be returning null fields from matches.
2217 if (prog->nparens) {
2218 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2225 if (regmatch(prog->program + 1)) {
2226 prog->endp[0] = PL_reginput - PL_bostr;
2229 REGCP_UNWIND(lastcp);
2233 #define RE_UNWIND_BRANCH 1
2234 #define RE_UNWIND_BRANCHJ 2
2238 typedef struct { /* XX: makes sense to enlarge it... */
2242 } re_unwind_generic_t;
2255 } re_unwind_branch_t;
2257 typedef union re_unwind_t {
2259 re_unwind_generic_t generic;
2260 re_unwind_branch_t branch;
2263 #define sayYES goto yes
2264 #define sayNO goto no
2265 #define sayNO_ANYOF goto no_anyof
2266 #define sayYES_FINAL goto yes_final
2267 #define sayYES_LOUD goto yes_loud
2268 #define sayNO_FINAL goto no_final
2269 #define sayNO_SILENT goto do_no
2270 #define saySAME(x) if (x) goto yes; else goto no
2272 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2273 #define POSCACHE_SEEN 1 /* we know what we're caching */
2274 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2275 #define CACHEsayYES STMT_START { \
2276 if (cache_offset | cache_bit) { \
2277 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2278 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2279 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2280 /* cache records failure, but this is success */ \
2282 PerlIO_printf(Perl_debug_log, \
2283 "%*s (remove success from failure cache)\n", \
2284 REPORT_CODE_OFF+PL_regindent*2, "") \
2286 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2291 #define CACHEsayNO STMT_START { \
2292 if (cache_offset | cache_bit) { \
2293 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2294 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2295 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2296 /* cache records success, but this is failure */ \
2298 PerlIO_printf(Perl_debug_log, \
2299 "%*s (remove failure from success cache)\n", \
2300 REPORT_CODE_OFF+PL_regindent*2, "") \
2302 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2308 /* this is used to determine how far from the left messages like
2309 'failed...' are printed. Currently 29 makes these messages line
2310 up with the opcode they refer to. Earlier perls used 25 which
2311 left these messages outdented making reviewing a debug output
2314 #define REPORT_CODE_OFF 29
2317 /* Make sure there is a test for this +1 options in re_tests */
2318 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2320 #define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2321 if ( trie->states[ state ].wordnum ) { \
2322 if ( !accepted ) { \
2325 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2326 sv_accept_buff=NEWSV( 1234, \
2327 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2328 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2329 SvPOK_on( sv_accept_buff ); \
2330 sv_2mortal( sv_accept_buff ); \
2331 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2333 if ( accepted >= bufflen ) { \
2335 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2336 bufflen * sizeof(reg_trie_accepted) ); \
2338 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2339 + sizeof( reg_trie_accepted ) ); \
2341 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2342 accept_buff[ accepted ].endpos = uc; \
2346 #define TRIE_HANDLE_CHAR STMT_START { \
2347 if ( uvc < 256 ) { \
2348 charid = trie->charmap[ uvc ]; \
2351 if( trie->widecharmap ) { \
2352 SV** svpp = (SV**)NULL; \
2353 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2354 sizeof( UV ), 0 ); \
2356 charid = (U16)SvIV( *svpp ); \
2361 ( base + charid > trie->uniquecharcount ) && \
2362 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
2363 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2365 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2373 - regmatch - main matching routine
2375 * Conceptually the strategy is simple: check to see whether the current
2376 * node matches, call self recursively to see whether the rest matches,
2377 * and then act accordingly. In practice we make some effort to avoid
2378 * recursion, in particular by going through "ordinary" nodes (that don't
2379 * need to know whether the rest of the match failed) by a loop instead of
2382 /* [lwall] I've hoisted the register declarations to the outer block in order to
2383 * maybe save a little bit of pushing and popping on the stack. It also takes
2384 * advantage of machines that use a register save mask on subroutine entry.
2386 STATIC I32 /* 0 failure, 1 success */
2387 S_regmatch(pTHX_ regnode *prog)
2390 register regnode *scan; /* Current node. */
2391 regnode *next; /* Next node. */
2392 regnode *inner; /* Next node in internal branch. */
2393 register I32 nextchr; /* renamed nextchr - nextchar colides with
2394 function of same name */
2395 register I32 n; /* no or next */
2396 register I32 ln = 0; /* len or last */
2397 register char *s = Nullch; /* operand or save */
2398 register char *locinput = PL_reginput;
2399 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2400 int minmod = 0, sw = 0, logical = 0;
2403 /* used by the trie code */
2404 SV *sv_accept_buff = 0; /* accepting states we have traversed */
2405 reg_trie_accepted *accept_buff = 0; /* "" */
2406 reg_trie_data *trie; /* what trie are we using right now */
2407 U32 accepted = 0; /* how many accepting states we have seen*/
2410 I32 firstcp = PL_savestack_ix;
2412 const register bool do_utf8 = PL_reg_match_utf8;
2414 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2415 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2416 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2418 SV *re_debug_flags = NULL;
2428 /* Note that nextchr is a byte even in UTF */
2429 nextchr = UCHARAT(locinput);
2431 while (scan != NULL) {
2434 SV *prop = sv_newmortal();
2435 const int docolor = *PL_colors[0];
2436 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2437 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2438 /* The part of the string before starttry has one color
2439 (pref0_len chars), between starttry and current
2440 position another one (pref_len - pref0_len chars),
2441 after the current position the third one.
2442 We assume that pref0_len <= pref_len, otherwise we
2443 decrease pref0_len. */
2444 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2445 ? (5 + taill) - l : locinput - PL_bostr;
2448 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2450 pref0_len = pref_len - (locinput - PL_reg_starttry);
2451 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2452 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2453 ? (5 + taill) - pref_len : PL_regeol - locinput);
2454 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2458 if (pref0_len > pref_len)
2459 pref0_len = pref_len;
2460 regprop(prop, scan);
2462 const char * const s0 =
2463 do_utf8 && OP(scan) != CANY ?
2464 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2465 pref0_len, 60, UNI_DISPLAY_REGEX) :
2466 locinput - pref_len;
2467 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2468 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2469 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2470 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2471 locinput - pref_len + pref0_len;
2472 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2473 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2474 pv_uni_display(dsv2, (U8*)locinput,
2475 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2477 const int len2 = do_utf8 ? strlen(s2) : l;
2478 PerlIO_printf(Perl_debug_log,
2479 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2480 (IV)(locinput - PL_bostr),
2487 (docolor ? "" : "> <"),
2491 15 - l - pref_len + 1,
2493 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2498 next = scan + NEXT_OFF(scan);
2504 if (locinput == PL_bostr)
2506 /* regtill = regbol; */
2511 if (locinput == PL_bostr ||
2512 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2518 if (locinput == PL_bostr)
2522 if (locinput == PL_reg_ganch)
2528 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2533 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2535 if (PL_regeol - locinput > 1)
2539 if (PL_regeol != locinput)
2543 if (!nextchr && locinput >= PL_regeol)
2546 locinput += PL_utf8skip[nextchr];
2547 if (locinput > PL_regeol)
2549 nextchr = UCHARAT(locinput);
2552 nextchr = UCHARAT(++locinput);
2555 if (!nextchr && locinput >= PL_regeol)
2557 nextchr = UCHARAT(++locinput);
2560 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2563 locinput += PL_utf8skip[nextchr];
2564 if (locinput > PL_regeol)
2566 nextchr = UCHARAT(locinput);
2569 nextchr = UCHARAT(++locinput);
2575 traverse the TRIE keeping track of all accepting states
2576 we transition through until we get to a failing node.
2578 we use two slightly different pieces of code to handle
2579 the traversal depending on whether its case sensitive or
2580 not. we reuse the accept code however. (this should probably
2581 be turned into a macro.)
2588 const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2589 U8 *uc = ( U8* )locinput;
2596 U8 *uscan = (U8*)NULL;
2600 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2602 while ( state && uc <= (U8*)PL_regeol ) {
2604 TRIE_CHECK_STATE_IS_ACCEPTING;
2606 base = trie->states[ state ].trans.base;
2608 DEBUG_TRIE_EXECUTE_r(
2609 PerlIO_printf( Perl_debug_log,
2610 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2611 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2612 (UV)state, (UV)base, (UV)accepted );
2617 if ( do_utf8 || UTF ) {
2619 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2624 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2625 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2626 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2627 foldlen -= UNISKIP( uvc );
2628 uscan = foldbuf + UNISKIP( uvc );
2640 DEBUG_TRIE_EXECUTE_r(
2641 PerlIO_printf( Perl_debug_log,
2642 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2643 charid, uvc, (UV)state, PL_colors[5] );
2652 /* unreached codepoint: we jump into the middle of the next case
2653 from previous if blocks */
2656 const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2657 U8 *uc = (U8*)locinput;
2666 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2668 while ( state && uc <= (U8*)PL_regeol ) {
2670 TRIE_CHECK_STATE_IS_ACCEPTING;
2672 base = trie->states[ state ].trans.base;
2674 DEBUG_TRIE_EXECUTE_r(
2675 PerlIO_printf( Perl_debug_log,
2676 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2677 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2678 (UV)state, (UV)base, (UV)accepted );
2683 if ( do_utf8 || UTF ) {
2684 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2695 DEBUG_TRIE_EXECUTE_r(
2696 PerlIO_printf( Perl_debug_log,
2697 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2698 charid, uvc, (UV)state, PL_colors[5] );
2708 There was at least one accepting state that we
2709 transitioned through. Presumably the number of accepting
2710 states is going to be low, typically one or two. So we
2711 simply scan through to find the one with lowest wordnum.
2712 Once we find it, we swap the last state into its place
2713 and decrement the size. We then try to match the rest of
2714 the pattern at the point where the word ends, if we
2715 succeed then we end the loop, otherwise the loop
2716 eventually terminates once all of the accepting states
2723 if ( accepted == 1 ) {
2725 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2726 PerlIO_printf( Perl_debug_log,
2727 "%*s %sonly one match : #%d <%s>%s\n",
2728 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2729 accept_buff[ 0 ].wordnum,
2730 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",
2733 PL_reginput = (char *)accept_buff[ 0 ].endpos;
2734 /* in this case we free tmps/leave before we call regmatch
2735 as we wont be using accept_buff again. */
2738 gotit = regmatch( scan + NEXT_OFF( scan ) );
2741 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2742 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
2745 while ( !gotit && accepted-- ) {
2748 for( cur = 1 ; cur <= accepted ; cur++ ) {
2749 DEBUG_TRIE_EXECUTE_r(
2750 PerlIO_printf( Perl_debug_log,
2751 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2752 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2753 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2754 accept_buff[ cur ].wordnum, PL_colors[5] );
2757 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2761 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2762 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
2763 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2764 accept_buff[best].wordnum,
2765 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",scan,
2768 if ( best<accepted ) {
2769 reg_trie_accepted tmp = accept_buff[ best ];
2770 accept_buff[ best ] = accept_buff[ accepted ];
2771 accept_buff[ accepted ] = tmp;
2774 PL_reginput = (char *)accept_buff[ best ].endpos;
2777 as far as I can tell we only need the SAVETMPS/FREETMPS
2778 for re's with EVAL in them but I'm leaving them in for
2779 all until I can be sure.
2782 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2795 /* unreached codepoint */
2799 if (do_utf8 != UTF) {
2800 /* The target and the pattern have differing utf8ness. */
2802 const char *e = s + ln;
2805 /* The target is utf8, the pattern is not utf8. */
2810 if (NATIVE_TO_UNI(*(U8*)s) !=
2811 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2813 0 : UTF8_ALLOW_ANY))
2820 /* The target is not utf8, the pattern is utf8. */
2825 if (NATIVE_TO_UNI(*((U8*)l)) !=
2826 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2828 0 : UTF8_ALLOW_ANY))
2835 nextchr = UCHARAT(locinput);
2838 /* The target and the pattern have the same utf8ness. */
2839 /* Inline the first character, for speed. */
2840 if (UCHARAT(s) != nextchr)
2842 if (PL_regeol - locinput < ln)
2844 if (ln > 1 && memNE(s, locinput, ln))
2847 nextchr = UCHARAT(locinput);
2850 PL_reg_flags |= RF_tainted;
2856 if (do_utf8 || UTF) {
2857 /* Either target or the pattern are utf8. */
2859 char *e = PL_regeol;
2861 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2862 l, &e, 0, do_utf8)) {
2863 /* One more case for the sharp s:
2864 * pack("U0U*", 0xDF) =~ /ss/i,
2865 * the 0xC3 0x9F are the UTF-8
2866 * byte sequence for the U+00DF. */
2868 toLOWER(s[0]) == 's' &&
2870 toLOWER(s[1]) == 's' &&
2877 nextchr = UCHARAT(locinput);
2881 /* Neither the target and the pattern are utf8. */
2883 /* Inline the first character, for speed. */
2884 if (UCHARAT(s) != nextchr &&
2885 UCHARAT(s) != ((OP(scan) == EXACTF)
2886 ? PL_fold : PL_fold_locale)[nextchr])
2888 if (PL_regeol - locinput < ln)
2890 if (ln > 1 && (OP(scan) == EXACTF
2891 ? ibcmp(s, locinput, ln)
2892 : ibcmp_locale(s, locinput, ln)))
2895 nextchr = UCHARAT(locinput);
2899 STRLEN inclasslen = PL_regeol - locinput;
2901 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2903 if (locinput >= PL_regeol)
2905 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2906 nextchr = UCHARAT(locinput);
2911 nextchr = UCHARAT(locinput);
2912 if (!REGINCLASS(scan, (U8*)locinput))
2914 if (!nextchr && locinput >= PL_regeol)
2916 nextchr = UCHARAT(++locinput);
2920 /* If we might have the case of the German sharp s
2921 * in a casefolding Unicode character class. */
2923 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2924 locinput += SHARP_S_SKIP;
2925 nextchr = UCHARAT(locinput);
2931 PL_reg_flags |= RF_tainted;
2937 LOAD_UTF8_CHARCLASS_ALNUM();
2938 if (!(OP(scan) == ALNUM
2939 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2940 : isALNUM_LC_utf8((U8*)locinput)))
2944 locinput += PL_utf8skip[nextchr];
2945 nextchr = UCHARAT(locinput);
2948 if (!(OP(scan) == ALNUM
2949 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2951 nextchr = UCHARAT(++locinput);
2954 PL_reg_flags |= RF_tainted;
2957 if (!nextchr && locinput >= PL_regeol)
2960 LOAD_UTF8_CHARCLASS_ALNUM();
2961 if (OP(scan) == NALNUM
2962 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2963 : isALNUM_LC_utf8((U8*)locinput))
2967 locinput += PL_utf8skip[nextchr];
2968 nextchr = UCHARAT(locinput);
2971 if (OP(scan) == NALNUM
2972 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2974 nextchr = UCHARAT(++locinput);
2978 PL_reg_flags |= RF_tainted;
2982 /* was last char in word? */
2984 if (locinput == PL_bostr)
2987 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2989 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
2991 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2992 ln = isALNUM_uni(ln);
2993 LOAD_UTF8_CHARCLASS_ALNUM();
2994 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2997 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2998 n = isALNUM_LC_utf8((U8*)locinput);
3002 ln = (locinput != PL_bostr) ?
3003 UCHARAT(locinput - 1) : '\n';
3004 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3006 n = isALNUM(nextchr);
3009 ln = isALNUM_LC(ln);
3010 n = isALNUM_LC(nextchr);
3013 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3014 OP(scan) == BOUNDL))
3018 PL_reg_flags |= RF_tainted;
3024 if (UTF8_IS_CONTINUED(nextchr)) {
3025 LOAD_UTF8_CHARCLASS_SPACE();
3026 if (!(OP(scan) == SPACE
3027 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3028 : isSPACE_LC_utf8((U8*)locinput)))
3032 locinput += PL_utf8skip[nextchr];
3033 nextchr = UCHARAT(locinput);
3036 if (!(OP(scan) == SPACE
3037 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3039 nextchr = UCHARAT(++locinput);
3042 if (!(OP(scan) == SPACE
3043 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3045 nextchr = UCHARAT(++locinput);
3049 PL_reg_flags |= RF_tainted;
3052 if (!nextchr && locinput >= PL_regeol)
3055 LOAD_UTF8_CHARCLASS_SPACE();
3056 if (OP(scan) == NSPACE
3057 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3058 : isSPACE_LC_utf8((U8*)locinput))
3062 locinput += PL_utf8skip[nextchr];
3063 nextchr = UCHARAT(locinput);
3066 if (OP(scan) == NSPACE
3067 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3069 nextchr = UCHARAT(++locinput);
3072 PL_reg_flags |= RF_tainted;
3078 LOAD_UTF8_CHARCLASS_DIGIT();
3079 if (!(OP(scan) == DIGIT
3080 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3081 : isDIGIT_LC_utf8((U8*)locinput)))
3085 locinput += PL_utf8skip[nextchr];
3086 nextchr = UCHARAT(locinput);
3089 if (!(OP(scan) == DIGIT
3090 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3092 nextchr = UCHARAT(++locinput);
3095 PL_reg_flags |= RF_tainted;
3098 if (!nextchr && locinput >= PL_regeol)
3101 LOAD_UTF8_CHARCLASS_DIGIT();
3102 if (OP(scan) == NDIGIT
3103 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3104 : isDIGIT_LC_utf8((U8*)locinput))
3108 locinput += PL_utf8skip[nextchr];
3109 nextchr = UCHARAT(locinput);
3112 if (OP(scan) == NDIGIT
3113 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3115 nextchr = UCHARAT(++locinput);
3118 if (locinput >= PL_regeol)
3121 LOAD_UTF8_CHARCLASS_MARK();
3122 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3124 locinput += PL_utf8skip[nextchr];
3125 while (locinput < PL_regeol &&
3126 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3127 locinput += UTF8SKIP(locinput);
3128 if (locinput > PL_regeol)
3133 nextchr = UCHARAT(locinput);
3136 PL_reg_flags |= RF_tainted;
3140 n = ARG(scan); /* which paren pair */
3141 ln = PL_regstartp[n];
3142 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3143 if ((I32)*PL_reglastparen < n || ln == -1)
3144 sayNO; /* Do not match unless seen CLOSEn. */
3145 if (ln == PL_regendp[n])
3149 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3151 const char *e = PL_bostr + PL_regendp[n];
3153 * Note that we can't do the "other character" lookup trick as
3154 * in the 8-bit case (no pun intended) because in Unicode we
3155 * have to map both upper and title case to lower case.
3157 if (OP(scan) == REFF) {
3159 STRLEN ulen1, ulen2;
3160 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3161 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3165 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3166 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3167 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3174 nextchr = UCHARAT(locinput);
3178 /* Inline the first character, for speed. */
3179 if (UCHARAT(s) != nextchr &&
3181 (UCHARAT(s) != ((OP(scan) == REFF
3182 ? PL_fold : PL_fold_locale)[nextchr]))))
3184 ln = PL_regendp[n] - ln;
3185 if (locinput + ln > PL_regeol)
3187 if (ln > 1 && (OP(scan) == REF
3188 ? memNE(s, locinput, ln)
3190 ? ibcmp(s, locinput, ln)
3191 : ibcmp_locale(s, locinput, ln))))
3194 nextchr = UCHARAT(locinput);
3205 OP_4tree *oop = PL_op;
3206 COP *ocurcop = PL_curcop;
3209 struct regexp *oreg = PL_reg_re;
3212 PL_op = (OP_4tree*)PL_regdata->data[n];
3213 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3214 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
3215 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3219 CALLRUNOPS(aTHX); /* Scalar context. */
3222 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3230 PAD_RESTORE_LOCAL(old_comppad);
3231 PL_curcop = ocurcop;
3233 if (logical == 2) { /* Postponed subexpression. */
3235 MAGIC *mg = Null(MAGIC*);
3237 CHECKPOINT cp, lastcp;
3241 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3242 mg = mg_find(sv, PERL_MAGIC_qr);
3243 else if (SvSMAGICAL(ret)) {
3244 if (SvGMAGICAL(ret))
3245 sv_unmagic(ret, PERL_MAGIC_qr);
3247 mg = mg_find(ret, PERL_MAGIC_qr);
3251 re = (regexp *)mg->mg_obj;
3252 (void)ReREFCNT_inc(re);
3256 const char *t = SvPV_const(ret, len);
3258 char * const oprecomp = PL_regprecomp;
3259 const I32 osize = PL_regsize;
3260 const I32 onpar = PL_regnpar;
3263 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3264 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3266 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3268 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3270 PL_regprecomp = oprecomp;
3275 PerlIO_printf(Perl_debug_log,
3276 "Entering embedded \"%s%.60s%s%s\"\n",
3280 (strlen(re->precomp) > 60 ? "..." : ""))
3283 state.prev = PL_reg_call_cc;
3284 state.cc = PL_regcc;
3285 state.re = PL_reg_re;
3289 cp = regcppush(0); /* Save *all* the positions. */
3292 state.ss = PL_savestack_ix;
3293 *PL_reglastparen = 0;
3294 *PL_reglastcloseparen = 0;
3295 PL_reg_call_cc = &state;
3296 PL_reginput = locinput;
3297 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3298 ((re->reganch & ROPT_UTF8) != 0);
3299 if (toggleutf) PL_reg_flags ^= RF_utf8;
3301 /* XXXX This is too dramatic a measure... */
3304 if (regmatch(re->program + 1)) {
3305 /* Even though we succeeded, we need to restore
3306 global variables, since we may be wrapped inside
3307 SUSPEND, thus the match may be not finished yet. */
3309 /* XXXX Do this only if SUSPENDed? */
3310 PL_reg_call_cc = state.prev;
3311 PL_regcc = state.cc;
3312 PL_reg_re = state.re;
3313 cache_re(PL_reg_re);
3314 if (toggleutf) PL_reg_flags ^= RF_utf8;
3316 /* XXXX This is too dramatic a measure... */
3319 /* These are needed even if not SUSPEND. */
3325 REGCP_UNWIND(lastcp);
3327 PL_reg_call_cc = state.prev;
3328 PL_regcc = state.cc;
3329 PL_reg_re = state.re;
3330 cache_re(PL_reg_re);
3331 if (toggleutf) PL_reg_flags ^= RF_utf8;
3333 /* XXXX This is too dramatic a measure... */
3343 sv_setsv(save_scalar(PL_replgv), ret);
3349 n = ARG(scan); /* which paren pair */
3350 PL_reg_start_tmp[n] = locinput;
3355 n = ARG(scan); /* which paren pair */
3356 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3357 PL_regendp[n] = locinput - PL_bostr;
3358 if (n > (I32)*PL_reglastparen)
3359 *PL_reglastparen = n;
3360 *PL_reglastcloseparen = n;
3363 n = ARG(scan); /* which paren pair */
3364 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3367 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3369 next = NEXTOPER(NEXTOPER(scan));
3371 next = scan + ARG(scan);
3372 if (OP(next) == IFTHEN) /* Fake one. */
3373 next = NEXTOPER(NEXTOPER(next));
3377 logical = scan->flags;
3379 /*******************************************************************
3380 PL_regcc contains infoblock about the innermost (...)* loop, and
3381 a pointer to the next outer infoblock.
3383 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3385 1) After matching X, regnode for CURLYX is processed;
3387 2) This regnode creates infoblock on the stack, and calls
3388 regmatch() recursively with the starting point at WHILEM node;
3390 3) Each hit of WHILEM node tries to match A and Z (in the order
3391 depending on the current iteration, min/max of {min,max} and
3392 greediness). The information about where are nodes for "A"
3393 and "Z" is read from the infoblock, as is info on how many times "A"
3394 was already matched, and greediness.
3396 4) After A matches, the same WHILEM node is hit again.
3398 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3399 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3400 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3401 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3402 of the external loop.
3404 Currently present infoblocks form a tree with a stem formed by PL_curcc
3405 and whatever it mentions via ->next, and additional attached trees
3406 corresponding to temporarily unset infoblocks as in "5" above.
3408 In the following picture infoblocks for outer loop of
3409 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3410 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3411 infoblocks are drawn below the "reset" infoblock.
3413 In fact in the picture below we do not show failed matches for Z and T
3414 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3415 more obvious *why* one needs to *temporary* unset infoblocks.]
3417 Matched REx position InfoBlocks Comment
3421 Y A)*?Z)*?T x <- O <- I
3422 YA )*?Z)*?T x <- O <- I
3423 YA A)*?Z)*?T x <- O <- I
3424 YAA )*?Z)*?T x <- O <- I
3425 YAA Z)*?T x <- O # Temporary unset I
3428 YAAZ Y(A)*?Z)*?T x <- O
3431 YAAZY (A)*?Z)*?T x <- O
3434 YAAZY A)*?Z)*?T x <- O <- I
3437 YAAZYA )*?Z)*?T x <- O <- I
3440 YAAZYA Z)*?T x <- O # Temporary unset I
3446 YAAZYAZ T x # Temporary unset O
3453 *******************************************************************/
3456 CHECKPOINT cp = PL_savestack_ix;
3457 /* No need to save/restore up to this paren */
3458 I32 parenfloor = scan->flags;
3460 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3462 cc.oldcc = PL_regcc;
3464 /* XXXX Probably it is better to teach regpush to support
3465 parenfloor > PL_regsize... */
3466 if (parenfloor > (I32)*PL_reglastparen)
3467 parenfloor = *PL_reglastparen; /* Pessimization... */
3468 cc.parenfloor = parenfloor;
3470 cc.min = ARG1(scan);
3471 cc.max = ARG2(scan);
3472 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3476 PL_reginput = locinput;
3477 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3479 PL_regcc = cc.oldcc;
3485 * This is really hard to understand, because after we match
3486 * what we're trying to match, we must make sure the rest of
3487 * the REx is going to match for sure, and to do that we have
3488 * to go back UP the parse tree by recursing ever deeper. And
3489 * if it fails, we have to reset our parent's current state
3490 * that we can try again after backing off.
3493 CHECKPOINT cp, lastcp;
3494 CURCUR* cc = PL_regcc;
3495 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3496 I32 cache_offset = 0, cache_bit = 0;
3498 n = cc->cur + 1; /* how many we know we matched */
3499 PL_reginput = locinput;
3502 PerlIO_printf(Perl_debug_log,
3503 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3504 REPORT_CODE_OFF+PL_regindent*2, "",
3505 (long)n, (long)cc->min,
3506 (long)cc->max, PTR2UV(cc))
3509 /* If degenerate scan matches "", assume scan done. */
3511 if (locinput == cc->lastloc && n >= cc->min) {
3512 PL_regcc = cc->oldcc;
3516 PerlIO_printf(Perl_debug_log,
3517 "%*s empty match detected, try continuation...\n",
3518 REPORT_CODE_OFF+PL_regindent*2, "")
3520 if (regmatch(cc->next))
3528 /* First just match a string of min scans. */
3532 cc->lastloc = locinput;
3533 if (regmatch(cc->scan))
3536 cc->lastloc = lastloc;
3541 /* Check whether we already were at this position.
3542 Postpone detection until we know the match is not
3543 *that* much linear. */
3544 if (!PL_reg_maxiter) {
3545 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3546 PL_reg_leftiter = PL_reg_maxiter;
3548 if (PL_reg_leftiter-- == 0) {
3549 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3550 if (PL_reg_poscache) {
3551 if ((I32)PL_reg_poscache_size < size) {
3552 Renew(PL_reg_poscache, size, char);
3553 PL_reg_poscache_size = size;
3555 Zero(PL_reg_poscache, size, char);
3558 PL_reg_poscache_size = size;
3559 Newz(29, PL_reg_poscache, size, char);
3562 PerlIO_printf(Perl_debug_log,
3563 "%sDetected a super-linear match, switching on caching%s...\n",
3564 PL_colors[4], PL_colors[5])
3567 if (PL_reg_leftiter < 0) {
3568 cache_offset = locinput - PL_bostr;
3570 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3571 + cache_offset * (scan->flags>>4);
3572 cache_bit = cache_offset % 8;
3574 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
3576 PerlIO_printf(Perl_debug_log,
3577 "%*s already tried at this position...\n",
3578 REPORT_CODE_OFF+PL_regindent*2, "")
3580 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3581 /* cache records success */
3584 /* cache records failure */
3587 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
3591 /* Prefer next over scan for minimal matching. */
3594 PL_regcc = cc->oldcc;
3597 cp = regcppush(cc->parenfloor);
3599 if (regmatch(cc->next)) {
3601 CACHEsayYES; /* All done. */
3603 REGCP_UNWIND(lastcp);
3609 if (n >= cc->max) { /* Maximum greed exceeded? */
3610 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3611 && !(PL_reg_flags & RF_warned)) {
3612 PL_reg_flags |= RF_warned;
3613 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3614 "Complex regular subexpression recursion",
3621 PerlIO_printf(Perl_debug_log,
3622 "%*s trying longer...\n",
3623 REPORT_CODE_OFF+PL_regindent*2, "")
3625 /* Try scanning more and see if it helps. */
3626 PL_reginput = locinput;
3628 cc->lastloc = locinput;
3629 cp = regcppush(cc->parenfloor);
3631 if (regmatch(cc->scan)) {
3635 REGCP_UNWIND(lastcp);
3638 cc->lastloc = lastloc;
3642 /* Prefer scan over next for maximal matching. */
3644 if (n < cc->max) { /* More greed allowed? */
3645 cp = regcppush(cc->parenfloor);
3647 cc->lastloc = locinput;
3649 if (regmatch(cc->scan)) {
3653 REGCP_UNWIND(lastcp);
3654 regcppop(); /* Restore some previous $<digit>s? */
3655 PL_reginput = locinput;
3657 PerlIO_printf(Perl_debug_log,
3658 "%*s failed, try continuation...\n",
3659 REPORT_CODE_OFF+PL_regindent*2, "")
3662 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3663 && !(PL_reg_flags & RF_warned)) {
3664 PL_reg_flags |= RF_warned;
3665 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3666 "Complex regular subexpression recursion",
3670 /* Failed deeper matches of scan, so see if this one works. */
3671 PL_regcc = cc->oldcc;
3674 if (regmatch(cc->next))
3680 cc->lastloc = lastloc;
3685 next = scan + ARG(scan);
3688 inner = NEXTOPER(NEXTOPER(scan));
3691 inner = NEXTOPER(scan);
3695 if (OP(next) != c1) /* No choice. */
3696 next = inner; /* Avoid recursion. */
3698 const I32 lastparen = *PL_reglastparen;
3700 re_unwind_branch_t *uw;
3702 /* Put unwinding data on stack */
3703 unwind1 = SSNEWt(1,re_unwind_branch_t);
3704 uw = SSPTRt(unwind1,re_unwind_branch_t);
3707 uw->type = ((c1 == BRANCH)
3709 : RE_UNWIND_BRANCHJ);
3710 uw->lastparen = lastparen;
3712 uw->locinput = locinput;
3713 uw->nextchr = nextchr;
3715 uw->regindent = ++PL_regindent;
3718 REGCP_SET(uw->lastcp);
3720 /* Now go into the first branch */
3733 /* We suppose that the next guy does not need
3734 backtracking: in particular, it is of constant non-zero length,
3735 and has no parenths to influence future backrefs. */
3736 ln = ARG1(scan); /* min to match */
3737 n = ARG2(scan); /* max to match */
3738 paren = scan->flags;
3740 if (paren > PL_regsize)
3742 if (paren > (I32)*PL_reglastparen)
3743 *PL_reglastparen = paren;
3745 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3747 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3748 PL_reginput = locinput;
3751 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3753 locinput = PL_reginput;
3754 if (HAS_TEXT(next) || JUMPABLE(next)) {
3755 regnode *text_node = next;
3757 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3759 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3761 if (PL_regkind[(U8)OP(text_node)] == REF) {
3765 else { c1 = (U8)*STRING(text_node); }
3766 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3768 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3769 c2 = PL_fold_locale[c1];
3778 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3779 /* If it could work, try it. */
3781 UCHARAT(PL_reginput) == c1 ||
3782 UCHARAT(PL_reginput) == c2)
3786 PL_regstartp[paren] =
3787 HOPc(PL_reginput, -l) - PL_bostr;
3788 PL_regendp[paren] = PL_reginput - PL_bostr;
3791 PL_regendp[paren] = -1;
3795 REGCP_UNWIND(lastcp);
3797 /* Couldn't or didn't -- move forward. */
3798 PL_reginput = locinput;
3799 if (regrepeat_hard(scan, 1, &l)) {
3801 locinput = PL_reginput;
3808 n = regrepeat_hard(scan, n, &l);
3809 locinput = PL_reginput;
3811 PerlIO_printf(Perl_debug_log,
3812 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3813 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3817 if (HAS_TEXT(next) || JUMPABLE(next)) {
3818 regnode *text_node = next;
3820 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3822 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3824 if (PL_regkind[(U8)OP(text_node)] == REF) {
3828 else { c1 = (U8)*STRING(text_node); }
3830 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3832 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3833 c2 = PL_fold_locale[c1];
3844 /* If it could work, try it. */
3846 UCHARAT(PL_reginput) == c1 ||
3847 UCHARAT(PL_reginput) == c2)
3850 PerlIO_printf(Perl_debug_log,
3851 "%*s trying tail with n=%"IVdf"...\n",
3852 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3856 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3857 PL_regendp[paren] = PL_reginput - PL_bostr;