5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
13 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
14 * confused with the original package (see point 3 below). Thanks, Henry!
17 /* Additional note: this code is very heavily munged from Henry's version
18 * in places. In some spots I've traded clarity for efficiency, so don't
19 * blame Henry for some of the lack of readability.
22 /* The names of the functions have been changed from regcomp and
23 * regexec to pregcomp and pregexec in order to avoid conflicts
24 * with the POSIX routines of the same names.
27 #ifdef PERL_EXT_RE_BUILD
28 /* need to replace pregcomp et al, so enable that */
29 # ifndef PERL_IN_XSUB_RE
30 # define PERL_IN_XSUB_RE
32 /* need access to debugger hooks */
33 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
38 #ifdef PERL_IN_XSUB_RE
39 /* We *really* need to overwrite these symbols: */
40 # define Perl_regexec_flags my_regexec
41 # define Perl_regdump my_regdump
42 # define Perl_regprop my_regprop
43 # define Perl_re_intuit_start my_re_intuit_start
44 /* *These* symbols are masked to allow static link. */
45 # define Perl_pregexec my_pregexec
46 # define Perl_reginitcolors my_reginitcolors
47 # define Perl_regclass_swash my_regclass_swash
49 # define PERL_NO_GET_CONTEXT
54 * pregcomp and pregexec -- regsub and regerror are not used in perl
56 * Copyright (c) 1986 by University of Toronto.
57 * Written by Henry Spencer. Not derived from licensed software.
59 * Permission is granted to anyone to use this software for any
60 * purpose on any computer system, and to redistribute it freely,
61 * subject to the following restrictions:
63 * 1. The author is not responsible for the consequences of use of
64 * this software, no matter how awful, even if they arise
67 * 2. The origin of this software must not be misrepresented, either
68 * by explicit claim or by omission.
70 * 3. Altered versions must be plainly marked as such, and must not
71 * be misrepresented as being the original software.
73 **** Alterations to Henry's code are...
75 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
76 **** 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
78 **** You may distribute under the terms of either the GNU General Public
79 **** License or the Artistic License, as specified in the README file.
81 * Beware that some of this code is subtly aware of the way operator
82 * precedence is structured in regular expressions. Serious changes in
83 * regular-expression syntax might require a total rethink.
86 #define PERL_IN_REGEXEC_C
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define RF_false 16 /* odd number of nested negatives */
97 #define UTF ((PL_reg_flags & RF_utf8) != 0)
99 #define RS_init 1 /* eval environment created */
100 #define RS_set 2 /* replsv value is set */
103 #define STATIC static
106 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
112 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
113 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
115 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
116 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
117 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
119 #define HOPc(pos,off) ((char*)HOP(pos,off))
120 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
122 #define HOPBACK(pos, off) ( \
123 (PL_reg_match_utf8) \
124 ? reghopmaybe((U8*)pos, -off) \
125 : (pos - off >= PL_bostr) \
129 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
131 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
132 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
134 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
135 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
136 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
138 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
140 /* for use after a quantifier and before an EXACT-like node -- japhy */
141 #define JUMPABLE(rn) ( \
142 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
143 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
144 OP(rn) == PLUS || OP(rn) == MINMOD || \
145 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
148 #define HAS_TEXT(rn) ( \
149 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
153 Search for mandatory following text node; for lookahead, the text must
154 follow but for lookbehind (rn->flags != 0) we skip to the next step.
156 #define FIND_NEXT_IMPT(rn) STMT_START { \
157 while (JUMPABLE(rn)) \
158 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
159 rn = NEXTOPER(NEXTOPER(rn)); \
160 else if (OP(rn) == PLUS) \
162 else if (OP(rn) == IFMATCH) \
163 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
164 else rn += NEXT_OFF(rn); \
167 static void restore_pos(pTHX_ void *arg);
170 S_regcppush(pTHX_ I32 parenfloor)
172 int retval = PL_savestack_ix;
173 #define REGCP_PAREN_ELEMS 4
174 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
177 if (paren_elems_to_push < 0)
178 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
180 #define REGCP_OTHER_ELEMS 6
181 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
182 for (p = PL_regsize; p > parenfloor; p--) {
183 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
184 SSPUSHINT(PL_regendp[p]);
185 SSPUSHINT(PL_regstartp[p]);
186 SSPUSHPTR(PL_reg_start_tmp[p]);
189 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
190 SSPUSHINT(PL_regsize);
191 SSPUSHINT(*PL_reglastparen);
192 SSPUSHINT(*PL_reglastcloseparen);
193 SSPUSHPTR(PL_reginput);
194 #define REGCP_FRAME_ELEMS 2
195 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
196 * are needed for the regexp context stack bookkeeping. */
197 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
198 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
203 /* These are needed since we do not localize EVAL nodes: */
204 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
205 " Setting an EVAL scope, savestack=%"IVdf"\n", \
206 (IV)PL_savestack_ix)); cp = PL_savestack_ix
208 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
209 PerlIO_printf(Perl_debug_log, \
210 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
211 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
221 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
223 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
224 i = SSPOPINT; /* Parentheses elements to pop. */
225 input = (char *) SSPOPPTR;
226 *PL_reglastcloseparen = SSPOPINT;
227 *PL_reglastparen = SSPOPINT;
228 PL_regsize = SSPOPINT;
230 /* Now restore the parentheses context. */
231 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
232 i > 0; i -= REGCP_PAREN_ELEMS) {
233 paren = (U32)SSPOPINT;
234 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
235 PL_regstartp[paren] = SSPOPINT;
237 if (paren <= *PL_reglastparen)
238 PL_regendp[paren] = tmps;
240 PerlIO_printf(Perl_debug_log,
241 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
242 (UV)paren, (IV)PL_regstartp[paren],
243 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
244 (IV)PL_regendp[paren],
245 (paren > *PL_reglastparen ? "(no)" : ""));
249 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
250 PerlIO_printf(Perl_debug_log,
251 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
252 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
256 /* It would seem that the similar code in regtry()
257 * already takes care of this, and in fact it is in
258 * a better location to since this code can #if 0-ed out
259 * but the code in regtry() is needed or otherwise tests
260 * requiring null fields (pat.t#187 and split.t#{13,14}
261 * (as of patchlevel 7877) will fail. Then again,
262 * this code seems to be necessary or otherwise
263 * building DynaLoader will fail:
264 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
266 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
267 if ((I32)paren > PL_regsize)
268 PL_regstartp[paren] = -1;
269 PL_regendp[paren] = -1;
276 S_regcp_set_to(pTHX_ I32 ss)
278 I32 tmp = PL_savestack_ix;
280 PL_savestack_ix = ss;
282 PL_savestack_ix = tmp;
286 typedef struct re_cc_state
290 struct re_cc_state *prev;
295 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
297 #define TRYPAREN(paren, n, input) { \
300 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
301 PL_regendp[paren] = input - PL_bostr; \
304 PL_regendp[paren] = -1; \
306 if (regmatch(next)) \
309 PL_regendp[paren] = -1; \
314 * pregexec and friends
318 - pregexec - match a regexp against a string
321 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
322 char *strbeg, I32 minend, SV *screamer, U32 nosave)
323 /* strend: pointer to null at end of string */
324 /* strbeg: real beginning of string */
325 /* minend: end of match must be >=minend after stringarg. */
326 /* nosave: For optimizations. */
329 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
330 nosave ? 0 : REXEC_COPY_STR);
334 S_cache_re(pTHX_ regexp *prog)
336 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
338 PL_regprogram = prog->program;
340 PL_regnpar = prog->nparens;
341 PL_regdata = prog->data;
346 * Need to implement the following flags for reg_anch:
348 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
350 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
351 * INTUIT_AUTORITATIVE_ML
352 * INTUIT_ONCE_NOML - Intuit can match in one location only.
355 * Another flag for this function: SECOND_TIME (so that float substrs
356 * with giant delta may be not rechecked).
359 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
361 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
362 Otherwise, only SvCUR(sv) is used to get strbeg. */
364 /* XXXX We assume that strpos is strbeg unless sv. */
366 /* XXXX Some places assume that there is a fixed substring.
367 An update may be needed if optimizer marks as "INTUITable"
368 RExen without fixed substrings. Similarly, it is assumed that
369 lengths of all the strings are no more than minlen, thus they
370 cannot come from lookahead.
371 (Or minlen should take into account lookahead.) */
373 /* A failure to find a constant substring means that there is no need to make
374 an expensive call to REx engine, thus we celebrate a failure. Similarly,
375 finding a substring too deep into the string means that less calls to
376 regtry() should be needed.
378 REx compiler's optimizer found 4 possible hints:
379 a) Anchored substring;
381 c) Whether we are anchored (beginning-of-line or \G);
382 d) First node (of those at offset 0) which may distingush positions;
383 We use a)b)d) and multiline-part of c), and try to find a position in the
384 string which does not contradict any of them.
387 /* Most of decisions we do here should have been done at compile time.
388 The nodes of the REx which we used for the search should have been
389 deleted from the finite automaton. */
392 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
393 char *strend, U32 flags, re_scream_pos_data *data)
395 register I32 start_shift = 0;
396 /* Should be nonnegative! */
397 register I32 end_shift = 0;
402 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
404 register char *other_last = Nullch; /* other substr checked before this */
405 char *check_at = Nullch; /* check substr found at this pos */
406 I32 multiline = prog->reganch & PMf_MULTILINE;
408 char *i_strpos = strpos;
409 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
411 RX_MATCH_UTF8_set(prog,do_utf8);
413 if (prog->reganch & ROPT_UTF8) {
414 DEBUG_r(PerlIO_printf(Perl_debug_log,
415 "UTF-8 regex...\n"));
416 PL_reg_flags |= RF_utf8;
420 char *s = PL_reg_match_utf8 ?
421 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
423 int len = PL_reg_match_utf8 ?
424 strlen(s) : strend - strpos;
427 if (PL_reg_match_utf8)
428 DEBUG_r(PerlIO_printf(Perl_debug_log,
429 "UTF-8 target...\n"));
430 PerlIO_printf(Perl_debug_log,
431 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
432 PL_colors[4],PL_colors[5],PL_colors[0],
435 (strlen(prog->precomp) > 60 ? "..." : ""),
437 (int)(len > 60 ? 60 : len),
439 (len > 60 ? "..." : "")
443 /* CHR_DIST() would be more correct here but it makes things slow. */
444 if (prog->minlen > strend - strpos) {
445 DEBUG_r(PerlIO_printf(Perl_debug_log,
446 "String too short... [re_intuit_start]\n"));
449 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
452 if (!prog->check_utf8 && prog->check_substr)
453 to_utf8_substr(prog);
454 check = prog->check_utf8;
456 if (!prog->check_substr && prog->check_utf8)
457 to_byte_substr(prog);
458 check = prog->check_substr;
460 if (check == &PL_sv_undef) {
461 DEBUG_r(PerlIO_printf(Perl_debug_log,
462 "Non-utf string cannot match utf check string\n"));
465 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
466 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
467 || ( (prog->reganch & ROPT_ANCH_BOL)
468 && !multiline ) ); /* Check after \n? */
471 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
472 | ROPT_IMPLICIT)) /* not a real BOL */
473 /* SvCUR is not set on references: SvRV and SvPVX overlap */
475 && (strpos != strbeg)) {
476 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
479 if (prog->check_offset_min == prog->check_offset_max &&
480 !(prog->reganch & ROPT_CANY_SEEN)) {
481 /* Substring at constant offset from beg-of-str... */
484 s = HOP3c(strpos, prog->check_offset_min, strend);
486 slen = SvCUR(check); /* >= 1 */
488 if ( strend - s > slen || strend - s < slen - 1
489 || (strend - s == slen && strend[-1] != '\n')) {
490 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
493 /* Now should match s[0..slen-2] */
495 if (slen && (*SvPVX(check) != *s
497 && memNE(SvPVX(check), s, slen)))) {
499 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
503 else if (*SvPVX(check) != *s
504 || ((slen = SvCUR(check)) > 1
505 && memNE(SvPVX(check), s, slen)))
507 goto success_at_start;
510 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
512 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
513 end_shift = prog->minlen - start_shift -
514 CHR_SVLEN(check) + (SvTAIL(check) != 0);
516 I32 end = prog->check_offset_max + CHR_SVLEN(check)
517 - (SvTAIL(check) != 0);
518 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
520 if (end_shift < eshift)
524 else { /* Can match at random position */
527 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
528 /* Should be nonnegative! */
529 end_shift = prog->minlen - start_shift -
530 CHR_SVLEN(check) + (SvTAIL(check) != 0);
533 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
535 Perl_croak(aTHX_ "panic: end_shift");
539 /* Find a possible match in the region s..strend by looking for
540 the "check" substring in the region corrected by start/end_shift. */
541 if (flags & REXEC_SCREAM) {
542 I32 p = -1; /* Internal iterator of scream. */
543 I32 *pp = data ? data->scream_pos : &p;
545 if (PL_screamfirst[BmRARE(check)] >= 0
546 || ( BmRARE(check) == '\n'
547 && (BmPREVIOUS(check) == SvCUR(check) - 1)
549 s = screaminstr(sv, check,
550 start_shift + (s - strbeg), end_shift, pp, 0);
553 /* we may be pointing at the wrong string */
554 if (s && RX_MATCH_COPIED(prog))
555 s = strbeg + (s - SvPVX(sv));
557 *data->scream_olds = s;
559 else if (prog->reganch & ROPT_CANY_SEEN)
560 s = fbm_instr((U8*)(s + start_shift),
561 (U8*)(strend - end_shift),
562 check, multiline ? FBMrf_MULTILINE : 0);
564 s = fbm_instr(HOP3(s, start_shift, strend),
565 HOP3(strend, -end_shift, strbeg),
566 check, multiline ? FBMrf_MULTILINE : 0);
568 /* Update the count-of-usability, remove useless subpatterns,
571 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
572 (s ? "Found" : "Did not find"),
573 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
575 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
577 PL_colors[1], (SvTAIL(check) ? "$" : ""),
578 (s ? " at offset " : "...\n") ) );
585 /* Finish the diagnostic message */
586 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
588 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
589 Start with the other substr.
590 XXXX no SCREAM optimization yet - and a very coarse implementation
591 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
592 *always* match. Probably should be marked during compile...
593 Probably it is right to do no SCREAM here...
596 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
597 /* Take into account the "other" substring. */
598 /* XXXX May be hopelessly wrong for UTF... */
601 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
604 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
608 t = s - prog->check_offset_max;
609 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
611 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
616 t = HOP3c(t, prog->anchored_offset, strend);
617 if (t < other_last) /* These positions already checked */
619 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
622 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
623 /* On end-of-str: see comment below. */
624 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
625 if (must == &PL_sv_undef) {
627 DEBUG_r(must = prog->anchored_utf8); /* for debug */
632 HOP3(HOP3(last1, prog->anchored_offset, strend)
633 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
635 multiline ? FBMrf_MULTILINE : 0
637 DEBUG_r(PerlIO_printf(Perl_debug_log,
638 "%s anchored substr `%s%.*s%s'%s",
639 (s ? "Found" : "Contradicts"),
642 - (SvTAIL(must)!=0)),
644 PL_colors[1], (SvTAIL(must) ? "$" : "")));
646 if (last1 >= last2) {
647 DEBUG_r(PerlIO_printf(Perl_debug_log,
648 ", giving up...\n"));
651 DEBUG_r(PerlIO_printf(Perl_debug_log,
652 ", trying floating at offset %ld...\n",
653 (long)(HOP3c(s1, 1, strend) - i_strpos)));
654 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
655 s = HOP3c(last, 1, strend);
659 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
660 (long)(s - i_strpos)));
661 t = HOP3c(s, -prog->anchored_offset, strbeg);
662 other_last = HOP3c(s, 1, strend);
670 else { /* Take into account the floating substring. */
675 t = HOP3c(s, -start_shift, strbeg);
677 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
678 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
679 last = HOP3c(t, prog->float_max_offset, strend);
680 s = HOP3c(t, prog->float_min_offset, strend);
683 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
684 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
685 /* fbm_instr() takes into account exact value of end-of-str
686 if the check is SvTAIL(ed). Since false positives are OK,
687 and end-of-str is not later than strend we are OK. */
688 if (must == &PL_sv_undef) {
690 DEBUG_r(must = prog->float_utf8); /* for debug message */
693 s = fbm_instr((unsigned char*)s,
694 (unsigned char*)last + SvCUR(must)
696 must, multiline ? FBMrf_MULTILINE : 0);
697 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
698 (s ? "Found" : "Contradicts"),
700 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
702 PL_colors[1], (SvTAIL(must) ? "$" : "")));
705 DEBUG_r(PerlIO_printf(Perl_debug_log,
706 ", giving up...\n"));
709 DEBUG_r(PerlIO_printf(Perl_debug_log,
710 ", trying anchored starting at offset %ld...\n",
711 (long)(s1 + 1 - i_strpos)));
713 s = HOP3c(t, 1, strend);
717 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
718 (long)(s - i_strpos)));
719 other_last = s; /* Fix this later. --Hugo */
728 t = s - prog->check_offset_max;
729 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
731 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
733 /* Fixed substring is found far enough so that the match
734 cannot start at strpos. */
736 if (ml_anch && t[-1] != '\n') {
737 /* Eventually fbm_*() should handle this, but often
738 anchored_offset is not 0, so this check will not be wasted. */
739 /* XXXX In the code below we prefer to look for "^" even in
740 presence of anchored substrings. And we search even
741 beyond the found float position. These pessimizations
742 are historical artefacts only. */
744 while (t < strend - prog->minlen) {
746 if (t < check_at - prog->check_offset_min) {
747 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
748 /* Since we moved from the found position,
749 we definitely contradict the found anchored
750 substr. Due to the above check we do not
751 contradict "check" substr.
752 Thus we can arrive here only if check substr
753 is float. Redo checking for "other"=="fixed".
756 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
757 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
758 goto do_other_anchored;
760 /* We don't contradict the found floating substring. */
761 /* XXXX Why not check for STCLASS? */
763 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
764 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
767 /* Position contradicts check-string */
768 /* XXXX probably better to look for check-string
769 than for "\n", so one should lower the limit for t? */
770 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
771 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
772 other_last = strpos = s = t + 1;
777 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
778 PL_colors[0],PL_colors[1]));
782 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
783 PL_colors[0],PL_colors[1]));
787 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
790 /* The found string does not prohibit matching at strpos,
791 - no optimization of calling REx engine can be performed,
792 unless it was an MBOL and we are not after MBOL,
793 or a future STCLASS check will fail this. */
795 /* Even in this situation we may use MBOL flag if strpos is offset
796 wrt the start of the string. */
797 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
798 && (strpos != strbeg) && strpos[-1] != '\n'
799 /* May be due to an implicit anchor of m{.*foo} */
800 && !(prog->reganch & ROPT_IMPLICIT))
805 DEBUG_r( if (ml_anch)
806 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
807 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
810 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
812 prog->check_utf8 /* Could be deleted already */
813 && --BmUSEFUL(prog->check_utf8) < 0
814 && (prog->check_utf8 == prog->float_utf8)
816 prog->check_substr /* Could be deleted already */
817 && --BmUSEFUL(prog->check_substr) < 0
818 && (prog->check_substr == prog->float_substr)
821 /* If flags & SOMETHING - do not do it many times on the same match */
822 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
823 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
824 if (do_utf8 ? prog->check_substr : prog->check_utf8)
825 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
826 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
827 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
828 check = Nullsv; /* abort */
830 /* XXXX This is a remnant of the old implementation. It
831 looks wasteful, since now INTUIT can use many
833 prog->reganch &= ~RE_USE_INTUIT;
840 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
841 if (prog->regstclass) {
842 /* minlen == 0 is possible if regstclass is \b or \B,
843 and the fixed substr is ''$.
844 Since minlen is already taken into account, s+1 is before strend;
845 accidentally, minlen >= 1 guaranties no false positives at s + 1
846 even for \b or \B. But (minlen? 1 : 0) below assumes that
847 regstclass does not come from lookahead... */
848 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
849 This leaves EXACTF only, which is dealt with in find_byclass(). */
850 U8* str = (U8*)STRING(prog->regstclass);
851 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
852 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
854 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
855 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
856 : (prog->float_substr || prog->float_utf8
857 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
860 char *startpos = strbeg;
864 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
869 if (endpos == strend) {
870 DEBUG_r( PerlIO_printf(Perl_debug_log,
871 "Could not match STCLASS...\n") );
874 DEBUG_r( PerlIO_printf(Perl_debug_log,
875 "This position contradicts STCLASS...\n") );
876 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
878 /* Contradict one of substrings */
879 if (prog->anchored_substr || prog->anchored_utf8) {
880 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
881 DEBUG_r( what = "anchored" );
883 s = HOP3c(t, 1, strend);
884 if (s + start_shift + end_shift > strend) {
885 /* XXXX Should be taken into account earlier? */
886 DEBUG_r( PerlIO_printf(Perl_debug_log,
887 "Could not match STCLASS...\n") );
892 DEBUG_r( PerlIO_printf(Perl_debug_log,
893 "Looking for %s substr starting at offset %ld...\n",
894 what, (long)(s + start_shift - i_strpos)) );
897 /* Have both, check_string is floating */
898 if (t + start_shift >= check_at) /* Contradicts floating=check */
899 goto retry_floating_check;
900 /* Recheck anchored substring, but not floating... */
904 DEBUG_r( PerlIO_printf(Perl_debug_log,
905 "Looking for anchored substr starting at offset %ld...\n",
906 (long)(other_last - i_strpos)) );
907 goto do_other_anchored;
909 /* Another way we could have checked stclass at the
910 current position only: */
915 DEBUG_r( PerlIO_printf(Perl_debug_log,
916 "Looking for /%s^%s/m starting at offset %ld...\n",
917 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
920 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
922 /* Check is floating subtring. */
923 retry_floating_check:
924 t = check_at - start_shift;
925 DEBUG_r( what = "floating" );
926 goto hop_and_restart;
929 DEBUG_r(PerlIO_printf(Perl_debug_log,
930 "By STCLASS: moving %ld --> %ld\n",
931 (long)(t - i_strpos), (long)(s - i_strpos))
935 DEBUG_r(PerlIO_printf(Perl_debug_log,
936 "Does not contradict STCLASS...\n");
941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
942 PL_colors[4], (check ? "Guessed" : "Giving up"),
943 PL_colors[5], (long)(s - i_strpos)) );
946 fail_finish: /* Substring not found */
947 if (prog->check_substr || prog->check_utf8) /* could be removed already */
948 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
950 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
951 PL_colors[4],PL_colors[5]));
955 /* We know what class REx starts with. Try to find this position... */
957 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
959 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
963 register STRLEN uskip;
967 register I32 tmp = 1; /* Scratch variable? */
968 register bool do_utf8 = PL_reg_match_utf8;
970 /* We know what class it must start with. */
974 while (s + (uskip = UTF8SKIP(s)) <= strend) {
975 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
976 !UTF8_IS_INVARIANT((U8)s[0]) ?
977 reginclass(c, (U8*)s, 0, do_utf8) :
978 REGINCLASS(c, (U8*)s)) {
979 if (tmp && (norun || regtry(prog, s)))
993 if (REGINCLASS(c, (U8*)s) ||
994 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
995 /* The assignment of 2 is intentional:
996 * for the folded sharp s, the skip is 2. */
997 (skip = SHARP_S_SKIP))) {
998 if (tmp && (norun || regtry(prog, s)))
1010 while (s < strend) {
1011 if (tmp && (norun || regtry(prog, s)))
1020 ln = STR_LEN(c); /* length to match in octets/bytes */
1021 lnc = (I32) ln; /* length to match in characters */
1023 STRLEN ulen1, ulen2;
1025 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1026 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
1028 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1029 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1031 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1032 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1033 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1034 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1036 while (sm < ((U8 *) m + ln)) {
1051 c2 = PL_fold_locale[c1];
1053 e = HOP3c(strend, -((I32)lnc), s);
1056 e = s; /* Due to minlen logic of intuit() */
1058 /* The idea in the EXACTF* cases is to first find the
1059 * first character of the EXACTF* node and then, if
1060 * necessary, case-insensitively compare the full
1061 * text of the node. The c1 and c2 are the first
1062 * characters (though in Unicode it gets a bit
1063 * more complicated because there are more cases
1064 * than just upper and lower: one needs to use
1065 * the so-called folding case for case-insensitive
1066 * matching (called "loose matching" in Unicode).
1067 * ibcmp_utf8() will do just that. */
1071 U8 tmpbuf [UTF8_MAXLEN+1];
1072 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1073 STRLEN len, foldlen;
1076 /* Upper and lower of 1st char are equal -
1077 * probably not a "letter". */
1079 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1081 0 : UTF8_ALLOW_ANY);
1084 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1085 m, (char **)0, ln, (bool)UTF))
1086 && (norun || regtry(prog, s)) )
1089 uvchr_to_utf8(tmpbuf, c);
1090 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1092 && (f == c1 || f == c2)
1093 && (ln == foldlen ||
1094 !ibcmp_utf8((char *) foldbuf,
1095 (char **)0, foldlen, do_utf8,
1097 (char **)0, ln, (bool)UTF))
1098 && (norun || regtry(prog, s)) )
1106 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1108 0 : UTF8_ALLOW_ANY);
1110 /* Handle some of the three Greek sigmas cases.
1111 * Note that not all the possible combinations
1112 * are handled here: some of them are handled
1113 * by the standard folding rules, and some of
1114 * them (the character class or ANYOF cases)
1115 * are handled during compiletime in
1116 * regexec.c:S_regclass(). */
1117 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1118 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1119 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1121 if ( (c == c1 || c == c2)
1123 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1124 m, (char **)0, ln, (bool)UTF))
1125 && (norun || regtry(prog, s)) )
1128 uvchr_to_utf8(tmpbuf, c);
1129 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1131 && (f == c1 || f == c2)
1132 && (ln == foldlen ||
1133 !ibcmp_utf8((char *) foldbuf,
1134 (char **)0, foldlen, do_utf8,
1136 (char **)0, ln, (bool)UTF))
1137 && (norun || regtry(prog, s)) )
1148 && (ln == 1 || !(OP(c) == EXACTF
1150 : ibcmp_locale(s, m, ln)))
1151 && (norun || regtry(prog, s)) )
1157 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1158 && (ln == 1 || !(OP(c) == EXACTF
1160 : ibcmp_locale(s, m, ln)))
1161 && (norun || regtry(prog, s)) )
1168 PL_reg_flags |= RF_tainted;
1175 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1177 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1179 tmp = ((OP(c) == BOUND ?
1180 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1181 LOAD_UTF8_CHARCLASS(alnum,"a");
1182 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1183 if (tmp == !(OP(c) == BOUND ?
1184 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1185 isALNUM_LC_utf8((U8*)s)))
1188 if ((norun || regtry(prog, s)))
1195 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1196 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1197 while (s < strend) {
1199 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1201 if ((norun || regtry(prog, s)))
1207 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1211 PL_reg_flags |= RF_tainted;
1218 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1220 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1222 tmp = ((OP(c) == NBOUND ?
1223 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1224 LOAD_UTF8_CHARCLASS(alnum,"a");
1225 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1226 if (tmp == !(OP(c) == NBOUND ?
1227 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1228 isALNUM_LC_utf8((U8*)s)))
1230 else if ((norun || regtry(prog, s)))
1236 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1237 tmp = ((OP(c) == NBOUND ?
1238 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1239 while (s < strend) {
1241 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1243 else if ((norun || regtry(prog, s)))
1248 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1253 LOAD_UTF8_CHARCLASS(alnum,"a");
1254 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1255 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1256 if (tmp && (norun || regtry(prog, s)))
1267 while (s < strend) {
1269 if (tmp && (norun || regtry(prog, s)))
1281 PL_reg_flags |= RF_tainted;
1283 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1284 if (isALNUM_LC_utf8((U8*)s)) {
1285 if (tmp && (norun || regtry(prog, s)))
1296 while (s < strend) {
1297 if (isALNUM_LC(*s)) {
1298 if (tmp && (norun || regtry(prog, s)))
1311 LOAD_UTF8_CHARCLASS(alnum,"a");
1312 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1313 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1314 if (tmp && (norun || regtry(prog, s)))
1325 while (s < strend) {
1327 if (tmp && (norun || regtry(prog, s)))
1339 PL_reg_flags |= RF_tainted;
1341 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1342 if (!isALNUM_LC_utf8((U8*)s)) {
1343 if (tmp && (norun || regtry(prog, s)))
1354 while (s < strend) {
1355 if (!isALNUM_LC(*s)) {
1356 if (tmp && (norun || regtry(prog, s)))
1369 LOAD_UTF8_CHARCLASS(space," ");
1370 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1371 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1372 if (tmp && (norun || regtry(prog, s)))
1383 while (s < strend) {
1385 if (tmp && (norun || regtry(prog, s)))
1397 PL_reg_flags |= RF_tainted;
1399 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1400 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1401 if (tmp && (norun || regtry(prog, s)))
1412 while (s < strend) {
1413 if (isSPACE_LC(*s)) {
1414 if (tmp && (norun || regtry(prog, s)))
1427 LOAD_UTF8_CHARCLASS(space," ");
1428 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1429 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1430 if (tmp && (norun || regtry(prog, s)))
1441 while (s < strend) {
1443 if (tmp && (norun || regtry(prog, s)))
1455 PL_reg_flags |= RF_tainted;
1457 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1458 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1459 if (tmp && (norun || regtry(prog, s)))
1470 while (s < strend) {
1471 if (!isSPACE_LC(*s)) {
1472 if (tmp && (norun || regtry(prog, s)))
1485 LOAD_UTF8_CHARCLASS(digit,"0");
1486 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1487 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1488 if (tmp && (norun || regtry(prog, s)))
1499 while (s < strend) {
1501 if (tmp && (norun || regtry(prog, s)))
1513 PL_reg_flags |= RF_tainted;
1515 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1516 if (isDIGIT_LC_utf8((U8*)s)) {
1517 if (tmp && (norun || regtry(prog, s)))
1528 while (s < strend) {
1529 if (isDIGIT_LC(*s)) {
1530 if (tmp && (norun || regtry(prog, s)))
1543 LOAD_UTF8_CHARCLASS(digit,"0");
1544 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1545 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1546 if (tmp && (norun || regtry(prog, s)))
1557 while (s < strend) {
1559 if (tmp && (norun || regtry(prog, s)))
1571 PL_reg_flags |= RF_tainted;
1573 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1574 if (!isDIGIT_LC_utf8((U8*)s)) {
1575 if (tmp && (norun || regtry(prog, s)))
1586 while (s < strend) {
1587 if (!isDIGIT_LC(*s)) {
1588 if (tmp && (norun || regtry(prog, s)))
1600 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1609 - regexec_flags - match a regexp against a string
1612 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1613 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1614 /* strend: pointer to null at end of string */
1615 /* strbeg: real beginning of string */
1616 /* minend: end of match must be >=minend after stringarg. */
1617 /* data: May be used for some additional optimizations. */
1618 /* nosave: For optimizations. */
1621 register regnode *c;
1622 register char *startpos = stringarg;
1623 I32 minlen; /* must match at least this many chars */
1624 I32 dontbother = 0; /* how many characters not to try at end */
1625 /* I32 start_shift = 0; */ /* Offset of the start to find
1626 constant substr. */ /* CC */
1627 I32 end_shift = 0; /* Same for the end. */ /* CC */
1628 I32 scream_pos = -1; /* Internal iterator of scream. */
1630 SV* oreplsv = GvSV(PL_replgv);
1631 bool do_utf8 = DO_UTF8(sv);
1632 I32 multiline = prog->reganch & PMf_MULTILINE;
1634 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1635 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1637 RX_MATCH_UTF8_set(prog,do_utf8);
1643 PL_regnarrate = DEBUG_r_TEST;
1646 /* Be paranoid... */
1647 if (prog == NULL || startpos == NULL) {
1648 Perl_croak(aTHX_ "NULL regexp parameter");
1652 minlen = prog->minlen;
1653 if (strend - startpos < minlen) {
1654 DEBUG_r(PerlIO_printf(Perl_debug_log,
1655 "String too short [regexec_flags]...\n"));
1659 /* Check validity of program. */
1660 if (UCHARAT(prog->program) != REG_MAGIC) {
1661 Perl_croak(aTHX_ "corrupted regexp program");
1665 PL_reg_eval_set = 0;
1668 if (prog->reganch & ROPT_UTF8)
1669 PL_reg_flags |= RF_utf8;
1671 /* Mark beginning of line for ^ and lookbehind. */
1672 PL_regbol = startpos;
1676 /* Mark end of line for $ (and such) */
1679 /* see how far we have to get to not match where we matched before */
1680 PL_regtill = startpos+minend;
1682 /* We start without call_cc context. */
1685 /* If there is a "must appear" string, look for it. */
1688 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1691 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1692 PL_reg_ganch = startpos;
1693 else if (sv && SvTYPE(sv) >= SVt_PVMG
1695 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1696 && mg->mg_len >= 0) {
1697 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1698 if (prog->reganch & ROPT_ANCH_GPOS) {
1699 if (s > PL_reg_ganch)
1704 else /* pos() not defined */
1705 PL_reg_ganch = strbeg;
1708 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1709 re_scream_pos_data d;
1711 d.scream_olds = &scream_olds;
1712 d.scream_pos = &scream_pos;
1713 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1715 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1716 goto phooey; /* not present */
1722 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1723 UNI_DISPLAY_REGEX) :
1725 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1726 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1727 UNI_DISPLAY_REGEX) : startpos;
1728 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1731 PerlIO_printf(Perl_debug_log,
1732 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
1733 PL_colors[4],PL_colors[5],PL_colors[0],
1736 len0 > 60 ? "..." : "",
1738 (int)(len1 > 60 ? 60 : len1),
1740 (len1 > 60 ? "..." : "")
1744 /* Simplest case: anchored match need be tried only once. */
1745 /* [unless only anchor is BOL and multiline is set] */
1746 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1747 if (s == startpos && regtry(prog, startpos))
1749 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1750 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1755 dontbother = minlen - 1;
1756 end = HOP3c(strend, -dontbother, strbeg) - 1;
1757 /* for multiline we only have to try after newlines */
1758 if (prog->check_substr || prog->check_utf8) {
1762 if (regtry(prog, s))
1767 if (prog->reganch & RE_USE_INTUIT) {
1768 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1779 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1780 if (regtry(prog, s))
1787 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1788 if (regtry(prog, PL_reg_ganch))
1793 /* Messy cases: unanchored match. */
1794 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1795 /* we have /x+whatever/ */
1796 /* it must be a one character string (XXXX Except UTF?) */
1801 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1802 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1803 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1806 while (s < strend) {
1808 DEBUG_r( did_match = 1 );
1809 if (regtry(prog, s)) goto got_it;
1811 while (s < strend && *s == ch)
1818 while (s < strend) {
1820 DEBUG_r( did_match = 1 );
1821 if (regtry(prog, s)) goto got_it;
1823 while (s < strend && *s == ch)
1829 DEBUG_r(if (!did_match)
1830 PerlIO_printf(Perl_debug_log,
1831 "Did not find anchored character...\n")
1835 else if (prog->anchored_substr != Nullsv
1836 || prog->anchored_utf8 != Nullsv
1837 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1838 && prog->float_max_offset < strend - s)) {
1843 char *last1; /* Last position checked before */
1847 if (prog->anchored_substr || prog->anchored_utf8) {
1848 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1849 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1850 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1851 back_max = back_min = prog->anchored_offset;
1853 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1854 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1855 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1856 back_max = prog->float_max_offset;
1857 back_min = prog->float_min_offset;
1859 if (must == &PL_sv_undef)
1860 /* could not downgrade utf8 check substring, so must fail */
1863 last = HOP3c(strend, /* Cannot start after this */
1864 -(I32)(CHR_SVLEN(must)
1865 - (SvTAIL(must) != 0) + back_min), strbeg);
1868 last1 = HOPc(s, -1);
1870 last1 = s - 1; /* bogus */
1872 /* XXXX check_substr already used to find `s', can optimize if
1873 check_substr==must. */
1875 dontbother = end_shift;
1876 strend = HOPc(strend, -dontbother);
1877 while ( (s <= last) &&
1878 ((flags & REXEC_SCREAM)
1879 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1880 end_shift, &scream_pos, 0))
1881 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1882 (unsigned char*)strend, must,
1883 multiline ? FBMrf_MULTILINE : 0))) ) {
1884 /* we may be pointing at the wrong string */
1885 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1886 s = strbeg + (s - SvPVX(sv));
1887 DEBUG_r( did_match = 1 );
1888 if (HOPc(s, -back_max) > last1) {
1889 last1 = HOPc(s, -back_min);
1890 s = HOPc(s, -back_max);
1893 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1895 last1 = HOPc(s, -back_min);
1899 while (s <= last1) {
1900 if (regtry(prog, s))
1906 while (s <= last1) {
1907 if (regtry(prog, s))
1913 DEBUG_r(if (!did_match)
1914 PerlIO_printf(Perl_debug_log,
1915 "Did not find %s substr `%s%.*s%s'%s...\n",
1916 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1917 ? "anchored" : "floating"),
1919 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1921 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1925 else if ((c = prog->regstclass)) {
1927 I32 op = (U8)OP(prog->regstclass);
1928 /* don't bother with what can't match */
1929 if (PL_regkind[op] != EXACT && op != CANY)
1930 strend = HOPc(strend, -(minlen - 1));
1933 SV *prop = sv_newmortal();
1941 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
1942 UNI_DISPLAY_REGEX) :
1944 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1946 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1947 len1 = UTF ? SvCUR(dsv1) : strend - s;
1948 PerlIO_printf(Perl_debug_log,
1949 "Matching stclass `%*.*s' against `%*.*s'\n",
1953 if (find_byclass(prog, c, s, strend, startpos, 0))
1955 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1959 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1964 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1965 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1966 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1968 if (flags & REXEC_SCREAM) {
1969 last = screaminstr(sv, float_real, s - strbeg,
1970 end_shift, &scream_pos, 1); /* last one */
1972 last = scream_olds; /* Only one occurrence. */
1973 /* we may be pointing at the wrong string */
1974 else if (RX_MATCH_COPIED(prog))
1975 s = strbeg + (s - SvPVX(sv));
1979 char *little = SvPV(float_real, len);
1981 if (SvTAIL(float_real)) {
1982 if (memEQ(strend - len + 1, little, len - 1))
1983 last = strend - len + 1;
1984 else if (!multiline)
1985 last = memEQ(strend - len, little, len)
1986 ? strend - len : Nullch;
1992 last = rninstr(s, strend, little, little + len);
1994 last = strend; /* matching `$' */
1998 DEBUG_r(PerlIO_printf(Perl_debug_log,
1999 "%sCan't trim the tail, match fails (should not happen)%s\n",
2000 PL_colors[4],PL_colors[5]));
2001 goto phooey; /* Should not happen! */
2003 dontbother = strend - last + prog->float_min_offset;
2005 if (minlen && (dontbother < minlen))
2006 dontbother = minlen - 1;
2007 strend -= dontbother; /* this one's always in bytes! */
2008 /* We don't know much -- general case. */
2011 if (regtry(prog, s))
2020 if (regtry(prog, s))
2022 } while (s++ < strend);
2030 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2032 if (PL_reg_eval_set) {
2033 /* Preserve the current value of $^R */
2034 if (oreplsv != GvSV(PL_replgv))
2035 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2036 restored, the value remains
2038 restore_pos(aTHX_ 0);
2041 /* make sure $`, $&, $', and $digit will work later */
2042 if ( !(flags & REXEC_NOT_FIRST) ) {
2043 RX_MATCH_COPY_FREE(prog);
2044 if (flags & REXEC_COPY_STR) {
2045 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2046 #ifdef PERL_COPY_ON_WRITE
2048 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2050 PerlIO_printf(Perl_debug_log,
2051 "Copy on write: regexp capture, type %d\n",
2054 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2055 prog->subbeg = SvPVX(prog->saved_copy);
2056 assert (SvPOKp(prog->saved_copy));
2060 RX_MATCH_COPIED_on(prog);
2061 s = savepvn(strbeg, i);
2067 prog->subbeg = strbeg;
2068 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2075 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2076 PL_colors[4],PL_colors[5]));
2077 if (PL_reg_eval_set)
2078 restore_pos(aTHX_ 0);
2083 - regtry - try match at specific point
2085 STATIC I32 /* 0 failure, 1 success */
2086 S_regtry(pTHX_ regexp *prog, char *startpos)
2094 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2096 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2099 PL_reg_eval_set = RS_init;
2101 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2102 (IV)(PL_stack_sp - PL_stack_base));
2104 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2105 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2106 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2108 /* Apparently this is not needed, judging by wantarray. */
2109 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2110 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2113 /* Make $_ available to executed code. */
2114 if (PL_reg_sv != DEFSV) {
2119 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2120 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2121 /* prepare for quick setting of pos */
2122 sv_magic(PL_reg_sv, (SV*)0,
2123 PERL_MAGIC_regex_global, Nullch, 0);
2124 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2128 PL_reg_oldpos = mg->mg_len;
2129 SAVEDESTRUCTOR_X(restore_pos, 0);
2131 if (!PL_reg_curpm) {
2132 Newz(22,PL_reg_curpm, 1, PMOP);
2135 SV* repointer = newSViv(0);
2136 /* so we know which PL_regex_padav element is PL_reg_curpm */
2137 SvFLAGS(repointer) |= SVf_BREAK;
2138 av_push(PL_regex_padav,repointer);
2139 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2140 PL_regex_pad = AvARRAY(PL_regex_padav);
2144 PM_SETRE(PL_reg_curpm, prog);
2145 PL_reg_oldcurpm = PL_curpm;
2146 PL_curpm = PL_reg_curpm;
2147 if (RX_MATCH_COPIED(prog)) {
2148 /* Here is a serious problem: we cannot rewrite subbeg,
2149 since it may be needed if this match fails. Thus
2150 $` inside (?{}) could fail... */
2151 PL_reg_oldsaved = prog->subbeg;
2152 PL_reg_oldsavedlen = prog->sublen;
2153 #ifdef PERL_COPY_ON_WRITE
2154 PL_nrs = prog->saved_copy;
2156 RX_MATCH_COPIED_off(prog);
2159 PL_reg_oldsaved = Nullch;
2160 prog->subbeg = PL_bostr;
2161 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2163 prog->startp[0] = startpos - PL_bostr;
2164 PL_reginput = startpos;
2165 PL_regstartp = prog->startp;
2166 PL_regendp = prog->endp;
2167 PL_reglastparen = &prog->lastparen;
2168 PL_reglastcloseparen = &prog->lastcloseparen;
2169 prog->lastparen = 0;
2170 prog->lastcloseparen = 0;
2172 DEBUG_r(PL_reg_starttry = startpos);
2173 if (PL_reg_start_tmpl <= prog->nparens) {
2174 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2175 if(PL_reg_start_tmp)
2176 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2178 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2181 /* XXXX What this code is doing here?!!! There should be no need
2182 to do this again and again, PL_reglastparen should take care of
2185 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2186 * Actually, the code in regcppop() (which Ilya may be meaning by
2187 * PL_reglastparen), is not needed at all by the test suite
2188 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2189 * enough, for building DynaLoader, or otherwise this
2190 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2191 * will happen. Meanwhile, this code *is* needed for the
2192 * above-mentioned test suite tests to succeed. The common theme
2193 * on those tests seems to be returning null fields from matches.
2198 if (prog->nparens) {
2199 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2206 if (regmatch(prog->program + 1)) {
2207 prog->endp[0] = PL_reginput - PL_bostr;
2210 REGCP_UNWIND(lastcp);
2214 #define RE_UNWIND_BRANCH 1
2215 #define RE_UNWIND_BRANCHJ 2
2219 typedef struct { /* XX: makes sense to enlarge it... */
2223 } re_unwind_generic_t;
2236 } re_unwind_branch_t;
2238 typedef union re_unwind_t {
2240 re_unwind_generic_t generic;
2241 re_unwind_branch_t branch;
2244 #define sayYES goto yes
2245 #define sayNO goto no
2246 #define sayNO_ANYOF goto no_anyof
2247 #define sayYES_FINAL goto yes_final
2248 #define sayYES_LOUD goto yes_loud
2249 #define sayNO_FINAL goto no_final
2250 #define sayNO_SILENT goto do_no
2251 #define saySAME(x) if (x) goto yes; else goto no
2253 #define REPORT_CODE_OFF 24
2256 - regmatch - main matching routine
2258 * Conceptually the strategy is simple: check to see whether the current
2259 * node matches, call self recursively to see whether the rest matches,
2260 * and then act accordingly. In practice we make some effort to avoid
2261 * recursion, in particular by going through "ordinary" nodes (that don't
2262 * need to know whether the rest of the match failed) by a loop instead of
2265 /* [lwall] I've hoisted the register declarations to the outer block in order to
2266 * maybe save a little bit of pushing and popping on the stack. It also takes
2267 * advantage of machines that use a register save mask on subroutine entry.
2269 STATIC I32 /* 0 failure, 1 success */
2270 S_regmatch(pTHX_ regnode *prog)
2272 register regnode *scan; /* Current node. */
2273 regnode *next; /* Next node. */
2274 regnode *inner; /* Next node in internal branch. */
2275 register I32 nextchr; /* renamed nextchr - nextchar colides with
2276 function of same name */
2277 register I32 n; /* no or next */
2278 register I32 ln = 0; /* len or last */
2279 register char *s = Nullch; /* operand or save */
2280 register char *locinput = PL_reginput;
2281 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2282 int minmod = 0, sw = 0, logical = 0;
2285 I32 firstcp = PL_savestack_ix;
2287 register bool do_utf8 = PL_reg_match_utf8;
2289 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2290 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2291 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2298 /* Note that nextchr is a byte even in UTF */
2299 nextchr = UCHARAT(locinput);
2301 while (scan != NULL) {
2304 SV *prop = sv_newmortal();
2305 int docolor = *PL_colors[0];
2306 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2307 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2308 /* The part of the string before starttry has one color
2309 (pref0_len chars), between starttry and current
2310 position another one (pref_len - pref0_len chars),
2311 after the current position the third one.
2312 We assume that pref0_len <= pref_len, otherwise we
2313 decrease pref0_len. */
2314 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2315 ? (5 + taill) - l : locinput - PL_bostr;
2318 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2320 pref0_len = pref_len - (locinput - PL_reg_starttry);
2321 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2322 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2323 ? (5 + taill) - pref_len : PL_regeol - locinput);
2324 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2328 if (pref0_len > pref_len)
2329 pref0_len = pref_len;
2330 regprop(prop, scan);
2333 do_utf8 && OP(scan) != CANY ?
2334 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2335 pref0_len, 60, UNI_DISPLAY_REGEX) :
2336 locinput - pref_len;
2337 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2338 char *s1 = do_utf8 && OP(scan) != CANY ?
2339 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2340 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2341 locinput - pref_len + pref0_len;
2342 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2343 char *s2 = do_utf8 && OP(scan) != CANY ?
2344 pv_uni_display(dsv2, (U8*)locinput,
2345 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2347 int len2 = do_utf8 ? strlen(s2) : l;
2348 PerlIO_printf(Perl_debug_log,
2349 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2350 (IV)(locinput - PL_bostr),
2357 (docolor ? "" : "> <"),
2361 15 - l - pref_len + 1,
2363 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2368 next = scan + NEXT_OFF(scan);
2374 if (locinput == PL_bostr)
2376 /* regtill = regbol; */
2381 if (locinput == PL_bostr ||
2382 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2388 if (locinput == PL_bostr)
2392 if (locinput == PL_reg_ganch)
2398 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2403 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2405 if (PL_regeol - locinput > 1)
2409 if (PL_regeol != locinput)
2413 if (!nextchr && locinput >= PL_regeol)
2416 locinput += PL_utf8skip[nextchr];
2417 if (locinput > PL_regeol)
2419 nextchr = UCHARAT(locinput);
2422 nextchr = UCHARAT(++locinput);
2425 if (!nextchr && locinput >= PL_regeol)
2427 nextchr = UCHARAT(++locinput);
2430 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2433 locinput += PL_utf8skip[nextchr];
2434 if (locinput > PL_regeol)
2436 nextchr = UCHARAT(locinput);
2439 nextchr = UCHARAT(++locinput);
2444 if (do_utf8 != UTF) {
2445 /* The target and the pattern have differing utf8ness. */
2451 /* The target is utf8, the pattern is not utf8. */
2455 if (NATIVE_TO_UNI(*(U8*)s) !=
2456 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2458 0 : UTF8_ALLOW_ANY))
2465 /* The target is not utf8, the pattern is utf8. */
2469 if (NATIVE_TO_UNI(*((U8*)l)) !=
2470 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2472 0 : UTF8_ALLOW_ANY))
2479 nextchr = UCHARAT(locinput);
2482 /* The target and the pattern have the same utf8ness. */
2483 /* Inline the first character, for speed. */
2484 if (UCHARAT(s) != nextchr)
2486 if (PL_regeol - locinput < ln)
2488 if (ln > 1 && memNE(s, locinput, ln))
2491 nextchr = UCHARAT(locinput);
2494 PL_reg_flags |= RF_tainted;
2500 if (do_utf8 || UTF) {
2501 /* Either target or the pattern are utf8. */
2503 char *e = PL_regeol;
2505 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2506 l, &e, 0, do_utf8)) {
2507 /* One more case for the sharp s:
2508 * pack("U0U*", 0xDF) =~ /ss/i,
2509 * the 0xC3 0x9F are the UTF-8
2510 * byte sequence for the U+00DF. */
2512 toLOWER(s[0]) == 's' &&
2514 toLOWER(s[1]) == 's' &&
2521 nextchr = UCHARAT(locinput);
2525 /* Neither the target and the pattern are utf8. */
2527 /* Inline the first character, for speed. */
2528 if (UCHARAT(s) != nextchr &&
2529 UCHARAT(s) != ((OP(scan) == EXACTF)
2530 ? PL_fold : PL_fold_locale)[nextchr])
2532 if (PL_regeol - locinput < ln)
2534 if (ln > 1 && (OP(scan) == EXACTF
2535 ? ibcmp(s, locinput, ln)
2536 : ibcmp_locale(s, locinput, ln)))
2539 nextchr = UCHARAT(locinput);
2543 STRLEN inclasslen = PL_regeol - locinput;
2545 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2547 if (locinput >= PL_regeol)
2549 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2550 nextchr = UCHARAT(locinput);
2555 nextchr = UCHARAT(locinput);
2556 if (!REGINCLASS(scan, (U8*)locinput))
2558 if (!nextchr && locinput >= PL_regeol)
2560 nextchr = UCHARAT(++locinput);
2564 /* If we might have the case of the German sharp s
2565 * in a casefolding Unicode character class. */
2567 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2568 locinput += SHARP_S_SKIP;
2569 nextchr = UCHARAT(locinput);
2575 PL_reg_flags |= RF_tainted;
2581 LOAD_UTF8_CHARCLASS(alnum,"a");
2582 if (!(OP(scan) == ALNUM
2583 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2584 : isALNUM_LC_utf8((U8*)locinput)))
2588 locinput += PL_utf8skip[nextchr];
2589 nextchr = UCHARAT(locinput);
2592 if (!(OP(scan) == ALNUM
2593 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2595 nextchr = UCHARAT(++locinput);
2598 PL_reg_flags |= RF_tainted;
2601 if (!nextchr && locinput >= PL_regeol)
2604 LOAD_UTF8_CHARCLASS(alnum,"a");
2605 if (OP(scan) == NALNUM
2606 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2607 : isALNUM_LC_utf8((U8*)locinput))
2611 locinput += PL_utf8skip[nextchr];
2612 nextchr = UCHARAT(locinput);
2615 if (OP(scan) == NALNUM
2616 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2618 nextchr = UCHARAT(++locinput);
2622 PL_reg_flags |= RF_tainted;
2626 /* was last char in word? */
2628 if (locinput == PL_bostr)
2631 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2633 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
2635 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2636 ln = isALNUM_uni(ln);
2637 LOAD_UTF8_CHARCLASS(alnum,"a");
2638 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2641 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2642 n = isALNUM_LC_utf8((U8*)locinput);
2646 ln = (locinput != PL_bostr) ?
2647 UCHARAT(locinput - 1) : '\n';
2648 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2650 n = isALNUM(nextchr);
2653 ln = isALNUM_LC(ln);
2654 n = isALNUM_LC(nextchr);
2657 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2658 OP(scan) == BOUNDL))
2662 PL_reg_flags |= RF_tainted;
2668 if (UTF8_IS_CONTINUED(nextchr)) {
2669 LOAD_UTF8_CHARCLASS(space," ");
2670 if (!(OP(scan) == SPACE
2671 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2672 : isSPACE_LC_utf8((U8*)locinput)))
2676 locinput += PL_utf8skip[nextchr];
2677 nextchr = UCHARAT(locinput);
2680 if (!(OP(scan) == SPACE
2681 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2683 nextchr = UCHARAT(++locinput);
2686 if (!(OP(scan) == SPACE
2687 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2689 nextchr = UCHARAT(++locinput);
2693 PL_reg_flags |= RF_tainted;
2696 if (!nextchr && locinput >= PL_regeol)
2699 LOAD_UTF8_CHARCLASS(space," ");
2700 if (OP(scan) == NSPACE
2701 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2702 : isSPACE_LC_utf8((U8*)locinput))
2706 locinput += PL_utf8skip[nextchr];
2707 nextchr = UCHARAT(locinput);
2710 if (OP(scan) == NSPACE
2711 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2713 nextchr = UCHARAT(++locinput);
2716 PL_reg_flags |= RF_tainted;
2722 LOAD_UTF8_CHARCLASS(digit,"0");
2723 if (!(OP(scan) == DIGIT
2724 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2725 : isDIGIT_LC_utf8((U8*)locinput)))
2729 locinput += PL_utf8skip[nextchr];
2730 nextchr = UCHARAT(locinput);
2733 if (!(OP(scan) == DIGIT
2734 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2736 nextchr = UCHARAT(++locinput);
2739 PL_reg_flags |= RF_tainted;
2742 if (!nextchr && locinput >= PL_regeol)
2745 LOAD_UTF8_CHARCLASS(digit,"0");
2746 if (OP(scan) == NDIGIT
2747 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2748 : isDIGIT_LC_utf8((U8*)locinput))
2752 locinput += PL_utf8skip[nextchr];
2753 nextchr = UCHARAT(locinput);
2756 if (OP(scan) == NDIGIT
2757 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2759 nextchr = UCHARAT(++locinput);
2762 if (locinput >= PL_regeol)
2765 LOAD_UTF8_CHARCLASS(mark,"~");
2766 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2768 locinput += PL_utf8skip[nextchr];
2769 while (locinput < PL_regeol &&
2770 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2771 locinput += UTF8SKIP(locinput);
2772 if (locinput > PL_regeol)
2777 nextchr = UCHARAT(locinput);
2780 PL_reg_flags |= RF_tainted;
2784 n = ARG(scan); /* which paren pair */
2785 ln = PL_regstartp[n];
2786 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2787 if ((I32)*PL_reglastparen < n || ln == -1)
2788 sayNO; /* Do not match unless seen CLOSEn. */
2789 if (ln == PL_regendp[n])
2793 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2795 char *e = PL_bostr + PL_regendp[n];
2797 * Note that we can't do the "other character" lookup trick as
2798 * in the 8-bit case (no pun intended) because in Unicode we
2799 * have to map both upper and title case to lower case.
2801 if (OP(scan) == REFF) {
2802 STRLEN ulen1, ulen2;
2803 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2804 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
2808 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2809 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2810 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2817 nextchr = UCHARAT(locinput);
2821 /* Inline the first character, for speed. */
2822 if (UCHARAT(s) != nextchr &&
2824 (UCHARAT(s) != ((OP(scan) == REFF
2825 ? PL_fold : PL_fold_locale)[nextchr]))))
2827 ln = PL_regendp[n] - ln;
2828 if (locinput + ln > PL_regeol)
2830 if (ln > 1 && (OP(scan) == REF
2831 ? memNE(s, locinput, ln)
2833 ? ibcmp(s, locinput, ln)
2834 : ibcmp_locale(s, locinput, ln))))
2837 nextchr = UCHARAT(locinput);
2848 OP_4tree *oop = PL_op;
2849 COP *ocurcop = PL_curcop;
2852 struct regexp *oreg = PL_reg_re;
2855 PL_op = (OP_4tree*)PL_regdata->data[n];
2856 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2857 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
2858 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2862 CALLRUNOPS(aTHX); /* Scalar context. */
2865 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
2873 PAD_RESTORE_LOCAL(old_comppad);
2874 PL_curcop = ocurcop;
2876 if (logical == 2) { /* Postponed subexpression. */
2878 MAGIC *mg = Null(MAGIC*);
2880 CHECKPOINT cp, lastcp;
2884 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2885 mg = mg_find(sv, PERL_MAGIC_qr);
2886 else if (SvSMAGICAL(ret)) {
2887 if (SvGMAGICAL(ret))
2888 sv_unmagic(ret, PERL_MAGIC_qr);
2890 mg = mg_find(ret, PERL_MAGIC_qr);
2894 re = (regexp *)mg->mg_obj;
2895 (void)ReREFCNT_inc(re);
2899 char *t = SvPV(ret, len);
2901 char *oprecomp = PL_regprecomp;
2902 I32 osize = PL_regsize;
2903 I32 onpar = PL_regnpar;
2906 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
2907 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2909 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2911 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2913 PL_regprecomp = oprecomp;
2918 PerlIO_printf(Perl_debug_log,
2919 "Entering embedded `%s%.60s%s%s'\n",
2923 (strlen(re->precomp) > 60 ? "..." : ""))
2926 state.prev = PL_reg_call_cc;
2927 state.cc = PL_regcc;
2928 state.re = PL_reg_re;
2932 cp = regcppush(0); /* Save *all* the positions. */
2935 state.ss = PL_savestack_ix;
2936 *PL_reglastparen = 0;
2937 *PL_reglastcloseparen = 0;
2938 PL_reg_call_cc = &state;
2939 PL_reginput = locinput;
2940 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2941 ((re->reganch & ROPT_UTF8) != 0);
2942 if (toggleutf) PL_reg_flags ^= RF_utf8;
2944 /* XXXX This is too dramatic a measure... */
2947 if (regmatch(re->program + 1)) {
2948 /* Even though we succeeded, we need to restore
2949 global variables, since we may be wrapped inside
2950 SUSPEND, thus the match may be not finished yet. */
2952 /* XXXX Do this only if SUSPENDed? */
2953 PL_reg_call_cc = state.prev;
2954 PL_regcc = state.cc;
2955 PL_reg_re = state.re;
2956 cache_re(PL_reg_re);
2957 if (toggleutf) PL_reg_flags ^= RF_utf8;
2959 /* XXXX This is too dramatic a measure... */
2962 /* These are needed even if not SUSPEND. */
2968 REGCP_UNWIND(lastcp);
2970 PL_reg_call_cc = state.prev;
2971 PL_regcc = state.cc;
2972 PL_reg_re = state.re;
2973 cache_re(PL_reg_re);
2974 if (toggleutf) PL_reg_flags ^= RF_utf8;
2976 /* XXXX This is too dramatic a measure... */
2986 sv_setsv(save_scalar(PL_replgv), ret);
2992 n = ARG(scan); /* which paren pair */
2993 PL_reg_start_tmp[n] = locinput;
2998 n = ARG(scan); /* which paren pair */
2999 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3000 PL_regendp[n] = locinput - PL_bostr;
3001 if (n > (I32)*PL_reglastparen)
3002 *PL_reglastparen = n;
3003 *PL_reglastcloseparen = n;
3006 n = ARG(scan); /* which paren pair */
3007 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3010 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3012 next = NEXTOPER(NEXTOPER(scan));
3014 next = scan + ARG(scan);
3015 if (OP(next) == IFTHEN) /* Fake one. */
3016 next = NEXTOPER(NEXTOPER(next));
3020 logical = scan->flags;
3022 /*******************************************************************
3023 PL_regcc contains infoblock about the innermost (...)* loop, and
3024 a pointer to the next outer infoblock.
3026 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3028 1) After matching X, regnode for CURLYX is processed;
3030 2) This regnode creates infoblock on the stack, and calls
3031 regmatch() recursively with the starting point at WHILEM node;
3033 3) Each hit of WHILEM node tries to match A and Z (in the order
3034 depending on the current iteration, min/max of {min,max} and
3035 greediness). The information about where are nodes for "A"
3036 and "Z" is read from the infoblock, as is info on how many times "A"
3037 was already matched, and greediness.
3039 4) After A matches, the same WHILEM node is hit again.
3041 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3042 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3043 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3044 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3045 of the external loop.
3047 Currently present infoblocks form a tree with a stem formed by PL_curcc
3048 and whatever it mentions via ->next, and additional attached trees
3049 corresponding to temporarily unset infoblocks as in "5" above.
3051 In the following picture infoblocks for outer loop of
3052 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3053 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3054 infoblocks are drawn below the "reset" infoblock.
3056 In fact in the picture below we do not show failed matches for Z and T
3057 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3058 more obvious *why* one needs to *temporary* unset infoblocks.]
3060 Matched REx position InfoBlocks Comment
3064 Y A)*?Z)*?T x <- O <- I
3065 YA )*?Z)*?T x <- O <- I
3066 YA A)*?Z)*?T x <- O <- I
3067 YAA )*?Z)*?T x <- O <- I
3068 YAA Z)*?T x <- O # Temporary unset I
3071 YAAZ Y(A)*?Z)*?T x <- O
3074 YAAZY (A)*?Z)*?T x <- O
3077 YAAZY A)*?Z)*?T x <- O <- I
3080 YAAZYA )*?Z)*?T x <- O <- I
3083 YAAZYA Z)*?T x <- O # Temporary unset I
3089 YAAZYAZ T x # Temporary unset O
3096 *******************************************************************/
3099 CHECKPOINT cp = PL_savestack_ix;
3100 /* No need to save/restore up to this paren */
3101 I32 parenfloor = scan->flags;
3103 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3105 cc.oldcc = PL_regcc;
3107 /* XXXX Probably it is better to teach regpush to support
3108 parenfloor > PL_regsize... */
3109 if (parenfloor > (I32)*PL_reglastparen)
3110 parenfloor = *PL_reglastparen; /* Pessimization... */
3111 cc.parenfloor = parenfloor;
3113 cc.min = ARG1(scan);
3114 cc.max = ARG2(scan);
3115 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3119 PL_reginput = locinput;
3120 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3122 PL_regcc = cc.oldcc;
3128 * This is really hard to understand, because after we match
3129 * what we're trying to match, we must make sure the rest of
3130 * the REx is going to match for sure, and to do that we have
3131 * to go back UP the parse tree by recursing ever deeper. And
3132 * if it fails, we have to reset our parent's current state
3133 * that we can try again after backing off.
3136 CHECKPOINT cp, lastcp;
3137 CURCUR* cc = PL_regcc;
3138 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3140 n = cc->cur + 1; /* how many we know we matched */
3141 PL_reginput = locinput;
3144 PerlIO_printf(Perl_debug_log,
3145 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3146 REPORT_CODE_OFF+PL_regindent*2, "",
3147 (long)n, (long)cc->min,
3148 (long)cc->max, PTR2UV(cc))
3151 /* If degenerate scan matches "", assume scan done. */
3153 if (locinput == cc->lastloc && n >= cc->min) {
3154 PL_regcc = cc->oldcc;
3158 PerlIO_printf(Perl_debug_log,
3159 "%*s empty match detected, try continuation...\n",
3160 REPORT_CODE_OFF+PL_regindent*2, "")
3162 if (regmatch(cc->next))
3170 /* First just match a string of min scans. */
3174 cc->lastloc = locinput;
3175 if (regmatch(cc->scan))
3178 cc->lastloc = lastloc;
3183 /* Check whether we already were at this position.
3184 Postpone detection until we know the match is not
3185 *that* much linear. */
3186 if (!PL_reg_maxiter) {
3187 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3188 PL_reg_leftiter = PL_reg_maxiter;
3190 if (PL_reg_leftiter-- == 0) {
3191 I32 size = (PL_reg_maxiter + 7)/8;
3192 if (PL_reg_poscache) {
3193 if ((I32)PL_reg_poscache_size < size) {
3194 Renew(PL_reg_poscache, size, char);
3195 PL_reg_poscache_size = size;
3197 Zero(PL_reg_poscache, size, char);
3200 PL_reg_poscache_size = size;
3201 Newz(29, PL_reg_poscache, size, char);
3204 PerlIO_printf(Perl_debug_log,
3205 "%sDetected a super-linear match, switching on caching%s...\n",
3206 PL_colors[4], PL_colors[5])
3209 if (PL_reg_leftiter < 0) {
3210 I32 o = locinput - PL_bostr, b;
3212 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3215 if (PL_reg_poscache[o] & (1<<b)) {
3217 PerlIO_printf(Perl_debug_log,
3218 "%*s already tried at this position...\n",
3219 REPORT_CODE_OFF+PL_regindent*2, "")
3221 if (PL_reg_flags & RF_false)
3226 PL_reg_poscache[o] |= (1<<b);
3230 /* Prefer next over scan for minimal matching. */
3233 PL_regcc = cc->oldcc;
3236 cp = regcppush(cc->parenfloor);
3238 if (regmatch(cc->next)) {
3240 sayYES; /* All done. */
3242 REGCP_UNWIND(lastcp);
3248 if (n >= cc->max) { /* Maximum greed exceeded? */
3249 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3250 && !(PL_reg_flags & RF_warned)) {
3251 PL_reg_flags |= RF_warned;
3252 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3253 "Complex regular subexpression recursion",
3260 PerlIO_printf(Perl_debug_log,
3261 "%*s trying longer...\n",
3262 REPORT_CODE_OFF+PL_regindent*2, "")
3264 /* Try scanning more and see if it helps. */
3265 PL_reginput = locinput;
3267 cc->lastloc = locinput;
3268 cp = regcppush(cc->parenfloor);
3270 if (regmatch(cc->scan)) {
3274 REGCP_UNWIND(lastcp);
3277 cc->lastloc = lastloc;
3281 /* Prefer scan over next for maximal matching. */
3283 if (n < cc->max) { /* More greed allowed? */
3284 cp = regcppush(cc->parenfloor);
3286 cc->lastloc = locinput;
3288 if (regmatch(cc->scan)) {
3292 REGCP_UNWIND(lastcp);
3293 regcppop(); /* Restore some previous $<digit>s? */
3294 PL_reginput = locinput;
3296 PerlIO_printf(Perl_debug_log,
3297 "%*s failed, try continuation...\n",
3298 REPORT_CODE_OFF+PL_regindent*2, "")
3301 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3302 && !(PL_reg_flags & RF_warned)) {
3303 PL_reg_flags |= RF_warned;
3304 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3305 "Complex regular subexpression recursion",
3309 /* Failed deeper matches of scan, so see if this one works. */
3310 PL_regcc = cc->oldcc;
3313 if (regmatch(cc->next))
3319 cc->lastloc = lastloc;
3324 next = scan + ARG(scan);
3327 inner = NEXTOPER(NEXTOPER(scan));
3330 inner = NEXTOPER(scan);
3334 if (OP(next) != c1) /* No choice. */
3335 next = inner; /* Avoid recursion. */
3337 I32 lastparen = *PL_reglastparen;
3339 re_unwind_branch_t *uw;
3341 /* Put unwinding data on stack */
3342 unwind1 = SSNEWt(1,re_unwind_branch_t);
3343 uw = SSPTRt(unwind1,re_unwind_branch_t);
3346 uw->type = ((c1 == BRANCH)
3348 : RE_UNWIND_BRANCHJ);
3349 uw->lastparen = lastparen;
3351 uw->locinput = locinput;
3352 uw->nextchr = nextchr;
3354 uw->regindent = ++PL_regindent;
3357 REGCP_SET(uw->lastcp);
3359 /* Now go into the first branch */
3372 /* We suppose that the next guy does not need
3373 backtracking: in particular, it is of constant non-zero length,
3374 and has no parenths to influence future backrefs. */
3375 ln = ARG1(scan); /* min to match */
3376 n = ARG2(scan); /* max to match */
3377 paren = scan->flags;
3379 if (paren > PL_regsize)
3381 if (paren > (I32)*PL_reglastparen)
3382 *PL_reglastparen = paren;
3384 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3386 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3387 PL_reginput = locinput;
3390 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3392 locinput = PL_reginput;
3393 if (HAS_TEXT(next) || JUMPABLE(next)) {
3394 regnode *text_node = next;
3396 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3398 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3400 if (PL_regkind[(U8)OP(text_node)] == REF) {
3404 else { c1 = (U8)*STRING(text_node); }
3405 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3407 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3408 c2 = PL_fold_locale[c1];
3417 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3418 /* If it could work, try it. */
3420 UCHARAT(PL_reginput) == c1 ||
3421 UCHARAT(PL_reginput) == c2)
3425 PL_regstartp[paren] =
3426 HOPc(PL_reginput, -l) - PL_bostr;
3427 PL_regendp[paren] = PL_reginput - PL_bostr;
3430 PL_regendp[paren] = -1;
3434 REGCP_UNWIND(lastcp);
3436 /* Couldn't or didn't -- move forward. */
3437 PL_reginput = locinput;
3438 if (regrepeat_hard(scan, 1, &l)) {
3440 locinput = PL_reginput;
3447 n = regrepeat_hard(scan, n, &l);
3448 locinput = PL_reginput;
3450 PerlIO_printf(Perl_debug_log,
3451 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3452 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3456 if (HAS_TEXT(next) || JUMPABLE(next)) {
3457 regnode *text_node = next;
3459 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3461 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3463 if (PL_regkind[(U8)OP(text_node)] == REF) {
3467 else { c1 = (U8)*STRING(text_node); }
3469 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3471 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3472 c2 = PL_fold_locale[c1];
3483 /* If it could work, try it. */
3485 UCHARAT(PL_reginput) == c1 ||
3486 UCHARAT(PL_reginput) == c2)
3489 PerlIO_printf(Perl_debug_log,
3490 "%*s trying tail with n=%"IVdf"...\n",
3491 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3495 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3496 PL_regendp[paren] = PL_reginput - PL_bostr;
3499 PL_regendp[paren] = -1;
3503 REGCP_UNWIND(lastcp);
3505 /* Couldn't or didn't -- back up. */
3507 locinput = HOPc(locinput, -l);
3508 PL_reginput = locinput;
3515 paren = scan->flags; /* Which paren to set */
3516 if (paren > PL_regsize)
3518 if (paren > (I32)*PL_reglastparen)
3519 *PL_reglastparen = paren;
3520 ln = ARG1(scan); /* min to match */
3521 n = ARG2(scan); /* max to match */
3522 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3526 ln = ARG1(scan); /* min to match */
3527 n = ARG2(scan); /* max to match */
3528 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3533 scan = NEXTOPER(scan);
3539 scan = NEXTOPER(scan);
3543 * Lookahead to avoid useless match attempts
3544 * when we know what character comes next.
3548 * Used to only do .*x and .*?x, but now it allows
3549 * for )'s, ('s and (?{ ... })'s to be in the way
3550 * of the quantifier and the EXACT-like node. -- japhy
3553 if (HAS_TEXT(next) || JUMPABLE(next)) {
3555 regnode *text_node = next;
3557 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3559 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3561 if (PL_regkind[(U8)OP(text_node)] == REF) {
3563 goto assume_ok_easy;
3565 else { s = (U8*)STRING(text_node); }
3569 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3571 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3572 c2 = PL_fold_locale[c1];
3575 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3576 STRLEN ulen1, ulen2;
3577 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3578 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
3580 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3581 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3583 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3585 0 : UTF8_ALLOW_ANY);
3586 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3588 0 : UTF8_ALLOW_ANY);
3591 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3593 0 : UTF8_ALLOW_ANY);
3601 PL_reginput = locinput;
3605 if (ln && regrepeat(scan, ln) < ln)
3607 locinput = PL_reginput;
3610 char *e; /* Should not check after this */
3611 char *old = locinput;
3614 if (n == REG_INFTY) {
3617 while (UTF8_IS_CONTINUATION(*(U8*)e))
3623 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3627 e = locinput + n - ln;
3632 /* Find place 'next' could work */
3635 while (locinput <= e &&
3636 UCHARAT(locinput) != c1)
3639 while (locinput <= e
3640 && UCHARAT(locinput) != c1
3641 && UCHARAT(locinput) != c2)
3644 count = locinput - old;
3649 /* count initialised to
3650 * utf8_distance(old, locinput) */
3651 while (locinput <= e &&
3652 utf8n_to_uvchr((U8*)locinput,
3655 0 : UTF8_ALLOW_ANY) != (UV)c1) {
3660 /* count initialised to
3661 * utf8_distance(old, locinput) */
3662 while (locinput <= e) {
3663 UV c = utf8n_to_uvchr((U8*)locinput,
3666 0 : UTF8_ALLOW_ANY);
3667 if (c == (UV)c1 || c == (UV)c2)
3676 /* PL_reginput == old now */
3677 if (locinput != old) {
3678 ln = 1; /* Did some */
3679 if (regrepeat(scan, count) < count)
3682 /* PL_reginput == locinput now */
3683 TRYPAREN(paren, ln, locinput);
3684 PL_reginput = locinput; /* Could be reset... */
3685 REGCP_UNWIND(lastcp);
3686 /* Couldn't or didn't -- move forward. */
3689 locinput += UTF8SKIP(locinput);
3696 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3700 c = utf8n_to_uvchr((U8*)PL_reginput,
3703 0 : UTF8_ALLOW_ANY);
3705 c = UCHARAT(PL_reginput);
3706 /* If it could work, try it. */
3707 if (c == (UV)c1 || c == (UV)c2)
3709 TRYPAREN(paren, ln, PL_reginput);
3710 REGCP_UNWIND(lastcp);
3713 /* If it could work, try it. */
3714 else if (c1 == -1000)
3716 TRYPAREN(paren, ln, PL_reginput);
3717 REGCP_UNWIND(lastcp);
3719 /* Couldn't or didn't -- move forward. */
3720 PL_reginput = locinput;
3721 if (regrepeat(scan, 1)) {
3723 locinput = PL_reginput;
3731 n = regrepeat(scan, n);
3732 locinput = PL_reginput;
3733 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3734 (OP(next) != MEOL ||
3735 OP(next) == SEOL || OP(next) == EOS))
3737 ln = n; /* why back off? */
3738 /* ...because $ and \Z can match before *and* after
3739 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3740 We should back off by one in this case. */
3741 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3750 c = utf8n_to_uvchr((U8*)PL_reginput,
3753 0 : UTF8_ALLOW_ANY);
3755 c = UCHARAT(PL_reginput);
3757 /* If it could work, try it. */
3758 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3760 TRYPAREN(paren, n, PL_reginput);
3761 REGCP_UNWIND(lastcp);
3763 /* Couldn't or didn't -- back up. */
3765 PL_reginput = locinput = HOPc(locinput, -1);
3773 c = utf8n_to_uvchr((U8*)PL_reginput,
3776 0 : UTF8_ALLOW_ANY);
3778 c = UCHARAT(PL_reginput);
3780 /* If it could work, try it. */
3781 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3783 TRYPAREN(paren, n, PL_reginput);
3784 REGCP_UNWIND(lastcp);
3786 /* Couldn't or didn't -- back up. */
3788 PL_reginput = locinput = HOPc(locinput, -1);
3795 if (PL_reg_call_cc) {
3796 re_cc_state *cur_call_cc = PL_reg_call_cc;
3797 CURCUR *cctmp = PL_regcc;
3798 regexp *re = PL_reg_re;
3799 CHECKPOINT cp, lastcp;
3801 cp = regcppush(0); /* Save *all* the positions. */
3803 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3805 PL_reginput = locinput; /* Make position available to
3807 cache_re(PL_reg_call_cc->re);
3808 PL_regcc = PL_reg_call_cc->cc;
3809 PL_reg_call_cc = PL_reg_call_cc->prev;
3810 if (regmatch(cur_call_cc->node)) {
3811 PL_reg_call_cc = cur_call_cc;
3815 REGCP_UNWIND(lastcp);
3817 PL_reg_call_cc = cur_call_cc;
3823 PerlIO_printf(Perl_debug_log,
3824 "%*s continuation failed...\n",
3825 REPORT_CODE_OFF+PL_regindent*2, "")
3829 if (locinput < PL_regtill) {
3830 DEBUG_r(PerlIO_printf(Perl_debug_log,
3831 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3833 (long)(locinput - PL_reg_starttry),
3834 (long)(PL_regtill - PL_reg_starttry),
3836 sayNO_FINAL; /* Cannot match: too short. */
3838 PL_reginput = locinput; /* put where regtry can find it */
3839 sayYES_FINAL; /* Success! */
3841 PL_reginput = locinput; /* put where regtry can find it */