5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
85 #define RF_tainted 1 /* tainted information used? */
86 #define RF_warned 2 /* warned about big count? */
87 #define RF_evaled 4 /* Did an EVAL with setting? */
88 #define RF_utf8 8 /* String contains multibyte chars? */
90 #define UTF (PL_reg_flags & RF_utf8)
92 #define RS_init 1 /* eval environment created */
93 #define RS_set 2 /* replsv value is set */
103 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
106 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
107 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
108 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
109 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
110 #define HOPc(pos,off) ((char*)HOP(pos,off))
111 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
113 #define HOPBACK(pos, off) ( \
114 (UTF && PL_reg_match_utf8) \
115 ? reghopmaybe((U8*)pos, -off) \
116 : (pos - off >= PL_bostr) \
120 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
122 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
123 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
124 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
125 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
126 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
127 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
129 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
131 /* for use after a quantifier and before an EXACT-like node -- japhy */
132 #define JUMPABLE(rn) ( \
133 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
134 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
135 OP(rn) == PLUS || OP(rn) == MINMOD || \
136 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
139 #define HAS_TEXT(rn) ( \
140 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
143 #define FIND_NEXT_IMPT(rn) STMT_START { \
144 while (JUMPABLE(rn)) \
145 if (OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
146 PL_regkind[(U8)OP(rn)] == CURLY) \
147 rn = NEXTOPER(NEXTOPER(rn)); \
148 else if (OP(rn) == PLUS) \
150 else rn += NEXT_OFF(rn); \
153 static void restore_pos(pTHX_ void *arg);
156 S_regcppush(pTHX_ I32 parenfloor)
158 int retval = PL_savestack_ix;
159 #define REGCP_PAREN_ELEMS 4
160 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
163 if (paren_elems_to_push < 0)
164 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
166 #define REGCP_OTHER_ELEMS 6
167 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
168 for (p = PL_regsize; p > parenfloor; p--) {
169 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
170 SSPUSHINT(PL_regendp[p]);
171 SSPUSHINT(PL_regstartp[p]);
172 SSPUSHPTR(PL_reg_start_tmp[p]);
175 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
176 SSPUSHINT(PL_regsize);
177 SSPUSHINT(*PL_reglastparen);
178 SSPUSHINT(*PL_reglastcloseparen);
179 SSPUSHPTR(PL_reginput);
180 #define REGCP_FRAME_ELEMS 2
181 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
182 * are needed for the regexp context stack bookkeeping. */
183 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
184 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
189 /* These are needed since we do not localize EVAL nodes: */
190 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
191 " Setting an EVAL scope, savestack=%"IVdf"\n", \
192 (IV)PL_savestack_ix)); cp = PL_savestack_ix
194 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
195 PerlIO_printf(Perl_debug_log, \
196 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
197 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
207 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
209 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
210 i = SSPOPINT; /* Parentheses elements to pop. */
211 input = (char *) SSPOPPTR;
212 *PL_reglastcloseparen = SSPOPINT;
213 *PL_reglastparen = SSPOPINT;
214 PL_regsize = SSPOPINT;
216 /* Now restore the parentheses context. */
217 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
218 i > 0; i -= REGCP_PAREN_ELEMS) {
219 paren = (U32)SSPOPINT;
220 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
221 PL_regstartp[paren] = SSPOPINT;
223 if (paren <= *PL_reglastparen)
224 PL_regendp[paren] = tmps;
226 PerlIO_printf(Perl_debug_log,
227 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
228 (UV)paren, (IV)PL_regstartp[paren],
229 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
230 (IV)PL_regendp[paren],
231 (paren > *PL_reglastparen ? "(no)" : ""));
235 if (*PL_reglastparen + 1 <= PL_regnpar) {
236 PerlIO_printf(Perl_debug_log,
237 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
238 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
242 /* It would seem that the similar code in regtry()
243 * already takes care of this, and in fact it is in
244 * a better location to since this code can #if 0-ed out
245 * but the code in regtry() is needed or otherwise tests
246 * requiring null fields (pat.t#187 and split.t#{13,14}
247 * (as of patchlevel 7877) will fail. Then again,
248 * this code seems to be necessary or otherwise
249 * building DynaLoader will fail:
250 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
252 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
253 if (paren > PL_regsize)
254 PL_regstartp[paren] = -1;
255 PL_regendp[paren] = -1;
262 S_regcp_set_to(pTHX_ I32 ss)
264 I32 tmp = PL_savestack_ix;
266 PL_savestack_ix = ss;
268 PL_savestack_ix = tmp;
272 typedef struct re_cc_state
276 struct re_cc_state *prev;
281 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
283 #define TRYPAREN(paren, n, input) { \
286 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
287 PL_regendp[paren] = input - PL_bostr; \
290 PL_regendp[paren] = -1; \
292 if (regmatch(next)) \
295 PL_regendp[paren] = -1; \
300 * pregexec and friends
304 - pregexec - match a regexp against a string
307 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
308 char *strbeg, I32 minend, SV *screamer, U32 nosave)
309 /* strend: pointer to null at end of string */
310 /* strbeg: real beginning of string */
311 /* minend: end of match must be >=minend after stringarg. */
312 /* nosave: For optimizations. */
315 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
316 nosave ? 0 : REXEC_COPY_STR);
320 S_cache_re(pTHX_ regexp *prog)
322 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
324 PL_regprogram = prog->program;
326 PL_regnpar = prog->nparens;
327 PL_regdata = prog->data;
332 * Need to implement the following flags for reg_anch:
334 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
336 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
337 * INTUIT_AUTORITATIVE_ML
338 * INTUIT_ONCE_NOML - Intuit can match in one location only.
341 * Another flag for this function: SECOND_TIME (so that float substrs
342 * with giant delta may be not rechecked).
345 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
347 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
348 Otherwise, only SvCUR(sv) is used to get strbeg. */
350 /* XXXX We assume that strpos is strbeg unless sv. */
352 /* XXXX Some places assume that there is a fixed substring.
353 An update may be needed if optimizer marks as "INTUITable"
354 RExen without fixed substrings. Similarly, it is assumed that
355 lengths of all the strings are no more than minlen, thus they
356 cannot come from lookahead.
357 (Or minlen should take into account lookahead.) */
359 /* A failure to find a constant substring means that there is no need to make
360 an expensive call to REx engine, thus we celebrate a failure. Similarly,
361 finding a substring too deep into the string means that less calls to
362 regtry() should be needed.
364 REx compiler's optimizer found 4 possible hints:
365 a) Anchored substring;
367 c) Whether we are anchored (beginning-of-line or \G);
368 d) First node (of those at offset 0) which may distingush positions;
369 We use a)b)d) and multiline-part of c), and try to find a position in the
370 string which does not contradict any of them.
373 /* Most of decisions we do here should have been done at compile time.
374 The nodes of the REx which we used for the search should have been
375 deleted from the finite automaton. */
378 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
379 char *strend, U32 flags, re_scream_pos_data *data)
381 register I32 start_shift = 0;
382 /* Should be nonnegative! */
383 register I32 end_shift = 0;
389 register char *other_last = Nullch; /* other substr checked before this */
390 char *check_at = Nullch; /* check substr found at this pos */
392 char *i_strpos = strpos;
393 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
396 if (prog->reganch & ROPT_UTF8) {
397 DEBUG_r(PerlIO_printf(Perl_debug_log,
398 "UTF-8 regex...\n"));
399 PL_reg_flags |= RF_utf8;
403 char *s = PL_reg_match_utf8 ?
404 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
406 int len = PL_reg_match_utf8 ?
407 strlen(s) : strend - strpos;
410 if (PL_reg_match_utf8)
411 DEBUG_r(PerlIO_printf(Perl_debug_log,
412 "UTF-8 target...\n"));
413 PerlIO_printf(Perl_debug_log,
414 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
415 PL_colors[4],PL_colors[5],PL_colors[0],
418 (strlen(prog->precomp) > 60 ? "..." : ""),
420 (int)(len > 60 ? 60 : len),
422 (len > 60 ? "..." : "")
426 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
427 DEBUG_r(PerlIO_printf(Perl_debug_log,
428 "String too short... [re_intuit_start]\n"));
431 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
433 check = prog->check_substr;
434 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
435 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
436 || ( (prog->reganch & ROPT_ANCH_BOL)
437 && !PL_multiline ) ); /* Check after \n? */
440 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
441 | ROPT_IMPLICIT)) /* not a real BOL */
442 /* SvCUR is not set on references: SvRV and SvPVX overlap */
444 && (strpos != strbeg)) {
445 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
448 if (prog->check_offset_min == prog->check_offset_max &&
449 !(prog->reganch & ROPT_CANY_SEEN)) {
450 /* Substring at constant offset from beg-of-str... */
453 s = HOP3c(strpos, prog->check_offset_min, strend);
455 slen = SvCUR(check); /* >= 1 */
457 if ( strend - s > slen || strend - s < slen - 1
458 || (strend - s == slen && strend[-1] != '\n')) {
459 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
462 /* Now should match s[0..slen-2] */
464 if (slen && (*SvPVX(check) != *s
466 && memNE(SvPVX(check), s, slen)))) {
468 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
472 else if (*SvPVX(check) != *s
473 || ((slen = SvCUR(check)) > 1
474 && memNE(SvPVX(check), s, slen)))
476 goto success_at_start;
479 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
481 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
482 end_shift = prog->minlen - start_shift -
483 CHR_SVLEN(check) + (SvTAIL(check) != 0);
485 I32 end = prog->check_offset_max + CHR_SVLEN(check)
486 - (SvTAIL(check) != 0);
487 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
489 if (end_shift < eshift)
493 else { /* Can match at random position */
496 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
497 /* Should be nonnegative! */
498 end_shift = prog->minlen - start_shift -
499 CHR_SVLEN(check) + (SvTAIL(check) != 0);
502 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
504 Perl_croak(aTHX_ "panic: end_shift");
508 /* Find a possible match in the region s..strend by looking for
509 the "check" substring in the region corrected by start/end_shift. */
510 if (flags & REXEC_SCREAM) {
511 I32 p = -1; /* Internal iterator of scream. */
512 I32 *pp = data ? data->scream_pos : &p;
514 if (PL_screamfirst[BmRARE(check)] >= 0
515 || ( BmRARE(check) == '\n'
516 && (BmPREVIOUS(check) == SvCUR(check) - 1)
518 s = screaminstr(sv, check,
519 start_shift + (s - strbeg), end_shift, pp, 0);
523 *data->scream_olds = s;
525 else if (prog->reganch & ROPT_CANY_SEEN)
526 s = fbm_instr((U8*)(s + start_shift),
527 (U8*)(strend - end_shift),
528 check, PL_multiline ? FBMrf_MULTILINE : 0);
530 s = fbm_instr(HOP3(s, start_shift, strend),
531 HOP3(strend, -end_shift, strbeg),
532 check, PL_multiline ? FBMrf_MULTILINE : 0);
534 /* Update the count-of-usability, remove useless subpatterns,
537 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
538 (s ? "Found" : "Did not find"),
539 ((check == prog->anchored_substr) ? "anchored" : "floating"),
541 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
543 PL_colors[1], (SvTAIL(check) ? "$" : ""),
544 (s ? " at offset " : "...\n") ) );
551 /* Finish the diagnostic message */
552 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
554 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
555 Start with the other substr.
556 XXXX no SCREAM optimization yet - and a very coarse implementation
557 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
558 *always* match. Probably should be marked during compile...
559 Probably it is right to do no SCREAM here...
562 if (prog->float_substr && prog->anchored_substr) {
563 /* Take into account the "other" substring. */
564 /* XXXX May be hopelessly wrong for UTF... */
567 if (check == prog->float_substr) {
570 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
573 t = s - prog->check_offset_max;
574 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
575 && (!(prog->reganch & ROPT_UTF8)
576 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
581 t = HOP3c(t, prog->anchored_offset, strend);
582 if (t < other_last) /* These positions already checked */
584 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
587 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
588 /* On end-of-str: see comment below. */
589 s = fbm_instr((unsigned char*)t,
590 HOP3(HOP3(last1, prog->anchored_offset, strend)
591 + SvCUR(prog->anchored_substr),
592 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
593 prog->anchored_substr,
594 PL_multiline ? FBMrf_MULTILINE : 0);
595 DEBUG_r(PerlIO_printf(Perl_debug_log,
596 "%s anchored substr `%s%.*s%s'%s",
597 (s ? "Found" : "Contradicts"),
599 (int)(SvCUR(prog->anchored_substr)
600 - (SvTAIL(prog->anchored_substr)!=0)),
601 SvPVX(prog->anchored_substr),
602 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
604 if (last1 >= last2) {
605 DEBUG_r(PerlIO_printf(Perl_debug_log,
606 ", giving up...\n"));
609 DEBUG_r(PerlIO_printf(Perl_debug_log,
610 ", trying floating at offset %ld...\n",
611 (long)(HOP3c(s1, 1, strend) - i_strpos)));
612 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
613 s = HOP3c(last, 1, strend);
617 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
618 (long)(s - i_strpos)));
619 t = HOP3c(s, -prog->anchored_offset, strbeg);
620 other_last = HOP3c(s, 1, strend);
628 else { /* Take into account the floating substring. */
632 t = HOP3c(s, -start_shift, strbeg);
634 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
635 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
636 last = HOP3c(t, prog->float_max_offset, strend);
637 s = HOP3c(t, prog->float_min_offset, strend);
640 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
641 /* fbm_instr() takes into account exact value of end-of-str
642 if the check is SvTAIL(ed). Since false positives are OK,
643 and end-of-str is not later than strend we are OK. */
644 s = fbm_instr((unsigned char*)s,
645 (unsigned char*)last + SvCUR(prog->float_substr)
646 - (SvTAIL(prog->float_substr)!=0),
647 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
648 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
649 (s ? "Found" : "Contradicts"),
651 (int)(SvCUR(prog->float_substr)
652 - (SvTAIL(prog->float_substr)!=0)),
653 SvPVX(prog->float_substr),
654 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
657 DEBUG_r(PerlIO_printf(Perl_debug_log,
658 ", giving up...\n"));
661 DEBUG_r(PerlIO_printf(Perl_debug_log,
662 ", trying anchored starting at offset %ld...\n",
663 (long)(s1 + 1 - i_strpos)));
665 s = HOP3c(t, 1, strend);
669 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
670 (long)(s - i_strpos)));
671 other_last = s; /* Fix this later. --Hugo */
680 t = s - prog->check_offset_max;
681 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
682 && (!(prog->reganch & ROPT_UTF8)
683 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
685 /* Fixed substring is found far enough so that the match
686 cannot start at strpos. */
688 if (ml_anch && t[-1] != '\n') {
689 /* Eventually fbm_*() should handle this, but often
690 anchored_offset is not 0, so this check will not be wasted. */
691 /* XXXX In the code below we prefer to look for "^" even in
692 presence of anchored substrings. And we search even
693 beyond the found float position. These pessimizations
694 are historical artefacts only. */
696 while (t < strend - prog->minlen) {
698 if (t < check_at - prog->check_offset_min) {
699 if (prog->anchored_substr) {
700 /* Since we moved from the found position,
701 we definitely contradict the found anchored
702 substr. Due to the above check we do not
703 contradict "check" substr.
704 Thus we can arrive here only if check substr
705 is float. Redo checking for "other"=="fixed".
708 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
709 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
710 goto do_other_anchored;
712 /* We don't contradict the found floating substring. */
713 /* XXXX Why not check for STCLASS? */
715 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
716 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
719 /* Position contradicts check-string */
720 /* XXXX probably better to look for check-string
721 than for "\n", so one should lower the limit for t? */
722 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
723 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
724 other_last = strpos = s = t + 1;
729 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
730 PL_colors[0],PL_colors[1]));
734 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
735 PL_colors[0],PL_colors[1]));
739 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
742 /* The found string does not prohibit matching at strpos,
743 - no optimization of calling REx engine can be performed,
744 unless it was an MBOL and we are not after MBOL,
745 or a future STCLASS check will fail this. */
747 /* Even in this situation we may use MBOL flag if strpos is offset
748 wrt the start of the string. */
749 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
750 && (strpos != strbeg) && strpos[-1] != '\n'
751 /* May be due to an implicit anchor of m{.*foo} */
752 && !(prog->reganch & ROPT_IMPLICIT))
757 DEBUG_r( if (ml_anch)
758 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
759 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
762 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
763 && prog->check_substr /* Could be deleted already */
764 && --BmUSEFUL(prog->check_substr) < 0
765 && prog->check_substr == prog->float_substr)
767 /* If flags & SOMETHING - do not do it many times on the same match */
768 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
769 SvREFCNT_dec(prog->check_substr);
770 prog->check_substr = Nullsv; /* disable */
771 prog->float_substr = Nullsv; /* clear */
772 check = Nullsv; /* abort */
774 /* XXXX This is a remnant of the old implementation. It
775 looks wasteful, since now INTUIT can use many
777 prog->reganch &= ~RE_USE_INTUIT;
784 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
785 if (prog->regstclass) {
786 /* minlen == 0 is possible if regstclass is \b or \B,
787 and the fixed substr is ''$.
788 Since minlen is already taken into account, s+1 is before strend;
789 accidentally, minlen >= 1 guaranties no false positives at s + 1
790 even for \b or \B. But (minlen? 1 : 0) below assumes that
791 regstclass does not come from lookahead... */
792 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
793 This leaves EXACTF only, which is dealt with in find_byclass(). */
794 U8* str = (U8*)STRING(prog->regstclass);
795 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
796 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
798 char *endpos = (prog->anchored_substr || ml_anch)
799 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
800 : (prog->float_substr
801 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
804 char *startpos = strbeg;
807 if (prog->reganch & ROPT_UTF8) {
808 PL_regdata = prog->data;
811 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
816 if (endpos == strend) {
817 DEBUG_r( PerlIO_printf(Perl_debug_log,
818 "Could not match STCLASS...\n") );
821 DEBUG_r( PerlIO_printf(Perl_debug_log,
822 "This position contradicts STCLASS...\n") );
823 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
825 /* Contradict one of substrings */
826 if (prog->anchored_substr) {
827 if (prog->anchored_substr == check) {
828 DEBUG_r( what = "anchored" );
830 s = HOP3c(t, 1, strend);
831 if (s + start_shift + end_shift > strend) {
832 /* XXXX Should be taken into account earlier? */
833 DEBUG_r( PerlIO_printf(Perl_debug_log,
834 "Could not match STCLASS...\n") );
839 DEBUG_r( PerlIO_printf(Perl_debug_log,
840 "Looking for %s substr starting at offset %ld...\n",
841 what, (long)(s + start_shift - i_strpos)) );
844 /* Have both, check_string is floating */
845 if (t + start_shift >= check_at) /* Contradicts floating=check */
846 goto retry_floating_check;
847 /* Recheck anchored substring, but not floating... */
851 DEBUG_r( PerlIO_printf(Perl_debug_log,
852 "Looking for anchored substr starting at offset %ld...\n",
853 (long)(other_last - i_strpos)) );
854 goto do_other_anchored;
856 /* Another way we could have checked stclass at the
857 current position only: */
862 DEBUG_r( PerlIO_printf(Perl_debug_log,
863 "Looking for /%s^%s/m starting at offset %ld...\n",
864 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
867 if (!prog->float_substr) /* Could have been deleted */
869 /* Check is floating subtring. */
870 retry_floating_check:
871 t = check_at - start_shift;
872 DEBUG_r( what = "floating" );
873 goto hop_and_restart;
876 DEBUG_r(PerlIO_printf(Perl_debug_log,
877 "By STCLASS: moving %ld --> %ld\n",
878 (long)(t - i_strpos), (long)(s - i_strpos))
882 DEBUG_r(PerlIO_printf(Perl_debug_log,
883 "Does not contradict STCLASS...\n");
888 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
889 PL_colors[4], (check ? "Guessed" : "Giving up"),
890 PL_colors[5], (long)(s - i_strpos)) );
893 fail_finish: /* Substring not found */
894 if (prog->check_substr) /* could be removed already */
895 BmUSEFUL(prog->check_substr) += 5; /* hooray */
897 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
898 PL_colors[4],PL_colors[5]));
902 /* We know what class REx starts with. Try to find this position... */
904 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
906 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
912 register I32 tmp = 1; /* Scratch variable? */
913 register bool do_utf8 = PL_reg_match_utf8;
915 /* We know what class it must start with. */
919 if (reginclass(c, (U8*)s, do_utf8)) {
920 if (tmp && (norun || regtry(prog, s)))
927 s += do_utf8 ? UTF8SKIP(s) : 1;
932 if (tmp && (norun || regtry(prog, s)))
944 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
945 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
947 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
948 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
950 c1 = utf8_to_uvuni(tmpbuf1, 0);
951 c2 = utf8_to_uvuni(tmpbuf2, 0);
962 c2 = PL_fold_locale[c1];
964 e = do_utf8 ? s + ln : strend - ln;
967 e = s; /* Due to minlen logic of intuit() */
969 /* The idea in the EXACTF* cases is to first find the
970 * first character of the EXACTF* node and then, if
971 * necessary, case-insensitively compare the full
972 * text of the node. The c1 and c2 are the first
973 * characters (though in Unicode it gets a bit
974 * more complicated because there are more cases
975 * than just upper and lower: one is really supposed
976 * to use the so-called folding case for case-insensitive
977 * matching (called "loose matching" in Unicode). */
981 U8 tmpbuf [UTF8_MAXLEN+1];
982 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
987 c = utf8_to_uvchr((U8*)s, &len);
990 ibcmp_utf8(s, (char **)0, 0, do_utf8,
991 m, (char **)0, ln, UTF))
992 && (norun || regtry(prog, s)) )
995 uvchr_to_utf8(tmpbuf, c);
996 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
998 && (f == c1 || f == c2)
1000 !ibcmp_utf8((char *) foldbuf,
1001 (char **)0, foldlen, do_utf8,
1003 (char **)0, ln, UTF))
1004 && (norun || regtry(prog, s)) )
1012 c = utf8_to_uvchr((U8*)s, &len);
1014 /* Handle some of the three Greek sigmas cases.
1015 * Note that not all the possible combinations
1016 * are handled here: some of them are handled
1017 * handled by the standard folding rules, and
1018 * some of them (the character class or ANYOF
1019 * cases) are handled during compiletime in
1020 * regexec.c:S_regclass(). */
1021 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1022 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1023 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1025 if ( (c == c1 || c == c2)
1027 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1028 m, (char **)0, ln, UTF))
1029 && (norun || regtry(prog, s)) )
1032 uvchr_to_utf8(tmpbuf, c);
1033 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1035 && (f == c1 || f == c2)
1036 && (ln == foldlen ||
1037 !ibcmp_utf8((char *)foldbuf,
1038 (char **)0, foldlen, do_utf8,
1040 (char **)0, ln, UTF))
1041 && (norun || regtry(prog, s)) )
1052 && (ln == 1 || !(OP(c) == EXACTF
1054 : ibcmp_locale(s, m, ln)))
1055 && (norun || regtry(prog, s)) )
1061 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1062 && (ln == 1 || !(OP(c) == EXACTF
1064 : ibcmp_locale(s, m, ln)))
1065 && (norun || regtry(prog, s)) )
1072 PL_reg_flags |= RF_tainted;
1079 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1082 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1084 tmp = ((OP(c) == BOUND ?
1085 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1086 LOAD_UTF8_CHARCLASS(alnum,"a");
1087 while (s < strend) {
1088 if (tmp == !(OP(c) == BOUND ?
1089 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1090 isALNUM_LC_utf8((U8*)s)))
1093 if ((norun || regtry(prog, s)))
1100 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1101 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1102 while (s < strend) {
1104 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1106 if ((norun || regtry(prog, s)))
1112 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1116 PL_reg_flags |= RF_tainted;
1123 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1126 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1128 tmp = ((OP(c) == NBOUND ?
1129 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1130 LOAD_UTF8_CHARCLASS(alnum,"a");
1131 while (s < strend) {
1132 if (tmp == !(OP(c) == NBOUND ?
1133 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1134 isALNUM_LC_utf8((U8*)s)))
1136 else if ((norun || regtry(prog, s)))
1142 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1143 tmp = ((OP(c) == NBOUND ?
1144 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1145 while (s < strend) {
1147 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1149 else if ((norun || regtry(prog, s)))
1154 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1159 LOAD_UTF8_CHARCLASS(alnum,"a");
1160 while (s < strend) {
1161 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1162 if (tmp && (norun || regtry(prog, s)))
1173 while (s < strend) {
1175 if (tmp && (norun || regtry(prog, s)))
1187 PL_reg_flags |= RF_tainted;
1189 while (s < strend) {
1190 if (isALNUM_LC_utf8((U8*)s)) {
1191 if (tmp && (norun || regtry(prog, s)))
1202 while (s < strend) {
1203 if (isALNUM_LC(*s)) {
1204 if (tmp && (norun || regtry(prog, s)))
1217 LOAD_UTF8_CHARCLASS(alnum,"a");
1218 while (s < strend) {
1219 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1220 if (tmp && (norun || regtry(prog, s)))
1231 while (s < strend) {
1233 if (tmp && (norun || regtry(prog, s)))
1245 PL_reg_flags |= RF_tainted;
1247 while (s < strend) {
1248 if (!isALNUM_LC_utf8((U8*)s)) {
1249 if (tmp && (norun || regtry(prog, s)))
1260 while (s < strend) {
1261 if (!isALNUM_LC(*s)) {
1262 if (tmp && (norun || regtry(prog, s)))
1275 LOAD_UTF8_CHARCLASS(space," ");
1276 while (s < strend) {
1277 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1278 if (tmp && (norun || regtry(prog, s)))
1289 while (s < strend) {
1291 if (tmp && (norun || regtry(prog, s)))
1303 PL_reg_flags |= RF_tainted;
1305 while (s < strend) {
1306 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1307 if (tmp && (norun || regtry(prog, s)))
1318 while (s < strend) {
1319 if (isSPACE_LC(*s)) {
1320 if (tmp && (norun || regtry(prog, s)))
1333 LOAD_UTF8_CHARCLASS(space," ");
1334 while (s < strend) {
1335 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1336 if (tmp && (norun || regtry(prog, s)))
1347 while (s < strend) {
1349 if (tmp && (norun || regtry(prog, s)))
1361 PL_reg_flags |= RF_tainted;
1363 while (s < strend) {
1364 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1365 if (tmp && (norun || regtry(prog, s)))
1376 while (s < strend) {
1377 if (!isSPACE_LC(*s)) {
1378 if (tmp && (norun || regtry(prog, s)))
1391 LOAD_UTF8_CHARCLASS(digit,"0");
1392 while (s < strend) {
1393 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1394 if (tmp && (norun || regtry(prog, s)))
1405 while (s < strend) {
1407 if (tmp && (norun || regtry(prog, s)))
1419 PL_reg_flags |= RF_tainted;
1421 while (s < strend) {
1422 if (isDIGIT_LC_utf8((U8*)s)) {
1423 if (tmp && (norun || regtry(prog, s)))
1434 while (s < strend) {
1435 if (isDIGIT_LC(*s)) {
1436 if (tmp && (norun || regtry(prog, s)))
1449 LOAD_UTF8_CHARCLASS(digit,"0");
1450 while (s < strend) {
1451 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1452 if (tmp && (norun || regtry(prog, s)))
1463 while (s < strend) {
1465 if (tmp && (norun || regtry(prog, s)))
1477 PL_reg_flags |= RF_tainted;
1479 while (s < strend) {
1480 if (!isDIGIT_LC_utf8((U8*)s)) {
1481 if (tmp && (norun || regtry(prog, s)))
1492 while (s < strend) {
1493 if (!isDIGIT_LC(*s)) {
1494 if (tmp && (norun || regtry(prog, s)))
1506 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1515 - regexec_flags - match a regexp against a string
1518 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1519 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1520 /* strend: pointer to null at end of string */
1521 /* strbeg: real beginning of string */
1522 /* minend: end of match must be >=minend after stringarg. */
1523 /* data: May be used for some additional optimizations. */
1524 /* nosave: For optimizations. */
1527 register regnode *c;
1528 register char *startpos = stringarg;
1529 I32 minlen; /* must match at least this many chars */
1530 I32 dontbother = 0; /* how many characters not to try at end */
1531 /* I32 start_shift = 0; */ /* Offset of the start to find
1532 constant substr. */ /* CC */
1533 I32 end_shift = 0; /* Same for the end. */ /* CC */
1534 I32 scream_pos = -1; /* Internal iterator of scream. */
1536 SV* oreplsv = GvSV(PL_replgv);
1537 bool do_utf8 = DO_UTF8(sv);
1539 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1540 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1547 PL_regnarrate = DEBUG_r_TEST;
1550 /* Be paranoid... */
1551 if (prog == NULL || startpos == NULL) {
1552 Perl_croak(aTHX_ "NULL regexp parameter");
1556 minlen = prog->minlen;
1557 if (strend - startpos < minlen &&
1558 !PL_reg_match_utf8 /* ANYOFs can balloon to EXACTFs */
1560 DEBUG_r(PerlIO_printf(Perl_debug_log,
1561 "String too short [regexec_flags]...\n"));
1565 /* Check validity of program. */
1566 if (UCHARAT(prog->program) != REG_MAGIC) {
1567 Perl_croak(aTHX_ "corrupted regexp program");
1571 PL_reg_eval_set = 0;
1574 if (prog->reganch & ROPT_UTF8)
1575 PL_reg_flags |= RF_utf8;
1577 /* Mark beginning of line for ^ and lookbehind. */
1578 PL_regbol = startpos;
1582 /* Mark end of line for $ (and such) */
1585 /* see how far we have to get to not match where we matched before */
1586 PL_regtill = startpos+minend;
1588 /* We start without call_cc context. */
1591 /* If there is a "must appear" string, look for it. */
1594 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1597 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1598 PL_reg_ganch = startpos;
1599 else if (sv && SvTYPE(sv) >= SVt_PVMG
1601 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1602 && mg->mg_len >= 0) {
1603 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1604 if (prog->reganch & ROPT_ANCH_GPOS) {
1605 if (s > PL_reg_ganch)
1610 else /* pos() not defined */
1611 PL_reg_ganch = strbeg;
1614 if (do_utf8 == (UTF!=0) &&
1615 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1616 re_scream_pos_data d;
1618 d.scream_olds = &scream_olds;
1619 d.scream_pos = &scream_pos;
1620 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1622 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1623 goto phooey; /* not present */
1629 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1630 UNI_DISPLAY_REGEX) :
1632 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1633 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1634 UNI_DISPLAY_REGEX) : startpos;
1635 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1638 PerlIO_printf(Perl_debug_log,
1639 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
1640 PL_colors[4],PL_colors[5],PL_colors[0],
1643 len0 > 60 ? "..." : "",
1645 (int)(len1 > 60 ? 60 : len1),
1647 (len1 > 60 ? "..." : "")
1651 /* Simplest case: anchored match need be tried only once. */
1652 /* [unless only anchor is BOL and multiline is set] */
1653 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1654 if (s == startpos && regtry(prog, startpos))
1656 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1657 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1662 dontbother = minlen - 1;
1663 end = HOP3c(strend, -dontbother, strbeg) - 1;
1664 /* for multiline we only have to try after newlines */
1665 if (prog->check_substr) {
1669 if (regtry(prog, s))
1674 if (prog->reganch & RE_USE_INTUIT) {
1675 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1686 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1687 if (regtry(prog, s))
1694 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1695 if (regtry(prog, PL_reg_ganch))
1700 /* Messy cases: unanchored match. */
1701 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1702 /* we have /x+whatever/ */
1703 /* it must be a one character string (XXXX Except UTF?) */
1704 char ch = SvPVX(prog->anchored_substr)[0];
1710 while (s < strend) {
1712 DEBUG_r( did_match = 1 );
1713 if (regtry(prog, s)) goto got_it;
1715 while (s < strend && *s == ch)
1722 while (s < strend) {
1724 DEBUG_r( did_match = 1 );
1725 if (regtry(prog, s)) goto got_it;
1727 while (s < strend && *s == ch)
1733 DEBUG_r(if (!did_match)
1734 PerlIO_printf(Perl_debug_log,
1735 "Did not find anchored character...\n")
1739 else if (do_utf8 == (UTF!=0) &&
1740 (prog->anchored_substr != Nullsv
1741 || (prog->float_substr != Nullsv
1742 && prog->float_max_offset < strend - s))) {
1743 SV *must = prog->anchored_substr
1744 ? prog->anchored_substr : prog->float_substr;
1746 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1748 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1749 char *last = HOP3c(strend, /* Cannot start after this */
1750 -(I32)(CHR_SVLEN(must)
1751 - (SvTAIL(must) != 0) + back_min), strbeg);
1752 char *last1; /* Last position checked before */
1758 last1 = HOPc(s, -1);
1760 last1 = s - 1; /* bogus */
1762 /* XXXX check_substr already used to find `s', can optimize if
1763 check_substr==must. */
1765 dontbother = end_shift;
1766 strend = HOPc(strend, -dontbother);
1767 while ( (s <= last) &&
1768 ((flags & REXEC_SCREAM)
1769 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1770 end_shift, &scream_pos, 0))
1771 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1772 (unsigned char*)strend, must,
1773 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1774 DEBUG_r( did_match = 1 );
1775 if (HOPc(s, -back_max) > last1) {
1776 last1 = HOPc(s, -back_min);
1777 s = HOPc(s, -back_max);
1780 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1782 last1 = HOPc(s, -back_min);
1786 while (s <= last1) {
1787 if (regtry(prog, s))
1793 while (s <= last1) {
1794 if (regtry(prog, s))
1800 DEBUG_r(if (!did_match)
1801 PerlIO_printf(Perl_debug_log,
1802 "Did not find %s substr `%s%.*s%s'%s...\n",
1803 ((must == prog->anchored_substr)
1804 ? "anchored" : "floating"),
1806 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1808 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1812 else if ((c = prog->regstclass)) {
1813 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1814 /* don't bother with what can't match */
1815 strend = HOPc(strend, -(minlen - 1));
1817 SV *prop = sv_newmortal();
1825 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
1826 UNI_DISPLAY_REGEX) :
1828 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1830 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1831 len1 = UTF ? SvCUR(dsv1) : strend - s;
1832 PerlIO_printf(Perl_debug_log,
1833 "Matching stclass `%*.*s' against `%*.*s'\n",
1837 if (find_byclass(prog, c, s, strend, startpos, 0))
1839 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1843 if (prog->float_substr != Nullsv) { /* Trim the end. */
1846 if (flags & REXEC_SCREAM) {
1847 last = screaminstr(sv, prog->float_substr, s - strbeg,
1848 end_shift, &scream_pos, 1); /* last one */
1850 last = scream_olds; /* Only one occurrence. */
1854 char *little = SvPV(prog->float_substr, len);
1856 if (SvTAIL(prog->float_substr)) {
1857 if (memEQ(strend - len + 1, little, len - 1))
1858 last = strend - len + 1;
1859 else if (!PL_multiline)
1860 last = memEQ(strend - len, little, len)
1861 ? strend - len : Nullch;
1867 last = rninstr(s, strend, little, little + len);
1869 last = strend; /* matching `$' */
1873 DEBUG_r(PerlIO_printf(Perl_debug_log,
1874 "%sCan't trim the tail, match fails (should not happen)%s\n",
1875 PL_colors[4],PL_colors[5]));
1876 goto phooey; /* Should not happen! */
1878 dontbother = strend - last + prog->float_min_offset;
1880 if (minlen && (dontbother < minlen))
1881 dontbother = minlen - 1;
1882 strend -= dontbother; /* this one's always in bytes! */
1883 /* We don't know much -- general case. */
1886 if (regtry(prog, s))
1895 if (regtry(prog, s))
1897 } while (s++ < strend);
1905 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1907 if (PL_reg_eval_set) {
1908 /* Preserve the current value of $^R */
1909 if (oreplsv != GvSV(PL_replgv))
1910 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1911 restored, the value remains
1913 restore_pos(aTHX_ 0);
1916 /* make sure $`, $&, $', and $digit will work later */
1917 if ( !(flags & REXEC_NOT_FIRST) ) {
1918 if (RX_MATCH_COPIED(prog)) {
1919 Safefree(prog->subbeg);
1920 RX_MATCH_COPIED_off(prog);
1922 if (flags & REXEC_COPY_STR) {
1923 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1925 s = savepvn(strbeg, i);
1928 RX_MATCH_COPIED_on(prog);
1931 prog->subbeg = strbeg;
1932 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1939 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1940 PL_colors[4],PL_colors[5]));
1941 if (PL_reg_eval_set)
1942 restore_pos(aTHX_ 0);
1947 - regtry - try match at specific point
1949 STATIC I32 /* 0 failure, 1 success */
1950 S_regtry(pTHX_ regexp *prog, char *startpos)
1958 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1960 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1963 PL_reg_eval_set = RS_init;
1965 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1966 (IV)(PL_stack_sp - PL_stack_base));
1968 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1969 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1970 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1972 /* Apparently this is not needed, judging by wantarray. */
1973 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1974 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1977 /* Make $_ available to executed code. */
1978 if (PL_reg_sv != DEFSV) {
1979 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
1984 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1985 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1986 /* prepare for quick setting of pos */
1987 sv_magic(PL_reg_sv, (SV*)0,
1988 PERL_MAGIC_regex_global, Nullch, 0);
1989 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
1993 PL_reg_oldpos = mg->mg_len;
1994 SAVEDESTRUCTOR_X(restore_pos, 0);
1996 if (!PL_reg_curpm) {
1997 Newz(22,PL_reg_curpm, 1, PMOP);
2000 SV* repointer = newSViv(0);
2001 /* so we know which PL_regex_padav element is PL_reg_curpm */
2002 SvFLAGS(repointer) |= SVf_BREAK;
2003 av_push(PL_regex_padav,repointer);
2004 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2005 PL_regex_pad = AvARRAY(PL_regex_padav);
2009 PM_SETRE(PL_reg_curpm, prog);
2010 PL_reg_oldcurpm = PL_curpm;
2011 PL_curpm = PL_reg_curpm;
2012 if (RX_MATCH_COPIED(prog)) {
2013 /* Here is a serious problem: we cannot rewrite subbeg,
2014 since it may be needed if this match fails. Thus
2015 $` inside (?{}) could fail... */
2016 PL_reg_oldsaved = prog->subbeg;
2017 PL_reg_oldsavedlen = prog->sublen;
2018 RX_MATCH_COPIED_off(prog);
2021 PL_reg_oldsaved = Nullch;
2022 prog->subbeg = PL_bostr;
2023 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2025 prog->startp[0] = startpos - PL_bostr;
2026 PL_reginput = startpos;
2027 PL_regstartp = prog->startp;
2028 PL_regendp = prog->endp;
2029 PL_reglastparen = &prog->lastparen;
2030 PL_reglastcloseparen = &prog->lastcloseparen;
2031 prog->lastparen = 0;
2033 DEBUG_r(PL_reg_starttry = startpos);
2034 if (PL_reg_start_tmpl <= prog->nparens) {
2035 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2036 if(PL_reg_start_tmp)
2037 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2039 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2043 sv_setpvn(PERL_DEBUG_PAD(0), "", 0);
2044 sv_setpvn(PERL_DEBUG_PAD(1), "", 0);
2045 sv_setpvn(PERL_DEBUG_PAD(2), "", 0);
2048 /* XXXX What this code is doing here?!!! There should be no need
2049 to do this again and again, PL_reglastparen should take care of
2052 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2053 * Actually, the code in regcppop() (which Ilya may be meaning by
2054 * PL_reglastparen), is not needed at all by the test suite
2055 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2056 * enough, for building DynaLoader, or otherwise this
2057 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2058 * will happen. Meanwhile, this code *is* needed for the
2059 * above-mentioned test suite tests to succeed. The common theme
2060 * on those tests seems to be returning null fields from matches.
2065 if (prog->nparens) {
2066 for (i = prog->nparens; i > *PL_reglastparen; i--) {
2073 if (regmatch(prog->program + 1)) {
2074 prog->endp[0] = PL_reginput - PL_bostr;
2077 REGCP_UNWIND(lastcp);
2081 #define RE_UNWIND_BRANCH 1
2082 #define RE_UNWIND_BRANCHJ 2
2086 typedef struct { /* XX: makes sense to enlarge it... */
2090 } re_unwind_generic_t;
2103 } re_unwind_branch_t;
2105 typedef union re_unwind_t {
2107 re_unwind_generic_t generic;
2108 re_unwind_branch_t branch;
2111 #define sayYES goto yes
2112 #define sayNO goto no
2113 #define sayYES_FINAL goto yes_final
2114 #define sayYES_LOUD goto yes_loud
2115 #define sayNO_FINAL goto no_final
2116 #define sayNO_SILENT goto do_no
2117 #define saySAME(x) if (x) goto yes; else goto no
2119 #define REPORT_CODE_OFF 24
2122 - regmatch - main matching routine
2124 * Conceptually the strategy is simple: check to see whether the current
2125 * node matches, call self recursively to see whether the rest matches,
2126 * and then act accordingly. In practice we make some effort to avoid
2127 * recursion, in particular by going through "ordinary" nodes (that don't
2128 * need to know whether the rest of the match failed) by a loop instead of
2131 /* [lwall] I've hoisted the register declarations to the outer block in order to
2132 * maybe save a little bit of pushing and popping on the stack. It also takes
2133 * advantage of machines that use a register save mask on subroutine entry.
2135 STATIC I32 /* 0 failure, 1 success */
2136 S_regmatch(pTHX_ regnode *prog)
2138 register regnode *scan; /* Current node. */
2139 regnode *next; /* Next node. */
2140 regnode *inner; /* Next node in internal branch. */
2141 register I32 nextchr; /* renamed nextchr - nextchar colides with
2142 function of same name */
2143 register I32 n; /* no or next */
2144 register I32 ln = 0; /* len or last */
2145 register char *s = Nullch; /* operand or save */
2146 register char *locinput = PL_reginput;
2147 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2148 int minmod = 0, sw = 0, logical = 0;
2151 I32 firstcp = PL_savestack_ix;
2153 register bool do_utf8 = PL_reg_match_utf8;
2155 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2156 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2157 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2164 /* Note that nextchr is a byte even in UTF */
2165 nextchr = UCHARAT(locinput);
2167 while (scan != NULL) {
2170 SV *prop = sv_newmortal();
2171 int docolor = *PL_colors[0];
2172 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2173 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2174 /* The part of the string before starttry has one color
2175 (pref0_len chars), between starttry and current
2176 position another one (pref_len - pref0_len chars),
2177 after the current position the third one.
2178 We assume that pref0_len <= pref_len, otherwise we
2179 decrease pref0_len. */
2180 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2181 ? (5 + taill) - l : locinput - PL_bostr;
2184 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2186 pref0_len = pref_len - (locinput - PL_reg_starttry);
2187 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2188 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2189 ? (5 + taill) - pref_len : PL_regeol - locinput);
2190 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2194 if (pref0_len > pref_len)
2195 pref0_len = pref_len;
2196 regprop(prop, scan);
2200 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2201 pref0_len, 60, UNI_DISPLAY_REGEX) :
2202 locinput - pref_len;
2203 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2204 char *s1 = do_utf8 ?
2205 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2206 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2207 locinput - pref_len + pref0_len;
2208 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2209 char *s2 = do_utf8 ?
2210 pv_uni_display(dsv2, (U8*)locinput,
2211 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2213 int len2 = do_utf8 ? strlen(s2) : l;
2214 PerlIO_printf(Perl_debug_log,
2215 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2216 (IV)(locinput - PL_bostr),
2223 (docolor ? "" : "> <"),
2227 15 - l - pref_len + 1,
2229 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2234 next = scan + NEXT_OFF(scan);
2240 if (locinput == PL_bostr || (PL_multiline &&
2241 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2243 /* regtill = regbol; */
2248 if (locinput == PL_bostr ||
2249 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2255 if (locinput == PL_bostr)
2259 if (locinput == PL_reg_ganch)
2269 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2274 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2276 if (PL_regeol - locinput > 1)
2280 if (PL_regeol != locinput)
2284 if (!nextchr && locinput >= PL_regeol)
2287 locinput += PL_utf8skip[nextchr];
2288 if (locinput > PL_regeol)
2290 nextchr = UCHARAT(locinput);
2293 nextchr = UCHARAT(++locinput);
2296 if (!nextchr && locinput >= PL_regeol)
2298 nextchr = UCHARAT(++locinput);
2301 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2304 locinput += PL_utf8skip[nextchr];
2305 if (locinput > PL_regeol)
2307 nextchr = UCHARAT(locinput);
2310 nextchr = UCHARAT(++locinput);
2315 if (do_utf8 != (UTF!=0)) {
2316 /* The target and the pattern have differing utf8ness. */
2322 /* The target is utf8, the pattern is not utf8. */
2326 if (NATIVE_TO_UNI(*(U8*)s) !=
2327 utf8_to_uvchr((U8*)l, &ulen))
2334 /* The target is not utf8, the pattern is utf8. */
2338 if (NATIVE_TO_UNI(*((U8*)l)) !=
2339 utf8_to_uvchr((U8*)s, &ulen))
2346 nextchr = UCHARAT(locinput);
2349 /* The target and the pattern have the same utf8ness. */
2350 /* Inline the first character, for speed. */
2351 if (UCHARAT(s) != nextchr)
2353 if (PL_regeol - locinput < ln)
2355 if (ln > 1 && memNE(s, locinput, ln))
2358 nextchr = UCHARAT(locinput);
2361 PL_reg_flags |= RF_tainted;
2367 if (do_utf8 || UTF) {
2368 /* Either target or the pattern are utf8. */
2370 char *e = PL_regeol;
2372 if (ibcmp_utf8(s, 0, ln, do_utf8,
2376 nextchr = UCHARAT(locinput);
2380 /* Neither the target and the pattern are utf8. */
2382 /* Inline the first character, for speed. */
2383 if (UCHARAT(s) != nextchr &&
2384 UCHARAT(s) != ((OP(scan) == EXACTF)
2385 ? PL_fold : PL_fold_locale)[nextchr])
2387 if (PL_regeol - locinput < ln)
2389 if (ln > 1 && (OP(scan) == EXACTF
2390 ? ibcmp(s, locinput, ln)
2391 : ibcmp_locale(s, locinput, ln)))
2394 nextchr = UCHARAT(locinput);
2398 STRLEN inclasslen = PL_regeol - locinput;
2400 if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8))
2402 if (locinput >= PL_regeol)
2404 locinput += inclasslen;
2405 nextchr = UCHARAT(locinput);
2409 nextchr = UCHARAT(locinput);
2410 if (!reginclass(scan, (U8*)locinput, do_utf8))
2412 if (!nextchr && locinput >= PL_regeol)
2414 nextchr = UCHARAT(++locinput);
2418 PL_reg_flags |= RF_tainted;
2424 LOAD_UTF8_CHARCLASS(alnum,"a");
2425 if (!(OP(scan) == ALNUM
2426 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2427 : isALNUM_LC_utf8((U8*)locinput)))
2431 locinput += PL_utf8skip[nextchr];
2432 nextchr = UCHARAT(locinput);
2435 if (!(OP(scan) == ALNUM
2436 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2438 nextchr = UCHARAT(++locinput);
2441 PL_reg_flags |= RF_tainted;
2444 if (!nextchr && locinput >= PL_regeol)
2447 LOAD_UTF8_CHARCLASS(alnum,"a");
2448 if (OP(scan) == NALNUM
2449 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2450 : isALNUM_LC_utf8((U8*)locinput))
2454 locinput += PL_utf8skip[nextchr];
2455 nextchr = UCHARAT(locinput);
2458 if (OP(scan) == NALNUM
2459 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2461 nextchr = UCHARAT(++locinput);
2465 PL_reg_flags |= RF_tainted;
2469 /* was last char in word? */
2471 if (locinput == PL_bostr)
2474 U8 *r = reghop((U8*)locinput, -1);
2476 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2478 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2479 ln = isALNUM_uni(ln);
2480 LOAD_UTF8_CHARCLASS(alnum,"a");
2481 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2484 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2485 n = isALNUM_LC_utf8((U8*)locinput);
2489 ln = (locinput != PL_bostr) ?
2490 UCHARAT(locinput - 1) : '\n';
2491 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2493 n = isALNUM(nextchr);
2496 ln = isALNUM_LC(ln);
2497 n = isALNUM_LC(nextchr);
2500 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2501 OP(scan) == BOUNDL))
2505 PL_reg_flags |= RF_tainted;
2511 if (UTF8_IS_CONTINUED(nextchr)) {
2512 LOAD_UTF8_CHARCLASS(space," ");
2513 if (!(OP(scan) == SPACE
2514 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2515 : isSPACE_LC_utf8((U8*)locinput)))
2519 locinput += PL_utf8skip[nextchr];
2520 nextchr = UCHARAT(locinput);
2523 if (!(OP(scan) == SPACE
2524 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2526 nextchr = UCHARAT(++locinput);
2529 if (!(OP(scan) == SPACE
2530 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2532 nextchr = UCHARAT(++locinput);
2536 PL_reg_flags |= RF_tainted;
2539 if (!nextchr && locinput >= PL_regeol)
2542 LOAD_UTF8_CHARCLASS(space," ");
2543 if (OP(scan) == NSPACE
2544 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2545 : isSPACE_LC_utf8((U8*)locinput))
2549 locinput += PL_utf8skip[nextchr];
2550 nextchr = UCHARAT(locinput);
2553 if (OP(scan) == NSPACE
2554 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2556 nextchr = UCHARAT(++locinput);
2559 PL_reg_flags |= RF_tainted;
2565 LOAD_UTF8_CHARCLASS(digit,"0");
2566 if (!(OP(scan) == DIGIT
2567 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2568 : isDIGIT_LC_utf8((U8*)locinput)))
2572 locinput += PL_utf8skip[nextchr];
2573 nextchr = UCHARAT(locinput);
2576 if (!(OP(scan) == DIGIT
2577 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2579 nextchr = UCHARAT(++locinput);
2582 PL_reg_flags |= RF_tainted;
2585 if (!nextchr && locinput >= PL_regeol)
2588 LOAD_UTF8_CHARCLASS(digit,"0");
2589 if (OP(scan) == NDIGIT
2590 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2591 : isDIGIT_LC_utf8((U8*)locinput))
2595 locinput += PL_utf8skip[nextchr];
2596 nextchr = UCHARAT(locinput);
2599 if (OP(scan) == NDIGIT
2600 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2602 nextchr = UCHARAT(++locinput);
2605 if (locinput >= PL_regeol)
2608 LOAD_UTF8_CHARCLASS(mark,"~");
2609 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2611 locinput += PL_utf8skip[nextchr];
2612 while (locinput < PL_regeol &&
2613 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2614 locinput += UTF8SKIP(locinput);
2615 if (locinput > PL_regeol)
2620 nextchr = UCHARAT(locinput);
2623 PL_reg_flags |= RF_tainted;
2627 n = ARG(scan); /* which paren pair */
2628 ln = PL_regstartp[n];
2629 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2630 if (*PL_reglastparen < n || ln == -1)
2631 sayNO; /* Do not match unless seen CLOSEn. */
2632 if (ln == PL_regendp[n])
2636 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2638 char *e = PL_bostr + PL_regendp[n];
2640 * Note that we can't do the "other character" lookup trick as
2641 * in the 8-bit case (no pun intended) because in Unicode we
2642 * have to map both upper and title case to lower case.
2644 if (OP(scan) == REFF) {
2645 STRLEN ulen1, ulen2;
2646 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2647 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
2651 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2652 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2653 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2660 nextchr = UCHARAT(locinput);
2664 /* Inline the first character, for speed. */
2665 if (UCHARAT(s) != nextchr &&
2667 (UCHARAT(s) != ((OP(scan) == REFF
2668 ? PL_fold : PL_fold_locale)[nextchr]))))
2670 ln = PL_regendp[n] - ln;
2671 if (locinput + ln > PL_regeol)
2673 if (ln > 1 && (OP(scan) == REF
2674 ? memNE(s, locinput, ln)
2676 ? ibcmp(s, locinput, ln)
2677 : ibcmp_locale(s, locinput, ln))))
2680 nextchr = UCHARAT(locinput);
2691 OP_4tree *oop = PL_op;
2692 COP *ocurcop = PL_curcop;
2693 SV **ocurpad = PL_curpad;
2697 PL_op = (OP_4tree*)PL_regdata->data[n];
2698 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2699 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2700 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2704 CALLRUNOPS(aTHX); /* Scalar context. */
2707 ret = Nullsv; /* protect against empty (?{}) blocks. */
2715 PL_curpad = ocurpad;
2716 PL_curcop = ocurcop;
2718 if (logical == 2) { /* Postponed subexpression. */
2720 MAGIC *mg = Null(MAGIC*);
2722 CHECKPOINT cp, lastcp;
2724 if(SvROK(ret) || SvRMAGICAL(ret)) {
2725 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2728 mg = mg_find(sv, PERL_MAGIC_qr);
2731 re = (regexp *)mg->mg_obj;
2732 (void)ReREFCNT_inc(re);
2736 char *t = SvPV(ret, len);
2738 char *oprecomp = PL_regprecomp;
2739 I32 osize = PL_regsize;
2740 I32 onpar = PL_regnpar;
2743 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2745 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2746 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2748 PL_regprecomp = oprecomp;
2753 PerlIO_printf(Perl_debug_log,
2754 "Entering embedded `%s%.60s%s%s'\n",
2758 (strlen(re->precomp) > 60 ? "..." : ""))
2761 state.prev = PL_reg_call_cc;
2762 state.cc = PL_regcc;
2763 state.re = PL_reg_re;
2767 cp = regcppush(0); /* Save *all* the positions. */
2770 state.ss = PL_savestack_ix;
2771 *PL_reglastparen = 0;
2772 *PL_reglastcloseparen = 0;
2773 PL_reg_call_cc = &state;
2774 PL_reginput = locinput;
2776 /* XXXX This is too dramatic a measure... */
2779 if (regmatch(re->program + 1)) {
2780 /* Even though we succeeded, we need to restore
2781 global variables, since we may be wrapped inside
2782 SUSPEND, thus the match may be not finished yet. */
2784 /* XXXX Do this only if SUSPENDed? */
2785 PL_reg_call_cc = state.prev;
2786 PL_regcc = state.cc;
2787 PL_reg_re = state.re;
2788 cache_re(PL_reg_re);
2790 /* XXXX This is too dramatic a measure... */
2793 /* These are needed even if not SUSPEND. */
2799 REGCP_UNWIND(lastcp);
2801 PL_reg_call_cc = state.prev;
2802 PL_regcc = state.cc;
2803 PL_reg_re = state.re;
2804 cache_re(PL_reg_re);
2806 /* XXXX This is too dramatic a measure... */
2816 sv_setsv(save_scalar(PL_replgv), ret);
2820 n = ARG(scan); /* which paren pair */
2821 PL_reg_start_tmp[n] = locinput;
2826 n = ARG(scan); /* which paren pair */
2827 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2828 PL_regendp[n] = locinput - PL_bostr;
2829 if (n > *PL_reglastparen)
2830 *PL_reglastparen = n;
2831 *PL_reglastcloseparen = n;
2834 n = ARG(scan); /* which paren pair */
2835 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2838 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2840 next = NEXTOPER(NEXTOPER(scan));
2842 next = scan + ARG(scan);
2843 if (OP(next) == IFTHEN) /* Fake one. */
2844 next = NEXTOPER(NEXTOPER(next));
2848 logical = scan->flags;
2850 /*******************************************************************
2851 PL_regcc contains infoblock about the innermost (...)* loop, and
2852 a pointer to the next outer infoblock.
2854 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2856 1) After matching X, regnode for CURLYX is processed;
2858 2) This regnode creates infoblock on the stack, and calls
2859 regmatch() recursively with the starting point at WHILEM node;
2861 3) Each hit of WHILEM node tries to match A and Z (in the order
2862 depending on the current iteration, min/max of {min,max} and
2863 greediness). The information about where are nodes for "A"
2864 and "Z" is read from the infoblock, as is info on how many times "A"
2865 was already matched, and greediness.
2867 4) After A matches, the same WHILEM node is hit again.
2869 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2870 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2871 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2872 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2873 of the external loop.
2875 Currently present infoblocks form a tree with a stem formed by PL_curcc
2876 and whatever it mentions via ->next, and additional attached trees
2877 corresponding to temporarily unset infoblocks as in "5" above.
2879 In the following picture infoblocks for outer loop of
2880 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2881 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2882 infoblocks are drawn below the "reset" infoblock.
2884 In fact in the picture below we do not show failed matches for Z and T
2885 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2886 more obvious *why* one needs to *temporary* unset infoblocks.]
2888 Matched REx position InfoBlocks Comment
2892 Y A)*?Z)*?T x <- O <- I
2893 YA )*?Z)*?T x <- O <- I
2894 YA A)*?Z)*?T x <- O <- I
2895 YAA )*?Z)*?T x <- O <- I
2896 YAA Z)*?T x <- O # Temporary unset I
2899 YAAZ Y(A)*?Z)*?T x <- O
2902 YAAZY (A)*?Z)*?T x <- O
2905 YAAZY A)*?Z)*?T x <- O <- I
2908 YAAZYA )*?Z)*?T x <- O <- I
2911 YAAZYA Z)*?T x <- O # Temporary unset I
2917 YAAZYAZ T x # Temporary unset O
2924 *******************************************************************/
2927 CHECKPOINT cp = PL_savestack_ix;
2928 /* No need to save/restore up to this paren */
2929 I32 parenfloor = scan->flags;
2931 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2933 cc.oldcc = PL_regcc;
2935 /* XXXX Probably it is better to teach regpush to support
2936 parenfloor > PL_regsize... */
2937 if (parenfloor > *PL_reglastparen)
2938 parenfloor = *PL_reglastparen; /* Pessimization... */
2939 cc.parenfloor = parenfloor;
2941 cc.min = ARG1(scan);
2942 cc.max = ARG2(scan);
2943 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2947 PL_reginput = locinput;
2948 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2950 PL_regcc = cc.oldcc;
2956 * This is really hard to understand, because after we match
2957 * what we're trying to match, we must make sure the rest of
2958 * the REx is going to match for sure, and to do that we have
2959 * to go back UP the parse tree by recursing ever deeper. And
2960 * if it fails, we have to reset our parent's current state
2961 * that we can try again after backing off.
2964 CHECKPOINT cp, lastcp;
2965 CURCUR* cc = PL_regcc;
2966 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2968 n = cc->cur + 1; /* how many we know we matched */
2969 PL_reginput = locinput;
2972 PerlIO_printf(Perl_debug_log,
2973 "%*s %ld out of %ld..%ld cc=%lx\n",
2974 REPORT_CODE_OFF+PL_regindent*2, "",
2975 (long)n, (long)cc->min,
2976 (long)cc->max, (long)cc)
2979 /* If degenerate scan matches "", assume scan done. */
2981 if (locinput == cc->lastloc && n >= cc->min) {
2982 PL_regcc = cc->oldcc;
2986 PerlIO_printf(Perl_debug_log,
2987 "%*s empty match detected, try continuation...\n",
2988 REPORT_CODE_OFF+PL_regindent*2, "")
2990 if (regmatch(cc->next))
2998 /* First just match a string of min scans. */
3002 cc->lastloc = locinput;
3003 if (regmatch(cc->scan))
3006 cc->lastloc = lastloc;
3011 /* Check whether we already were at this position.
3012 Postpone detection until we know the match is not
3013 *that* much linear. */
3014 if (!PL_reg_maxiter) {
3015 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3016 PL_reg_leftiter = PL_reg_maxiter;
3018 if (PL_reg_leftiter-- == 0) {
3019 I32 size = (PL_reg_maxiter + 7)/8;
3020 if (PL_reg_poscache) {
3021 if (PL_reg_poscache_size < size) {
3022 Renew(PL_reg_poscache, size, char);
3023 PL_reg_poscache_size = size;
3025 Zero(PL_reg_poscache, size, char);
3028 PL_reg_poscache_size = size;
3029 Newz(29, PL_reg_poscache, size, char);
3032 PerlIO_printf(Perl_debug_log,
3033 "%sDetected a super-linear match, switching on caching%s...\n",
3034 PL_colors[4], PL_colors[5])
3037 if (PL_reg_leftiter < 0) {
3038 I32 o = locinput - PL_bostr, b;
3040 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3043 if (PL_reg_poscache[o] & (1<<b)) {
3045 PerlIO_printf(Perl_debug_log,
3046 "%*s already tried at this position...\n",
3047 REPORT_CODE_OFF+PL_regindent*2, "")
3051 PL_reg_poscache[o] |= (1<<b);
3055 /* Prefer next over scan for minimal matching. */
3058 PL_regcc = cc->oldcc;
3061 cp = regcppush(cc->parenfloor);
3063 if (regmatch(cc->next)) {
3065 sayYES; /* All done. */
3067 REGCP_UNWIND(lastcp);
3073 if (n >= cc->max) { /* Maximum greed exceeded? */
3074 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3075 && !(PL_reg_flags & RF_warned)) {
3076 PL_reg_flags |= RF_warned;
3077 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
3078 "Complex regular subexpression recursion",
3085 PerlIO_printf(Perl_debug_log,
3086 "%*s trying longer...\n",
3087 REPORT_CODE_OFF+PL_regindent*2, "")
3089 /* Try scanning more and see if it helps. */
3090 PL_reginput = locinput;
3092 cc->lastloc = locinput;
3093 cp = regcppush(cc->parenfloor);
3095 if (regmatch(cc->scan)) {
3099 REGCP_UNWIND(lastcp);
3102 cc->lastloc = lastloc;
3106 /* Prefer scan over next for maximal matching. */
3108 if (n < cc->max) { /* More greed allowed? */
3109 cp = regcppush(cc->parenfloor);
3111 cc->lastloc = locinput;
3113 if (regmatch(cc->scan)) {
3117 REGCP_UNWIND(lastcp);
3118 regcppop(); /* Restore some previous $<digit>s? */
3119 PL_reginput = locinput;
3121 PerlIO_printf(Perl_debug_log,
3122 "%*s failed, try continuation...\n",
3123 REPORT_CODE_OFF+PL_regindent*2, "")
3126 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3127 && !(PL_reg_flags & RF_warned)) {
3128 PL_reg_flags |= RF_warned;
3129 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
3130 "Complex regular subexpression recursion",
3134 /* Failed deeper matches of scan, so see if this one works. */
3135 PL_regcc = cc->oldcc;
3138 if (regmatch(cc->next))
3144 cc->lastloc = lastloc;
3149 next = scan + ARG(scan);
3152 inner = NEXTOPER(NEXTOPER(scan));
3155 inner = NEXTOPER(scan);
3159 if (OP(next) != c1) /* No choice. */
3160 next = inner; /* Avoid recursion. */
3162 I32 lastparen = *PL_reglastparen;
3164 re_unwind_branch_t *uw;
3166 /* Put unwinding data on stack */
3167 unwind1 = SSNEWt(1,re_unwind_branch_t);
3168 uw = SSPTRt(unwind1,re_unwind_branch_t);
3171 uw->type = ((c1 == BRANCH)
3173 : RE_UNWIND_BRANCHJ);
3174 uw->lastparen = lastparen;
3176 uw->locinput = locinput;
3177 uw->nextchr = nextchr;
3179 uw->regindent = ++PL_regindent;
3182 REGCP_SET(uw->lastcp);
3184 /* Now go into the first branch */
3197 /* We suppose that the next guy does not need
3198 backtracking: in particular, it is of constant length,
3199 and has no parenths to influence future backrefs. */
3200 ln = ARG1(scan); /* min to match */
3201 n = ARG2(scan); /* max to match */
3202 paren = scan->flags;
3204 if (paren > PL_regsize)
3206 if (paren > *PL_reglastparen)
3207 *PL_reglastparen = paren;
3209 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3211 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3212 PL_reginput = locinput;
3215 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3217 /* if we matched something zero-length we don't need to
3218 backtrack - capturing parens are already defined, so
3219 the caveat in the maximal case doesn't apply
3221 XXXX if ln == 0, we can redo this check first time
3222 through the following loop
3225 n = ln; /* don't backtrack */
3226 locinput = PL_reginput;
3227 if (HAS_TEXT(next) || JUMPABLE(next)) {
3228 regnode *text_node = next;
3230 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3232 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3234 if (PL_regkind[(U8)OP(text_node)] == REF) {
3236 n = ARG(text_node); /* which paren pair */
3237 ln = PL_regstartp[n];
3238 /* assume yes if we haven't seen CLOSEn */
3240 *PL_reglastparen < n ||
3247 c1 = *(PL_bostr + ln);
3249 else { c1 = (U8)*STRING(text_node); }
3250 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3252 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3253 c2 = PL_fold_locale[c1];
3262 /* This may be improved if l == 0. */
3263 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3264 /* If it could work, try it. */
3266 UCHARAT(PL_reginput) == c1 ||
3267 UCHARAT(PL_reginput) == c2)
3271 PL_regstartp[paren] =
3272 HOPc(PL_reginput, -l) - PL_bostr;
3273 PL_regendp[paren] = PL_reginput - PL_bostr;
3276 PL_regendp[paren] = -1;
3280 REGCP_UNWIND(lastcp);
3282 /* Couldn't or didn't -- move forward. */
3283 PL_reginput = locinput;
3284 if (regrepeat_hard(scan, 1, &l)) {
3286 locinput = PL_reginput;
3293 n = regrepeat_hard(scan, n, &l);
3294 /* if we matched something zero-length we don't need to
3295 backtrack, unless the minimum count is zero and we
3296 are capturing the result - in that case the capture
3297 being defined or not may affect later execution
3299 if (n != 0 && l == 0 && !(paren && ln == 0))
3300 ln = n; /* don't backtrack */
3301 locinput = PL_reginput;
3303 PerlIO_printf(Perl_debug_log,
3304 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3305 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3309 if (HAS_TEXT(next) || JUMPABLE(next)) {
3310 regnode *text_node = next;
3312 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3314 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3316 if (PL_regkind[(U8)OP(text_node)] == REF) {
3318 n = ARG(text_node); /* which paren pair */
3319 ln = PL_regstartp[n];
3320 /* assume yes if we haven't seen CLOSEn */
3322 *PL_reglastparen < n ||
3329 c1 = *(PL_bostr + ln);
3331 else { c1 = (U8)*STRING(text_node); }
3333 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3335 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3336 c2 = PL_fold_locale[c1];
3347 /* If it could work, try it. */
3349 UCHARAT(PL_reginput) == c1 ||
3350 UCHARAT(PL_reginput) == c2)
3353 PerlIO_printf(Perl_debug_log,
3354 "%*s trying tail with n=%"IVdf"...\n",
3355 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3359 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3360 PL_regendp[paren] = PL_reginput - PL_bostr;
3363 PL_regendp[paren] = -1;
3367 REGCP_UNWIND(lastcp);
3369 /* Couldn't or didn't -- back up. */
3371 locinput = HOPc(locinput, -l);
3372 PL_reginput = locinput;
3379 paren = scan->flags; /* Which paren to set */
3380 if (paren > PL_regsize)
3382 if (paren > *PL_reglastparen)
3383 *PL_reglastparen = paren;
3384 ln = ARG1(scan); /* min to match */
3385 n = ARG2(scan); /* max to match */
3386 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3390 ln = ARG1(scan); /* min to match */
3391 n = ARG2(scan); /* max to match */
3392 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3397 scan = NEXTOPER(scan);
3403 scan = NEXTOPER(scan);
3407 * Lookahead to avoid useless match attempts
3408 * when we know what character comes next.
3412 * Used to only do .*x and .*?x, but now it allows
3413 * for )'s, ('s and (?{ ... })'s to be in the way
3414 * of the quantifier and the EXACT-like node. -- japhy
3417 if (HAS_TEXT(next) || JUMPABLE(next)) {
3419 regnode *text_node = next;
3421 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3423 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3425 if (PL_regkind[(U8)OP(text_node)] == REF) {
3427 n = ARG(text_node); /* which paren pair */
3428 ln = PL_regstartp[n];
3429 /* assume yes if we haven't seen CLOSEn */
3431 *PL_reglastparen < n ||
3436 goto assume_ok_easy;
3438 s = (U8*)PL_bostr + ln;
3440 else { s = (U8*)STRING(text_node); }
3444 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3446 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3447 c2 = PL_fold_locale[c1];
3450 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3451 STRLEN ulen1, ulen2;
3452 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3453 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
3455 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3456 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3458 c1 = utf8_to_uvuni(tmpbuf1, 0);
3459 c2 = utf8_to_uvuni(tmpbuf2, 0);
3462 c2 = c1 = utf8_to_uvchr(s, NULL);
3470 PL_reginput = locinput;
3474 if (ln && regrepeat(scan, ln) < ln)
3476 locinput = PL_reginput;
3479 char *e; /* Should not check after this */
3480 char *old = locinput;
3482 if (n == REG_INFTY) {
3485 while (UTF8_IS_CONTINUATION(*(U8*)e))
3491 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3495 e = locinput + n - ln;
3501 /* Find place 'next' could work */
3504 while (locinput <= e &&
3505 UCHARAT(locinput) != c1)
3508 while (locinput <= e
3509 && UCHARAT(locinput) != c1
3510 && UCHARAT(locinput) != c2)
3513 count = locinput - old;
3520 utf8_to_uvchr((U8*)locinput, &len) != c1;
3525 for (count = 0; locinput <= e; count++) {
3526 UV c = utf8_to_uvchr((U8*)locinput, &len);
3527 if (c == c1 || c == c2)
3535 /* PL_reginput == old now */
3536 if (locinput != old) {
3537 ln = 1; /* Did some */
3538 if (regrepeat(scan, count) < count)
3541 /* PL_reginput == locinput now */
3542 TRYPAREN(paren, ln, locinput);
3543 PL_reginput = locinput; /* Could be reset... */
3544 REGCP_UNWIND(lastcp);
3545 /* Couldn't or didn't -- move forward. */
3548 locinput += UTF8SKIP(locinput);
3554 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3558 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3560 c = UCHARAT(PL_reginput);
3561 /* If it could work, try it. */
3562 if (c == c1 || c == c2)
3564 TRYPAREN(paren, n, PL_reginput);
3565 REGCP_UNWIND(lastcp);
3568 /* If it could work, try it. */
3569 else if (c1 == -1000)
3571 TRYPAREN(paren, n, PL_reginput);
3572 REGCP_UNWIND(lastcp);
3574 /* Couldn't or didn't -- move forward. */
3575 PL_reginput = locinput;
3576 if (regrepeat(scan, 1)) {
3578 locinput = PL_reginput;
3586 n = regrepeat(scan, n);
3587 locinput = PL_reginput;
3588 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3589 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3590 ln = n; /* why back off? */
3591 /* ...because $ and \Z can match before *and* after
3592 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3593 We should back off by one in this case. */
3594 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3603 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3605 c = UCHARAT(PL_reginput);
3607 /* If it could work, try it. */
3608 if (c1 == -1000 || c == c1 || c == c2)
3610 TRYPAREN(paren, n, PL_reginput);
3611 REGCP_UNWIND(lastcp);
3613 /* Couldn't or didn't -- back up. */
3615 PL_reginput = locinput = HOPc(locinput, -1);
3623 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3625 c = UCHARAT(PL_reginput);
3627 /* If it could work, try it. */
3628 if (c1 == -1000 || c == c1 || c == c2)
3630 TRYPAREN(paren, n, PL_reginput);
3631 REGCP_UNWIND(lastcp);
3633 /* Couldn't or didn't -- back up. */
3635 PL_reginput = locinput = HOPc(locinput, -1);
3642 if (PL_reg_call_cc) {
3643 re_cc_state *cur_call_cc = PL_reg_call_cc;
3644 CURCUR *cctmp = PL_regcc;
3645 regexp *re = PL_reg_re;
3646 CHECKPOINT cp, lastcp;
3648 cp = regcppush(0); /* Save *all* the positions. */
3650 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3652 PL_reginput = locinput; /* Make position available to
3654 cache_re(PL_reg_call_cc->re);
3655 PL_regcc = PL_reg_call_cc->cc;
3656 PL_reg_call_cc = PL_reg_call_cc->prev;
3657 if (regmatch(cur_call_cc->node)) {
3658 PL_reg_call_cc = cur_call_cc;
3662 REGCP_UNWIND(lastcp);
3664 PL_reg_call_cc = cur_call_cc;
3670 PerlIO_printf(Perl_debug_log,
3671 "%*s continuation failed...\n",
3672 REPORT_CODE_OFF+PL_regindent*2, "")
3676 if (locinput < PL_regtill) {
3677 DEBUG_r(PerlIO_printf(Perl_debug_log,
3678 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3680 (long)(locinput - PL_reg_starttry),
3681 (long)(PL_regtill - PL_reg_starttry),
3683 sayNO_FINAL; /* Cannot match: too short. */
3685 PL_reginput = locinput; /* put where regtry can find it */
3686 sayYES_FINAL; /* Success! */
3688 PL_reginput = locinput; /* put where regtry can find it */
3689 sayYES_LOUD; /* Success! */
3692 PL_reginput = locinput;
3697 s = HOPBACKc(locinput, scan->flags);
3703 PL_reginput = locinput;
3708 s = HOPBACKc(locinput, scan->flags);
3714 PL_reginput = locinput;
3717 inner = NEXTOPER(NEXTOPER(scan));
3718 if (regmatch(inner) != n) {
3733 if (OP(scan) == SUSPEND) {
3734 locinput = PL_reginput;
3735 nextchr = UCHARAT(locinput);
3740 next = scan + ARG(scan);
3745 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3746 PTR2UV(scan), OP(scan));
3747 Perl_croak(aTHX_ "regexp memory corruption");
3754 * We get here only if there's trouble -- normally "case END" is
3755 * the terminating point.
3757 Perl_croak(aTHX_ "corrupted regexp pointers");
3763 PerlIO_printf(Perl_debug_log,
3764 "%*s %scould match...%s\n",
3765 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3769 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3770 PL_colors[4],PL_colors[5]));
3776 #if 0 /* Breaks $^R */
3784 PerlIO_printf(Perl_debug_log,
3785 "%*s %sfailed...%s\n",
3786 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3792 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3795 case RE_UNWIND_BRANCH:
3796 case RE_UNWIND_BRANCHJ:
3798 re_unwind_branch_t *uwb = &(uw->branch);
3799 I32 lastparen = uwb->lastparen;
3801 REGCP_UNWIND(uwb->lastcp);
3802 for (n = *PL_reglastparen; n > lastparen; n--)
3804 *PL_reglastparen = n;
3805 scan = next = uwb->next;
3807 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3808 ? BRANCH : BRANCHJ) ) { /* Failure */
3815 /* Have more choice yet. Reuse the same uwb. */
3817 if ((n = (uwb->type == RE_UNWIND_BRANCH
3818 ? NEXT_OFF(next) : ARG(next))))
3821 next = NULL; /* XXXX Needn't unwinding in this case... */
3823 next = NEXTOPER(scan);
3824 if (uwb->type == RE_UNWIND_BRANCHJ)
3825 next = NEXTOPER(next);
3826 locinput = uwb->locinput;
3827 nextchr = uwb->nextchr;
3829 PL_regindent = uwb->regindent;
3836 Perl_croak(aTHX_ "regexp unwind memory corruption");
3847 - regrepeat - repeatedly match something simple, report how many
3850 * [This routine now assumes that it will only match on things of length 1.
3851 * That was true before, but now we assume scan - reginput is the count,
3852 * rather than incrementing count on every character. [Er, except utf8.]]
3855 S_regrepeat(pTHX_ regnode *p, I32 max)
3857 register char *scan;
3859 register char *loceol = PL_regeol;
3860 register I32 hardcount = 0;
3861 register bool do_utf8 = PL_reg_match_utf8;
3864 if (max != REG_INFTY && max < loceol - scan)
3865 loceol = scan + max;
3870 while (scan < loceol && hardcount < max && *scan != '\n') {
3871 scan += UTF8SKIP(scan);
3875 while (scan < loceol && *scan != '\n')
3885 case EXACT: /* length of string is 1 */
3887 while (scan < loceol && UCHARAT(scan) == c)
3890 case EXACTF: /* length of string is 1 */
3892 while (scan < loceol &&
3893 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3896 case EXACTFL: /* length of string is 1 */
3897 PL_reg_flags |= RF_tainted;
3899 while (scan < loceol &&
3900 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))