5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
38 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
70 #define PERL_IN_REGEXEC_C
73 #ifdef PERL_IN_XSUB_RE
79 #define RF_tainted 1 /* tainted information used? */
80 #define RF_warned 2 /* warned about big count? */
81 #define RF_evaled 4 /* Did an EVAL with setting? */
82 #define RF_utf8 8 /* String contains multibyte chars? */
84 #define UTF ((PL_reg_flags & RF_utf8) != 0)
86 #define RS_init 1 /* eval environment created */
87 #define RS_set 2 /* replsv value is set */
93 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
99 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
100 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
102 #define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
106 #define HOPBACKc(pos, off) \
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
113 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
114 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
116 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
123 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
125 /* for use after a quantifier and before an EXACT-like node -- japhy */
126 #define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
133 #define HAS_TEXT(rn) ( \
134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
141 #define FIND_NEXT_IMPT(rn) STMT_START { \
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
145 rn = NEXTOPER(NEXTOPER(rn)); \
146 else if (type == PLUS) \
148 else if (type == IFMATCH) \
149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
150 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 6
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
171 for (p = PL_regsize; p > parenfloor; p--) {
172 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
175 SSPUSHPTR(PL_reg_start_tmp[p]);
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
184 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
187 SSPUSHINT(*PL_reglastcloseparen);
188 SSPUSHPTR(PL_reginput);
189 #define REGCP_FRAME_ELEMS 2
190 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
198 /* These are needed since we do not localize EVAL nodes: */
199 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
203 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
204 PerlIO_printf(Perl_debug_log, \
205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
209 S_regcppop(pTHX_ const regexp *rex)
215 GET_RE_DEBUG_FLAGS_DECL;
217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
221 input = (char *) SSPOPPTR;
222 *PL_reglastcloseparen = SSPOPINT;
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
226 /* Now restore the parentheses context. */
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
230 U32 paren = (U32)SSPOPINT;
231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
232 PL_regstartp[paren] = SSPOPINT;
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
237 PerlIO_printf(Perl_debug_log,
238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
239 (UV)paren, (IV)PL_regstartp[paren],
240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
241 (IV)PL_regendp[paren],
242 (paren > *PL_reglastparen ? "(no)" : ""));
246 if (*PL_reglastparen + 1 <= rex->nparens) {
247 PerlIO_printf(Perl_debug_log,
248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
265 PL_regstartp[i] = -1;
272 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
274 #define TRYPAREN(paren, n, input, where) { \
277 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
278 PL_regendp[paren] = input - PL_bostr; \
281 PL_regendp[paren] = -1; \
283 REGMATCH(next, where); \
287 PL_regendp[paren] = -1; \
292 * pregexec and friends
295 #ifndef PERL_IN_XSUB_RE
297 - pregexec - match a regexp against a string
300 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
301 char *strbeg, I32 minend, SV *screamer, U32 nosave)
302 /* strend: pointer to null at end of string */
303 /* strbeg: real beginning of string */
304 /* minend: end of match must be >=minend after stringarg. */
305 /* nosave: For optimizations. */
308 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
309 nosave ? 0 : REXEC_COPY_STR);
314 * Need to implement the following flags for reg_anch:
316 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
318 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
319 * INTUIT_AUTORITATIVE_ML
320 * INTUIT_ONCE_NOML - Intuit can match in one location only.
323 * Another flag for this function: SECOND_TIME (so that float substrs
324 * with giant delta may be not rechecked).
327 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
329 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
330 Otherwise, only SvCUR(sv) is used to get strbeg. */
332 /* XXXX We assume that strpos is strbeg unless sv. */
334 /* XXXX Some places assume that there is a fixed substring.
335 An update may be needed if optimizer marks as "INTUITable"
336 RExen without fixed substrings. Similarly, it is assumed that
337 lengths of all the strings are no more than minlen, thus they
338 cannot come from lookahead.
339 (Or minlen should take into account lookahead.) */
341 /* A failure to find a constant substring means that there is no need to make
342 an expensive call to REx engine, thus we celebrate a failure. Similarly,
343 finding a substring too deep into the string means that less calls to
344 regtry() should be needed.
346 REx compiler's optimizer found 4 possible hints:
347 a) Anchored substring;
349 c) Whether we are anchored (beginning-of-line or \G);
350 d) First node (of those at offset 0) which may distingush positions;
351 We use a)b)d) and multiline-part of c), and try to find a position in the
352 string which does not contradict any of them.
355 /* Most of decisions we do here should have been done at compile time.
356 The nodes of the REx which we used for the search should have been
357 deleted from the finite automaton. */
360 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
361 char *strend, U32 flags, re_scream_pos_data *data)
364 register I32 start_shift = 0;
365 /* Should be nonnegative! */
366 register I32 end_shift = 0;
371 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
373 register char *other_last = NULL; /* other substr checked before this */
374 char *check_at = NULL; /* check substr found at this pos */
375 const I32 multiline = prog->reganch & PMf_MULTILINE;
377 const char * const i_strpos = strpos;
378 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
381 GET_RE_DEBUG_FLAGS_DECL;
383 RX_MATCH_UTF8_set(prog,do_utf8);
385 if (prog->reganch & ROPT_UTF8) {
386 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
387 "UTF-8 regex...\n"));
388 PL_reg_flags |= RF_utf8;
392 const char *s = PL_reg_match_utf8 ?
393 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
395 const int len = PL_reg_match_utf8 ?
396 (int)strlen(s) : strend - strpos;
399 if (PL_reg_match_utf8)
400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
401 "UTF-8 target...\n"));
402 PerlIO_printf(Perl_debug_log,
403 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
404 PL_colors[4], PL_colors[5], PL_colors[0],
407 (strlen(prog->precomp) > 60 ? "..." : ""),
409 (int)(len > 60 ? 60 : len),
411 (len > 60 ? "..." : "")
415 /* CHR_DIST() would be more correct here but it makes things slow. */
416 if (prog->minlen > strend - strpos) {
417 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
418 "String too short... [re_intuit_start]\n"));
421 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
424 if (!prog->check_utf8 && prog->check_substr)
425 to_utf8_substr(prog);
426 check = prog->check_utf8;
428 if (!prog->check_substr && prog->check_utf8)
429 to_byte_substr(prog);
430 check = prog->check_substr;
432 if (check == &PL_sv_undef) {
433 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
434 "Non-utf string cannot match utf check string\n"));
437 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
438 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
439 || ( (prog->reganch & ROPT_ANCH_BOL)
440 && !multiline ) ); /* Check after \n? */
443 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
444 | ROPT_IMPLICIT)) /* not a real BOL */
445 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
447 && (strpos != strbeg)) {
448 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
451 if (prog->check_offset_min == prog->check_offset_max &&
452 !(prog->reganch & ROPT_CANY_SEEN)) {
453 /* Substring at constant offset from beg-of-str... */
456 s = HOP3c(strpos, prog->check_offset_min, strend);
458 slen = SvCUR(check); /* >= 1 */
460 if ( strend - s > slen || strend - s < slen - 1
461 || (strend - s == slen && strend[-1] != '\n')) {
462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
465 /* Now should match s[0..slen-2] */
467 if (slen && (*SvPVX_const(check) != *s
469 && memNE(SvPVX_const(check), s, slen)))) {
471 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
475 else if (*SvPVX_const(check) != *s
476 || ((slen = SvCUR(check)) > 1
477 && memNE(SvPVX_const(check), s, slen)))
480 goto success_at_start;
483 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
485 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
486 end_shift = prog->minlen - start_shift -
487 CHR_SVLEN(check) + (SvTAIL(check) != 0);
489 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
490 - (SvTAIL(check) != 0);
491 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
493 if (end_shift < eshift)
497 else { /* Can match at random position */
500 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
501 /* Should be nonnegative! */
502 end_shift = prog->minlen - start_shift -
503 CHR_SVLEN(check) + (SvTAIL(check) != 0);
506 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
508 Perl_croak(aTHX_ "panic: end_shift");
512 /* Find a possible match in the region s..strend by looking for
513 the "check" substring in the region corrected by start/end_shift. */
514 if (flags & REXEC_SCREAM) {
515 I32 p = -1; /* Internal iterator of scream. */
516 I32 * const pp = data ? data->scream_pos : &p;
518 if (PL_screamfirst[BmRARE(check)] >= 0
519 || ( BmRARE(check) == '\n'
520 && (BmPREVIOUS(check) == SvCUR(check) - 1)
522 s = screaminstr(sv, check,
523 start_shift + (s - strbeg), end_shift, pp, 0);
526 /* we may be pointing at the wrong string */
527 if (s && RX_MATCH_COPIED(prog))
528 s = strbeg + (s - SvPVX_const(sv));
530 *data->scream_olds = s;
532 else if (prog->reganch & ROPT_CANY_SEEN)
533 s = fbm_instr((U8*)(s + start_shift),
534 (U8*)(strend - end_shift),
535 check, multiline ? FBMrf_MULTILINE : 0);
537 s = fbm_instr(HOP3(s, start_shift, strend),
538 HOP3(strend, -end_shift, strbeg),
539 check, multiline ? FBMrf_MULTILINE : 0);
541 /* Update the count-of-usability, remove useless subpatterns,
544 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
545 (s ? "Found" : "Did not find"),
546 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
548 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
550 PL_colors[1], (SvTAIL(check) ? "$" : ""),
551 (s ? " at offset " : "...\n") ) );
558 /* Finish the diagnostic message */
559 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
561 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
562 Start with the other substr.
563 XXXX no SCREAM optimization yet - and a very coarse implementation
564 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
565 *always* match. Probably should be marked during compile...
566 Probably it is right to do no SCREAM here...
569 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
570 /* Take into account the "other" substring. */
571 /* XXXX May be hopelessly wrong for UTF... */
574 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
577 char * const last = HOP3c(s, -start_shift, strbeg);
579 char * const saved_s = s;
582 t = s - prog->check_offset_max;
583 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
585 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
590 t = HOP3c(t, prog->anchored_offset, strend);
591 if (t < other_last) /* These positions already checked */
593 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
596 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
597 /* On end-of-str: see comment below. */
598 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
599 if (must == &PL_sv_undef) {
601 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
606 HOP3(HOP3(last1, prog->anchored_offset, strend)
607 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
609 multiline ? FBMrf_MULTILINE : 0
611 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
612 "%s anchored substr \"%s%.*s%s\"%s",
613 (s ? "Found" : "Contradicts"),
616 - (SvTAIL(must)!=0)),
618 PL_colors[1], (SvTAIL(must) ? "$" : "")));
620 if (last1 >= last2) {
621 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
622 ", giving up...\n"));
625 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
626 ", trying floating at offset %ld...\n",
627 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
628 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
629 s = HOP3c(last, 1, strend);
633 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
634 (long)(s - i_strpos)));
635 t = HOP3c(s, -prog->anchored_offset, strbeg);
636 other_last = HOP3c(s, 1, strend);
644 else { /* Take into account the floating substring. */
646 char * const saved_s = s;
649 t = HOP3c(s, -start_shift, strbeg);
651 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
652 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
653 last = HOP3c(t, prog->float_max_offset, strend);
654 s = HOP3c(t, prog->float_min_offset, strend);
657 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
658 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
659 /* fbm_instr() takes into account exact value of end-of-str
660 if the check is SvTAIL(ed). Since false positives are OK,
661 and end-of-str is not later than strend we are OK. */
662 if (must == &PL_sv_undef) {
664 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
667 s = fbm_instr((unsigned char*)s,
668 (unsigned char*)last + SvCUR(must)
670 must, multiline ? FBMrf_MULTILINE : 0);
671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
672 (s ? "Found" : "Contradicts"),
674 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
676 PL_colors[1], (SvTAIL(must) ? "$" : "")));
679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
680 ", giving up...\n"));
683 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
684 ", trying anchored starting at offset %ld...\n",
685 (long)(saved_s + 1 - i_strpos)));
687 s = HOP3c(t, 1, strend);
691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
692 (long)(s - i_strpos)));
693 other_last = s; /* Fix this later. --Hugo */
702 t = s - prog->check_offset_max;
703 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
705 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
707 /* Fixed substring is found far enough so that the match
708 cannot start at strpos. */
710 if (ml_anch && t[-1] != '\n') {
711 /* Eventually fbm_*() should handle this, but often
712 anchored_offset is not 0, so this check will not be wasted. */
713 /* XXXX In the code below we prefer to look for "^" even in
714 presence of anchored substrings. And we search even
715 beyond the found float position. These pessimizations
716 are historical artefacts only. */
718 while (t < strend - prog->minlen) {
720 if (t < check_at - prog->check_offset_min) {
721 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
722 /* Since we moved from the found position,
723 we definitely contradict the found anchored
724 substr. Due to the above check we do not
725 contradict "check" substr.
726 Thus we can arrive here only if check substr
727 is float. Redo checking for "other"=="fixed".
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
731 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
732 goto do_other_anchored;
734 /* We don't contradict the found floating substring. */
735 /* XXXX Why not check for STCLASS? */
737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
738 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
741 /* Position contradicts check-string */
742 /* XXXX probably better to look for check-string
743 than for "\n", so one should lower the limit for t? */
744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
745 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
746 other_last = strpos = s = t + 1;
751 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
752 PL_colors[0], PL_colors[1]));
756 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
757 PL_colors[0], PL_colors[1]));
761 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
764 /* The found string does not prohibit matching at strpos,
765 - no optimization of calling REx engine can be performed,
766 unless it was an MBOL and we are not after MBOL,
767 or a future STCLASS check will fail this. */
769 /* Even in this situation we may use MBOL flag if strpos is offset
770 wrt the start of the string. */
771 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
772 && (strpos != strbeg) && strpos[-1] != '\n'
773 /* May be due to an implicit anchor of m{.*foo} */
774 && !(prog->reganch & ROPT_IMPLICIT))
779 DEBUG_EXECUTE_r( if (ml_anch)
780 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
781 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
784 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
786 prog->check_utf8 /* Could be deleted already */
787 && --BmUSEFUL(prog->check_utf8) < 0
788 && (prog->check_utf8 == prog->float_utf8)
790 prog->check_substr /* Could be deleted already */
791 && --BmUSEFUL(prog->check_substr) < 0
792 && (prog->check_substr == prog->float_substr)
795 /* If flags & SOMETHING - do not do it many times on the same match */
796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
797 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
798 if (do_utf8 ? prog->check_substr : prog->check_utf8)
799 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
800 prog->check_substr = prog->check_utf8 = NULL; /* disable */
801 prog->float_substr = prog->float_utf8 = NULL; /* clear */
802 check = NULL; /* abort */
804 /* XXXX This is a remnant of the old implementation. It
805 looks wasteful, since now INTUIT can use many
807 prog->reganch &= ~RE_USE_INTUIT;
814 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
815 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
816 /* minlen == 0 is possible if regstclass is \b or \B,
817 and the fixed substr is ''$.
818 Since minlen is already taken into account, s+1 is before strend;
819 accidentally, minlen >= 1 guaranties no false positives at s + 1
820 even for \b or \B. But (minlen? 1 : 0) below assumes that
821 regstclass does not come from lookahead... */
822 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
823 This leaves EXACTF only, which is dealt with in find_byclass(). */
824 const U8* const str = (U8*)STRING(prog->regstclass);
825 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
826 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
828 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
829 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
830 : (prog->float_substr || prog->float_utf8
831 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
834 /*if (OP(prog->regstclass) == TRIE)
837 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
840 const char *what = NULL;
842 if (endpos == strend) {
843 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
844 "Could not match STCLASS...\n") );
847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
848 "This position contradicts STCLASS...\n") );
849 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
851 /* Contradict one of substrings */
852 if (prog->anchored_substr || prog->anchored_utf8) {
853 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
854 DEBUG_EXECUTE_r( what = "anchored" );
856 s = HOP3c(t, 1, strend);
857 if (s + start_shift + end_shift > strend) {
858 /* XXXX Should be taken into account earlier? */
859 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
860 "Could not match STCLASS...\n") );
865 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
866 "Looking for %s substr starting at offset %ld...\n",
867 what, (long)(s + start_shift - i_strpos)) );
870 /* Have both, check_string is floating */
871 if (t + start_shift >= check_at) /* Contradicts floating=check */
872 goto retry_floating_check;
873 /* Recheck anchored substring, but not floating... */
877 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
878 "Looking for anchored substr starting at offset %ld...\n",
879 (long)(other_last - i_strpos)) );
880 goto do_other_anchored;
882 /* Another way we could have checked stclass at the
883 current position only: */
888 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
889 "Looking for /%s^%s/m starting at offset %ld...\n",
890 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
893 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
895 /* Check is floating subtring. */
896 retry_floating_check:
897 t = check_at - start_shift;
898 DEBUG_EXECUTE_r( what = "floating" );
899 goto hop_and_restart;
902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
903 "By STCLASS: moving %ld --> %ld\n",
904 (long)(t - i_strpos), (long)(s - i_strpos))
908 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
909 "Does not contradict STCLASS...\n");
914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
915 PL_colors[4], (check ? "Guessed" : "Giving up"),
916 PL_colors[5], (long)(s - i_strpos)) );
919 fail_finish: /* Substring not found */
920 if (prog->check_substr || prog->check_utf8) /* could be removed already */
921 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
923 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
924 PL_colors[4], PL_colors[5]));
928 /* We know what class REx starts with. Try to find this position... */
929 /* if reginfo is NULL, its a dryrun */
930 /* annoyingly all the vars in this routine have different names from their counterparts
931 in regmatch. /grrr */
934 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
935 const char *strend, const regmatch_info *reginfo)
938 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
942 register STRLEN uskip;
946 register I32 tmp = 1; /* Scratch variable? */
947 register const bool do_utf8 = PL_reg_match_utf8;
949 /* We know what class it must start with. */
953 while (s + (uskip = UTF8SKIP(s)) <= strend) {
954 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
955 !UTF8_IS_INVARIANT((U8)s[0]) ?
956 reginclass(prog, c, (U8*)s, 0, do_utf8) :
957 REGINCLASS(prog, c, (U8*)s)) {
958 if (tmp && (!reginfo || regtry(reginfo, s)))
972 if (REGINCLASS(prog, c, (U8*)s) ||
973 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
974 /* The assignment of 2 is intentional:
975 * for the folded sharp s, the skip is 2. */
976 (skip = SHARP_S_SKIP))) {
977 if (tmp && (!reginfo || regtry(reginfo, s)))
990 if (tmp && (!reginfo || regtry(reginfo, s)))
999 ln = STR_LEN(c); /* length to match in octets/bytes */
1000 lnc = (I32) ln; /* length to match in characters */
1002 STRLEN ulen1, ulen2;
1004 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1005 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1006 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1008 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1009 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1011 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1013 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1016 while (sm < ((U8 *) m + ln)) {
1031 c2 = PL_fold_locale[c1];
1033 e = HOP3c(strend, -((I32)lnc), s);
1035 if (!reginfo && e < s)
1036 e = s; /* Due to minlen logic of intuit() */
1038 /* The idea in the EXACTF* cases is to first find the
1039 * first character of the EXACTF* node and then, if
1040 * necessary, case-insensitively compare the full
1041 * text of the node. The c1 and c2 are the first
1042 * characters (though in Unicode it gets a bit
1043 * more complicated because there are more cases
1044 * than just upper and lower: one needs to use
1045 * the so-called folding case for case-insensitive
1046 * matching (called "loose matching" in Unicode).
1047 * ibcmp_utf8() will do just that. */
1051 U8 tmpbuf [UTF8_MAXBYTES+1];
1052 STRLEN len, foldlen;
1053 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1055 /* Upper and lower of 1st char are equal -
1056 * probably not a "letter". */
1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1062 ibcmp_utf8(s, NULL, 0, do_utf8,
1063 m, NULL, ln, (bool)UTF))
1064 && (!reginfo || regtry(reginfo, s)) )
1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1068 uvchr_to_utf8(tmpbuf, c);
1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
1073 !ibcmp_utf8((char *) foldbuf,
1074 NULL, foldlen, do_utf8,
1076 NULL, ln, (bool)UTF))
1077 && (!reginfo || regtry(reginfo, s)) )
1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1088 /* Handle some of the three Greek sigmas cases.
1089 * Note that not all the possible combinations
1090 * are handled here: some of them are handled
1091 * by the standard folding rules, and some of
1092 * them (the character class or ANYOF cases)
1093 * are handled during compiletime in
1094 * regexec.c:S_regclass(). */
1095 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1096 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1097 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1099 if ( (c == c1 || c == c2)
1101 ibcmp_utf8(s, NULL, 0, do_utf8,
1102 m, NULL, ln, (bool)UTF))
1103 && (!reginfo || regtry(reginfo, s)) )
1106 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1107 uvchr_to_utf8(tmpbuf, c);
1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
1112 !ibcmp_utf8((char *) foldbuf,
1113 NULL, foldlen, do_utf8,
1115 NULL, ln, (bool)UTF))
1116 && (!reginfo || regtry(reginfo, s)) )
1127 && (ln == 1 || !(OP(c) == EXACTF
1129 : ibcmp_locale(s, m, ln)))
1130 && (!reginfo || regtry(reginfo, s)) )
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1139 : ibcmp_locale(s, m, ln)))
1140 && (!reginfo || regtry(reginfo, s)) )
1147 PL_reg_flags |= RF_tainted;
1154 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1155 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1157 tmp = ((OP(c) == BOUND ?
1158 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1159 LOAD_UTF8_CHARCLASS_ALNUM();
1160 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1161 if (tmp == !(OP(c) == BOUND ?
1162 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1163 isALNUM_LC_utf8((U8*)s)))
1166 if ((!reginfo || regtry(reginfo, s)))
1173 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1174 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1175 while (s < strend) {
1177 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1179 if ((!reginfo || regtry(reginfo, s)))
1185 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1189 PL_reg_flags |= RF_tainted;
1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1199 tmp = ((OP(c) == NBOUND ?
1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1201 LOAD_UTF8_CHARCLASS_ALNUM();
1202 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1203 if (tmp == !(OP(c) == NBOUND ?
1204 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1205 isALNUM_LC_utf8((U8*)s)))
1207 else if ((!reginfo || regtry(reginfo, s)))
1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1214 tmp = ((OP(c) == NBOUND ?
1215 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 while (s < strend) {
1218 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1220 else if ((!reginfo || regtry(reginfo, s)))
1225 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1230 LOAD_UTF8_CHARCLASS_ALNUM();
1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1232 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1233 if (tmp && (!reginfo || regtry(reginfo, s)))
1244 while (s < strend) {
1246 if (tmp && (!reginfo || regtry(reginfo, s)))
1258 PL_reg_flags |= RF_tainted;
1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1261 if (isALNUM_LC_utf8((U8*)s)) {
1262 if (tmp && (!reginfo || regtry(reginfo, s)))
1273 while (s < strend) {
1274 if (isALNUM_LC(*s)) {
1275 if (tmp && (!reginfo || regtry(reginfo, s)))
1288 LOAD_UTF8_CHARCLASS_ALNUM();
1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1290 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1291 if (tmp && (!reginfo || regtry(reginfo, s)))
1302 while (s < strend) {
1304 if (tmp && (!reginfo || regtry(reginfo, s)))
1316 PL_reg_flags |= RF_tainted;
1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1319 if (!isALNUM_LC_utf8((U8*)s)) {
1320 if (tmp && (!reginfo || regtry(reginfo, s)))
1331 while (s < strend) {
1332 if (!isALNUM_LC(*s)) {
1333 if (tmp && (!reginfo || regtry(reginfo, s)))
1346 LOAD_UTF8_CHARCLASS_SPACE();
1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1348 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1349 if (tmp && (!reginfo || regtry(reginfo, s)))
1360 while (s < strend) {
1362 if (tmp && (!reginfo || regtry(reginfo, s)))
1374 PL_reg_flags |= RF_tainted;
1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1377 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1378 if (tmp && (!reginfo || regtry(reginfo, s)))
1389 while (s < strend) {
1390 if (isSPACE_LC(*s)) {
1391 if (tmp && (!reginfo || regtry(reginfo, s)))
1404 LOAD_UTF8_CHARCLASS_SPACE();
1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1406 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1407 if (tmp && (!reginfo || regtry(reginfo, s)))
1418 while (s < strend) {
1420 if (tmp && (!reginfo || regtry(reginfo, s)))
1432 PL_reg_flags |= RF_tainted;
1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1435 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1436 if (tmp && (!reginfo || regtry(reginfo, s)))
1447 while (s < strend) {
1448 if (!isSPACE_LC(*s)) {
1449 if (tmp && (!reginfo || regtry(reginfo, s)))
1462 LOAD_UTF8_CHARCLASS_DIGIT();
1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1464 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1465 if (tmp && (!reginfo || regtry(reginfo, s)))
1476 while (s < strend) {
1478 if (tmp && (!reginfo || regtry(reginfo, s)))
1490 PL_reg_flags |= RF_tainted;
1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1493 if (isDIGIT_LC_utf8((U8*)s)) {
1494 if (tmp && (!reginfo || regtry(reginfo, s)))
1505 while (s < strend) {
1506 if (isDIGIT_LC(*s)) {
1507 if (tmp && (!reginfo || regtry(reginfo, s)))
1520 LOAD_UTF8_CHARCLASS_DIGIT();
1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1522 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1523 if (tmp && (!reginfo || regtry(reginfo, s)))
1534 while (s < strend) {
1536 if (tmp && (!reginfo || regtry(reginfo, s)))
1548 PL_reg_flags |= RF_tainted;
1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1551 if (!isDIGIT_LC_utf8((U8*)s)) {
1552 if (tmp && (!reginfo || regtry(reginfo, s)))
1563 while (s < strend) {
1564 if (!isDIGIT_LC(*s)) {
1565 if (tmp && (!reginfo || regtry(reginfo, s)))
1577 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1579 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1580 trie_type = do_utf8 ?
1581 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1583 /* what trie are we using right now */
1585 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1586 reg_trie_data *trie=aho->trie;
1588 const char *last_start = strend - trie->minlen;
1589 const char *real_start = s;
1590 STRLEN maxlen = trie->maxlen;
1592 U8 **points; /* map of where we were in the input string
1593 when reading a given string. For ASCII this
1594 is unnecessary overhead as the relationship
1595 is always 1:1, but for unicode, especially
1596 case folded unicode this is not true. */
1598 GET_RE_DEBUG_FLAGS_DECL;
1600 /* We can't just allocate points here. We need to wrap it in
1601 * an SV so it gets freed properly if there is a croak while
1602 * running the match */
1605 sv_points=newSV(maxlen * sizeof(U8 *));
1606 SvCUR_set(sv_points,
1607 maxlen * sizeof(U8 *));
1608 SvPOK_on(sv_points);
1609 sv_2mortal(sv_points);
1610 points=(U8**)SvPV_nolen(sv_points );
1612 if (trie->bitmap && trie_type != trie_utf8_fold) {
1613 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
1618 while (s <= last_start) {
1619 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1627 U8 *uscan = (U8*)NULL;
1628 U8 *leftmost = NULL;
1632 while ( state && uc <= (U8*)strend ) {
1634 if (aho->states[ state ].wordnum) {
1635 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1636 if (!leftmost || lpos < leftmost)
1640 points[pointpos++ % maxlen]= uc;
1641 switch (trie_type) {
1642 case trie_utf8_fold:
1644 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1649 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1650 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1651 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1652 foldlen -= UNISKIP( uvc );
1653 uscan = foldbuf + UNISKIP( uvc );
1657 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1666 charid = trie->charmap[ uvc ];
1670 if (trie->widecharmap) {
1671 SV** const svpp = hv_fetch(trie->widecharmap,
1672 (char*)&uvc, sizeof(UV), 0);
1674 charid = (U16)SvIV(*svpp);
1677 DEBUG_TRIE_EXECUTE_r(
1678 PerlIO_printf(Perl_debug_log,
1679 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1680 (int)((const char*)uc - real_start), charid, uvc)
1685 U32 word = aho->states[ state ].wordnum;
1686 base = aho->states[ state ].trans.base;
1688 DEBUG_TRIE_EXECUTE_r(
1689 PerlIO_printf( Perl_debug_log,
1690 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1691 failed ? "Fail transition to " : "",
1692 state, base, uvc, word)
1697 (base + charid > trie->uniquecharcount )
1698 && (base + charid - 1 - trie->uniquecharcount
1700 && trie->trans[base + charid - 1 -
1701 trie->uniquecharcount].check == state
1702 && (tmp=trie->trans[base + charid - 1 -
1703 trie->uniquecharcount ].next))
1713 state = aho->fail[state];
1717 /* we must be accepting here */
1725 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1726 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
1732 if ( aho->states[ state ].wordnum ) {
1733 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1734 if (!leftmost || lpos < leftmost)
1737 DEBUG_TRIE_EXECUTE_r(
1738 PerlIO_printf( Perl_debug_log,
1739 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1744 s = (char*)leftmost;
1745 if (!reginfo || regtry(reginfo, s)) {
1760 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1769 - regexec_flags - match a regexp against a string
1772 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1773 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1774 /* strend: pointer to null at end of string */
1775 /* strbeg: real beginning of string */
1776 /* minend: end of match must be >=minend after stringarg. */
1777 /* data: May be used for some additional optimizations. */
1778 /* nosave: For optimizations. */
1782 register regnode *c;
1783 register char *startpos = stringarg;
1784 I32 minlen; /* must match at least this many chars */
1785 I32 dontbother = 0; /* how many characters not to try at end */
1786 I32 end_shift = 0; /* Same for the end. */ /* CC */
1787 I32 scream_pos = -1; /* Internal iterator of scream. */
1788 char *scream_olds = NULL;
1789 SV* const oreplsv = GvSV(PL_replgv);
1790 const bool do_utf8 = DO_UTF8(sv);
1796 regmatch_info reginfo; /* create some info to pass to regtry etc */
1798 GET_RE_DEBUG_FLAGS_DECL;
1800 PERL_UNUSED_ARG(data);
1802 /* Be paranoid... */
1803 if (prog == NULL || startpos == NULL) {
1804 Perl_croak(aTHX_ "NULL regexp parameter");
1808 multiline = prog->reganch & PMf_MULTILINE;
1809 reginfo.prog = prog;
1812 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1813 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1816 RX_MATCH_UTF8_set(prog, do_utf8);
1818 minlen = prog->minlen;
1819 if (strend - startpos < minlen) {
1820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1821 "String too short [regexec_flags]...\n"));
1825 /* Check validity of program. */
1826 if (UCHARAT(prog->program) != REG_MAGIC) {
1827 Perl_croak(aTHX_ "corrupted regexp program");
1831 PL_reg_eval_set = 0;
1834 if (prog->reganch & ROPT_UTF8)
1835 PL_reg_flags |= RF_utf8;
1837 /* Mark beginning of line for ^ and lookbehind. */
1838 reginfo.bol = startpos; /* XXX not used ??? */
1842 /* Mark end of line for $ (and such) */
1845 /* see how far we have to get to not match where we matched before */
1846 reginfo.till = startpos+minend;
1848 /* If there is a "must appear" string, look for it. */
1851 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1854 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1855 reginfo.ganch = startpos;
1856 else if (sv && SvTYPE(sv) >= SVt_PVMG
1858 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1859 && mg->mg_len >= 0) {
1860 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1861 if (prog->reganch & ROPT_ANCH_GPOS) {
1862 if (s > reginfo.ganch)
1867 else /* pos() not defined */
1868 reginfo.ganch = strbeg;
1871 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1872 re_scream_pos_data d;
1874 d.scream_olds = &scream_olds;
1875 d.scream_pos = &scream_pos;
1876 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1879 goto phooey; /* not present */
1884 const char * const s0 = UTF
1885 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1888 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1889 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1890 UNI_DISPLAY_REGEX) : startpos;
1891 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
1894 PerlIO_printf(Perl_debug_log,
1895 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1896 PL_colors[4], PL_colors[5], PL_colors[0],
1899 len0 > 60 ? "..." : "",
1901 (int)(len1 > 60 ? 60 : len1),
1903 (len1 > 60 ? "..." : "")
1907 /* Simplest case: anchored match need be tried only once. */
1908 /* [unless only anchor is BOL and multiline is set] */
1909 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1910 if (s == startpos && regtry(®info, startpos))
1912 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1913 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1918 dontbother = minlen - 1;
1919 end = HOP3c(strend, -dontbother, strbeg) - 1;
1920 /* for multiline we only have to try after newlines */
1921 if (prog->check_substr || prog->check_utf8) {
1925 if (regtry(®info, s))
1930 if (prog->reganch & RE_USE_INTUIT) {
1931 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1942 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1943 if (regtry(®info, s))
1950 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1951 if (regtry(®info, reginfo.ganch))
1956 /* Messy cases: unanchored match. */
1957 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1958 /* we have /x+whatever/ */
1959 /* it must be a one character string (XXXX Except UTF?) */
1964 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1965 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1966 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1969 while (s < strend) {
1971 DEBUG_EXECUTE_r( did_match = 1 );
1972 if (regtry(®info, s)) goto got_it;
1974 while (s < strend && *s == ch)
1981 while (s < strend) {
1983 DEBUG_EXECUTE_r( did_match = 1 );
1984 if (regtry(®info, s)) goto got_it;
1986 while (s < strend && *s == ch)
1992 DEBUG_EXECUTE_r(if (!did_match)
1993 PerlIO_printf(Perl_debug_log,
1994 "Did not find anchored character...\n")
1997 else if (prog->anchored_substr != NULL
1998 || prog->anchored_utf8 != NULL
1999 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2000 && prog->float_max_offset < strend - s)) {
2005 char *last1; /* Last position checked before */
2009 if (prog->anchored_substr || prog->anchored_utf8) {
2010 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2011 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2012 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2013 back_max = back_min = prog->anchored_offset;
2015 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2016 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2017 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2018 back_max = prog->float_max_offset;
2019 back_min = prog->float_min_offset;
2021 if (must == &PL_sv_undef)
2022 /* could not downgrade utf8 check substring, so must fail */
2025 last = HOP3c(strend, /* Cannot start after this */
2026 -(I32)(CHR_SVLEN(must)
2027 - (SvTAIL(must) != 0) + back_min), strbeg);
2030 last1 = HOPc(s, -1);
2032 last1 = s - 1; /* bogus */
2034 /* XXXX check_substr already used to find "s", can optimize if
2035 check_substr==must. */
2037 dontbother = end_shift;
2038 strend = HOPc(strend, -dontbother);
2039 while ( (s <= last) &&
2040 ((flags & REXEC_SCREAM)
2041 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
2042 end_shift, &scream_pos, 0))
2043 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
2044 (unsigned char*)strend, must,
2045 multiline ? FBMrf_MULTILINE : 0))) ) {
2046 /* we may be pointing at the wrong string */
2047 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
2048 s = strbeg + (s - SvPVX_const(sv));
2049 DEBUG_EXECUTE_r( did_match = 1 );
2050 if (HOPc(s, -back_max) > last1) {
2051 last1 = HOPc(s, -back_min);
2052 s = HOPc(s, -back_max);
2055 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2057 last1 = HOPc(s, -back_min);
2061 while (s <= last1) {
2062 if (regtry(®info, s))
2068 while (s <= last1) {
2069 if (regtry(®info, s))
2075 DEBUG_EXECUTE_r(if (!did_match)
2076 PerlIO_printf(Perl_debug_log,
2077 "Did not find %s substr \"%s%.*s%s\"%s...\n",
2078 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2079 ? "anchored" : "floating"),
2081 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
2083 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2087 else if ((c = prog->regstclass)) {
2089 const OPCODE op = OP(prog->regstclass);
2090 /* don't bother with what can't match */
2091 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
2092 strend = HOPc(strend, -(minlen - 1));
2095 SV * const prop = sv_newmortal();
2101 regprop(prog, prop, c);
2103 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
2104 UNI_DISPLAY_REGEX) :
2106 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2108 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
2109 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
2110 PerlIO_printf(Perl_debug_log,
2111 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
2113 len1, len1, s1, (int)(strend - s));
2115 if (find_byclass(prog, c, s, strend, ®info))
2117 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2121 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2126 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2127 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2128 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2130 if (flags & REXEC_SCREAM) {
2131 last = screaminstr(sv, float_real, s - strbeg,
2132 end_shift, &scream_pos, 1); /* last one */
2134 last = scream_olds; /* Only one occurrence. */
2135 /* we may be pointing at the wrong string */
2136 else if (RX_MATCH_COPIED(prog))
2137 s = strbeg + (s - SvPVX_const(sv));
2141 const char * const little = SvPV_const(float_real, len);
2143 if (SvTAIL(float_real)) {
2144 if (memEQ(strend - len + 1, little, len - 1))
2145 last = strend - len + 1;
2146 else if (!multiline)
2147 last = memEQ(strend - len, little, len)
2148 ? strend - len : NULL;
2154 last = rninstr(s, strend, little, little + len);
2156 last = strend; /* matching "$" */
2160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2161 "%sCan't trim the tail, match fails (should not happen)%s\n",
2162 PL_colors[4], PL_colors[5]));
2163 goto phooey; /* Should not happen! */
2165 dontbother = strend - last + prog->float_min_offset;
2167 if (minlen && (dontbother < minlen))
2168 dontbother = minlen - 1;
2169 strend -= dontbother; /* this one's always in bytes! */
2170 /* We don't know much -- general case. */
2173 if (regtry(®info, s))
2182 if (regtry(®info, s))
2184 } while (s++ < strend);
2192 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2194 if (PL_reg_eval_set) {
2195 /* Preserve the current value of $^R */
2196 if (oreplsv != GvSV(PL_replgv))
2197 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2198 restored, the value remains
2200 restore_pos(aTHX_ prog);
2203 /* make sure $`, $&, $', and $digit will work later */
2204 if ( !(flags & REXEC_NOT_FIRST) ) {
2205 RX_MATCH_COPY_FREE(prog);
2206 if (flags & REXEC_COPY_STR) {
2207 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2208 #ifdef PERL_OLD_COPY_ON_WRITE
2210 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2212 PerlIO_printf(Perl_debug_log,
2213 "Copy on write: regexp capture, type %d\n",
2216 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2217 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2218 assert (SvPOKp(prog->saved_copy));
2222 RX_MATCH_COPIED_on(prog);
2223 s = savepvn(strbeg, i);
2229 prog->subbeg = strbeg;
2230 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2237 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2238 PL_colors[4], PL_colors[5]));
2239 if (PL_reg_eval_set)
2240 restore_pos(aTHX_ prog);
2245 - regtry - try match at specific point
2247 STATIC I32 /* 0 failure, 1 success */
2248 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2254 regexp *prog = reginfo->prog;
2255 GET_RE_DEBUG_FLAGS_DECL;
2258 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2260 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2263 PL_reg_eval_set = RS_init;
2264 DEBUG_EXECUTE_r(DEBUG_s(
2265 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2266 (IV)(PL_stack_sp - PL_stack_base));
2268 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2269 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2270 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2272 /* Apparently this is not needed, judging by wantarray. */
2273 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2274 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2277 /* Make $_ available to executed code. */
2278 if (reginfo->sv != DEFSV) {
2280 DEFSV = reginfo->sv;
2283 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2284 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2285 /* prepare for quick setting of pos */
2286 #ifdef PERL_OLD_COPY_ON_WRITE
2288 sv_force_normal_flags(sv, 0);
2290 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2291 &PL_vtbl_mglob, NULL, 0);
2295 PL_reg_oldpos = mg->mg_len;
2296 SAVEDESTRUCTOR_X(restore_pos, prog);
2298 if (!PL_reg_curpm) {
2299 Newxz(PL_reg_curpm, 1, PMOP);
2302 SV* const repointer = newSViv(0);
2303 /* so we know which PL_regex_padav element is PL_reg_curpm */
2304 SvFLAGS(repointer) |= SVf_BREAK;
2305 av_push(PL_regex_padav,repointer);
2306 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2307 PL_regex_pad = AvARRAY(PL_regex_padav);
2311 PM_SETRE(PL_reg_curpm, prog);
2312 PL_reg_oldcurpm = PL_curpm;
2313 PL_curpm = PL_reg_curpm;
2314 if (RX_MATCH_COPIED(prog)) {
2315 /* Here is a serious problem: we cannot rewrite subbeg,
2316 since it may be needed if this match fails. Thus
2317 $` inside (?{}) could fail... */
2318 PL_reg_oldsaved = prog->subbeg;
2319 PL_reg_oldsavedlen = prog->sublen;
2320 #ifdef PERL_OLD_COPY_ON_WRITE
2321 PL_nrs = prog->saved_copy;
2323 RX_MATCH_COPIED_off(prog);
2326 PL_reg_oldsaved = NULL;
2327 prog->subbeg = PL_bostr;
2328 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2330 prog->startp[0] = startpos - PL_bostr;
2331 PL_reginput = startpos;
2332 PL_regstartp = prog->startp;
2333 PL_regendp = prog->endp;
2334 PL_reglastparen = &prog->lastparen;
2335 PL_reglastcloseparen = &prog->lastcloseparen;
2336 prog->lastparen = 0;
2337 prog->lastcloseparen = 0;
2339 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2340 if (PL_reg_start_tmpl <= prog->nparens) {
2341 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2342 if(PL_reg_start_tmp)
2343 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2345 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2348 /* XXXX What this code is doing here?!!! There should be no need
2349 to do this again and again, PL_reglastparen should take care of
2352 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2353 * Actually, the code in regcppop() (which Ilya may be meaning by
2354 * PL_reglastparen), is not needed at all by the test suite
2355 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2356 * enough, for building DynaLoader, or otherwise this
2357 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2358 * will happen. Meanwhile, this code *is* needed for the
2359 * above-mentioned test suite tests to succeed. The common theme
2360 * on those tests seems to be returning null fields from matches.
2365 if (prog->nparens) {
2367 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2374 if (regmatch(reginfo, prog->program + 1)) {
2375 prog->endp[0] = PL_reginput - PL_bostr;
2378 REGCP_UNWIND(lastcp);
2383 #define sayYES goto yes
2384 #define sayNO goto no
2385 #define sayNO_ANYOF goto no_anyof
2386 #define sayYES_FINAL goto yes_final
2387 #define sayNO_FINAL goto no_final
2388 #define sayNO_SILENT goto do_no
2389 #define saySAME(x) if (x) goto yes; else goto no
2391 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2392 #define POSCACHE_SEEN 1 /* we know what we're caching */
2393 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2395 #define CACHEsayYES STMT_START { \
2396 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2397 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2398 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2399 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2401 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2402 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2405 /* cache records failure, but this is success */ \
2407 PerlIO_printf(Perl_debug_log, \
2408 "%*s (remove success from failure cache)\n", \
2409 REPORT_CODE_OFF+PL_regindent*2, "") \
2411 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2417 #define CACHEsayNO STMT_START { \
2418 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2419 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2420 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2421 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2423 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2424 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2427 /* cache records success, but this is failure */ \
2429 PerlIO_printf(Perl_debug_log, \
2430 "%*s (remove failure from success cache)\n", \
2431 REPORT_CODE_OFF+PL_regindent*2, "") \
2433 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2439 /* this is used to determine how far from the left messages like
2440 'failed...' are printed. Currently 29 makes these messages line
2441 up with the opcode they refer to. Earlier perls used 25 which
2442 left these messages outdented making reviewing a debug output
2445 #define REPORT_CODE_OFF 29
2448 /* Make sure there is a test for this +1 options in re_tests */
2449 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2451 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2452 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2454 #define SLAB_FIRST(s) (&(s)->states[0])
2455 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2457 /* grab a new slab and return the first slot in it */
2459 STATIC regmatch_state *
2462 #if PERL_VERSION < 9
2465 regmatch_slab *s = PL_regmatch_slab->next;
2467 Newx(s, 1, regmatch_slab);
2468 s->prev = PL_regmatch_slab;
2470 PL_regmatch_slab->next = s;
2472 PL_regmatch_slab = s;
2473 return SLAB_FIRST(s);
2476 /* simulate a recursive call to regmatch */
2478 #define REGMATCH(ns, where) \
2481 st->resume_state = resume_##where; \
2482 goto start_recurse; \
2483 resume_point_##where:
2485 /* push a new state then goto it */
2487 #define PUSH_STATE_GOTO(state, node) \
2489 st->resume_state = state; \
2492 /* push a new state with success backtracking, then goto it */
2494 #define PUSH_YES_STATE_GOTO(state, node) \
2496 st->resume_state = state; \
2497 goto push_yes_state;
2502 - regmatch - main matching routine
2504 * Conceptually the strategy is simple: check to see whether the current
2505 * node matches, call self recursively to see whether the rest matches,
2506 * and then act accordingly. In practice we make some effort to avoid
2507 * recursion, in particular by going through "ordinary" nodes (that don't
2508 * need to know whether the rest of the match failed) by a loop instead of
2511 /* [lwall] I've hoisted the register declarations to the outer block in order to
2512 * maybe save a little bit of pushing and popping on the stack. It also takes
2513 * advantage of machines that use a register save mask on subroutine entry.
2515 * This function used to be heavily recursive, but since this had the
2516 * effect of blowing the CPU stack on complex regexes, it has been
2517 * restructured to be iterative, and to save state onto the heap rather
2518 * than the stack. Essentially whereever regmatch() used to be called, it
2519 * pushes the current state, notes where to return, then jumps back into
2522 * Originally the structure of this function used to look something like
2527 while (scan != NULL) {
2528 a++; // do stuff with a and b
2534 if (regmatch(...)) // recurse
2544 * Now it looks something like this:
2552 regmatch_state *st = new();
2554 st->a++; // do stuff with a and b
2556 while (scan != NULL) {
2564 st->resume_state = resume_FOO;
2565 goto start_recurse; // recurse
2574 st = new(); push a new state
2575 st->a = 1; st->b = 2;
2582 switch (resume_state) {
2584 goto resume_point_FOO;
2591 * WARNING: this means that any line in this function that contains a
2592 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2593 * regmatch() using gotos instead. Thus the values of any local variables
2594 * not saved in the regmatch_state structure will have been lost when
2595 * execution resumes on the next line .
2597 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2598 * PL_regmatch_state always points to the currently active state, and
2599 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2600 * The first time regmatch is called, the first slab is allocated, and is
2601 * never freed until interpreter desctruction. When the slab is full,
2602 * a new one is allocated chained to the end. At exit from regmatch, slabs
2603 * allocated since entry are freed.
2606 /* *** every FOO_fail should = FOO+1 */
2607 #define resume_TRIE1 (REGNODE_MAX+1)
2608 #define resume_TRIE2 (REGNODE_MAX+2)
2609 #define EVAL_A (REGNODE_MAX+3)
2610 #define EVAL_A_fail (REGNODE_MAX+4)
2611 #define resume_CURLYX (REGNODE_MAX+5)
2612 #define resume_WHILEM1 (REGNODE_MAX+6)
2613 #define resume_WHILEM2 (REGNODE_MAX+7)
2614 #define resume_WHILEM3 (REGNODE_MAX+8)
2615 #define resume_WHILEM4 (REGNODE_MAX+9)
2616 #define resume_WHILEM5 (REGNODE_MAX+10)
2617 #define resume_WHILEM6 (REGNODE_MAX+11)
2618 #define BRANCH_next (REGNODE_MAX+12)
2619 #define BRANCH_next_fail (REGNODE_MAX+13)
2620 #define CURLYM_A (REGNODE_MAX+14)
2621 #define CURLYM_A_fail (REGNODE_MAX+15)
2622 #define CURLYM_B (REGNODE_MAX+16)
2623 #define CURLYM_B_fail (REGNODE_MAX+17)
2624 #define IFMATCH_A (REGNODE_MAX+18)
2625 #define IFMATCH_A_fail (REGNODE_MAX+19)
2626 #define resume_PLUS1 (REGNODE_MAX+20)
2627 #define resume_PLUS2 (REGNODE_MAX+21)
2628 #define resume_PLUS3 (REGNODE_MAX+22)
2629 #define resume_PLUS4 (REGNODE_MAX+23)
2633 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2637 S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2639 const int docolor = *PL_colors[0];
2640 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2641 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2642 /* The part of the string before starttry has one color
2643 (pref0_len chars), between starttry and current
2644 position another one (pref_len - pref0_len chars),
2645 after the current position the third one.
2646 We assume that pref0_len <= pref_len, otherwise we
2647 decrease pref0_len. */
2648 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2649 ? (5 + taill) - l : locinput - PL_bostr;
2652 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2654 pref0_len = pref_len - (locinput - PL_reg_starttry);
2655 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2656 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2657 ? (5 + taill) - pref_len : PL_regeol - locinput);
2658 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2662 if (pref0_len > pref_len)
2663 pref0_len = pref_len;
2665 const char * const s0 =
2666 do_utf8 && OP(scan) != CANY ?
2667 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2668 pref0_len, 60, UNI_DISPLAY_REGEX) :
2669 locinput - pref_len;
2670 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2671 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2672 pv_uni_display(PERL_DEBUG_PAD(1),
2673 (U8*)(locinput - pref_len + pref0_len),
2674 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2675 locinput - pref_len + pref0_len;
2676 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2677 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2678 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2679 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2681 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2682 PerlIO_printf(Perl_debug_log,
2683 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2684 (IV)(locinput - PL_bostr),
2691 (docolor ? "" : "> <"),
2695 15 - l - pref_len + 1,
2701 STATIC I32 /* 0 failure, 1 success */
2702 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2704 #if PERL_VERSION < 9
2708 register const bool do_utf8 = PL_reg_match_utf8;
2709 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2711 regexp *rex = reginfo->prog;
2713 regmatch_slab *orig_slab;
2714 regmatch_state *orig_state;
2716 /* the current state. This is a cached copy of PL_regmatch_state */
2717 register regmatch_state *st;
2719 /* cache heavy used fields of st in registers */
2720 register regnode *scan;
2721 register regnode *next;
2722 register I32 n = 0; /* initialize to shut up compiler warning */
2723 register char *locinput = PL_reginput;
2725 /* these variables are NOT saved during a recusive RFEGMATCH: */
2726 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2727 bool result; /* return value of S_regmatch */
2728 int depth = 0; /* depth of recursion */
2729 regmatch_state *yes_state = NULL; /* state to pop to on success of
2734 GET_RE_DEBUG_FLAGS_DECL;
2738 /* on first ever call to regmatch, allocate first slab */
2739 if (!PL_regmatch_slab) {
2740 Newx(PL_regmatch_slab, 1, regmatch_slab);
2741 PL_regmatch_slab->prev = NULL;
2742 PL_regmatch_slab->next = NULL;
2743 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2746 /* remember current high-water mark for exit */
2747 /* XXX this should be done with SAVE* instead */
2748 orig_slab = PL_regmatch_slab;
2749 orig_state = PL_regmatch_state;
2751 /* grab next free state slot */
2752 st = ++PL_regmatch_state;
2753 if (st > SLAB_LAST(PL_regmatch_slab))
2754 st = PL_regmatch_state = S_push_slab(aTHX);
2760 /* Note that nextchr is a byte even in UTF */
2761 nextchr = UCHARAT(locinput);
2763 while (scan != NULL) {
2766 SV * const prop = sv_newmortal();
2767 dump_exec_pos( locinput, scan, do_utf8 );
2768 regprop(rex, prop, scan);
2770 PerlIO_printf(Perl_debug_log,
2771 "%3"IVdf":%*s%s(%"IVdf")\n",
2772 (IV)(scan - rex->program), PL_regindent*2, "",
2774 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2777 next = scan + NEXT_OFF(scan);
2780 state_num = OP(scan);
2783 switch (state_num) {
2785 if (locinput == PL_bostr)
2787 /* reginfo->till = reginfo->bol; */
2792 if (locinput == PL_bostr ||
2793 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2799 if (locinput == PL_bostr)
2803 if (locinput == reginfo->ganch)
2809 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2814 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2816 if (PL_regeol - locinput > 1)
2820 if (PL_regeol != locinput)
2824 if (!nextchr && locinput >= PL_regeol)
2827 locinput += PL_utf8skip[nextchr];
2828 if (locinput > PL_regeol)
2830 nextchr = UCHARAT(locinput);
2833 nextchr = UCHARAT(++locinput);
2836 if (!nextchr && locinput >= PL_regeol)
2838 nextchr = UCHARAT(++locinput);
2841 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2844 locinput += PL_utf8skip[nextchr];
2845 if (locinput > PL_regeol)
2847 nextchr = UCHARAT(locinput);
2850 nextchr = UCHARAT(++locinput);
2854 /* what type of TRIE am I? (utf8 makes this contextual) */
2855 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2856 trie_type = do_utf8 ?
2857 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2860 /* what trie are we using right now */
2861 reg_trie_data * const trie
2862 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2863 U32 state = trie->startstate;
2865 if (trie->bitmap && trie_type != trie_utf8_fold &&
2866 !TRIE_BITMAP_TEST(trie,*locinput)
2868 if (trie->states[ state ].wordnum) {
2870 PerlIO_printf(Perl_debug_log,
2871 "%*s %smatched empty string...%s\n",
2872 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2877 PerlIO_printf(Perl_debug_log,
2878 "%*s %sfailed to match start class...%s\n",
2879 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2886 traverse the TRIE keeping track of all accepting states
2887 we transition through until we get to a failing node.
2890 U8 *uc = ( U8* )locinput;
2896 U8 *uscan = (U8*)NULL;
2898 SV *sv_accept_buff = NULL;
2900 st->u.trie.accepted = 0; /* how many accepting states we have seen */
2903 while ( state && uc <= (U8*)PL_regeol ) {
2905 if (trie->states[ state ].wordnum) {
2906 if (!st->u.trie.accepted ) {
2909 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2910 sv_accept_buff=newSV(bufflen *
2911 sizeof(reg_trie_accepted) - 1);
2912 SvCUR_set(sv_accept_buff,
2913 sizeof(reg_trie_accepted));
2914 SvPOK_on(sv_accept_buff);
2915 sv_2mortal(sv_accept_buff);
2916 st->u.trie.accept_buff =
2917 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2920 if (st->u.trie.accepted >= bufflen) {
2922 st->u.trie.accept_buff =(reg_trie_accepted*)
2923 SvGROW(sv_accept_buff,
2924 bufflen * sizeof(reg_trie_accepted));
2926 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2927 + sizeof(reg_trie_accepted));
2929 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2930 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2931 ++st->u.trie.accepted;
2934 base = trie->states[ state ].trans.base;
2936 DEBUG_TRIE_EXECUTE_r({
2937 dump_exec_pos( (char *)uc, scan, do_utf8 );
2938 PerlIO_printf( Perl_debug_log,
2939 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2940 2+PL_regindent * 2, "", PL_colors[4],
2941 (UV)state, (UV)base, (UV)st->u.trie.accepted );
2945 switch (trie_type) {
2946 case trie_utf8_fold:
2948 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2953 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2954 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2955 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2956 foldlen -= UNISKIP( uvc );
2957 uscan = foldbuf + UNISKIP( uvc );
2961 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2970 charid = trie->charmap[ uvc ];
2974 if (trie->widecharmap) {
2975 SV** const svpp = hv_fetch(trie->widecharmap,
2976 (char*)&uvc, sizeof(UV), 0);
2978 charid = (U16)SvIV(*svpp);
2983 (base + charid > trie->uniquecharcount )
2984 && (base + charid - 1 - trie->uniquecharcount
2986 && trie->trans[base + charid - 1 -
2987 trie->uniquecharcount].check == state)
2989 state = trie->trans[base + charid - 1 -
2990 trie->uniquecharcount ].next;
3001 DEBUG_TRIE_EXECUTE_r(
3002 PerlIO_printf( Perl_debug_log,
3003 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3004 charid, uvc, (UV)state, PL_colors[5] );
3007 if (!st->u.trie.accepted )
3011 There was at least one accepting state that we
3012 transitioned through. Presumably the number of accepting
3013 states is going to be low, typically one or two. So we
3014 simply scan through to find the one with lowest wordnum.
3015 Once we find it, we swap the last state into its place
3016 and decrement the size. We then try to match the rest of
3017 the pattern at the point where the word ends, if we
3018 succeed then we end the loop, otherwise the loop
3019 eventually terminates once all of the accepting states
3023 if ( st->u.trie.accepted == 1 ) {
3025 SV ** const tmp = RX_DEBUG(reginfo->prog)
3026 ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
3028 PerlIO_printf( Perl_debug_log,
3029 "%*s %sonly one match : #%d <%s>%s\n",
3030 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3031 st->u.trie.accept_buff[ 0 ].wordnum,
3032 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3035 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
3036 /* in this case we free tmps/leave before we call regmatch
3037 as we wont be using accept_buff again. */
3040 /* do we need this? why dont we just do a break? */
3041 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
3042 /*** all unsaved local vars undefined at this point */
3045 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
3046 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
3049 while ( !result && st->u.trie.accepted-- ) {
3052 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
3053 DEBUG_TRIE_EXECUTE_r(
3054 PerlIO_printf( Perl_debug_log,
3055 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3056 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
3057 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
3058 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
3061 if (st->u.trie.accept_buff[cur].wordnum <
3062 st->u.trie.accept_buff[best].wordnum)
3066 reg_trie_data * const trie = (reg_trie_data*)
3067 rex->data->data[ARG(scan)];
3068 SV ** const tmp = RX_DEBUG(reginfo->prog)
3069 ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
3071 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3072 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3073 st->u.trie.accept_buff[best].wordnum,
3074 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3077 if ( best<st->u.trie.accepted ) {
3078 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
3079 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
3080 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
3081 best = st->u.trie.accepted;
3083 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
3086 as far as I can tell we only need the SAVETMPS/FREETMPS
3087 for re's with EVAL in them but I'm leaving them in for
3088 all until I can be sure.
3091 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
3092 /*** all unsaved local vars undefined at this point */
3105 /* unreached codepoint */
3107 char *s = STRING(scan);
3108 st->ln = STR_LEN(scan);
3109 if (do_utf8 != UTF) {
3110 /* The target and the pattern have differing utf8ness. */
3112 const char * const e = s + st->ln;
3115 /* The target is utf8, the pattern is not utf8. */
3120 if (NATIVE_TO_UNI(*(U8*)s) !=
3121 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3129 /* The target is not utf8, the pattern is utf8. */
3134 if (NATIVE_TO_UNI(*((U8*)l)) !=
3135 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3143 nextchr = UCHARAT(locinput);
3146 /* The target and the pattern have the same utf8ness. */
3147 /* Inline the first character, for speed. */
3148 if (UCHARAT(s) != nextchr)
3150 if (PL_regeol - locinput < st->ln)
3152 if (st->ln > 1 && memNE(s, locinput, st->ln))
3155 nextchr = UCHARAT(locinput);
3159 PL_reg_flags |= RF_tainted;
3162 char * const s = STRING(scan);
3163 st->ln = STR_LEN(scan);
3165 if (do_utf8 || UTF) {
3166 /* Either target or the pattern are utf8. */
3167 const char * const l = locinput;
3168 char *e = PL_regeol;
3170 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
3171 l, &e, 0, do_utf8)) {
3172 /* One more case for the sharp s:
3173 * pack("U0U*", 0xDF) =~ /ss/i,
3174 * the 0xC3 0x9F are the UTF-8
3175 * byte sequence for the U+00DF. */
3177 toLOWER(s[0]) == 's' &&
3179 toLOWER(s[1]) == 's' &&
3186 nextchr = UCHARAT(locinput);
3190 /* Neither the target and the pattern are utf8. */
3192 /* Inline the first character, for speed. */
3193 if (UCHARAT(s) != nextchr &&
3194 UCHARAT(s) != ((OP(scan) == EXACTF)
3195 ? PL_fold : PL_fold_locale)[nextchr])
3197 if (PL_regeol - locinput < st->ln)
3199 if (st->ln > 1 && (OP(scan) == EXACTF
3200 ? ibcmp(s, locinput, st->ln)
3201 : ibcmp_locale(s, locinput, st->ln)))
3204 nextchr = UCHARAT(locinput);
3209 STRLEN inclasslen = PL_regeol - locinput;
3211 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3213 if (locinput >= PL_regeol)
3215 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3216 nextchr = UCHARAT(locinput);
3221 nextchr = UCHARAT(locinput);
3222 if (!REGINCLASS(rex, scan, (U8*)locinput))
3224 if (!nextchr && locinput >= PL_regeol)
3226 nextchr = UCHARAT(++locinput);
3230 /* If we might have the case of the German sharp s
3231 * in a casefolding Unicode character class. */
3233 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3234 locinput += SHARP_S_SKIP;
3235 nextchr = UCHARAT(locinput);
3241 PL_reg_flags |= RF_tainted;
3247 LOAD_UTF8_CHARCLASS_ALNUM();
3248 if (!(OP(scan) == ALNUM
3249 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3250 : isALNUM_LC_utf8((U8*)locinput)))
3254 locinput += PL_utf8skip[nextchr];
3255 nextchr = UCHARAT(locinput);
3258 if (!(OP(scan) == ALNUM
3259 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3261 nextchr = UCHARAT(++locinput);
3264 PL_reg_flags |= RF_tainted;
3267 if (!nextchr && locinput >= PL_regeol)
3270 LOAD_UTF8_CHARCLASS_ALNUM();
3271 if (OP(scan) == NALNUM
3272 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3273 : isALNUM_LC_utf8((U8*)locinput))
3277 locinput += PL_utf8skip[nextchr];
3278 nextchr = UCHARAT(locinput);
3281 if (OP(scan) == NALNUM
3282 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3284 nextchr = UCHARAT(++locinput);
3288 PL_reg_flags |= RF_tainted;
3292 /* was last char in word? */
3294 if (locinput == PL_bostr)
3297 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3299 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3301 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3302 st->ln = isALNUM_uni(st->ln);
3303 LOAD_UTF8_CHARCLASS_ALNUM();
3304 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3307 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
3308 n = isALNUM_LC_utf8((U8*)locinput);
3312 st->ln = (locinput != PL_bostr) ?
3313 UCHARAT(locinput - 1) : '\n';
3314 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3315 st->ln = isALNUM(st->ln);
3316 n = isALNUM(nextchr);
3319 st->ln = isALNUM_LC(st->ln);
3320 n = isALNUM_LC(nextchr);
3323 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3324 OP(scan) == BOUNDL))
3328 PL_reg_flags |= RF_tainted;
3334 if (UTF8_IS_CONTINUED(nextchr)) {
3335 LOAD_UTF8_CHARCLASS_SPACE();
3336 if (!(OP(scan) == SPACE
3337 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3338 : isSPACE_LC_utf8((U8*)locinput)))
3342 locinput += PL_utf8skip[nextchr];
3343 nextchr = UCHARAT(locinput);
3346 if (!(OP(scan) == SPACE
3347 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3349 nextchr = UCHARAT(++locinput);
3352 if (!(OP(scan) == SPACE
3353 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3355 nextchr = UCHARAT(++locinput);
3359 PL_reg_flags |= RF_tainted;
3362 if (!nextchr && locinput >= PL_regeol)
3365 LOAD_UTF8_CHARCLASS_SPACE();
3366 if (OP(scan) == NSPACE
3367 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3368 : isSPACE_LC_utf8((U8*)locinput))
3372 locinput += PL_utf8skip[nextchr];
3373 nextchr = UCHARAT(locinput);
3376 if (OP(scan) == NSPACE
3377 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3379 nextchr = UCHARAT(++locinput);
3382 PL_reg_flags |= RF_tainted;
3388 LOAD_UTF8_CHARCLASS_DIGIT();
3389 if (!(OP(scan) == DIGIT
3390 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3391 : isDIGIT_LC_utf8((U8*)locinput)))
3395 locinput += PL_utf8skip[nextchr];
3396 nextchr = UCHARAT(locinput);
3399 if (!(OP(scan) == DIGIT
3400 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3402 nextchr = UCHARAT(++locinput);
3405 PL_reg_flags |= RF_tainted;
3408 if (!nextchr && locinput >= PL_regeol)
3411 LOAD_UTF8_CHARCLASS_DIGIT();
3412 if (OP(scan) == NDIGIT
3413 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3414 : isDIGIT_LC_utf8((U8*)locinput))
3418 locinput += PL_utf8skip[nextchr];
3419 nextchr = UCHARAT(locinput);
3422 if (OP(scan) == NDIGIT
3423 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3425 nextchr = UCHARAT(++locinput);
3428 if (locinput >= PL_regeol)
3431 LOAD_UTF8_CHARCLASS_MARK();
3432 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3434 locinput += PL_utf8skip[nextchr];
3435 while (locinput < PL_regeol &&
3436 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3437 locinput += UTF8SKIP(locinput);
3438 if (locinput > PL_regeol)
3443 nextchr = UCHARAT(locinput);
3446 PL_reg_flags |= RF_tainted;
3451 n = ARG(scan); /* which paren pair */
3452 st->ln = PL_regstartp[n];
3453 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3454 if ((I32)*PL_reglastparen < n || st->ln == -1)
3455 sayNO; /* Do not match unless seen CLOSEn. */
3456 if (st->ln == PL_regendp[n])
3459 s = PL_bostr + st->ln;
3460 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3462 const char *e = PL_bostr + PL_regendp[n];
3464 * Note that we can't do the "other character" lookup trick as
3465 * in the 8-bit case (no pun intended) because in Unicode we
3466 * have to map both upper and title case to lower case.
3468 if (OP(scan) == REFF) {
3470 STRLEN ulen1, ulen2;
3471 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3472 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3476 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3477 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3478 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3485 nextchr = UCHARAT(locinput);
3489 /* Inline the first character, for speed. */
3490 if (UCHARAT(s) != nextchr &&
3492 (UCHARAT(s) != ((OP(scan) == REFF
3493 ? PL_fold : PL_fold_locale)[nextchr]))))
3495 st->ln = PL_regendp[n] - st->ln;
3496 if (locinput + st->ln > PL_regeol)
3498 if (st->ln > 1 && (OP(scan) == REF
3499 ? memNE(s, locinput, st->ln)
3501 ? ibcmp(s, locinput, st->ln)
3502 : ibcmp_locale(s, locinput, st->ln))))
3505 nextchr = UCHARAT(locinput);
3516 #define ST st->u.eval
3518 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3522 /* execute the code in the {...} */
3524 SV ** const before = SP;
3525 OP_4tree * const oop = PL_op;
3526 COP * const ocurcop = PL_curcop;
3530 PL_op = (OP_4tree*)rex->data->data[n];
3531 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3532 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3533 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3535 CALLRUNOPS(aTHX); /* Scalar context. */
3538 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3545 PAD_RESTORE_LOCAL(old_comppad);
3546 PL_curcop = ocurcop;
3549 sv_setsv(save_scalar(PL_replgv), ret);
3553 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3556 /* extract RE object from returned value; compiling if
3561 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3562 mg = mg_find(sv, PERL_MAGIC_qr);
3563 else if (SvSMAGICAL(ret)) {
3564 if (SvGMAGICAL(ret))
3565 sv_unmagic(ret, PERL_MAGIC_qr);
3567 mg = mg_find(ret, PERL_MAGIC_qr);
3571 re = (regexp *)mg->mg_obj;
3572 (void)ReREFCNT_inc(re);
3576 const char * const t = SvPV_const(ret, len);
3578 const I32 osize = PL_regsize;
3581 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3582 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3584 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3586 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3592 /* run the pattern returned from (??{...}) */
3595 PerlIO_printf(Perl_debug_log,
3596 "Entering embedded \"%s%.60s%s%s\"\n",
3600 (strlen(re->precomp) > 60 ? "..." : ""))
3603 ST.cp = regcppush(0); /* Save *all* the positions. */
3604 REGCP_SET(ST.lastcp);
3605 *PL_reglastparen = 0;
3606 *PL_reglastcloseparen = 0;
3607 PL_reginput = locinput;
3609 /* XXXX This is too dramatic a measure... */
3613 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3614 ((re->reganch & ROPT_UTF8) != 0);
3615 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3620 /* now continue from first node in postoned RE */
3621 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
3624 /* /(?(?{...})X|Y)/ */
3625 st->sw = SvTRUE(ret);
3630 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3632 PL_reg_flags ^= RF_utf8;
3635 /* XXXX This is too dramatic a measure... */
3637 /* Restore parens of the caller without popping the
3640 const I32 tmp = PL_savestack_ix;
3641 PL_savestack_ix = ST.lastcp;
3643 PL_savestack_ix = tmp;
3645 PL_reginput = locinput;
3646 /* continue at the node following the (??{...}) */
3650 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3651 /* Restore state to the outer re then re-throw the failure */
3653 PL_reg_flags ^= RF_utf8;
3657 /* XXXX This is too dramatic a measure... */
3660 PL_reginput = locinput;
3661 REGCP_UNWIND(ST.lastcp);
3668 n = ARG(scan); /* which paren pair */
3669 PL_reg_start_tmp[n] = locinput;
3674 n = ARG(scan); /* which paren pair */
3675 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3676 PL_regendp[n] = locinput - PL_bostr;
3677 if (n > (I32)*PL_reglastparen)
3678 *PL_reglastparen = n;
3679 *PL_reglastcloseparen = n;
3682 n = ARG(scan); /* which paren pair */
3683 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3686 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3688 next = NEXTOPER(NEXTOPER(scan));
3690 next = scan + ARG(scan);
3691 if (OP(next) == IFTHEN) /* Fake one. */
3692 next = NEXTOPER(NEXTOPER(next));
3696 st->logical = scan->flags;
3698 /*******************************************************************
3699 cc points to the regmatch_state associated with the most recent CURLYX.
3700 This struct contains info about the innermost (...)* loop (an
3701 "infoblock"), and a pointer to the next outer cc.
3703 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3705 1) After matching Y, regnode for CURLYX is processed;
3707 2) This regnode populates cc, and calls regmatch() recursively
3708 with the starting point at WHILEM node;
3710 3) Each hit of WHILEM node tries to match A and Z (in the order
3711 depending on the current iteration, min/max of {min,max} and
3712 greediness). The information about where are nodes for "A"
3713 and "Z" is read from cc, as is info on how many times "A"
3714 was already matched, and greediness.
3716 4) After A matches, the same WHILEM node is hit again.
3718 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3719 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3720 resets cc, since this Y(A)*Z can be a part of some other loop:
3721 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3722 of the external loop.
3724 Currently present infoblocks form a tree with a stem formed by st->cc
3725 and whatever it mentions via ->next, and additional attached trees
3726 corresponding to temporarily unset infoblocks as in "5" above.
3728 In the following picture, infoblocks for outer loop of
3729 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3730 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3731 infoblocks are drawn below the "reset" infoblock.
3733 In fact in the picture below we do not show failed matches for Z and T
3734 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3735 more obvious *why* one needs to *temporary* unset infoblocks.]
3737 Matched REx position InfoBlocks Comment
3741 Y A)*?Z)*?T x <- O <- I
3742 YA )*?Z)*?T x <- O <- I
3743 YA A)*?Z)*?T x <- O <- I
3744 YAA )*?Z)*?T x <- O <- I
3745 YAA Z)*?T x <- O # Temporary unset I
3748 YAAZ Y(A)*?Z)*?T x <- O
3751 YAAZY (A)*?Z)*?T x <- O
3754 YAAZY A)*?Z)*?T x <- O <- I
3757 YAAZYA )*?Z)*?T x <- O <- I
3760 YAAZYA Z)*?T x <- O # Temporary unset I
3766 YAAZYAZ T x # Temporary unset O
3773 *******************************************************************/
3776 /* No need to save/restore up to this paren */
3777 I32 parenfloor = scan->flags;
3781 CURLYX and WHILEM are always paired: they're the moral
3782 equivalent of pp_enteriter anbd pp_iter.
3784 The only time next could be null is if the node tree is
3785 corrupt. This was mentioned on p5p a few days ago.
3787 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3788 So we'll assert that this is true:
3791 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3793 /* XXXX Probably it is better to teach regpush to support
3794 parenfloor > PL_regsize... */
3795 if (parenfloor > (I32)*PL_reglastparen)
3796 parenfloor = *PL_reglastparen; /* Pessimization... */
3798 st->u.curlyx.cp = PL_savestack_ix;
3799 st->u.curlyx.outercc = st->cc;
3801 /* these fields contain the state of the current curly.
3802 * they are accessed by subsequent WHILEMs;
3803 * cur and lastloc are also updated by WHILEM */
3804 st->u.curlyx.parenfloor = parenfloor;
3805 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3806 st->u.curlyx.min = ARG1(scan);
3807 st->u.curlyx.max = ARG2(scan);
3808 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3809 st->u.curlyx.lastloc = 0;
3810 /* st->next and st->minmod are also read by WHILEM */
3812 PL_reginput = locinput;
3813 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3814 /*** all unsaved local vars undefined at this point */
3815 regcpblow(st->u.curlyx.cp);
3816 st->cc = st->u.curlyx.outercc;
3822 * This is really hard to understand, because after we match
3823 * what we're trying to match, we must make sure the rest of
3824 * the REx is going to match for sure, and to do that we have
3825 * to go back UP the parse tree by recursing ever deeper. And
3826 * if it fails, we have to reset our parent's current state
3827 * that we can try again after backing off.
3832 st->cc gets initialised by CURLYX ready for use by WHILEM.
3833 So again, unless somethings been corrupted, st->cc cannot
3834 be null at that point in WHILEM.
3836 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3837 So we'll assert that this is true:
3840 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3841 st->u.whilem.cache_offset = 0;
3842 st->u.whilem.cache_bit = 0;
3844 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3845 PL_reginput = locinput;
3848 PerlIO_printf(Perl_debug_log,
3849 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3850 REPORT_CODE_OFF+PL_regindent*2, "",
3851 (long)n, (long)st->cc->u.curlyx.min,
3852 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3855 /* If degenerate scan matches "", assume scan done. */
3857 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3858 st->u.whilem.savecc = st->cc;
3859 st->cc = st->cc->u.curlyx.outercc;
3861 st->ln = st->cc->u.curlyx.cur;
3863 PerlIO_printf(Perl_debug_log,
3864 "%*s empty match detected, try continuation...\n",
3865 REPORT_CODE_OFF+PL_regindent*2, "")
3867 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3868 /*** all unsaved local vars undefined at this point */
3869 st->cc = st->u.whilem.savecc;
3872 if (st->cc->u.curlyx.outercc)
3873 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3877 /* First just match a string of min scans. */
3879 if (n < st->cc->u.curlyx.min) {
3880 st->cc->u.curlyx.cur = n;
3881 st->cc->u.curlyx.lastloc = locinput;
3882 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3883 /*** all unsaved local vars undefined at this point */
3886 st->cc->u.curlyx.cur = n - 1;
3887 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;