This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix a problem with jump-tries, add (?FAIL) pattern.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
c476f425 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
6bda09f9 169#define REGCP_OTHER_ELEMS 8
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 171
3280af22 172 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 173/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
174 SSPUSHINT(PL_regendp[p]);
175 SSPUSHINT(PL_regstartp[p]);
3280af22 176 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 177 SSPUSHINT(p);
40a82448
DM
178 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
179 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
180 (UV)p, (IV)PL_regstartp[p],
181 (IV)(PL_reg_start_tmp[p] - PL_bostr),
182 (IV)PL_regendp[p]
183 ));
a0d0e21e 184 }
b1ce53c5 185/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
6bda09f9
YO
186 SSPUSHPTR(PL_regstartp);
187 SSPUSHPTR(PL_regendp);
3280af22
NIS
188 SSPUSHINT(PL_regsize);
189 SSPUSHINT(*PL_reglastparen);
a01268b5 190 SSPUSHINT(*PL_reglastcloseparen);
3280af22 191 SSPUSHPTR(PL_reginput);
41123dfd
JH
192#define REGCP_FRAME_ELEMS 2
193/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
194 * are needed for the regexp context stack bookkeeping. */
195 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 196 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 197
a0d0e21e
LW
198 return retval;
199}
200
c277df42 201/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
202#define REGCP_SET(cp) \
203 DEBUG_STATE_r( \
ab3bbdeb 204 PerlIO_printf(Perl_debug_log, \
e4f74956 205 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
206 (IV)PL_savestack_ix)); \
207 cp = PL_savestack_ix
c3464db5 208
ab3bbdeb 209#define REGCP_UNWIND(cp) \
e4f74956 210 DEBUG_STATE_r( \
ab3bbdeb 211 if (cp != PL_savestack_ix) \
e4f74956
YO
212 PerlIO_printf(Perl_debug_log, \
213 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
214 (IV)(cp), (IV)PL_savestack_ix)); \
215 regcpblow(cp)
c277df42 216
76e3520e 217STATIC char *
097eb12c 218S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 219{
97aff369 220 dVAR;
b1ce53c5 221 I32 i;
a0d0e21e 222 char *input;
b1ce53c5 223
a3621e74
YO
224 GET_RE_DEBUG_FLAGS_DECL;
225
b1ce53c5 226 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 227 i = SSPOPINT;
b1ce53c5
JH
228 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
229 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 230 input = (char *) SSPOPPTR;
a01268b5 231 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
232 *PL_reglastparen = SSPOPINT;
233 PL_regsize = SSPOPINT;
6bda09f9
YO
234 PL_regendp=(I32 *) SSPOPPTR;
235 PL_regstartp=(I32 *) SSPOPPTR;
b1ce53c5 236
6bda09f9 237
b1ce53c5 238 /* Now restore the parentheses context. */
41123dfd
JH
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 241 I32 tmps;
097eb12c 242 U32 paren = (U32)SSPOPINT;
3280af22 243 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
244 PL_regstartp[paren] = SSPOPINT;
245 tmps = SSPOPINT;
3280af22
NIS
246 if (paren <= *PL_reglastparen)
247 PL_regendp[paren] = tmps;
a3621e74 248 DEBUG_EXECUTE_r(
c3464db5 249 PerlIO_printf(Perl_debug_log,
b900a521 250 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 251 (UV)paren, (IV)PL_regstartp[paren],
b900a521 252 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 253 (IV)PL_regendp[paren],
3280af22 254 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 255 );
a0d0e21e 256 }
a3621e74 257 DEBUG_EXECUTE_r(
bb7a0f54 258 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 259 PerlIO_printf(Perl_debug_log,
faccc32b 260 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 261 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
262 }
263 );
daf18116 264#if 1
dafc8851
JH
265 /* It would seem that the similar code in regtry()
266 * already takes care of this, and in fact it is in
267 * a better location to since this code can #if 0-ed out
268 * but the code in regtry() is needed or otherwise tests
269 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
270 * (as of patchlevel 7877) will fail. Then again,
271 * this code seems to be necessary or otherwise
272 * building DynaLoader will fail:
273 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 * --jhi */
bb7a0f54 275 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
276 if (i > PL_regsize)
277 PL_regstartp[i] = -1;
278 PL_regendp[i] = -1;
a0d0e21e 279 }
dafc8851 280#endif
a0d0e21e
LW
281 return input;
282}
283
02db2b7b 284#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 285
a687059c 286/*
e50aee73 287 * pregexec and friends
a687059c
LW
288 */
289
76234dfb 290#ifndef PERL_IN_XSUB_RE
a687059c 291/*
c277df42 292 - pregexec - match a regexp against a string
a687059c 293 */
c277df42 294I32
864dbfa3 295Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 296 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
297/* strend: pointer to null at end of string */
298/* strbeg: real beginning of string */
299/* minend: end of match must be >=minend after stringarg. */
300/* nosave: For optimizations. */
301{
302 return
9041c2e3 303 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
304 nosave ? 0 : REXEC_COPY_STR);
305}
76234dfb 306#endif
22e551b9 307
9041c2e3 308/*
cad2e5aa
JH
309 * Need to implement the following flags for reg_anch:
310 *
311 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
312 * USE_INTUIT_ML
313 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
314 * INTUIT_AUTORITATIVE_ML
315 * INTUIT_ONCE_NOML - Intuit can match in one location only.
316 * INTUIT_ONCE_ML
317 *
318 * Another flag for this function: SECOND_TIME (so that float substrs
319 * with giant delta may be not rechecked).
320 */
321
322/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
323
3f7c398e 324/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
325 Otherwise, only SvCUR(sv) is used to get strbeg. */
326
327/* XXXX We assume that strpos is strbeg unless sv. */
328
6eb5f6b9
JH
329/* XXXX Some places assume that there is a fixed substring.
330 An update may be needed if optimizer marks as "INTUITable"
331 RExen without fixed substrings. Similarly, it is assumed that
332 lengths of all the strings are no more than minlen, thus they
333 cannot come from lookahead.
334 (Or minlen should take into account lookahead.) */
335
2c2d71f5
JH
336/* A failure to find a constant substring means that there is no need to make
337 an expensive call to REx engine, thus we celebrate a failure. Similarly,
338 finding a substring too deep into the string means that less calls to
30944b6d
IZ
339 regtry() should be needed.
340
341 REx compiler's optimizer found 4 possible hints:
342 a) Anchored substring;
343 b) Fixed substring;
344 c) Whether we are anchored (beginning-of-line or \G);
345 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 346 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
347 string which does not contradict any of them.
348 */
2c2d71f5 349
6eb5f6b9
JH
350/* Most of decisions we do here should have been done at compile time.
351 The nodes of the REx which we used for the search should have been
352 deleted from the finite automaton. */
353
cad2e5aa
JH
354char *
355Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
356 char *strend, U32 flags, re_scream_pos_data *data)
357{
97aff369 358 dVAR;
b7953727 359 register I32 start_shift = 0;
cad2e5aa 360 /* Should be nonnegative! */
b7953727 361 register I32 end_shift = 0;
2c2d71f5
JH
362 register char *s;
363 register SV *check;
a1933d95 364 char *strbeg;
cad2e5aa 365 char *t;
1de06328 366 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 367 I32 ml_anch;
bd61b366
SS
368 register char *other_last = NULL; /* other substr checked before this */
369 char *check_at = NULL; /* check substr found at this pos */
1df70142 370 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 371#ifdef DEBUGGING
890ce7af 372 const char * const i_strpos = strpos;
30944b6d 373#endif
a3621e74
YO
374
375 GET_RE_DEBUG_FLAGS_DECL;
376
a30b2f1f 377 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 378
b8d68ded 379 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
380 PL_reg_flags |= RF_utf8;
381 }
ab3bbdeb
YO
382 DEBUG_EXECUTE_r(
383 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
384 sv ? "Guessing start of match in sv for"
385 : "Guessing start of match in string for");
2a782b5b 386 );
cad2e5aa 387
c344f387
JH
388 /* CHR_DIST() would be more correct here but it makes things slow. */
389 if (prog->minlen > strend - strpos) {
a3621e74 390 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 391 "String too short... [re_intuit_start]\n"));
cad2e5aa 392 goto fail;
2c2d71f5 393 }
1de06328 394
a1933d95 395 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 396 PL_regeol = strend;
33b8afdf
JH
397 if (do_utf8) {
398 if (!prog->check_utf8 && prog->check_substr)
399 to_utf8_substr(prog);
400 check = prog->check_utf8;
401 } else {
402 if (!prog->check_substr && prog->check_utf8)
403 to_byte_substr(prog);
404 check = prog->check_substr;
405 }
1de06328 406 if (check == &PL_sv_undef) {
a3621e74 407 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 408 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
409 goto fail;
410 }
2c2d71f5 411 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
412 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
413 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 414 && !multiline ) ); /* Check after \n? */
cad2e5aa 415
7e25d62c
JH
416 if (!ml_anch) {
417 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
418 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 419 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
420 && sv && !SvROK(sv)
421 && (strpos != strbeg)) {
a3621e74 422 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
423 goto fail;
424 }
425 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 426 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 427 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
428 I32 slen;
429
1aa99e6b 430 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 431
653099ff
GS
432 if (SvTAIL(check)) {
433 slen = SvCUR(check); /* >= 1 */
cad2e5aa 434
9041c2e3 435 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 436 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 437 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 438 goto fail_finish;
cad2e5aa
JH
439 }
440 /* Now should match s[0..slen-2] */
441 slen--;
3f7c398e 442 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 443 || (slen > 1
3f7c398e 444 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 445 report_neq:
a3621e74 446 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
447 goto fail_finish;
448 }
cad2e5aa 449 }
3f7c398e 450 else if (*SvPVX_const(check) != *s
653099ff 451 || ((slen = SvCUR(check)) > 1
3f7c398e 452 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 453 goto report_neq;
c315bfe8 454 check_at = s;
2c2d71f5 455 goto success_at_start;
7e25d62c 456 }
cad2e5aa 457 }
2c2d71f5 458 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 459 s = strpos;
2c2d71f5 460 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
461 end_shift = prog->check_end_shift;
462
2c2d71f5 463 if (!ml_anch) {
a3b680e6 464 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 465 - (SvTAIL(check) != 0);
a3b680e6 466 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
467
468 if (end_shift < eshift)
469 end_shift = eshift;
470 }
cad2e5aa 471 }
2c2d71f5 472 else { /* Can match at random position */
cad2e5aa
JH
473 ml_anch = 0;
474 s = strpos;
1de06328
YO
475 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
476 end_shift = prog->check_end_shift;
477
478 /* end shift should be non negative here */
cad2e5aa
JH
479 }
480
2c2d71f5 481#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 482 if (end_shift < 0)
1de06328 483 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 484 (IV)end_shift, prog->precomp);
2c2d71f5
JH
485#endif
486
2c2d71f5
JH
487 restart:
488 /* Find a possible match in the region s..strend by looking for
489 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
490
491 {
492 I32 srch_start_shift = start_shift;
493 I32 srch_end_shift = end_shift;
494 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
495 srch_end_shift -= ((strbeg - s) - srch_start_shift);
496 srch_start_shift = strbeg - s;
497 }
6bda09f9 498 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
499 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
500 (IV)prog->check_offset_min,
501 (IV)srch_start_shift,
502 (IV)srch_end_shift,
503 (IV)prog->check_end_shift);
504 });
505
cad2e5aa 506 if (flags & REXEC_SCREAM) {
cad2e5aa 507 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 508 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 509
2c2d71f5
JH
510 if (PL_screamfirst[BmRARE(check)] >= 0
511 || ( BmRARE(check) == '\n'
512 && (BmPREVIOUS(check) == SvCUR(check) - 1)
513 && SvTAIL(check) ))
9041c2e3 514 s = screaminstr(sv, check,
1de06328 515 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 516 else
2c2d71f5 517 goto fail_finish;
4addbd3b
HS
518 /* we may be pointing at the wrong string */
519 if (s && RX_MATCH_COPIED(prog))
3f7c398e 520 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
521 if (data)
522 *data->scream_olds = s;
523 }
1de06328
YO
524 else {
525 U8* start_point;
526 U8* end_point;
527 if (prog->reganch & ROPT_CANY_SEEN) {
528 start_point= (U8*)(s + srch_start_shift);
529 end_point= (U8*)(strend - srch_end_shift);
530 } else {
531 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
532 end_point= HOP3(strend, -srch_end_shift, strbeg);
533 }
6bda09f9 534 DEBUG_OPTIMISE_MORE_r({
56570a2c 535 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 536 (int)(end_point - start_point),
fc8cd66c 537 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
538 start_point);
539 });
540
541 s = fbm_instr( start_point, end_point,
7fba1cd6 542 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
543 }
544 }
cad2e5aa
JH
545 /* Update the count-of-usability, remove useless subpatterns,
546 unshift s. */
2c2d71f5 547
ab3bbdeb
YO
548 DEBUG_EXECUTE_r({
549 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
550 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
551 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 552 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
553 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
554 ? "anchored" : "floating"),
555 quoted,
556 RE_SV_TAIL(check),
557 (s ? " at offset " : "...\n") );
558 });
2c2d71f5
JH
559
560 if (!s)
561 goto fail_finish;
2c2d71f5 562 /* Finish the diagnostic message */
a3621e74 563 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 564
1de06328
YO
565 /* XXX dmq: first branch is for positive lookbehind...
566 Our check string is offset from the beginning of the pattern.
567 So we need to do any stclass tests offset forward from that
568 point. I think. :-(
569 */
570
571
572
573 check_at=s;
574
575
2c2d71f5
JH
576 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
577 Start with the other substr.
578 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 579 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
580 *always* match. Probably should be marked during compile...
581 Probably it is right to do no SCREAM here...
582 */
583
1de06328
YO
584 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
585 : (prog->float_substr && prog->anchored_substr))
586 {
30944b6d 587 /* Take into account the "other" substring. */
2c2d71f5
JH
588 /* XXXX May be hopelessly wrong for UTF... */
589 if (!other_last)
6eb5f6b9 590 other_last = strpos;
33b8afdf 591 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
592 do_other_anchored:
593 {
890ce7af
AL
594 char * const last = HOP3c(s, -start_shift, strbeg);
595 char *last1, *last2;
be8e71aa 596 char * const saved_s = s;
33b8afdf 597 SV* must;
2c2d71f5 598
2c2d71f5
JH
599 t = s - prog->check_offset_max;
600 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 601 && (!do_utf8
0ce71af7 602 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 603 && t > strpos)))
6f207bd3 604 NOOP;
2c2d71f5
JH
605 else
606 t = strpos;
1aa99e6b 607 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
608 if (t < other_last) /* These positions already checked */
609 t = other_last;
1aa99e6b 610 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
611 if (last < last1)
612 last1 = last;
1de06328
YO
613 /* XXXX It is not documented what units *_offsets are in.
614 We assume bytes, but this is clearly wrong.
615 Meaning this code needs to be carefully reviewed for errors.
616 dmq.
617 */
618
2c2d71f5 619 /* On end-of-str: see comment below. */
33b8afdf
JH
620 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
621 if (must == &PL_sv_undef) {
622 s = (char*)NULL;
1de06328 623 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
624 }
625 else
626 s = fbm_instr(
627 (unsigned char*)t,
628 HOP3(HOP3(last1, prog->anchored_offset, strend)
629 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
630 must,
7fba1cd6 631 multiline ? FBMrf_MULTILINE : 0
33b8afdf 632 );
ab3bbdeb
YO
633 DEBUG_EXECUTE_r({
634 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
635 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
636 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 637 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
638 quoted, RE_SV_TAIL(must));
639 });
640
641
2c2d71f5
JH
642 if (!s) {
643 if (last1 >= last2) {
a3621e74 644 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
645 ", giving up...\n"));
646 goto fail_finish;
647 }
a3621e74 648 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 649 ", trying floating at offset %ld...\n",
be8e71aa 650 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
651 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
652 s = HOP3c(last, 1, strend);
2c2d71f5
JH
653 goto restart;
654 }
655 else {
a3621e74 656 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 657 (long)(s - i_strpos)));
1aa99e6b
IH
658 t = HOP3c(s, -prog->anchored_offset, strbeg);
659 other_last = HOP3c(s, 1, strend);
be8e71aa 660 s = saved_s;
2c2d71f5
JH
661 if (t == strpos)
662 goto try_at_start;
2c2d71f5
JH
663 goto try_at_offset;
664 }
30944b6d 665 }
2c2d71f5
JH
666 }
667 else { /* Take into account the floating substring. */
33b8afdf 668 char *last, *last1;
be8e71aa 669 char * const saved_s = s;
33b8afdf
JH
670 SV* must;
671
672 t = HOP3c(s, -start_shift, strbeg);
673 last1 = last =
674 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
675 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
676 last = HOP3c(t, prog->float_max_offset, strend);
677 s = HOP3c(t, prog->float_min_offset, strend);
678 if (s < other_last)
679 s = other_last;
2c2d71f5 680 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
681 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
682 /* fbm_instr() takes into account exact value of end-of-str
683 if the check is SvTAIL(ed). Since false positives are OK,
684 and end-of-str is not later than strend we are OK. */
685 if (must == &PL_sv_undef) {
686 s = (char*)NULL;
1de06328 687 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
688 }
689 else
2c2d71f5 690 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
691 (unsigned char*)last + SvCUR(must)
692 - (SvTAIL(must)!=0),
7fba1cd6 693 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
694 DEBUG_EXECUTE_r({
695 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
696 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
697 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 698 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
699 quoted, RE_SV_TAIL(must));
700 });
33b8afdf
JH
701 if (!s) {
702 if (last1 == last) {
a3621e74 703 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
704 ", giving up...\n"));
705 goto fail_finish;
2c2d71f5 706 }
a3621e74 707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 708 ", trying anchored starting at offset %ld...\n",
be8e71aa 709 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
710 other_last = last;
711 s = HOP3c(t, 1, strend);
712 goto restart;
713 }
714 else {
a3621e74 715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
716 (long)(s - i_strpos)));
717 other_last = s; /* Fix this later. --Hugo */
be8e71aa 718 s = saved_s;
33b8afdf
JH
719 if (t == strpos)
720 goto try_at_start;
721 goto try_at_offset;
722 }
2c2d71f5 723 }
cad2e5aa 724 }
2c2d71f5 725
1de06328 726
9ef43ace 727 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 728
6bda09f9 729 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
730 PerlIO_printf(Perl_debug_log,
731 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
732 (IV)prog->check_offset_min,
733 (IV)prog->check_offset_max,
734 (IV)(s-strpos),
735 (IV)(t-strpos),
736 (IV)(t-s),
737 (IV)(strend-strpos)
738 )
739 );
740
2c2d71f5 741 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 742 && (!do_utf8
9ef43ace 743 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
744 && t > strpos)))
745 {
2c2d71f5
JH
746 /* Fixed substring is found far enough so that the match
747 cannot start at strpos. */
748 try_at_offset:
cad2e5aa 749 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
750 /* Eventually fbm_*() should handle this, but often
751 anchored_offset is not 0, so this check will not be wasted. */
752 /* XXXX In the code below we prefer to look for "^" even in
753 presence of anchored substrings. And we search even
754 beyond the found float position. These pessimizations
755 are historical artefacts only. */
756 find_anchor:
2c2d71f5 757 while (t < strend - prog->minlen) {
cad2e5aa 758 if (*t == '\n') {
4ee3650e 759 if (t < check_at - prog->check_offset_min) {
33b8afdf 760 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
761 /* Since we moved from the found position,
762 we definitely contradict the found anchored
30944b6d
IZ
763 substr. Due to the above check we do not
764 contradict "check" substr.
765 Thus we can arrive here only if check substr
766 is float. Redo checking for "other"=="fixed".
767 */
9041c2e3 768 strpos = t + 1;
a3621e74 769 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 770 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
771 goto do_other_anchored;
772 }
4ee3650e
GS
773 /* We don't contradict the found floating substring. */
774 /* XXXX Why not check for STCLASS? */
cad2e5aa 775 s = t + 1;
a3621e74 776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 777 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
778 goto set_useful;
779 }
4ee3650e
GS
780 /* Position contradicts check-string */
781 /* XXXX probably better to look for check-string
782 than for "\n", so one should lower the limit for t? */
a3621e74 783 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 784 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 785 other_last = strpos = s = t + 1;
cad2e5aa
JH
786 goto restart;
787 }
788 t++;
789 }
a3621e74 790 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 791 PL_colors[0], PL_colors[1]));
2c2d71f5 792 goto fail_finish;
cad2e5aa 793 }
f5952150 794 else {
a3621e74 795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 796 PL_colors[0], PL_colors[1]));
f5952150 797 }
cad2e5aa
JH
798 s = t;
799 set_useful:
33b8afdf 800 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
801 }
802 else {
f5952150 803 /* The found string does not prohibit matching at strpos,
2c2d71f5 804 - no optimization of calling REx engine can be performed,
f5952150
GS
805 unless it was an MBOL and we are not after MBOL,
806 or a future STCLASS check will fail this. */
2c2d71f5
JH
807 try_at_start:
808 /* Even in this situation we may use MBOL flag if strpos is offset
809 wrt the start of the string. */
05b4157f 810 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 811 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
812 /* May be due to an implicit anchor of m{.*foo} */
813 && !(prog->reganch & ROPT_IMPLICIT))
814 {
cad2e5aa
JH
815 t = strpos;
816 goto find_anchor;
817 }
a3621e74 818 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 819 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 820 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 821 );
2c2d71f5 822 success_at_start:
30944b6d 823 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
824 && (do_utf8 ? (
825 prog->check_utf8 /* Could be deleted already */
826 && --BmUSEFUL(prog->check_utf8) < 0
827 && (prog->check_utf8 == prog->float_utf8)
828 ) : (
829 prog->check_substr /* Could be deleted already */
830 && --BmUSEFUL(prog->check_substr) < 0
831 && (prog->check_substr == prog->float_substr)
832 )))
66e933ab 833 {
cad2e5aa 834 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 835 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
836 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
837 if (do_utf8 ? prog->check_substr : prog->check_utf8)
838 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
839 prog->check_substr = prog->check_utf8 = NULL; /* disable */
840 prog->float_substr = prog->float_utf8 = NULL; /* clear */
841 check = NULL; /* abort */
cad2e5aa 842 s = strpos;
3cf5c195
IZ
843 /* XXXX This is a remnant of the old implementation. It
844 looks wasteful, since now INTUIT can use many
6eb5f6b9 845 other heuristics. */
cad2e5aa
JH
846 prog->reganch &= ~RE_USE_INTUIT;
847 }
848 else
849 s = strpos;
850 }
851
6eb5f6b9
JH
852 /* Last resort... */
853 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
854 /* trie stclasses are too expensive to use here, we are better off to
855 leave it to regmatch itself */
786e8c11 856 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
6eb5f6b9
JH
857 /* minlen == 0 is possible if regstclass is \b or \B,
858 and the fixed substr is ''$.
859 Since minlen is already taken into account, s+1 is before strend;
860 accidentally, minlen >= 1 guaranties no false positives at s + 1
861 even for \b or \B. But (minlen? 1 : 0) below assumes that
862 regstclass does not come from lookahead... */
863 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
864 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 865 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 866 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 867 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 868 : 1);
1de06328
YO
869 char * endpos;
870 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
871 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
872 else if (prog->float_substr || prog->float_utf8)
873 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
874 else
875 endpos= strend;
876
56570a2c 877 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
b57a0404 878 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
1de06328 879
6eb5f6b9 880 t = s;
3b0527fe 881 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
882 if (!s) {
883#ifdef DEBUGGING
cbbf8932 884 const char *what = NULL;
6eb5f6b9
JH
885#endif
886 if (endpos == strend) {
a3621e74 887 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
888 "Could not match STCLASS...\n") );
889 goto fail;
890 }
a3621e74 891 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 892 "This position contradicts STCLASS...\n") );
653099ff
GS
893 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
894 goto fail;
6eb5f6b9 895 /* Contradict one of substrings */
33b8afdf
JH
896 if (prog->anchored_substr || prog->anchored_utf8) {
897 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 898 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 899 hop_and_restart:
1aa99e6b 900 s = HOP3c(t, 1, strend);
66e933ab
GS
901 if (s + start_shift + end_shift > strend) {
902 /* XXXX Should be taken into account earlier? */
a3621e74 903 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
904 "Could not match STCLASS...\n") );
905 goto fail;
906 }
5e39e1e5
HS
907 if (!check)
908 goto giveup;
a3621e74 909 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 910 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
911 what, (long)(s + start_shift - i_strpos)) );
912 goto restart;
913 }
66e933ab 914 /* Have both, check_string is floating */
6eb5f6b9
JH
915 if (t + start_shift >= check_at) /* Contradicts floating=check */
916 goto retry_floating_check;
917 /* Recheck anchored substring, but not floating... */
9041c2e3 918 s = check_at;
5e39e1e5
HS
919 if (!check)
920 goto giveup;
a3621e74 921 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 922 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
923 (long)(other_last - i_strpos)) );
924 goto do_other_anchored;
925 }
60e71179
GS
926 /* Another way we could have checked stclass at the
927 current position only: */
928 if (ml_anch) {
929 s = t = t + 1;
5e39e1e5
HS
930 if (!check)
931 goto giveup;
a3621e74 932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 933 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 934 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 935 goto try_at_offset;
66e933ab 936 }
33b8afdf 937 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 938 goto fail;
6eb5f6b9
JH
939 /* Check is floating subtring. */
940 retry_floating_check:
941 t = check_at - start_shift;
a3621e74 942 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
943 goto hop_and_restart;
944 }
b7953727 945 if (t != s) {
a3621e74 946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 947 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
948 (long)(t - i_strpos), (long)(s - i_strpos))
949 );
950 }
951 else {
a3621e74 952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
953 "Does not contradict STCLASS...\n");
954 );
955 }
6eb5f6b9 956 }
5e39e1e5 957 giveup:
a3621e74 958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
959 PL_colors[4], (check ? "Guessed" : "Giving up"),
960 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 961 return s;
2c2d71f5
JH
962
963 fail_finish: /* Substring not found */
33b8afdf
JH
964 if (prog->check_substr || prog->check_utf8) /* could be removed already */
965 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 966 fail:
a3621e74 967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 968 PL_colors[4], PL_colors[5]));
bd61b366 969 return NULL;
cad2e5aa 970}
9661b544 971
786e8c11 972
3b0527fe 973
4cadc6a9
YO
974#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
975foldlen, foldbuf, uniflags) STMT_START { \
976 switch (trie_type) { \
977 case trie_utf8_fold: \
978 if ( foldlen>0 ) { \
979 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
980 foldlen -= len; \
981 uscan += len; \
982 len=0; \
983 } else { \
984 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
985 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
986 foldlen -= UNISKIP( uvc ); \
987 uscan = foldbuf + UNISKIP( uvc ); \
988 } \
989 break; \
990 case trie_utf8: \
991 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
992 break; \
993 case trie_plain: \
994 uvc = (UV)*uc; \
995 len = 1; \
996 } \
997 \
998 if (uvc < 256) { \
999 charid = trie->charmap[ uvc ]; \
1000 } \
1001 else { \
1002 charid = 0; \
1003 if (trie->widecharmap) { \
1004 SV** const svpp = hv_fetch(trie->widecharmap, \
1005 (char*)&uvc, sizeof(UV), 0); \
1006 if (svpp) \
1007 charid = (U16)SvIV(*svpp); \
1008 } \
1009 } \
1010} STMT_END
1011
1012#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1013 if ( (CoNd) \
1014 && (ln == len || \
1015 ibcmp_utf8(s, NULL, 0, do_utf8, \
1016 m, NULL, ln, (bool)UTF)) \
1017 && (!reginfo || regtry(reginfo, s)) ) \
1018 goto got_it; \
1019 else { \
1020 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1021 uvchr_to_utf8(tmpbuf, c); \
1022 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1023 if ( f != c \
1024 && (f == c1 || f == c2) \
1025 && (ln == foldlen || \
1026 !ibcmp_utf8((char *) foldbuf, \
1027 NULL, foldlen, do_utf8, \
1028 m, \
1029 NULL, ln, (bool)UTF)) \
1030 && (!reginfo || regtry(reginfo, s)) ) \
1031 goto got_it; \
1032 } \
1033 s += len
1034
1035#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1036STMT_START { \
1037 while (s <= e) { \
1038 if ( (CoNd) \
1039 && (ln == 1 || !(OP(c) == EXACTF \
1040 ? ibcmp(s, m, ln) \
1041 : ibcmp_locale(s, m, ln))) \
1042 && (!reginfo || regtry(reginfo, s)) ) \
1043 goto got_it; \
1044 s++; \
1045 } \
1046} STMT_END
1047
1048#define REXEC_FBC_UTF8_SCAN(CoDe) \
1049STMT_START { \
1050 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1051 CoDe \
1052 s += uskip; \
1053 } \
1054} STMT_END
1055
1056#define REXEC_FBC_SCAN(CoDe) \
1057STMT_START { \
1058 while (s < strend) { \
1059 CoDe \
1060 s++; \
1061 } \
1062} STMT_END
1063
1064#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1065REXEC_FBC_UTF8_SCAN( \
1066 if (CoNd) { \
1067 if (tmp && (!reginfo || regtry(reginfo, s))) \
1068 goto got_it; \
1069 else \
1070 tmp = doevery; \
1071 } \
1072 else \
1073 tmp = 1; \
1074)
1075
1076#define REXEC_FBC_CLASS_SCAN(CoNd) \
1077REXEC_FBC_SCAN( \
1078 if (CoNd) { \
1079 if (tmp && (!reginfo || regtry(reginfo, s))) \
1080 goto got_it; \
1081 else \
1082 tmp = doevery; \
1083 } \
1084 else \
1085 tmp = 1; \
1086)
1087
1088#define REXEC_FBC_TRYIT \
1089if ((!reginfo || regtry(reginfo, s))) \
1090 goto got_it
1091
1092#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1093 if (do_utf8) { \
1094 UtFpReLoAd; \
1095 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1096 } \
1097 else { \
1098 REXEC_FBC_CLASS_SCAN(CoNd); \
1099 } \
1100 break
1101
1102#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1103 PL_reg_flags |= RF_tainted; \
1104 if (do_utf8) { \
1105 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1106 } \
1107 else { \
1108 REXEC_FBC_CLASS_SCAN(CoNd); \
1109 } \
1110 break
1111
786e8c11
YO
1112#define DUMP_EXEC_POS(li,s,doutf8) \
1113 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1114
1115/* We know what class REx starts with. Try to find this position... */
1116/* if reginfo is NULL, its a dryrun */
1117/* annoyingly all the vars in this routine have different names from their counterparts
1118 in regmatch. /grrr */
1119
3c3eec57 1120STATIC char *
07be1b83
YO
1121S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1122 const char *strend, const regmatch_info *reginfo)
a687059c 1123{
27da23d5 1124 dVAR;
1df70142 1125 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1126 char *m;
d8093b23 1127 STRLEN ln;
5dab1207 1128 STRLEN lnc;
078c425b 1129 register STRLEN uskip;
d8093b23 1130 unsigned int c1;
1131 unsigned int c2;
6eb5f6b9
JH
1132 char *e;
1133 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1134 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1135
6eb5f6b9
JH
1136 /* We know what class it must start with. */
1137 switch (OP(c)) {
6eb5f6b9 1138 case ANYOF:
388cc4de 1139 if (do_utf8) {
4cadc6a9 1140 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1141 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1142 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1143 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1144 }
1145 else {
1146 while (s < strend) {
1147 STRLEN skip = 1;
1148
32fc9b6a 1149 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1150 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1151 /* The assignment of 2 is intentional:
1152 * for the folded sharp s, the skip is 2. */
1153 (skip = SHARP_S_SKIP))) {
3b0527fe 1154 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1155 goto got_it;
1156 else
1157 tmp = doevery;
1158 }
1159 else
1160 tmp = 1;
1161 s += skip;
1162 }
a0d0e21e 1163 }
6eb5f6b9 1164 break;
f33976b4 1165 case CANY:
4cadc6a9 1166 REXEC_FBC_SCAN(
3b0527fe 1167 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1168 goto got_it;
1169 else
1170 tmp = doevery;
4cadc6a9 1171 );
f33976b4 1172 break;
6eb5f6b9 1173 case EXACTF:
5dab1207
NIS
1174 m = STRING(c);
1175 ln = STR_LEN(c); /* length to match in octets/bytes */
1176 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1177 if (UTF) {
a2a2844f 1178 STRLEN ulen1, ulen2;
5dab1207 1179 U8 *sm = (U8 *) m;
89ebb4a3
JH
1180 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1181 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1182 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1183
1184 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1185 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1186
89ebb4a3 1187 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1188 0, uniflags);
89ebb4a3 1189 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1190 0, uniflags);
5dab1207
NIS
1191 lnc = 0;
1192 while (sm < ((U8 *) m + ln)) {
1193 lnc++;
1194 sm += UTF8SKIP(sm);
1195 }
1aa99e6b
IH
1196 }
1197 else {
1198 c1 = *(U8*)m;
1199 c2 = PL_fold[c1];
1200 }
6eb5f6b9
JH
1201 goto do_exactf;
1202 case EXACTFL:
5dab1207
NIS
1203 m = STRING(c);
1204 ln = STR_LEN(c);
1205 lnc = (I32) ln;
d8093b23 1206 c1 = *(U8*)m;
6eb5f6b9
JH
1207 c2 = PL_fold_locale[c1];
1208 do_exactf:
db12adc6 1209 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1210
3b0527fe 1211 if (!reginfo && e < s)
6eb5f6b9 1212 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1213
60a8b682
JH
1214 /* The idea in the EXACTF* cases is to first find the
1215 * first character of the EXACTF* node and then, if
1216 * necessary, case-insensitively compare the full
1217 * text of the node. The c1 and c2 are the first
1218 * characters (though in Unicode it gets a bit
1219 * more complicated because there are more cases
7f16dd3d
JH
1220 * than just upper and lower: one needs to use
1221 * the so-called folding case for case-insensitive
1222 * matching (called "loose matching" in Unicode).
1223 * ibcmp_utf8() will do just that. */
60a8b682 1224
1aa99e6b 1225 if (do_utf8) {
575cac57 1226 UV c, f;
89ebb4a3 1227 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1228 STRLEN len, foldlen;
4ad0818d 1229 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1230 if (c1 == c2) {
5dab1207
NIS
1231 /* Upper and lower of 1st char are equal -
1232 * probably not a "letter". */
1aa99e6b 1233 while (s <= e) {
89ebb4a3 1234 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1235 uniflags);
4cadc6a9 1236 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1237 }
09091399
JH
1238 }
1239 else {
1aa99e6b 1240 while (s <= e) {
89ebb4a3 1241 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1242 uniflags);
80aecb99 1243
60a8b682 1244 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1245 * Note that not all the possible combinations
1246 * are handled here: some of them are handled
1247 * by the standard folding rules, and some of
1248 * them (the character class or ANYOF cases)
1249 * are handled during compiletime in
1250 * regexec.c:S_regclass(). */
880bd946
JH
1251 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1252 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1253 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1254
4cadc6a9 1255 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1256 }
09091399 1257 }
1aa99e6b
IH
1258 }
1259 else {
1260 if (c1 == c2)
4cadc6a9 1261 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1262 else
4cadc6a9 1263 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1264 }
1265 break;
bbce6d69 1266 case BOUNDL:
3280af22 1267 PL_reg_flags |= RF_tainted;
bbce6d69 1268 /* FALL THROUGH */
a0d0e21e 1269 case BOUND:
ffc61ed2 1270 if (do_utf8) {
12d33761 1271 if (s == PL_bostr)
ffc61ed2
JH
1272 tmp = '\n';
1273 else {
6136c704 1274 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1275 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1276 }
1277 tmp = ((OP(c) == BOUND ?
9041c2e3 1278 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1279 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1280 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1281 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1282 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1283 isALNUM_LC_utf8((U8*)s)))
1284 {
1285 tmp = !tmp;
4cadc6a9 1286 REXEC_FBC_TRYIT;
a687059c 1287 }
4cadc6a9 1288 );
a0d0e21e 1289 }
667bb95a 1290 else {
12d33761 1291 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1292 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1293 REXEC_FBC_SCAN(
ffc61ed2
JH
1294 if (tmp ==
1295 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1296 tmp = !tmp;
4cadc6a9 1297 REXEC_FBC_TRYIT;
a0ed51b3 1298 }
4cadc6a9 1299 );
a0ed51b3 1300 }
3b0527fe 1301 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1302 goto got_it;
1303 break;
bbce6d69 1304 case NBOUNDL:
3280af22 1305 PL_reg_flags |= RF_tainted;
bbce6d69 1306 /* FALL THROUGH */
a0d0e21e 1307 case NBOUND:
ffc61ed2 1308 if (do_utf8) {
12d33761 1309 if (s == PL_bostr)
ffc61ed2
JH
1310 tmp = '\n';
1311 else {
6136c704 1312 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1313 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1314 }
1315 tmp = ((OP(c) == NBOUND ?
9041c2e3 1316 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1317 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1318 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1319 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1320 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1321 isALNUM_LC_utf8((U8*)s)))
1322 tmp = !tmp;
4cadc6a9
YO
1323 else REXEC_FBC_TRYIT;
1324 );
a0d0e21e 1325 }
667bb95a 1326 else {
12d33761 1327 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1328 tmp = ((OP(c) == NBOUND ?
1329 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1330 REXEC_FBC_SCAN(
ffc61ed2
JH
1331 if (tmp ==
1332 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1333 tmp = !tmp;
4cadc6a9
YO
1334 else REXEC_FBC_TRYIT;
1335 );
a0ed51b3 1336 }
3b0527fe 1337 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1338 goto got_it;
1339 break;
a0d0e21e 1340 case ALNUM:
4cadc6a9
YO
1341 REXEC_FBC_CSCAN_PRELOAD(
1342 LOAD_UTF8_CHARCLASS_ALNUM(),
1343 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1344 isALNUM(*s)
1345 );
bbce6d69 1346 case ALNUML:
4cadc6a9
YO
1347 REXEC_FBC_CSCAN_TAINT(
1348 isALNUM_LC_utf8((U8*)s),
1349 isALNUM_LC(*s)
1350 );
a0d0e21e 1351 case NALNUM:
4cadc6a9
YO
1352 REXEC_FBC_CSCAN_PRELOAD(
1353 LOAD_UTF8_CHARCLASS_ALNUM(),
1354 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1355 !isALNUM(*s)
1356 );
bbce6d69 1357 case NALNUML:
4cadc6a9
YO
1358 REXEC_FBC_CSCAN_TAINT(
1359 !isALNUM_LC_utf8((U8*)s),
1360 !isALNUM_LC(*s)
1361 );
a0d0e21e 1362 case SPACE:
4cadc6a9
YO
1363 REXEC_FBC_CSCAN_PRELOAD(
1364 LOAD_UTF8_CHARCLASS_SPACE(),
1365 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1366 isSPACE(*s)
1367 );
bbce6d69 1368 case SPACEL:
4cadc6a9
YO
1369 REXEC_FBC_CSCAN_TAINT(
1370 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1371 isSPACE_LC(*s)
1372 );
a0d0e21e 1373 case NSPACE:
4cadc6a9
YO
1374 REXEC_FBC_CSCAN_PRELOAD(
1375 LOAD_UTF8_CHARCLASS_SPACE(),
1376 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1377 !isSPACE(*s)
1378 );
bbce6d69 1379 case NSPACEL:
4cadc6a9
YO
1380 REXEC_FBC_CSCAN_TAINT(
1381 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1382 !isSPACE_LC(*s)
1383 );
a0d0e21e 1384 case DIGIT:
4cadc6a9
YO
1385 REXEC_FBC_CSCAN_PRELOAD(
1386 LOAD_UTF8_CHARCLASS_DIGIT(),
1387 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1388 isDIGIT(*s)
1389 );
b8c5462f 1390 case DIGITL:
4cadc6a9
YO
1391 REXEC_FBC_CSCAN_TAINT(
1392 isDIGIT_LC_utf8((U8*)s),
1393 isDIGIT_LC(*s)
1394 );
a0d0e21e 1395 case NDIGIT:
4cadc6a9
YO
1396 REXEC_FBC_CSCAN_PRELOAD(
1397 LOAD_UTF8_CHARCLASS_DIGIT(),
1398 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1399 !isDIGIT(*s)
1400 );
b8c5462f 1401 case NDIGITL:
4cadc6a9
YO
1402 REXEC_FBC_CSCAN_TAINT(
1403 !isDIGIT_LC_utf8((U8*)s),
1404 !isDIGIT_LC(*s)
1405 );
1de06328
YO
1406 case AHOCORASICKC:
1407 case AHOCORASICK:
07be1b83
YO
1408 {
1409 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1410 trie_type = do_utf8 ?
1411 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1412 : trie_plain;
1413 /* what trie are we using right now */
1414 reg_ac_data *aho
1415 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1416 reg_trie_data *trie=aho->trie;
1417
1418 const char *last_start = strend - trie->minlen;
6148ee25 1419#ifdef DEBUGGING
07be1b83 1420 const char *real_start = s;
6148ee25 1421#endif
07be1b83 1422 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1423 SV *sv_points;
1424 U8 **points; /* map of where we were in the input string
786e8c11 1425 when reading a given char. For ASCII this
be8e71aa
YO
1426 is unnecessary overhead as the relationship
1427 is always 1:1, but for unicode, especially
1428 case folded unicode this is not true. */
f9e705e8 1429 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1430 U8 *bitmap=NULL;
1431
07be1b83
YO
1432
1433 GET_RE_DEBUG_FLAGS_DECL;
1434
be8e71aa
YO
1435 /* We can't just allocate points here. We need to wrap it in
1436 * an SV so it gets freed properly if there is a croak while
1437 * running the match */
1438 ENTER;
1439 SAVETMPS;
1440 sv_points=newSV(maxlen * sizeof(U8 *));
1441 SvCUR_set(sv_points,
1442 maxlen * sizeof(U8 *));
1443 SvPOK_on(sv_points);
1444 sv_2mortal(sv_points);
1445 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1446 if ( trie_type != trie_utf8_fold
1447 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1448 {
786e8c11
YO
1449 if (trie->bitmap)
1450 bitmap=(U8*)trie->bitmap;
1451 else
1452 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1453 }
786e8c11
YO
1454 /* this is the Aho-Corasick algorithm modified a touch
1455 to include special handling for long "unknown char"
1456 sequences. The basic idea being that we use AC as long
1457 as we are dealing with a possible matching char, when
1458 we encounter an unknown char (and we have not encountered
1459 an accepting state) we scan forward until we find a legal
1460 starting char.
1461 AC matching is basically that of trie matching, except
1462 that when we encounter a failing transition, we fall back
1463 to the current states "fail state", and try the current char
1464 again, a process we repeat until we reach the root state,
1465 state 1, or a legal transition. If we fail on the root state
1466 then we can either terminate if we have reached an accepting
1467 state previously, or restart the entire process from the beginning
1468 if we have not.
1469
1470 */
07be1b83
YO
1471 while (s <= last_start) {
1472 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1473 U8 *uc = (U8*)s;
1474 U16 charid = 0;
1475 U32 base = 1;
1476 U32 state = 1;
1477 UV uvc = 0;
1478 STRLEN len = 0;
1479 STRLEN foldlen = 0;
1480 U8 *uscan = (U8*)NULL;
1481 U8 *leftmost = NULL;
786e8c11
YO
1482#ifdef DEBUGGING
1483 U32 accepted_word= 0;
1484#endif
07be1b83
YO
1485 U32 pointpos = 0;
1486
1487 while ( state && uc <= (U8*)strend ) {
1488 int failed=0;
786e8c11
YO
1489 U32 word = aho->states[ state ].wordnum;
1490
1de06328
YO
1491 if( state==1 ) {
1492 if ( bitmap ) {
1493 DEBUG_TRIE_EXECUTE_r(
1494 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1495 dump_exec_pos( (char *)uc, c, strend, real_start,
1496 (char *)uc, do_utf8 );
1497 PerlIO_printf( Perl_debug_log,
1498 " Scanning for legal start char...\n");
1499 }
1500 );
1501 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1502 uc++;
786e8c11 1503 }
1de06328 1504 s= (char *)uc;
786e8c11 1505 }
786e8c11
YO
1506 if (uc >(U8*)last_start) break;
1507 }
1508
1509 if ( word ) {
1510 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1511 if (!leftmost || lpos < leftmost) {
1512 DEBUG_r(accepted_word=word);
07be1b83 1513 leftmost= lpos;
786e8c11 1514 }
07be1b83 1515 if (base==0) break;
786e8c11 1516
07be1b83
YO
1517 }
1518 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1519 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1520 uvc, charid, foldlen, foldbuf, uniflags);
786e8c11
YO
1521 DEBUG_TRIE_EXECUTE_r({
1522 dump_exec_pos( (char *)uc, c, strend, real_start,
1523 s, do_utf8 );
07be1b83 1524 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1525 " Charid:%3u CP:%4"UVxf" ",
1526 charid, uvc);
1527 });
07be1b83
YO
1528
1529 do {
6148ee25 1530#ifdef DEBUGGING
786e8c11 1531 word = aho->states[ state ].wordnum;
6148ee25 1532#endif
07be1b83
YO
1533 base = aho->states[ state ].trans.base;
1534
786e8c11
YO
1535 DEBUG_TRIE_EXECUTE_r({
1536 if (failed)
1537 dump_exec_pos( (char *)uc, c, strend, real_start,
1538 s, do_utf8 );
07be1b83 1539 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1540 "%sState: %4"UVxf", word=%"UVxf,
1541 failed ? " Fail transition to " : "",
1542 (UV)state, (UV)word);
1543 });
07be1b83
YO
1544 if ( base ) {
1545 U32 tmp;
1546 if (charid &&
1547 (base + charid > trie->uniquecharcount )
1548 && (base + charid - 1 - trie->uniquecharcount
1549 < trie->lasttrans)
1550 && trie->trans[base + charid - 1 -
1551 trie->uniquecharcount].check == state
1552 && (tmp=trie->trans[base + charid - 1 -
1553 trie->uniquecharcount ].next))
1554 {
786e8c11
YO
1555 DEBUG_TRIE_EXECUTE_r(
1556 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1557 state = tmp;
1558 break;
1559 }
1560 else {
786e8c11
YO
1561 DEBUG_TRIE_EXECUTE_r(
1562 PerlIO_printf( Perl_debug_log," - fail\n"));
1563 failed = 1;
1564 state = aho->fail[state];
07be1b83
YO
1565 }
1566 }
1567 else {
1568 /* we must be accepting here */
786e8c11
YO
1569 DEBUG_TRIE_EXECUTE_r(
1570 PerlIO_printf( Perl_debug_log," - accepting\n"));
1571 failed = 1;
07be1b83
YO
1572 break;
1573 }
1574 } while(state);
786e8c11 1575 uc += len;
07be1b83
YO
1576 if (failed) {
1577 if (leftmost)
1578 break;
786e8c11 1579 if (!state) state = 1;
07be1b83
YO
1580 }
1581 }
1582 if ( aho->states[ state ].wordnum ) {
1583 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1584 if (!leftmost || lpos < leftmost) {
1585 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1586 leftmost = lpos;
786e8c11 1587 }
07be1b83 1588 }
07be1b83
YO
1589 if (leftmost) {
1590 s = (char*)leftmost;
786e8c11
YO
1591 DEBUG_TRIE_EXECUTE_r({
1592 PerlIO_printf(
1593 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1594 (UV)accepted_word, s - real_start
1595 );
1596 });
be8e71aa
YO
1597 if (!reginfo || regtry(reginfo, s)) {
1598 FREETMPS;
1599 LEAVE;
07be1b83 1600 goto got_it;
be8e71aa 1601 }
07be1b83 1602 s = HOPc(s,1);
786e8c11
YO
1603 DEBUG_TRIE_EXECUTE_r({
1604 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1605 });
07be1b83 1606 } else {
786e8c11
YO
1607 DEBUG_TRIE_EXECUTE_r(
1608 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1609 break;
1610 }
1611 }
be8e71aa
YO
1612 FREETMPS;
1613 LEAVE;
07be1b83
YO
1614 }
1615 break;
b3c9acc1 1616 default:
3c3eec57
GS
1617 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1618 break;
d6a28714 1619 }
6eb5f6b9
JH
1620 return 0;
1621 got_it:
1622 return s;
1623}
1624
1625/*
1626 - regexec_flags - match a regexp against a string
1627 */
1628I32
1629Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1630 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1631/* strend: pointer to null at end of string */
1632/* strbeg: real beginning of string */
1633/* minend: end of match must be >=minend after stringarg. */
1634/* data: May be used for some additional optimizations. */
1635/* nosave: For optimizations. */
1636{
97aff369 1637 dVAR;
6eb5f6b9
JH
1638 register char *s;
1639 register regnode *c;
1640 register char *startpos = stringarg;
6eb5f6b9
JH
1641 I32 minlen; /* must match at least this many chars */
1642 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1643 I32 end_shift = 0; /* Same for the end. */ /* CC */
1644 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1645 char *scream_olds = NULL;
3dab1dad 1646 SV* const oreplsv = GvSV(PL_replgv);
f9f4320a 1647 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1648 I32 multiline;
0df25f3d 1649
3b0527fe 1650 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1651
1652 GET_RE_DEBUG_FLAGS_DECL;
1653
9d4ba2ae 1654 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1655
1656 /* Be paranoid... */
1657 if (prog == NULL || startpos == NULL) {
1658 Perl_croak(aTHX_ "NULL regexp parameter");
1659 return 0;
1660 }
1661
2757e526 1662 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1663 reginfo.prog = prog;
2757e526 1664
bac06658 1665 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1666 DEBUG_EXECUTE_r(
1667 debug_start_match(prog, do_utf8, startpos, strend,
1668 "Matching");
1669 );
bac06658 1670
6eb5f6b9 1671 minlen = prog->minlen;
1de06328
YO
1672
1673 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1675 "String too short [regexec_flags]...\n"));
1676 goto phooey;
1aa99e6b 1677 }
6eb5f6b9 1678
1de06328 1679
6eb5f6b9
JH
1680 /* Check validity of program. */
1681 if (UCHARAT(prog->program) != REG_MAGIC) {
1682 Perl_croak(aTHX_ "corrupted regexp program");
1683 }
1684
1685 PL_reg_flags = 0;
1686 PL_reg_eval_set = 0;
1687 PL_reg_maxiter = 0;
1688
1689 if (prog->reganch & ROPT_UTF8)
1690 PL_reg_flags |= RF_utf8;
1691
1692 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1693 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1694 PL_bostr = strbeg;
3b0527fe 1695 reginfo.sv = sv;
6eb5f6b9
JH
1696
1697 /* Mark end of line for $ (and such) */
1698 PL_regeol = strend;
1699
1700 /* see how far we have to get to not match where we matched before */
3b0527fe 1701 reginfo.till = startpos+minend;
6eb5f6b9 1702
6eb5f6b9
JH
1703 /* If there is a "must appear" string, look for it. */
1704 s = startpos;
1705
3b0527fe 1706 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1707 MAGIC *mg;
1708
1709 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1710 reginfo.ganch = startpos;
6eb5f6b9
JH
1711 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && SvMAGIC(sv)
14befaf4
DM
1713 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1714 && mg->mg_len >= 0) {
3b0527fe 1715 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1716 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1717 if (s > reginfo.ganch)
6eb5f6b9 1718 goto phooey;
3b0527fe 1719 s = reginfo.ganch;
6eb5f6b9
JH
1720 }
1721 }
1722 else /* pos() not defined */
3b0527fe 1723 reginfo.ganch = strbeg;
6eb5f6b9
JH
1724 }
1725
a0714e2c 1726 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1727 re_scream_pos_data d;
1728
1729 d.scream_olds = &scream_olds;
1730 d.scream_pos = &scream_pos;
1731 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1732 if (!s) {
a3621e74 1733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1734 goto phooey; /* not present */
3fa9c3d7 1735 }
6eb5f6b9
JH
1736 }
1737
1de06328 1738
6eb5f6b9
JH
1739
1740 /* Simplest case: anchored match need be tried only once. */
1741 /* [unless only anchor is BOL and multiline is set] */
1742 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1743 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1744 goto got_it;
7fba1cd6 1745 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1746 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1747 {
1748 char *end;
1749
1750 if (minlen)
1751 dontbother = minlen - 1;
1aa99e6b 1752 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1753 /* for multiline we only have to try after newlines */
33b8afdf 1754 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1755 if (s == startpos)
1756 goto after_try;
1757 while (1) {
3b0527fe 1758 if (regtry(&reginfo, s))
6eb5f6b9
JH
1759 goto got_it;
1760 after_try:
1761 if (s >= end)
1762 goto phooey;
1763 if (prog->reganch & RE_USE_INTUIT) {
1764 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1765 if (!s)
1766 goto phooey;
1767 }
1768 else
1769 s++;
1770 }
1771 } else {
1772 if (s > startpos)
1773 s--;
1774 while (s < end) {
1775 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1776 if (regtry(&reginfo, s))
6eb5f6b9
JH
1777 goto got_it;
1778 }
1779 }
1780 }
1781 }
1782 goto phooey;
f9f4320a
YO
1783 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1784 {
1785 /* the warning about reginfo.ganch being used without intialization
1786 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1787 and we only enter this block when the same bit is set. */
3b0527fe 1788 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1789 goto got_it;
1790 goto phooey;
1791 }
1792
1793 /* Messy cases: unanchored match. */
33b8afdf 1794 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1795 /* we have /x+whatever/ */
1796 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1797 char ch;
bf93d4cc
GS
1798#ifdef DEBUGGING
1799 int did_match = 0;
1800#endif
33b8afdf
JH
1801 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1802 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1803 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1804
1aa99e6b 1805 if (do_utf8) {
4cadc6a9 1806 REXEC_FBC_SCAN(
6eb5f6b9 1807 if (*s == ch) {
a3621e74 1808 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1809 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1810 s += UTF8SKIP(s);
1811 while (s < strend && *s == ch)
1812 s += UTF8SKIP(s);
1813 }
4cadc6a9 1814 );
6eb5f6b9
JH
1815 }
1816 else {
4cadc6a9 1817 REXEC_FBC_SCAN(
6eb5f6b9 1818 if (*s == ch) {
a3621e74 1819 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1820 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1821 s++;
1822 while (s < strend && *s == ch)
1823 s++;
1824 }
4cadc6a9 1825 );
6eb5f6b9 1826 }
a3621e74 1827 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1828 PerlIO_printf(Perl_debug_log,
b7953727
JH
1829 "Did not find anchored character...\n")
1830 );
6eb5f6b9 1831 }
a0714e2c
SS
1832 else if (prog->anchored_substr != NULL
1833 || prog->anchored_utf8 != NULL
1834 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1835 && prog->float_max_offset < strend - s)) {
1836 SV *must;
1837 I32 back_max;
1838 I32 back_min;
1839 char *last;
6eb5f6b9 1840 char *last1; /* Last position checked before */
bf93d4cc
GS
1841#ifdef DEBUGGING
1842 int did_match = 0;
1843#endif
33b8afdf
JH
1844 if (prog->anchored_substr || prog->anchored_utf8) {
1845 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1846 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1847 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1848 back_max = back_min = prog->anchored_offset;
1849 } else {
1850 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1851 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1852 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1853 back_max = prog->float_max_offset;
1854 back_min = prog->float_min_offset;
1855 }
1de06328
YO
1856
1857
33b8afdf
JH
1858 if (must == &PL_sv_undef)
1859 /* could not downgrade utf8 check substring, so must fail */
1860 goto phooey;
1861
1de06328
YO
1862 if (back_min<0) {
1863 last = strend;
1864 } else {
1865 last = HOP3c(strend, /* Cannot start after this */
1866 -(I32)(CHR_SVLEN(must)
1867 - (SvTAIL(must) != 0) + back_min), strbeg);
1868 }
6eb5f6b9
JH
1869 if (s > PL_bostr)
1870 last1 = HOPc(s, -1);
1871 else
1872 last1 = s - 1; /* bogus */
1873
a0288114 1874 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1875 check_substr==must. */
1876 scream_pos = -1;
1877 dontbother = end_shift;
1878 strend = HOPc(strend, -dontbother);
1879 while ( (s <= last) &&
9041c2e3 1880 ((flags & REXEC_SCREAM)
1de06328 1881 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1882 end_shift, &scream_pos, 0))
1de06328 1883 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1884 (unsigned char*)strend, must,
7fba1cd6 1885 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1886 /* we may be pointing at the wrong string */
1887 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1888 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1889 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1890 if (HOPc(s, -back_max) > last1) {
1891 last1 = HOPc(s, -back_min);
1892 s = HOPc(s, -back_max);
1893 }
1894 else {
52657f30 1895 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1896
1897 last1 = HOPc(s, -back_min);
52657f30 1898 s = t;
6eb5f6b9 1899 }
1aa99e6b 1900 if (do_utf8) {
6eb5f6b9 1901 while (s <= last1) {
3b0527fe 1902 if (regtry(&reginfo, s))
6eb5f6b9
JH
1903 goto got_it;
1904 s += UTF8SKIP(s);
1905 }
1906 }
1907 else {
1908 while (s <= last1) {
3b0527fe 1909 if (regtry(&reginfo, s))
6eb5f6b9
JH
1910 goto got_it;
1911 s++;
1912 }
1913 }
1914 }
ab3bbdeb
YO
1915 DEBUG_EXECUTE_r(if (!did_match) {
1916 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1917 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1918 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1919 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1920 ? "anchored" : "floating"),
ab3bbdeb
YO
1921 quoted, RE_SV_TAIL(must));
1922 });
6eb5f6b9
JH
1923 goto phooey;
1924 }
786e8c11 1925 else if ( (c = prog->regstclass) ) {
f14c76ed 1926 if (minlen) {
be8e71aa 1927 const OPCODE op = OP(prog->regstclass);
66e933ab 1928 /* don't bother with what can't match */
786e8c11 1929 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1930 strend = HOPc(strend, -(minlen - 1));
1931 }
a3621e74 1932 DEBUG_EXECUTE_r({
be8e71aa 1933 SV * const prop = sv_newmortal();
32fc9b6a 1934 regprop(prog, prop, c);
0df25f3d 1935 {
ab3bbdeb
YO
1936 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1937 s,strend-s,60);
0df25f3d 1938 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1939 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1940 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1941 quoted, (int)(strend - s));
0df25f3d 1942 }
ffc61ed2 1943 });
3b0527fe 1944 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1945 goto got_it;
07be1b83 1946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1947 }
1948 else {
1949 dontbother = 0;
a0714e2c 1950 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1951 /* Trim the end. */
d6a28714 1952 char *last;
33b8afdf
JH
1953 SV* float_real;
1954
1955 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1956 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1957 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1958
1959 if (flags & REXEC_SCREAM) {
33b8afdf 1960 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1961 end_shift, &scream_pos, 1); /* last one */
1962 if (!last)
ffc61ed2 1963 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1964 /* we may be pointing at the wrong string */
1965 else if (RX_MATCH_COPIED(prog))
3f7c398e 1966 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1967 }
d6a28714
JH
1968 else {
1969 STRLEN len;
cfd0369c 1970 const char * const little = SvPV_const(float_real, len);
d6a28714 1971
33b8afdf 1972 if (SvTAIL(float_real)) {
d6a28714
JH
1973 if (memEQ(strend - len + 1, little, len - 1))
1974 last = strend - len + 1;
7fba1cd6 1975 else if (!multiline)
9041c2e3 1976 last = memEQ(strend - len, little, len)
bd61b366 1977 ? strend - len : NULL;
b8c5462f 1978 else
d6a28714
JH
1979 goto find_last;
1980 } else {
1981 find_last:
9041c2e3 1982 if (len)
d6a28714 1983 last = rninstr(s, strend, little, little + len);
b8c5462f 1984 else
a0288114 1985 last = strend; /* matching "$" */
b8c5462f 1986 }
b8c5462f 1987 }
bf93d4cc 1988 if (last == NULL) {
6bda09f9
YO
1989 DEBUG_EXECUTE_r(
1990 PerlIO_printf(Perl_debug_log,
1991 "%sCan't trim the tail, match fails (should not happen)%s\n",
1992 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1993 goto phooey; /* Should not happen! */
1994 }
d6a28714
JH
1995 dontbother = strend - last + prog->float_min_offset;
1996 }
1997 if (minlen && (dontbother < minlen))
1998 dontbother = minlen - 1;
1999 strend -= dontbother; /* this one's always in bytes! */
2000 /* We don't know much -- general case. */
1aa99e6b 2001 if (do_utf8) {
d6a28714 2002 for (;;) {
3b0527fe 2003 if (regtry(&reginfo, s))
d6a28714
JH
2004 goto got_it;
2005 if (s >= strend)
2006 break;
b8c5462f 2007 s += UTF8SKIP(s);
d6a28714
JH
2008 };
2009 }
2010 else {
2011 do {
3b0527fe 2012 if (regtry(&reginfo, s))
d6a28714
JH
2013 goto got_it;
2014 } while (s++ < strend);
2015 }
2016 }
2017
2018 /* Failure. */
2019 goto phooey;
2020
2021got_it:
2022 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2023
2024 if (PL_reg_eval_set) {
2025 /* Preserve the current value of $^R */
2026 if (oreplsv != GvSV(PL_replgv))
2027 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2028 restored, the value remains
2029 the same. */
4f639d21 2030 restore_pos(aTHX_ prog);
d6a28714 2031 }
81714fb9
YO
2032 if (prog->paren_names)
2033 (void)hv_iterinit(prog->paren_names);
d6a28714
JH
2034
2035 /* make sure $`, $&, $', and $digit will work later */
2036 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2037 RX_MATCH_COPY_FREE(prog);
d6a28714 2038 if (flags & REXEC_COPY_STR) {
be8e71aa 2039 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2040#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2041 if ((SvIsCOW(sv)
2042 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2043 if (DEBUG_C_TEST) {
2044 PerlIO_printf(Perl_debug_log,
2045 "Copy on write: regexp capture, type %d\n",
2046 (int) SvTYPE(sv));
2047 }
2048 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2049 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2050 assert (SvPOKp(prog->saved_copy));
2051 } else
2052#endif
2053 {
2054 RX_MATCH_COPIED_on(prog);
2055 s = savepvn(strbeg, i);
2056 prog->subbeg = s;
2057 }
d6a28714 2058 prog->sublen = i;
d6a28714
JH
2059 }
2060 else {
2061 prog->subbeg = strbeg;
2062 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2063 }
2064 }
9041c2e3 2065
d6a28714
JH
2066 return 1;
2067
2068phooey:
a3621e74 2069 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2070 PL_colors[4], PL_colors[5]));
d6a28714 2071 if (PL_reg_eval_set)
4f639d21 2072 restore_pos(aTHX_ prog);
d6a28714
JH
2073 return 0;
2074}
2075
6bda09f9 2076
d6a28714
JH
2077/*
2078 - regtry - try match at specific point
2079 */
2080STATIC I32 /* 0 failure, 1 success */
3b0527fe 2081S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2082{
97aff369 2083 dVAR;
d6a28714
JH
2084 register I32 *sp;
2085 register I32 *ep;
2086 CHECKPOINT lastcp;
3b0527fe 2087 regexp *prog = reginfo->prog;
a3621e74 2088 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2089
2090 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2091 MAGIC *mg;
2092
2093 PL_reg_eval_set = RS_init;
a3621e74 2094 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2095 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2096 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2097 ));
ea8d6ae1 2098 SAVESTACK_CXPOS();
d6a28714
JH
2099 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2100 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2101 SAVETMPS;
2102 /* Apparently this is not needed, judging by wantarray. */
e8347627 2103 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2104 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2105
3b0527fe 2106 if (reginfo->sv) {
d6a28714 2107 /* Make $_ available to executed code. */
3b0527fe 2108 if (reginfo->sv != DEFSV) {
59f00321 2109 SAVE_DEFSV;
3b0527fe 2110 DEFSV = reginfo->sv;
b8c5462f 2111 }
d6a28714 2112
3b0527fe
DM
2113 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2114 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2115 /* prepare for quick setting of pos */
d300d9fa
NC
2116#ifdef PERL_OLD_COPY_ON_WRITE
2117 if (SvIsCOW(sv))
2118 sv_force_normal_flags(sv, 0);
2119#endif
3dab1dad 2120 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2121 &PL_vtbl_mglob, NULL, 0);
d6a28714 2122 mg->mg_len = -1;
b8c5462f 2123 }
d6a28714
JH
2124 PL_reg_magic = mg;
2125 PL_reg_oldpos = mg->mg_len;
4f639d21 2126 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2127 }
09687e5a 2128 if (!PL_reg_curpm) {
a02a5408 2129 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2130#ifdef USE_ITHREADS
2131 {
be8e71aa 2132 SV* const repointer = newSViv(0);
577e12cc 2133 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2134 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2135 av_push(PL_regex_padav,repointer);
2136 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2137 PL_regex_pad = AvARRAY(PL_regex_padav);
2138 }
2139#endif
2140 }
aaa362c4 2141 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2142 PL_reg_oldcurpm = PL_curpm;
2143 PL_curpm = PL_reg_curpm;
2144 if (RX_MATCH_COPIED(prog)) {
2145 /* Here is a serious problem: we cannot rewrite subbeg,
2146 since it may be needed if this match fails. Thus
2147 $` inside (?{}) could fail... */
2148 PL_reg_oldsaved = prog->subbeg;
2149 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2150#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2151 PL_nrs = prog->saved_copy;
2152#endif
d6a28714
JH
2153 RX_MATCH_COPIED_off(prog);
2154 }
2155 else
bd61b366 2156 PL_reg_oldsaved = NULL;
d6a28714
JH
2157 prog->subbeg = PL_bostr;
2158 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2159 }
6bda09f9 2160 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
973dddac 2161 prog->startp[0] = startpos - PL_bostr;
d6a28714 2162 PL_reginput = startpos;
d6a28714 2163 PL_reglastparen = &prog->lastparen;
a01268b5 2164 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2165 prog->lastparen = 0;
03994de8 2166 prog->lastcloseparen = 0;
d6a28714 2167 PL_regsize = 0;
6bda09f9
YO
2168 PL_regstartp = prog->startp;
2169 PL_regendp = prog->endp;
d6a28714
JH
2170 if (PL_reg_start_tmpl <= prog->nparens) {
2171 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2172 if(PL_reg_start_tmp)
2173 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2174 else
a02a5408 2175 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2176 }
2177
2178 /* XXXX What this code is doing here?!!! There should be no need
2179 to do this again and again, PL_reglastparen should take care of
3dd2943c 2180 this! --ilya*/
dafc8851
JH
2181
2182 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2183 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2184 * PL_reglastparen), is not needed at all by the test suite
2185 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2186 * enough, for building DynaLoader, or otherwise this
2187 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2188 * will happen. Meanwhile, this code *is* needed for the
2189 * above-mentioned test suite tests to succeed. The common theme
2190 * on those tests seems to be returning null fields from matches.
2191 * --jhi */
dafc8851 2192#if 1
d6a28714
JH
2193 sp = prog->startp;
2194 ep = prog->endp;
2195 if (prog->nparens) {
097eb12c 2196 register I32 i;
eb160463 2197 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2198 *++sp = -1;
2199 *++ep = -1;
2200 }
2201 }
dafc8851 2202#endif
02db2b7b 2203 REGCP_SET(lastcp);
3b0527fe 2204 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2205 prog->endp[0] = PL_reginput - PL_bostr;
2206 return 1;
2207 }
02db2b7b 2208 REGCP_UNWIND(lastcp);
d6a28714
JH
2209 return 0;
2210}
2211
02db2b7b 2212
8ba1375e
MJD
2213#define sayYES goto yes
2214#define sayNO goto no
262b90c4 2215#define sayNO_SILENT goto no_silent
8ba1375e 2216
f9f4320a
YO
2217/* we dont use STMT_START/END here because it leads to
2218 "unreachable code" warnings, which are bogus, but distracting. */
2219#define CACHEsayNO \
c476f425
DM
2220 if (ST.cache_mask) \
2221 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2222 sayNO
3298f257 2223
a3621e74 2224/* this is used to determine how far from the left messages like
265c4333
YO
2225 'failed...' are printed. It should be set such that messages
2226 are inline with the regop output that created them.
a3621e74 2227*/
265c4333 2228#define REPORT_CODE_OFF 32
a3621e74
YO
2229
2230
2231/* Make sure there is a test for this +1 options in re_tests */
2232#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2233
40a82448
DM
2234#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2235#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2236
86545054
DM
2237#define SLAB_FIRST(s) (&(s)->states[0])
2238#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2239
5d9a96ca
DM
2240/* grab a new slab and return the first slot in it */
2241
2242STATIC regmatch_state *
2243S_push_slab(pTHX)
2244{
54df2634
NC
2245#if PERL_VERSION < 9
2246 dMY_CXT;
2247#endif
5d9a96ca
DM
2248 regmatch_slab *s = PL_regmatch_slab->next;
2249 if (!s) {
2250 Newx(s, 1, regmatch_slab);
2251 s->prev = PL_regmatch_slab;
2252 s->next = NULL;
2253 PL_regmatch_slab->next = s;
2254 }
2255 PL_regmatch_slab = s;
86545054 2256 return SLAB_FIRST(s);
5d9a96ca 2257}
5b47454d 2258
95b24440 2259
40a82448
DM
2260/* push a new state then goto it */
2261
2262#define PUSH_STATE_GOTO(state, node) \
2263 scan = node; \
2264 st->resume_state = state; \
2265 goto push_state;
2266
2267/* push a new state with success backtracking, then goto it */
2268
2269#define PUSH_YES_STATE_GOTO(state, node) \
2270 scan = node; \
2271 st->resume_state = state; \
2272 goto push_yes_state;
2273
aa283a38 2274
aa283a38 2275
d6a28714 2276/*
95b24440 2277
bf1f174e
DM
2278regmatch() - main matching routine
2279
2280This is basically one big switch statement in a loop. We execute an op,
2281set 'next' to point the next op, and continue. If we come to a point which
2282we may need to backtrack to on failure such as (A|B|C), we push a
2283backtrack state onto the backtrack stack. On failure, we pop the top
2284state, and re-enter the loop at the state indicated. If there are no more
2285states to pop, we return failure.
2286
2287Sometimes we also need to backtrack on success; for example /A+/, where
2288after successfully matching one A, we need to go back and try to
2289match another one; similarly for lookahead assertions: if the assertion
2290completes successfully, we backtrack to the state just before the assertion
2291and then carry on. In these cases, the pushed state is marked as
2292'backtrack on success too'. This marking is in fact done by a chain of
2293pointers, each pointing to the previous 'yes' state. On success, we pop to
2294the nearest yes state, discarding any intermediate failure-only states.
2295Sometimes a yes state is pushed just to force some cleanup code to be
2296called at the end of a successful match or submatch; e.g. (??{$re}) uses
2297it to free the inner regex.
2298
2299Note that failure backtracking rewinds the cursor position, while
2300success backtracking leaves it alone.
2301
2302A pattern is complete when the END op is executed, while a subpattern
2303such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2304ops trigger the "pop to last yes state if any, otherwise return true"
2305behaviour.
2306
2307A common convention in this function is to use A and B to refer to the two
2308subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2309the subpattern to be matched possibly multiple times, while B is the entire
2310rest of the pattern. Variable and state names reflect this convention.
2311
2312The states in the main switch are the union of ops and failure/success of
2313substates associated with with that op. For example, IFMATCH is the op
2314that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2315'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2316successfully matched A and IFMATCH_A_fail is a state saying that we have
2317just failed to match A. Resume states always come in pairs. The backtrack
2318state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2319at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2320on success or failure.
2321
2322The struct that holds a backtracking state is actually a big union, with
2323one variant for each major type of op. The variable st points to the
2324top-most backtrack struct. To make the code clearer, within each
2325block of code we #define ST to alias the relevant union.
2326
2327Here's a concrete example of a (vastly oversimplified) IFMATCH
2328implementation:
2329
2330 switch (state) {
2331 ....
2332
2333#define ST st->u.ifmatch
2334
2335 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2336 ST.foo = ...; // some state we wish to save
95b24440 2337 ...
bf1f174e
DM
2338 // push a yes backtrack state with a resume value of
2339 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2340 // first node of A:
2341 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2342 // NOTREACHED
2343
2344 case IFMATCH_A: // we have successfully executed A; now continue with B
2345 next = B;
2346 bar = ST.foo; // do something with the preserved value
2347 break;
2348
2349 case IFMATCH_A_fail: // A failed, so the assertion failed
2350 ...; // do some housekeeping, then ...
2351 sayNO; // propagate the failure
2352
2353#undef ST
95b24440 2354
bf1f174e
DM
2355 ...
2356 }
95b24440 2357
bf1f174e
DM
2358For any old-timers reading this who are familiar with the old recursive
2359approach, the code above is equivalent to:
95b24440 2360
bf1f174e
DM
2361 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2362 {
2363 int foo = ...
95b24440 2364 ...
bf1f174e
DM
2365 if (regmatch(A)) {
2366 next = B;
2367 bar = foo;
2368 break;
95b24440 2369 }
bf1f174e
DM
2370 ...; // do some housekeeping, then ...
2371 sayNO; // propagate the failure
95b24440 2372 }
bf1f174e
DM
2373
2374The topmost backtrack state, pointed to by st, is usually free. If you
2375want to claim it, populate any ST.foo fields in it with values you wish to
2376save, then do one of
2377
2378 PUSH_STATE_GOTO(resume_state, node);
2379 PUSH_YES_STATE_GOTO(resume_state, node);
2380
2381which sets that backtrack state's resume value to 'resume_state', pushes a
2382new free entry to the top of the backtrack stack, then goes to 'node'.
2383On backtracking, the free slot is popped, and the saved state becomes the
2384new free state. An ST.foo field in this new top state can be temporarily
2385accessed to retrieve values, but once the main loop is re-entered, it
2386becomes available for reuse.
2387
2388Note that the depth of the backtrack stack constantly increases during the
2389left-to-right execution of the pattern, rather than going up and down with
2390the pattern nesting. For example the stack is at its maximum at Z at the
2391end of the pattern, rather than at X in the following:
2392
2393 /(((X)+)+)+....(Y)+....Z/
2394
2395The only exceptions to this are lookahead/behind assertions and the cut,
2396(?>A), which pop all the backtrack states associated with A before
2397continuing.
2398
2399Bascktrack state structs are allocated in slabs of about 4K in size.
2400PL_regmatch_state and st always point to the currently active state,
2401and PL_regmatch_slab points to the slab currently containing
2402PL_regmatch_state. The first time regmatch() is called, the first slab is
2403allocated, and is never freed until interpreter destruction. When the slab
2404is full, a new one is allocated and chained to the end. At exit from
2405regmatch(), slabs allocated since entry are freed.
2406
2407*/
95b24440 2408
40a82448 2409
5bc10b2c 2410#define DEBUG_STATE_pp(pp) \
265c4333 2411 DEBUG_STATE_r({ \
5bc10b2c
DM
2412 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2413 PerlIO_printf(Perl_debug_log, \
2414 " %*s"pp" %s\n", \
2415 depth*2, "", \
03363afd 2416 reg_name[st->resume_state] ); \
265c4333 2417 });
5bc10b2c 2418
40a82448 2419
3dab1dad 2420#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2421
3df15adc 2422#ifdef DEBUGGING
5bc10b2c 2423
ab3bbdeb
YO
2424STATIC void
2425S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2426 const char *start, const char *end, const char *blurb)
2427{
2428 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2429 if (!PL_colorset)
2430 reginitcolors();
2431 {
2432 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2433 prog->precomp, prog->prelen, 60);
2434
2435 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2436 start, end - start, 60);
2437
2438 PerlIO_printf(Perl_debug_log,
2439 "%s%s REx%s %s against %s\n",
2440 PL_colors[4], blurb, PL_colors[5], s0, s1);
2441
2442 if (do_utf8||utf8_pat)
1de06328
YO
2443 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2444 utf8_pat ? "pattern" : "",
2445 utf8_pat && do_utf8 ? " and " : "",
2446 do_utf8 ? "string" : ""
ab3bbdeb
YO
2447 );
2448 }
2449}
3df15adc
YO
2450
2451STATIC void
786e8c11
YO
2452S_dump_exec_pos(pTHX_ const char *locinput,
2453 const regnode *scan,
2454 const char *loc_regeol,
2455 const char *loc_bostr,
2456 const char *loc_reg_starttry,
2457 const bool do_utf8)
07be1b83 2458{
786e8c11 2459 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2460 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2461 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2462 /* The part of the string before starttry has one color
2463 (pref0_len chars), between starttry and current
2464 position another one (pref_len - pref0_len chars),
2465 after the current position the third one.
2466 We assume that pref0_len <= pref_len, otherwise we
2467 decrease pref0_len. */
786e8c11
YO
2468 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2469 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2470 int pref0_len;
2471
2472 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2473 pref_len++;
786e8c11
YO
2474 pref0_len = pref_len - (locinput - loc_reg_starttry);
2475 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2476 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2477 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2478 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2479 l--;
2480 if (pref0_len < 0)
2481 pref0_len = 0;
2482 if (pref0_len > pref_len)
2483 pref0_len = pref_len;
2484 {
3df15adc 2485 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2486
ab3bbdeb 2487 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2488 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2489
ab3bbdeb 2490 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2491 (locinput - pref_len + pref0_len),
1de06328 2492 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2493
ab3bbdeb 2494 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2495 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2496
1de06328 2497 const STRLEN tlen=len0+len1+len2;
3df15adc 2498 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2499 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2500 (IV)(locinput - loc_bostr),
07be1b83 2501 len0, s0,
07be1b83 2502 len1, s1,
07be1b83 2503 (docolor ? "" : "> <"),
07be1b83 2504 len2, s2,
f9f4320a 2505 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2506 "");
2507 }
2508}
3df15adc 2509
07be1b83
YO
2510#endif
2511
0a4db386
YO
2512/* reg_check_named_buff_matched()
2513 * Checks to see if a named buffer has matched. The data array of
2514 * buffer numbers corresponding to the buffer is expected to reside
2515 * in the regexp->data->data array in the slot stored in the ARG() of
2516 * node involved. Note that this routine doesn't actually care about the
2517 * name, that information is not preserved from compilation to execution.
2518 * Returns the index of the leftmost defined buffer with the given name
2519 * or 0 if non of the buffers matched.
2520 */
2521STATIC I32
2522S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2523 I32 n;
2524 SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ];
2525 I32 *nums=(I32*)SvPVX(sv_dat);
2526 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2527 if ((I32)*PL_reglastparen >= nums[n] &&
2528 PL_regendp[nums[n]] != -1)
2529 {
2530 return nums[n];
2531 }
2532 }
2533 return 0;
2534}
2535
d6a28714 2536STATIC I32 /* 0 failure, 1 success */
3b0527fe 2537S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2538{
54df2634
NC
2539#if PERL_VERSION < 9
2540 dMY_CXT;
2541#endif
27da23d5 2542 dVAR;
95b24440 2543 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2544 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2545
3b0527fe
DM
2546 regexp *rex = reginfo->prog;
2547
5d9a96ca
DM
2548 regmatch_slab *orig_slab;
2549 regmatch_state *orig_state;
a3621e74 2550
5d9a96ca
DM
2551 /* the current state. This is a cached copy of PL_regmatch_state */
2552 register regmatch_state *st;
95b24440 2553
5d9a96ca
DM
2554 /* cache heavy used fields of st in registers */
2555 register regnode *scan;
2556 register regnode *next;
24d3c4a9
DM
2557 register I32 n = 0; /* general value; init to avoid compiler warning */
2558 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2559 register char *locinput = PL_reginput;
5d9a96ca 2560 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2561
b69b0499 2562 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2563 int depth = 0; /* depth of backtrack stack */
6bda09f9 2564 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
77cb431f
DM
2565 regmatch_state *yes_state = NULL; /* state to pop to on success of
2566 subpattern */
faec1544 2567 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2568 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2569 U32 state_num;
10edeb5d 2570
24d3c4a9
DM
2571 /* these three flags are set by various ops to signal information to
2572 * the very next op. They have a useful lifetime of exactly one loop
2573 * iteration, and are not preserved or restored by state pushes/pops
2574 */
2575 bool sw = 0; /* the condition value in (?(cond)a|b) */
2576 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2577 int logical = 0; /* the following EVAL is:
2578 0: (?{...})
2579 1: (?(?{...})X|Y)
2580 2: (??{...})
2581 or the following IFMATCH/UNLESSM is:
2582 false: plain (?=foo)
2583 true: used as a condition: (?(?=foo))
2584 */
2585
95b24440 2586#ifdef DEBUGGING
e68ec53f 2587 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2588#endif
2589
5d9a96ca
DM
2590 /* on first ever call to regmatch, allocate first slab */
2591 if (!PL_regmatch_slab) {
2592 Newx(PL_regmatch_slab, 1, regmatch_slab);
2593 PL_regmatch_slab->prev = NULL;
2594 PL_regmatch_slab->next = NULL;
86545054 2595 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2596 }
2597
2598 /* remember current high-water mark for exit */
2599 /* XXX this should be done with SAVE* instead */
2600 orig_slab = PL_regmatch_slab;
2601 orig_state = PL_regmatch_state;
2602
2603 /* grab next free state slot */
2604 st = ++PL_regmatch_state;
86545054 2605 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2606 st = PL_regmatch_state = S_push_slab(aTHX);
2607
d6a28714
JH
2608 /* Note that nextchr is a byte even in UTF */
2609 nextchr = UCHARAT(locinput);
2610 scan = prog;
2611 while (scan != NULL) {
8ba1375e 2612
a3621e74 2613 DEBUG_EXECUTE_r( {
6136c704 2614 SV * const prop = sv_newmortal();
1de06328 2615 regnode *rnext=regnext(scan);
786e8c11 2616 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2617 regprop(rex, prop, scan);
07be1b83
YO
2618
2619 PerlIO_printf(Perl_debug_log,
2620 "%3"IVdf":%*s%s(%"IVdf")\n",
5bc10b2c 2621 (IV)(scan - rex->program), depth*2, "",
07be1b83 2622 SvPVX_const(prop),
1de06328
YO
2623 (PL_regkind[OP(scan)] == END || !rnext) ?
2624 0 : (IV)(rnext - rex->program));
2a782b5b 2625 });
d6a28714
JH
2626
2627 next = scan + NEXT_OFF(scan);
2628 if (next == scan)
2629 next = NULL;
40a82448 2630 state_num = OP(scan);
d6a28714 2631
40a82448
DM
2632 reenter_switch:
2633 switch (state_num) {
d6a28714 2634 case BOL:
7fba1cd6 2635 if (locinput == PL_bostr)
d6a28714 2636 {
3b0527fe 2637 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2638 break;
2639 }
d6a28714
JH
2640 sayNO;
2641 case MBOL:
12d33761
HS
2642 if (locinput == PL_bostr ||
2643 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2644 {
b8c5462f
JH
2645 break;
2646 }
d6a28714
JH
2647 sayNO;
2648 case SBOL:
c2a73568 2649 if (locinput == PL_bostr)
b8c5462f 2650 break;
d6a28714
JH
2651 sayNO;
2652 case GPOS:
3b0527fe 2653 if (locinput == reginfo->ganch)
d6a28714
JH
2654 break;
2655 sayNO;
2656 case EOL:
d6a28714
JH
2657 goto seol;
2658 case MEOL:
d6a28714 2659 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2660 sayNO;
b8c5462f 2661 break;
d6a28714
JH
2662 case SEOL:
2663 seol:
2664 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2665 sayNO;
d6a28714 2666 if (PL_regeol - locinput > 1)
b8c5462f 2667 sayNO;
b8c5462f 2668 break;
d6a28714
JH
2669 case EOS:
2670 if (PL_regeol != locinput)
b8c5462f 2671 sayNO;
d6a28714 2672 break;
ffc61ed2 2673 case SANY:
d6a28714 2674 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2675 sayNO;
f33976b4
DB
2676 if (do_utf8) {
2677 locinput += PL_utf8skip[nextchr];
2678 if (locinput > PL_regeol)
2679 sayNO;
2680 nextchr = UCHARAT(locinput);
2681 }
2682 else
2683 nextchr = UCHARAT(++locinput);
2684 break;
2685 case CANY:
2686 if (!nextchr && locinput >= PL_regeol)
2687 sayNO;
b8c5462f 2688 nextchr = UCHARAT(++locinput);
a0d0e21e 2689 break;
ffc61ed2 2690 case REG_ANY:
1aa99e6b
IH
2691 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2692 sayNO;
2693 if (do_utf8) {
b8c5462f 2694 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2695 if (locinput > PL_regeol)
2696 sayNO;
a0ed51b3 2697 nextchr = UCHARAT(locinput);
a0ed51b3 2698 }
1aa99e6b
IH
2699 else
2700 nextchr = UCHARAT(++locinput);
a0ed51b3 2701 break;
166ba7cd
DM
2702
2703#undef ST
2704#define ST st->u.trie
786e8c11
YO
2705 case TRIEC:
2706 /* In this case the charclass data is available inline so
2707 we can fail fast without a lot of extra overhead.
2708 */
2709 if (scan->flags == EXACT || !do_utf8) {
2710 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2711 DEBUG_EXECUTE_r(
2712 PerlIO_printf(Perl_debug_log,
2713 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2714 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2715 );
2716 sayNO_SILENT;
2717 /* NOTREACHED */
2718 }
2719 }
2720 /* FALL THROUGH */
5b47454d 2721 case TRIE:
3dab1dad 2722 {
07be1b83 2723 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2724 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2725 trie_type = do_utf8 ?
2726 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2727 : trie_plain;
2728
2729 /* what trie are we using right now */
be8e71aa 2730 reg_trie_data * const trie
3dab1dad
YO
2731 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2732 U32 state = trie->startstate;
166ba7cd 2733
3dab1dad
YO
2734 if (trie->bitmap && trie_type != trie_utf8_fold &&
2735 !TRIE_BITMAP_TEST(trie,*locinput)
2736 ) {
2737 if (trie->states[ state ].wordnum) {
2738 DEBUG_EXECUTE_r(
2739 PerlIO_printf(Perl_debug_log,
2740 "%*s %smatched empty string...%s\n",
5bc10b2c 2741 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2742 );
2743 break;
2744 } else {
2745 DEBUG_EXECUTE_r(
2746 PerlIO_printf(Perl_debug_log,
786e8c11 2747 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2748 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2749 );
2750 sayNO_SILENT;
2751 }
2752 }
166ba7cd 2753
786e8c11
YO
2754 {
2755 U8 *uc = ( U8* )locinput;
2756
2757 STRLEN len = 0;
2758 STRLEN foldlen = 0;
2759 U8 *uscan = (U8*)NULL;
2760 STRLEN bufflen=0;
2761 SV *sv_accept_buff = NULL;
2762 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2763
2764 ST.accepted = 0; /* how many accepting states we have seen */
2765 ST.B = next;
2766 ST.jump = trie->jump;
786e8c11 2767 ST.me = scan;
786e8c11 2768
07be1b83
YO
2769 /*
2770 traverse the TRIE keeping track of all accepting states
2771 we transition through until we get to a failing node.
2772 */
2773
a3621e74 2774 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2775 U32 base = trie->states[ state ].trans.base;
f9f4320a 2776 UV uvc = 0;
786e8c11
YO
2777 U16 charid;
2778 /* We use charid to hold the wordnum as we don't use it
2779 for charid until after we have done the wordnum logic.
2780 We define an alias just so that the wordnum logic reads
2781 more naturally. */
2782
2783#define got_wordnum charid
2784 got_wordnum = trie->states[ state ].wordnum;
2785
2786 if ( got_wordnum ) {
2787 if ( ! ST.accepted ) {
5b47454d
DM
2788 ENTER;
2789 SAVETMPS;
2790 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2791 sv_accept_buff=newSV(bufflen *
2792 sizeof(reg_trie_accepted) - 1);
786e8c11 2793 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2794 SvPOK_on(sv_accept_buff);
2795 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2796 SAVETMPS;
2797 ST.accept_buff =
5b47454d
DM
2798 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2799 }
786e8c11 2800 do {
166ba7cd 2801 if (ST.accepted >= bufflen) {
5b47454d 2802 bufflen *= 2;
166ba7cd 2803 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2804 SvGROW(sv_accept_buff,
2805 bufflen * sizeof(reg_trie_accepted));
2806 }
2807 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2808 + sizeof(reg_trie_accepted));
a3621e74 2809
786e8c11
YO
2810
2811 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2812 ST.accept_buff[ST.accepted].endpos = uc;
2813 ++ST.accepted;
2814 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2815 }
2816#undef got_wordnum
a3621e74 2817
07be1b83 2818 DEBUG_TRIE_EXECUTE_r({
786e8c11 2819 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2820 PerlIO_printf( Perl_debug_log,
786e8c11 2821 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2822 2+depth * 2, "", PL_colors[4],
786e8c11 2823 (UV)state, (UV)ST.accepted );
07be1b83 2824 });
a3621e74
YO
2825
2826 if ( base ) {
4cadc6a9
YO
2827 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2828 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2829
5b47454d
DM
2830 if (charid &&
2831 (base + charid > trie->uniquecharcount )
2832 && (base + charid - 1 - trie->uniquecharcount
2833 < trie->lasttrans)
2834 && trie->trans[base + charid - 1 -
2835 trie->uniquecharcount].check == state)
2836 {
2837 state = trie->trans[base + charid - 1 -
2838 trie->uniquecharcount ].next;
2839 }
2840 else {
2841 state = 0;
2842 }
2843 uc += len;
2844
2845 }
2846 else {
a3621e74
YO
2847 state = 0;
2848 }
2849 DEBUG_TRIE_EXECUTE_r(
e4584336 2850 PerlIO_printf( Perl_debug_log,
786e8c11 2851 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2852 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2853 );
2854 }
166ba7cd 2855 if (!ST.accepted )
a3621e74 2856 sayNO;
a3621e74 2857
166ba7cd
DM
2858 DEBUG_EXECUTE_r(
2859 PerlIO_printf( Perl_debug_log,
2860 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 2861 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
2862 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2863 );
786e8c11 2864 }}
166ba7cd
DM
2865
2866 /* FALL THROUGH */
2867
2868 case TRIE_next_fail: /* we failed - try next alterative */
2869
2870 if ( ST.accepted == 1 ) {
2871 /* only one choice left - just continue */
2872 DEBUG_EXECUTE_r({
2873 reg_trie_data * const trie
2874 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2875 SV ** const tmp = RX_DEBUG(reginfo->prog)
2876 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2877 : NULL;
2878 PerlIO_printf( Perl_debug_log,
2879 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 2880 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd
DM
2881 ST.accept_buff[ 0 ].wordnum,
2882 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2883 PL_colors[5] );
2884 });
2885 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2886 /* in this case we free tmps/leave before we call regmatch
2887 as we wont be using accept_buff again. */
2888 FREETMPS;
2889 LEAVE;
2890 locinput = PL_reginput;
2891 nextchr = UCHARAT(locinput);
786e8c11 2892
7f69552c 2893 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
786e8c11
YO
2894 scan = ST.B;
2895 else
7f69552c 2896 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
786e8c11 2897
166ba7cd
DM
2898 continue; /* execute rest of RE */
2899 }
2900
2901 if (!ST.accepted-- ) {
2902 FREETMPS;
2903 LEAVE;
2904 sayNO;
2905 }
2906
a3621e74 2907 /*
166ba7cd
DM
2908 There are at least two accepting states left. Presumably
2909 the number of accepting states is going to be low,
2910 typically two. So we simply scan through to find the one
2911 with lowest wordnum. Once we find it, we swap the last
2912 state into its place and decrement the size. We then try to
2913 match the rest of the pattern at the point where the word
2914 ends. If we succeed, control just continues along the
2915 regex; if we fail we return here to try the next accepting
2916 state
2917 */
a3621e74 2918
166ba7cd
DM
2919 {
2920 U32 best = 0;
2921 U32 cur;
2922 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2923 DEBUG_TRIE_EXECUTE_r(
f2278c82 2924 PerlIO_printf( Perl_debug_log,
166ba7cd 2925 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 2926 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
2927 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2928 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2929 );
2930
2931 if (ST.accept_buff[cur].wordnum <
2932 ST.accept_buff[best].wordnum)
2933 best = cur;
a3621e74 2934 }
166ba7cd
DM
2935
2936 DEBUG_EXECUTE_r({
2937 reg_trie_data * const trie
2938 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2939 SV ** const tmp = RX_DEBUG(reginfo->prog)
2940 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2941 : NULL;
7f69552c 2942 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 2943 ST.B :
7f69552c 2944 ST.me + ST.jump[ST.accept_buff[best].wordnum];
265c4333
YO
2945 PerlIO_printf( Perl_debug_log,
2946 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 2947 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 2948 ST.accept_buff[best].wordnum,
265c4333
YO
2949 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2950 REG_NODE_NUM(nextop),
166ba7cd
DM
2951 PL_colors[5] );
2952 });
2953
2954 if ( best<ST.accepted ) {
2955 reg_trie_accepted tmp = ST.accept_buff[ best ];
2956 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2957 ST.accept_buff[ ST.accepted ] = tmp;
2958 best = ST.accepted;
a3621e74 2959 }
166ba7cd 2960 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 2961 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
786e8c11
YO
2962 PUSH_STATE_GOTO(TRIE_next, ST.B);
2963 /* NOTREACHED */
2964 } else {
7f69552c 2965 PUSH_STATE_GOTO(TRIE_next, ST.me + ST.jump[ST.accept_buff[best].wordnum]);
786e8c11
YO
2966 /* NOTREACHED */
2967 }
2968 /* NOTREACHED */
166ba7cd 2969 }
166ba7cd
DM
2970 /* NOTREACHED */
2971
2972#undef ST
2973
95b24440
DM
2974 case EXACT: {
2975 char *s = STRING(scan);
24d3c4a9 2976 ln = STR_LEN(scan);
eb160463 2977 if (do_utf8 != UTF) {
bc517b45 2978 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2979 char *l = locinput;
24d3c4a9 2980 const char * const e = s + ln;
a72c7584 2981
5ff6fc6d
JH
2982 if (do_utf8) {
2983 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2984 while (s < e) {
a3b680e6 2985 STRLEN ulen;
1aa99e6b 2986 if (l >= PL_regeol)
5ff6fc6d
JH
2987 sayNO;
2988 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2989 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2990 uniflags))
5ff6fc6d 2991 sayNO;
bc517b45 2992 l += ulen;
5ff6fc6d 2993 s ++;
1aa99e6b 2994 }
5ff6fc6d
JH
2995 }
2996 else {
2997 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2998 while (s < e) {
a3b680e6 2999 STRLEN ulen;
1aa99e6b
IH
3000 if (l >= PL_regeol)
3001 sayNO;
5ff6fc6d 3002 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3003 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3004 uniflags))
1aa99e6b 3005 sayNO;
bc517b45 3006 s += ulen;
a72c7584 3007 l ++;
1aa99e6b 3008 }
5ff6fc6d 3009 }
1aa99e6b
IH
3010 locinput = l;
3011 nextchr = UCHARAT(locinput);
3012 break;
3013 }
bc517b45 3014 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3015 /* Inline the first character, for speed. */
3016 if (UCHARAT(s) != nextchr)
3017 sayNO;
24d3c4a9 3018 if (PL_regeol - locinput < ln)
d6a28714 3019 sayNO;
24d3c4a9 3020 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3021 sayNO;
24d3c4a9 3022 locinput += ln;
d6a28714
JH
3023 nextchr = UCHARAT(locinput);
3024 break;
95b24440 3025 }
d6a28714 3026 case EXACTFL:
b8c5462f
JH
3027 PL_reg_flags |= RF_tainted;
3028 /* FALL THROUGH */
95b24440 3029 case EXACTF: {
be8e71aa 3030 char * const s = STRING(scan);
24d3c4a9 3031 ln = STR_LEN(scan);
d6a28714 3032
d07ddd77
JH
3033 if (do_utf8 || UTF) {
3034 /* Either target or the pattern are utf8. */
be8e71aa 3035 const char * const l = locinput;
d07ddd77 3036 char *e = PL_regeol;
bc517b45 3037
24d3c4a9 3038 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3039 l, &e, 0, do_utf8)) {
5486206c
JH
3040 /* One more case for the sharp s:
3041 * pack("U0U*", 0xDF) =~ /ss/i,
3042 * the 0xC3 0x9F are the UTF-8
3043 * byte sequence for the U+00DF. */
3044 if (!(do_utf8 &&
3045 toLOWER(s[0]) == 's' &&
24d3c4a9 3046 ln >= 2 &&
5486206c
JH
3047 toLOWER(s[1]) == 's' &&
3048 (U8)l[0] == 0xC3 &&
3049 e - l >= 2 &&
3050 (U8)l[1] == 0x9F))
3051 sayNO;
3052 }
d07ddd77
JH
3053 locinput = e;
3054 nextchr = UCHARAT(locinput);
3055 break;
a0ed51b3 3056 }
d6a28714 3057
bc517b45
JH
3058 /* Neither the target and the pattern are utf8. */
3059
d6a28714
JH
3060 /* Inline the first character, for speed. */
3061 if (UCHARAT(s) != nextchr &&
3062 UCHARAT(s) != ((OP(scan) == EXACTF)
3063 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3064 sayNO;
24d3c4a9 3065 if (PL_regeol - locinput < ln)
b8c5462f 3066 sayNO;
24d3c4a9
DM
3067 if (ln > 1 && (OP(scan) == EXACTF
3068 ? ibcmp(s, locinput, ln)
3069 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3070 sayNO;
24d3c4a9 3071 locinput += ln;
d6a28714 3072 nextchr = UCHARAT(locinput);
a0d0e21e 3073 break;
95b24440 3074 }
d6a28714 3075 case ANYOF:
ffc61ed2 3076 if (do_utf8) {
9e55ce06
JH
3077 STRLEN inclasslen = PL_regeol - locinput;
3078
32fc9b6a 3079 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3080 goto anyof_fail;
ffc61ed2
JH
3081 if (locinput >= PL_regeol)
3082 sayNO;
0f0076b4 3083 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3084 nextchr = UCHARAT(locinput);
e0f9d4a8 3085 break;
ffc61ed2
JH
3086 }
3087 else {
3088 if (nextchr < 0)
3089 nextchr = UCHARAT(locinput);
32fc9b6a 3090 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3091 goto anyof_fail;
ffc61ed2
JH
3092 if (!nextchr && locinput >= PL_regeol)
3093 sayNO;
3094 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3095 break;
3096 }
262b90c4 3097 anyof_fail:
e0f9d4a8
JH
3098 /* If we might have the case of the German sharp s
3099 * in a casefolding Unicode character class. */
3100
ebc501f0
JH
3101 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3102 locinput += SHARP_S_SKIP;
e0f9d4a8 3103 nextchr = UCHARAT(locinput);
ffc61ed2 3104 }
e0f9d4a8
JH
3105 else
3106 sayNO;
b8c5462f 3107 break;
d6a28714 3108 case ALNUML:
b8c5462f
JH
3109 PL_reg_flags |= RF_tainted;
3110 /* FALL THROUGH */
d6a28714 3111 case ALNUM:
b8c5462f 3112 if (!nextchr)
4633a7c4 3113 sayNO;
ffc61ed2 3114 if (do_utf8) {
1a4fad37 3115 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3116 if (!(OP(scan) == ALNUM
bb7a0f54 3117 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3118 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3119 {
3120 sayNO;
a0ed51b3 3121 }
b8c5462f 3122 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3123 nextchr = UCHARAT(locinput);
3124 break;
3125 }
ffc61ed2 3126 if (!(OP(scan) == ALNUM
d6a28714 3127 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3128 sayNO;
b8c5462f 3129 nextchr = UCHARAT(++locinput);
a0d0e21e 3130 break;
d6a28714 3131 case NALNUML:
b8c5462f
JH
3132 PL_reg_flags |= RF_tainted;
3133 /* FALL THROUGH */
d6a28714
JH
3134 case NALNUM:
3135 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3136 sayNO;
ffc61ed2 3137 if (do_utf8) {
1a4fad37 3138 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3139 if (OP(scan) == NALNUM
bb7a0f54 3140 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3141 : isALNUM_LC_utf8((U8*)locinput))
3142 {
b8c5462f 3143 sayNO;
d6a28714 3144 }
b8c5462f
JH
3145 locinput += PL_utf8skip[nextchr];
3146 nextchr = UCHARAT(locinput);
3147 break;
3148 }
ffc61ed2 3149 if (OP(scan) == NALNUM
d6a28714 3150 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3151 sayNO;
76e3520e 3152 nextchr = UCHARAT(++locinput);
a0d0e21e 3153 break;
d6a28714
JH
3154 case BOUNDL:
3155 case NBOUNDL:
3280af22 3156 PL_reg_flags |= RF_tainted;
bbce6d69 3157 /* FALL THROUGH */
d6a28714
JH
3158 case BOUND:
3159 case NBOUND:
3160 /* was last char in word? */
ffc61ed2 3161 if (do_utf8) {
12d33761 3162 if (locinput == PL_bostr)
24d3c4a9 3163 ln = '\n';
ffc61ed2 3164 else {
a3b680e6 3165 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3166
24d3c4a9 3167 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3168 }
3169 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3170 ln = isALNUM_uni(ln);
1a4fad37 3171 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3172 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3173 }
3174 else {
24d3c4a9 3175 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3176 n = isALNUM_LC_utf8((U8*)locinput);
3177 }
a0ed51b3 3178 }
d6a28714 3179 else {
24d3c4a9 3180 ln = (locinput != PL_bostr) ?
12d33761 3181 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3182 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3183 ln = isALNUM(ln);
ffc61ed2
JH
3184 n = isALNUM(nextchr);
3185 }
3186 else {
24d3c4a9 3187 ln = isALNUM_LC(ln);
ffc61ed2
JH
3188 n = isALNUM_LC(nextchr);
3189 }
d6a28714 3190 }
24d3c4a9 3191 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3192 OP(scan) == BOUNDL))
3193 sayNO;
a0ed51b3 3194 break;
d6a28714 3195 case SPACEL:
3280af22 3196 PL_reg_flags |= RF_tainted;
bbce6d69 3197 /* FALL THROUGH */
d6a28714 3198 case SPACE:
9442cb0e 3199 if (!nextchr)
4633a7c4 3200 sayNO;
1aa99e6b 3201 if (do_utf8) {
fd400ab9 3202 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3203 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3204 if (!(OP(scan) == SPACE
bb7a0f54 3205 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3206 : isSPACE_LC_utf8((U8*)locinput)))
3207 {
3208 sayNO;
3209 }
3210 locinput += PL_utf8skip[nextchr];
3211 nextchr = UCHARAT(locinput);
3212 break;
d6a28714 3213 }
ffc61ed2
JH
3214 if (!(OP(scan) == SPACE
3215 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3216 sayNO;
3217 nextchr = UCHARAT(++locinput);
3218 }
3219 else {
3220 if (!(OP(scan) == SPACE
3221 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3222 sayNO;
3223 nextchr = UCHARAT(++locinput);
a0ed51b3 3224 }
a0ed51b3 3225 break;
d6a28714 3226 case NSPACEL:
3280af22 3227 PL_reg_flags |= RF_tainted;
bbce6d69 3228 /* FALL THROUGH */
d6a28714 3229 case NSPACE:
9442cb0e 3230 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3231 sayNO;
1aa99e6b 3232 if (do_utf8) {
1a4fad37 3233 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3234 if (OP(scan) == NSPACE
bb7a0f54 3235 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3236 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3237 {
3238 sayNO;
3239 }
3240 locinput += PL_utf8skip[nextchr];
3241 nextchr = UCHARAT(locinput);
3242 break;
a0ed51b3 3243 }
ffc61ed2 3244 if (OP(scan) == NSPACE
d6a28714 3245 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3246 sayNO;
76e3520e 3247 nextchr = UCHARAT(++locinput);
a0d0e21e 3248 break;
d6a28714 3249 case DIGITL:
a0ed51b3
LW
3250 PL_reg_flags |= RF_tainted;
3251 /* FALL THROUGH */
d6a28714 3252 case DIGIT:
9442cb0e 3253 if (!nextchr)
a0ed51b3 3254 sayNO;
1aa99e6b 3255 if (do_utf8) {
1a4fad37 3256 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3257 if (!(OP(scan) == DIGIT
bb7a0f54 3258 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3259 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3260 {
a0ed51b3 3261 sayNO;
dfe13c55 3262 }
6f06b55f 3263 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3264 nextchr = UCHARAT(locinput);
3265 break;
3266 }
ffc61ed2 3267 if (!(OP(scan) == DIGIT
9442cb0e 3268 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3269 sayNO;
3270 nextchr = UCHARAT(++locinput);
3271 break;
d6a28714 3272 case NDIGITL:
b8c5462f
JH
3273 PL_reg_flags |= RF_tainted;
3274 /* FALL THROUGH */
d6a28714 3275 case NDIGIT:
9442cb0e 3276 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3277 sayNO;
1aa99e6b 3278 if (do_utf8) {
1a4fad37 3279 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3280 if (OP(scan) == NDIGIT
bb7a0f54 3281 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3282 : isDIGIT_LC_utf8((U8*)locinput))
3283 {
a0ed51b3 3284 sayNO;
9442cb0e 3285 }
6f06b55f 3286 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3287 nextchr = UCHARAT(locinput);
3288 break;
3289 }
ffc61ed2 3290 if (OP(scan) == NDIGIT
9442cb0e 3291 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3292 sayNO;
3293 nextchr = UCHARAT(++locinput);
3294 break;
3295 case CLUMP:
b7c83a7e 3296 if (locinput >= PL_regeol)
a0ed51b3 3297 sayNO;
b7c83a7e 3298 if (do_utf8) {
1a4fad37 3299 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3300 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3301 sayNO;
3302 locinput += PL_utf8skip[nextchr];
3303 while (locinput < PL_regeol &&
3304 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3305 locinput += UTF8SKIP(locinput);
3306 if (locinput > PL_regeol)
3307 sayNO;
eb08e2da
JH
3308 }
3309 else
3310 locinput++;
a0ed51b3
LW
3311 nextchr = UCHARAT(locinput);
3312 break;
81714fb9
YO
3313
3314 case NREFFL:
3315 {
3316 char *s;
ff1157ca 3317 char type;
81714fb9
YO
3318 PL_reg_flags |= RF_tainted;
3319 /* FALL THROUGH */
3320 case NREF:
3321 case NREFF:
ff1157ca 3322 type = OP(scan);
0a4db386
YO
3323 n = reg_check_named_buff_matched(rex,scan);
3324
3325 if ( n ) {
3326 type = REF + ( type - NREF );
3327 goto do_ref;
3328 } else {
81714fb9 3329 sayNO;
0a4db386
YO
3330 }
3331 /* unreached */
c8756f30 3332 case REFFL:
3280af22 3333 PL_reg_flags |= RF_tainted;
c8756f30 3334 /* FALL THROUGH */
c277df42 3335 case REF:
81714fb9 3336 case REFF:
c277df42 3337 n = ARG(scan); /* which paren pair */
81714fb9
YO
3338 type = OP(scan);
3339 do_ref:
24d3c4a9 3340 ln = PL_regstartp[n];
2c2d71f5 3341 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3342 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3343 sayNO; /* Do not match unless seen CLOSEn. */
24d3c4a9 3344 if (ln == PL_regendp[n])
a0d0e21e 3345 break;
a0ed51b3 3346
24d3c4a9 3347 s = PL_bostr + ln;
81714fb9 3348 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3349 char *l = locinput;
a3b680e6 3350 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3351 /*
3352 * Note that we can't do the "other character" lookup trick as
3353 * in the 8-bit case (no pun intended) because in Unicode we
3354 * have to map both upper and title case to lower case.
3355 */
81714fb9 3356 if (type == REFF) {
a0ed51b3 3357 while (s < e) {
a3b680e6
AL
3358 STRLEN ulen1, ulen2;
3359 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3360 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3361
a0ed51b3
LW
3362 if (l >= PL_regeol)
3363 sayNO;
a2a2844f
JH
3364 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3365 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3366 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3367 sayNO;
a2a2844f
JH
3368 s += ulen1;
3369 l += ulen2;
a0ed51b3
LW
3370 }
3371 }
3372 locinput = l;
3373 nextchr = UCHARAT(locinput);
3374 break;
3375 }
3376
a0d0e21e 3377 /* Inline the first character, for speed. */
76e3520e 3378 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3379 (type == REF ||
3380 (UCHARAT(s) != (type == REFF
3381 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3382 sayNO;
24d3c4a9
DM
3383 ln = PL_regendp[n] - ln;
3384 if (locinput + ln > PL_regeol)
4633a7c4 3385 sayNO;
81714fb9 3386 if (ln > 1 && (type == REF
24d3c4a9 3387 ? memNE(s, locinput, ln)
81714fb9 3388 : (type == REFF
24d3c4a9
DM
3389 ? ibcmp(s, locinput, ln)
3390 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3391 sayNO;
24d3c4a9 3392 locinput += ln;
76e3520e 3393 nextchr = UCHARAT(locinput);
a0d0e21e 3394 break;
81714fb9 3395 }
a0d0e21e 3396 case NOTHING:
c277df42 3397 case TAIL:
a0d0e21e
LW
3398 break;
3399 case BACK:
3400 break;
40a82448
DM
3401
3402#undef ST
3403#define ST st->u.eval
c277df42 3404 {
c277df42 3405 SV *ret;
6bda09f9
YO
3406 regexp *re;
3407 regnode *startpoint;
3408
3409 case SRECURSE:
3410 case RECURSE: /* /(...(?1))/ */
3411 if (cur_eval && cur_eval->locinput==locinput) {
3412 if (cur_eval->u.eval.close_paren == ARG(scan))
3413 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3414 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3415 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3416 } else {
3417 nochange_depth = 0;
3418 }
3419 re = rex;
3420 (void)ReREFCNT_inc(rex);
3421 if (OP(scan)==RECURSE) {
3422 startpoint = scan + ARG2L(scan);
3423 ST.close_paren = ARG(scan);
3424 } else {
3425 startpoint = re->program+1;
3426 ST.close_paren = 0;
3427 }
3428 goto eval_recurse_doit;
3429 /* NOTREACHED */
3430 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3431 if (cur_eval && cur_eval->locinput==locinput) {
3432 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3433 Perl_croak(aTHX_ "EVAL without pos change