This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlhack: some portability updates
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
c476f425 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
6bda09f9 169#define REGCP_OTHER_ELEMS 8
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 171
3280af22 172 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 173/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
174 SSPUSHINT(PL_regendp[p]);
175 SSPUSHINT(PL_regstartp[p]);
3280af22 176 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 177 SSPUSHINT(p);
40a82448
DM
178 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
179 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
180 (UV)p, (IV)PL_regstartp[p],
181 (IV)(PL_reg_start_tmp[p] - PL_bostr),
182 (IV)PL_regendp[p]
183 ));
a0d0e21e 184 }
b1ce53c5 185/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
6bda09f9
YO
186 SSPUSHPTR(PL_regstartp);
187 SSPUSHPTR(PL_regendp);
3280af22
NIS
188 SSPUSHINT(PL_regsize);
189 SSPUSHINT(*PL_reglastparen);
a01268b5 190 SSPUSHINT(*PL_reglastcloseparen);
3280af22 191 SSPUSHPTR(PL_reginput);
41123dfd
JH
192#define REGCP_FRAME_ELEMS 2
193/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
194 * are needed for the regexp context stack bookkeeping. */
195 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 196 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 197
a0d0e21e
LW
198 return retval;
199}
200
c277df42 201/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
202#define REGCP_SET(cp) \
203 DEBUG_STATE_r( \
ab3bbdeb 204 PerlIO_printf(Perl_debug_log, \
e4f74956 205 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
206 (IV)PL_savestack_ix)); \
207 cp = PL_savestack_ix
c3464db5 208
ab3bbdeb 209#define REGCP_UNWIND(cp) \
e4f74956 210 DEBUG_STATE_r( \
ab3bbdeb 211 if (cp != PL_savestack_ix) \
e4f74956
YO
212 PerlIO_printf(Perl_debug_log, \
213 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
214 (IV)(cp), (IV)PL_savestack_ix)); \
215 regcpblow(cp)
c277df42 216
76e3520e 217STATIC char *
097eb12c 218S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 219{
97aff369 220 dVAR;
3b6647e0 221 U32 i;
a0d0e21e 222 char *input;
b1ce53c5 223
a3621e74
YO
224 GET_RE_DEBUG_FLAGS_DECL;
225
b1ce53c5 226 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 227 i = SSPOPINT;
b1ce53c5
JH
228 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
229 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 230 input = (char *) SSPOPPTR;
a01268b5 231 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
232 *PL_reglastparen = SSPOPINT;
233 PL_regsize = SSPOPINT;
6bda09f9
YO
234 PL_regendp=(I32 *) SSPOPPTR;
235 PL_regstartp=(I32 *) SSPOPPTR;
b1ce53c5 236
6bda09f9 237
b1ce53c5 238 /* Now restore the parentheses context. */
41123dfd
JH
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 241 I32 tmps;
097eb12c 242 U32 paren = (U32)SSPOPINT;
3280af22 243 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
244 PL_regstartp[paren] = SSPOPINT;
245 tmps = SSPOPINT;
3280af22
NIS
246 if (paren <= *PL_reglastparen)
247 PL_regendp[paren] = tmps;
a3621e74 248 DEBUG_EXECUTE_r(
c3464db5 249 PerlIO_printf(Perl_debug_log,
b900a521 250 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 251 (UV)paren, (IV)PL_regstartp[paren],
b900a521 252 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 253 (IV)PL_regendp[paren],
3280af22 254 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 255 );
a0d0e21e 256 }
a3621e74 257 DEBUG_EXECUTE_r(
bb7a0f54 258 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 259 PerlIO_printf(Perl_debug_log,
faccc32b 260 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 261 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
262 }
263 );
daf18116 264#if 1
dafc8851
JH
265 /* It would seem that the similar code in regtry()
266 * already takes care of this, and in fact it is in
267 * a better location to since this code can #if 0-ed out
268 * but the code in regtry() is needed or otherwise tests
269 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
270 * (as of patchlevel 7877) will fail. Then again,
271 * this code seems to be necessary or otherwise
272 * building DynaLoader will fail:
273 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 * --jhi */
3b6647e0 275 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c
AL
276 if (i > PL_regsize)
277 PL_regstartp[i] = -1;
278 PL_regendp[i] = -1;
a0d0e21e 279 }
dafc8851 280#endif
a0d0e21e
LW
281 return input;
282}
283
02db2b7b 284#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 285
a687059c 286/*
e50aee73 287 * pregexec and friends
a687059c
LW
288 */
289
76234dfb 290#ifndef PERL_IN_XSUB_RE
a687059c 291/*
c277df42 292 - pregexec - match a regexp against a string
a687059c 293 */
c277df42 294I32
864dbfa3 295Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 296 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
297/* strend: pointer to null at end of string */
298/* strbeg: real beginning of string */
299/* minend: end of match must be >=minend after stringarg. */
300/* nosave: For optimizations. */
301{
302 return
9041c2e3 303 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
304 nosave ? 0 : REXEC_COPY_STR);
305}
76234dfb 306#endif
22e551b9 307
9041c2e3 308/*
cad2e5aa
JH
309 * Need to implement the following flags for reg_anch:
310 *
311 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
312 * USE_INTUIT_ML
313 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
314 * INTUIT_AUTORITATIVE_ML
315 * INTUIT_ONCE_NOML - Intuit can match in one location only.
316 * INTUIT_ONCE_ML
317 *
318 * Another flag for this function: SECOND_TIME (so that float substrs
319 * with giant delta may be not rechecked).
320 */
321
322/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
323
3f7c398e 324/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
325 Otherwise, only SvCUR(sv) is used to get strbeg. */
326
327/* XXXX We assume that strpos is strbeg unless sv. */
328
6eb5f6b9
JH
329/* XXXX Some places assume that there is a fixed substring.
330 An update may be needed if optimizer marks as "INTUITable"
331 RExen without fixed substrings. Similarly, it is assumed that
332 lengths of all the strings are no more than minlen, thus they
333 cannot come from lookahead.
40d049e4
YO
334 (Or minlen should take into account lookahead.)
335 NOTE: Some of this comment is not correct. minlen does now take account
336 of lookahead/behind. Further research is required. -- demerphq
337
338*/
6eb5f6b9 339
2c2d71f5
JH
340/* A failure to find a constant substring means that there is no need to make
341 an expensive call to REx engine, thus we celebrate a failure. Similarly,
342 finding a substring too deep into the string means that less calls to
30944b6d
IZ
343 regtry() should be needed.
344
345 REx compiler's optimizer found 4 possible hints:
346 a) Anchored substring;
347 b) Fixed substring;
348 c) Whether we are anchored (beginning-of-line or \G);
349 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 350 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
351 string which does not contradict any of them.
352 */
2c2d71f5 353
6eb5f6b9
JH
354/* Most of decisions we do here should have been done at compile time.
355 The nodes of the REx which we used for the search should have been
356 deleted from the finite automaton. */
357
cad2e5aa
JH
358char *
359Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
360 char *strend, U32 flags, re_scream_pos_data *data)
361{
97aff369 362 dVAR;
b7953727 363 register I32 start_shift = 0;
cad2e5aa 364 /* Should be nonnegative! */
b7953727 365 register I32 end_shift = 0;
2c2d71f5
JH
366 register char *s;
367 register SV *check;
a1933d95 368 char *strbeg;
cad2e5aa 369 char *t;
1de06328 370 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 371 I32 ml_anch;
bd61b366
SS
372 register char *other_last = NULL; /* other substr checked before this */
373 char *check_at = NULL; /* check substr found at this pos */
bbe252da 374 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 375 RXi_GET_DECL(prog,progi);
30944b6d 376#ifdef DEBUGGING
890ce7af 377 const char * const i_strpos = strpos;
30944b6d 378#endif
a3621e74
YO
379
380 GET_RE_DEBUG_FLAGS_DECL;
381
a30b2f1f 382 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 383
bbe252da 384 if (prog->extflags & RXf_UTF8) {
b8d68ded
JH
385 PL_reg_flags |= RF_utf8;
386 }
ab3bbdeb
YO
387 DEBUG_EXECUTE_r(
388 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
389 sv ? "Guessing start of match in sv for"
390 : "Guessing start of match in string for");
2a782b5b 391 );
cad2e5aa 392
c344f387
JH
393 /* CHR_DIST() would be more correct here but it makes things slow. */
394 if (prog->minlen > strend - strpos) {
a3621e74 395 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 396 "String too short... [re_intuit_start]\n"));
cad2e5aa 397 goto fail;
2c2d71f5 398 }
1de06328 399
a1933d95 400 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 401 PL_regeol = strend;
33b8afdf
JH
402 if (do_utf8) {
403 if (!prog->check_utf8 && prog->check_substr)
404 to_utf8_substr(prog);
405 check = prog->check_utf8;
406 } else {
407 if (!prog->check_substr && prog->check_utf8)
408 to_byte_substr(prog);
409 check = prog->check_substr;
410 }
1de06328 411 if (check == &PL_sv_undef) {
a3621e74 412 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 413 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
414 goto fail;
415 }
bbe252da
YO
416 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
417 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
418 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 419 && !multiline ) ); /* Check after \n? */
cad2e5aa 420
7e25d62c 421 if (!ml_anch) {
bbe252da
YO
422 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
423 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 424 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
425 && sv && !SvROK(sv)
426 && (strpos != strbeg)) {
a3621e74 427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
428 goto fail;
429 }
430 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 431 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 432 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
433 I32 slen;
434
1aa99e6b 435 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 436
653099ff
GS
437 if (SvTAIL(check)) {
438 slen = SvCUR(check); /* >= 1 */
cad2e5aa 439
9041c2e3 440 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 441 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 443 goto fail_finish;
cad2e5aa
JH
444 }
445 /* Now should match s[0..slen-2] */
446 slen--;
3f7c398e 447 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 448 || (slen > 1
3f7c398e 449 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 450 report_neq:
a3621e74 451 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
452 goto fail_finish;
453 }
cad2e5aa 454 }
3f7c398e 455 else if (*SvPVX_const(check) != *s
653099ff 456 || ((slen = SvCUR(check)) > 1
3f7c398e 457 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 458 goto report_neq;
c315bfe8 459 check_at = s;
2c2d71f5 460 goto success_at_start;
7e25d62c 461 }
cad2e5aa 462 }
2c2d71f5 463 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 464 s = strpos;
2c2d71f5 465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
466 end_shift = prog->check_end_shift;
467
2c2d71f5 468 if (!ml_anch) {
a3b680e6 469 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 470 - (SvTAIL(check) != 0);
a3b680e6 471 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
472
473 if (end_shift < eshift)
474 end_shift = eshift;
475 }
cad2e5aa 476 }
2c2d71f5 477 else { /* Can match at random position */
cad2e5aa
JH
478 ml_anch = 0;
479 s = strpos;
1de06328
YO
480 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
481 end_shift = prog->check_end_shift;
482
483 /* end shift should be non negative here */
cad2e5aa
JH
484 }
485
2c2d71f5 486#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 487 if (end_shift < 0)
1de06328 488 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 489 (IV)end_shift, prog->precomp);
2c2d71f5
JH
490#endif
491
2c2d71f5
JH
492 restart:
493 /* Find a possible match in the region s..strend by looking for
494 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
495
496 {
497 I32 srch_start_shift = start_shift;
498 I32 srch_end_shift = end_shift;
499 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
500 srch_end_shift -= ((strbeg - s) - srch_start_shift);
501 srch_start_shift = strbeg - s;
502 }
6bda09f9 503 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
504 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
505 (IV)prog->check_offset_min,
506 (IV)srch_start_shift,
507 (IV)srch_end_shift,
508 (IV)prog->check_end_shift);
509 });
510
cad2e5aa 511 if (flags & REXEC_SCREAM) {
cad2e5aa 512 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 513 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 514
2c2d71f5
JH
515 if (PL_screamfirst[BmRARE(check)] >= 0
516 || ( BmRARE(check) == '\n'
517 && (BmPREVIOUS(check) == SvCUR(check) - 1)
518 && SvTAIL(check) ))
9041c2e3 519 s = screaminstr(sv, check,
1de06328 520 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 521 else
2c2d71f5 522 goto fail_finish;
4addbd3b
HS
523 /* we may be pointing at the wrong string */
524 if (s && RX_MATCH_COPIED(prog))
3f7c398e 525 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
526 if (data)
527 *data->scream_olds = s;
528 }
1de06328
YO
529 else {
530 U8* start_point;
531 U8* end_point;
bbe252da 532 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
533 start_point= (U8*)(s + srch_start_shift);
534 end_point= (U8*)(strend - srch_end_shift);
535 } else {
536 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
537 end_point= HOP3(strend, -srch_end_shift, strbeg);
538 }
6bda09f9 539 DEBUG_OPTIMISE_MORE_r({
56570a2c 540 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 541 (int)(end_point - start_point),
fc8cd66c 542 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
543 start_point);
544 });
545
546 s = fbm_instr( start_point, end_point,
7fba1cd6 547 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
548 }
549 }
cad2e5aa
JH
550 /* Update the count-of-usability, remove useless subpatterns,
551 unshift s. */
2c2d71f5 552
ab3bbdeb
YO
553 DEBUG_EXECUTE_r({
554 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
555 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
556 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 557 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
558 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
559 ? "anchored" : "floating"),
560 quoted,
561 RE_SV_TAIL(check),
562 (s ? " at offset " : "...\n") );
563 });
2c2d71f5
JH
564
565 if (!s)
566 goto fail_finish;
2c2d71f5 567 /* Finish the diagnostic message */
a3621e74 568 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 569
1de06328
YO
570 /* XXX dmq: first branch is for positive lookbehind...
571 Our check string is offset from the beginning of the pattern.
572 So we need to do any stclass tests offset forward from that
573 point. I think. :-(
574 */
575
576
577
578 check_at=s;
579
580
2c2d71f5
JH
581 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
582 Start with the other substr.
583 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 584 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
585 *always* match. Probably should be marked during compile...
586 Probably it is right to do no SCREAM here...
587 */
588
1de06328
YO
589 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
590 : (prog->float_substr && prog->anchored_substr))
591 {
30944b6d 592 /* Take into account the "other" substring. */
2c2d71f5
JH
593 /* XXXX May be hopelessly wrong for UTF... */
594 if (!other_last)
6eb5f6b9 595 other_last = strpos;
33b8afdf 596 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
597 do_other_anchored:
598 {
890ce7af
AL
599 char * const last = HOP3c(s, -start_shift, strbeg);
600 char *last1, *last2;
be8e71aa 601 char * const saved_s = s;
33b8afdf 602 SV* must;
2c2d71f5 603
2c2d71f5
JH
604 t = s - prog->check_offset_max;
605 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 606 && (!do_utf8
0ce71af7 607 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 608 && t > strpos)))
6f207bd3 609 NOOP;
2c2d71f5
JH
610 else
611 t = strpos;
1aa99e6b 612 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
613 if (t < other_last) /* These positions already checked */
614 t = other_last;
1aa99e6b 615 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
616 if (last < last1)
617 last1 = last;
1de06328
YO
618 /* XXXX It is not documented what units *_offsets are in.
619 We assume bytes, but this is clearly wrong.
620 Meaning this code needs to be carefully reviewed for errors.
621 dmq.
622 */
623
2c2d71f5 624 /* On end-of-str: see comment below. */
33b8afdf
JH
625 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
626 if (must == &PL_sv_undef) {
627 s = (char*)NULL;
1de06328 628 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
629 }
630 else
631 s = fbm_instr(
632 (unsigned char*)t,
633 HOP3(HOP3(last1, prog->anchored_offset, strend)
634 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
635 must,
7fba1cd6 636 multiline ? FBMrf_MULTILINE : 0
33b8afdf 637 );
ab3bbdeb
YO
638 DEBUG_EXECUTE_r({
639 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
640 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
641 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 642 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
643 quoted, RE_SV_TAIL(must));
644 });
645
646
2c2d71f5
JH
647 if (!s) {
648 if (last1 >= last2) {
a3621e74 649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
650 ", giving up...\n"));
651 goto fail_finish;
652 }
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 654 ", trying floating at offset %ld...\n",
be8e71aa 655 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
656 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
657 s = HOP3c(last, 1, strend);
2c2d71f5
JH
658 goto restart;
659 }
660 else {
a3621e74 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 662 (long)(s - i_strpos)));
1aa99e6b
IH
663 t = HOP3c(s, -prog->anchored_offset, strbeg);
664 other_last = HOP3c(s, 1, strend);
be8e71aa 665 s = saved_s;
2c2d71f5
JH
666 if (t == strpos)
667 goto try_at_start;
2c2d71f5
JH
668 goto try_at_offset;
669 }
30944b6d 670 }
2c2d71f5
JH
671 }
672 else { /* Take into account the floating substring. */
33b8afdf 673 char *last, *last1;
be8e71aa 674 char * const saved_s = s;
33b8afdf
JH
675 SV* must;
676
677 t = HOP3c(s, -start_shift, strbeg);
678 last1 = last =
679 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
680 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
681 last = HOP3c(t, prog->float_max_offset, strend);
682 s = HOP3c(t, prog->float_min_offset, strend);
683 if (s < other_last)
684 s = other_last;
2c2d71f5 685 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
686 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
687 /* fbm_instr() takes into account exact value of end-of-str
688 if the check is SvTAIL(ed). Since false positives are OK,
689 and end-of-str is not later than strend we are OK. */
690 if (must == &PL_sv_undef) {
691 s = (char*)NULL;
1de06328 692 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
693 }
694 else
2c2d71f5 695 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
696 (unsigned char*)last + SvCUR(must)
697 - (SvTAIL(must)!=0),
7fba1cd6 698 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
699 DEBUG_EXECUTE_r({
700 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
701 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
702 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 703 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
704 quoted, RE_SV_TAIL(must));
705 });
33b8afdf
JH
706 if (!s) {
707 if (last1 == last) {
a3621e74 708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
709 ", giving up...\n"));
710 goto fail_finish;
2c2d71f5 711 }
a3621e74 712 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 713 ", trying anchored starting at offset %ld...\n",
be8e71aa 714 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
715 other_last = last;
716 s = HOP3c(t, 1, strend);
717 goto restart;
718 }
719 else {
a3621e74 720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
721 (long)(s - i_strpos)));
722 other_last = s; /* Fix this later. --Hugo */
be8e71aa 723 s = saved_s;
33b8afdf
JH
724 if (t == strpos)
725 goto try_at_start;
726 goto try_at_offset;
727 }
2c2d71f5 728 }
cad2e5aa 729 }
2c2d71f5 730
1de06328 731
9ef43ace 732 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 733
6bda09f9 734 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
735 PerlIO_printf(Perl_debug_log,
736 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
737 (IV)prog->check_offset_min,
738 (IV)prog->check_offset_max,
739 (IV)(s-strpos),
740 (IV)(t-strpos),
741 (IV)(t-s),
742 (IV)(strend-strpos)
743 )
744 );
745
2c2d71f5 746 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 747 && (!do_utf8
9ef43ace 748 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
749 && t > strpos)))
750 {
2c2d71f5
JH
751 /* Fixed substring is found far enough so that the match
752 cannot start at strpos. */
753 try_at_offset:
cad2e5aa 754 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
755 /* Eventually fbm_*() should handle this, but often
756 anchored_offset is not 0, so this check will not be wasted. */
757 /* XXXX In the code below we prefer to look for "^" even in
758 presence of anchored substrings. And we search even
759 beyond the found float position. These pessimizations
760 are historical artefacts only. */
761 find_anchor:
2c2d71f5 762 while (t < strend - prog->minlen) {
cad2e5aa 763 if (*t == '\n') {
4ee3650e 764 if (t < check_at - prog->check_offset_min) {
33b8afdf 765 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
766 /* Since we moved from the found position,
767 we definitely contradict the found anchored
30944b6d
IZ
768 substr. Due to the above check we do not
769 contradict "check" substr.
770 Thus we can arrive here only if check substr
771 is float. Redo checking for "other"=="fixed".
772 */
9041c2e3 773 strpos = t + 1;
a3621e74 774 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 775 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
776 goto do_other_anchored;
777 }
4ee3650e
GS
778 /* We don't contradict the found floating substring. */
779 /* XXXX Why not check for STCLASS? */
cad2e5aa 780 s = t + 1;
a3621e74 781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 782 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
783 goto set_useful;
784 }
4ee3650e
GS
785 /* Position contradicts check-string */
786 /* XXXX probably better to look for check-string
787 than for "\n", so one should lower the limit for t? */
a3621e74 788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 789 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 790 other_last = strpos = s = t + 1;
cad2e5aa
JH
791 goto restart;
792 }
793 t++;
794 }
a3621e74 795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 796 PL_colors[0], PL_colors[1]));
2c2d71f5 797 goto fail_finish;
cad2e5aa 798 }
f5952150 799 else {
a3621e74 800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 801 PL_colors[0], PL_colors[1]));
f5952150 802 }
cad2e5aa
JH
803 s = t;
804 set_useful:
33b8afdf 805 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
806 }
807 else {
f5952150 808 /* The found string does not prohibit matching at strpos,
2c2d71f5 809 - no optimization of calling REx engine can be performed,
f5952150
GS
810 unless it was an MBOL and we are not after MBOL,
811 or a future STCLASS check will fail this. */
2c2d71f5
JH
812 try_at_start:
813 /* Even in this situation we may use MBOL flag if strpos is offset
814 wrt the start of the string. */
05b4157f 815 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 816 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 817 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 818 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 819 {
cad2e5aa
JH
820 t = strpos;
821 goto find_anchor;
822 }
a3621e74 823 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 824 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 825 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 826 );
2c2d71f5 827 success_at_start:
bbe252da 828 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
829 && (do_utf8 ? (
830 prog->check_utf8 /* Could be deleted already */
831 && --BmUSEFUL(prog->check_utf8) < 0
832 && (prog->check_utf8 == prog->float_utf8)
833 ) : (
834 prog->check_substr /* Could be deleted already */
835 && --BmUSEFUL(prog->check_substr) < 0
836 && (prog->check_substr == prog->float_substr)
837 )))
66e933ab 838 {
cad2e5aa 839 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 840 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
841 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
842 if (do_utf8 ? prog->check_substr : prog->check_utf8)
843 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
844 prog->check_substr = prog->check_utf8 = NULL; /* disable */
845 prog->float_substr = prog->float_utf8 = NULL; /* clear */
846 check = NULL; /* abort */
cad2e5aa 847 s = strpos;
3cf5c195
IZ
848 /* XXXX This is a remnant of the old implementation. It
849 looks wasteful, since now INTUIT can use many
6eb5f6b9 850 other heuristics. */
bbe252da 851 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
852 }
853 else
854 s = strpos;
855 }
856
6eb5f6b9
JH
857 /* Last resort... */
858 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
859 /* trie stclasses are too expensive to use here, we are better off to
860 leave it to regmatch itself */
f8fc2ecf 861 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
862 /* minlen == 0 is possible if regstclass is \b or \B,
863 and the fixed substr is ''$.
864 Since minlen is already taken into account, s+1 is before strend;
865 accidentally, minlen >= 1 guaranties no false positives at s + 1
866 even for \b or \B. But (minlen? 1 : 0) below assumes that
867 regstclass does not come from lookahead... */
868 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
869 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
870 const U8* const str = (U8*)STRING(progi->regstclass);
871 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
872 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 873 : 1);
1de06328
YO
874 char * endpos;
875 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
876 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
877 else if (prog->float_substr || prog->float_utf8)
878 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
879 else
880 endpos= strend;
881
56570a2c 882 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
b57a0404 883 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
1de06328 884
6eb5f6b9 885 t = s;
f8fc2ecf 886 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
887 if (!s) {
888#ifdef DEBUGGING
cbbf8932 889 const char *what = NULL;
6eb5f6b9
JH
890#endif
891 if (endpos == strend) {
a3621e74 892 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
893 "Could not match STCLASS...\n") );
894 goto fail;
895 }
a3621e74 896 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 897 "This position contradicts STCLASS...\n") );
bbe252da 898 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 899 goto fail;
6eb5f6b9 900 /* Contradict one of substrings */
33b8afdf
JH
901 if (prog->anchored_substr || prog->anchored_utf8) {
902 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 903 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 904 hop_and_restart:
1aa99e6b 905 s = HOP3c(t, 1, strend);
66e933ab
GS
906 if (s + start_shift + end_shift > strend) {
907 /* XXXX Should be taken into account earlier? */
a3621e74 908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
909 "Could not match STCLASS...\n") );
910 goto fail;
911 }
5e39e1e5
HS
912 if (!check)
913 goto giveup;
a3621e74 914 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 915 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
916 what, (long)(s + start_shift - i_strpos)) );
917 goto restart;
918 }
66e933ab 919 /* Have both, check_string is floating */
6eb5f6b9
JH
920 if (t + start_shift >= check_at) /* Contradicts floating=check */
921 goto retry_floating_check;
922 /* Recheck anchored substring, but not floating... */
9041c2e3 923 s = check_at;
5e39e1e5
HS
924 if (!check)
925 goto giveup;
a3621e74 926 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 927 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
928 (long)(other_last - i_strpos)) );
929 goto do_other_anchored;
930 }
60e71179
GS
931 /* Another way we could have checked stclass at the
932 current position only: */
933 if (ml_anch) {
934 s = t = t + 1;
5e39e1e5
HS
935 if (!check)
936 goto giveup;
a3621e74 937 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 938 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 939 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 940 goto try_at_offset;
66e933ab 941 }
33b8afdf 942 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 943 goto fail;
6eb5f6b9
JH
944 /* Check is floating subtring. */
945 retry_floating_check:
946 t = check_at - start_shift;
a3621e74 947 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
948 goto hop_and_restart;
949 }
b7953727 950 if (t != s) {
a3621e74 951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 952 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
953 (long)(t - i_strpos), (long)(s - i_strpos))
954 );
955 }
956 else {
a3621e74 957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
958 "Does not contradict STCLASS...\n");
959 );
960 }
6eb5f6b9 961 }
5e39e1e5 962 giveup:
a3621e74 963 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
964 PL_colors[4], (check ? "Guessed" : "Giving up"),
965 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 966 return s;
2c2d71f5
JH
967
968 fail_finish: /* Substring not found */
33b8afdf
JH
969 if (prog->check_substr || prog->check_utf8) /* could be removed already */
970 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 971 fail:
a3621e74 972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 973 PL_colors[4], PL_colors[5]));
bd61b366 974 return NULL;
cad2e5aa 975}
9661b544 976
786e8c11 977
3b0527fe 978
4cadc6a9
YO
979#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
980foldlen, foldbuf, uniflags) STMT_START { \
981 switch (trie_type) { \
982 case trie_utf8_fold: \
983 if ( foldlen>0 ) { \
984 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
985 foldlen -= len; \
986 uscan += len; \
987 len=0; \
988 } else { \
989 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
990 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
991 foldlen -= UNISKIP( uvc ); \
992 uscan = foldbuf + UNISKIP( uvc ); \
993 } \
994 break; \
995 case trie_utf8: \
996 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
997 break; \
998 case trie_plain: \
999 uvc = (UV)*uc; \
1000 len = 1; \
1001 } \
1002 \
1003 if (uvc < 256) { \
1004 charid = trie->charmap[ uvc ]; \
1005 } \
1006 else { \
1007 charid = 0; \
1008 if (trie->widecharmap) { \
1009 SV** const svpp = hv_fetch(trie->widecharmap, \
1010 (char*)&uvc, sizeof(UV), 0); \
1011 if (svpp) \
1012 charid = (U16)SvIV(*svpp); \
1013 } \
1014 } \
1015} STMT_END
1016
1017#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1018 if ( (CoNd) \
1019 && (ln == len || \
1020 ibcmp_utf8(s, NULL, 0, do_utf8, \
1021 m, NULL, ln, (bool)UTF)) \
24b23f37 1022 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1023 goto got_it; \
1024 else { \
1025 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1026 uvchr_to_utf8(tmpbuf, c); \
1027 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1028 if ( f != c \
1029 && (f == c1 || f == c2) \
1030 && (ln == foldlen || \
1031 !ibcmp_utf8((char *) foldbuf, \
1032 NULL, foldlen, do_utf8, \
1033 m, \
1034 NULL, ln, (bool)UTF)) \
24b23f37 1035 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1036 goto got_it; \
1037 } \
1038 s += len
1039
1040#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1041STMT_START { \
1042 while (s <= e) { \
1043 if ( (CoNd) \
1044 && (ln == 1 || !(OP(c) == EXACTF \
1045 ? ibcmp(s, m, ln) \
1046 : ibcmp_locale(s, m, ln))) \
24b23f37 1047 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1048 goto got_it; \
1049 s++; \
1050 } \
1051} STMT_END
1052
1053#define REXEC_FBC_UTF8_SCAN(CoDe) \
1054STMT_START { \
1055 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1056 CoDe \
1057 s += uskip; \
1058 } \
1059} STMT_END
1060
1061#define REXEC_FBC_SCAN(CoDe) \
1062STMT_START { \
1063 while (s < strend) { \
1064 CoDe \
1065 s++; \
1066 } \
1067} STMT_END
1068
1069#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1070REXEC_FBC_UTF8_SCAN( \
1071 if (CoNd) { \
24b23f37 1072 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1073 goto got_it; \
1074 else \
1075 tmp = doevery; \
1076 } \
1077 else \
1078 tmp = 1; \
1079)
1080
1081#define REXEC_FBC_CLASS_SCAN(CoNd) \
1082REXEC_FBC_SCAN( \
1083 if (CoNd) { \
24b23f37 1084 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1085 goto got_it; \
1086 else \
1087 tmp = doevery; \
1088 } \
1089 else \
1090 tmp = 1; \
1091)
1092
1093#define REXEC_FBC_TRYIT \
24b23f37 1094if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1095 goto got_it
1096
1097#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1098 if (do_utf8) { \
1099 UtFpReLoAd; \
1100 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1101 } \
1102 else { \
1103 REXEC_FBC_CLASS_SCAN(CoNd); \
1104 } \
1105 break
1106
1107#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1108 PL_reg_flags |= RF_tainted; \
1109 if (do_utf8) { \
1110 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1111 } \
1112 else { \
1113 REXEC_FBC_CLASS_SCAN(CoNd); \
1114 } \
1115 break
1116
786e8c11
YO
1117#define DUMP_EXEC_POS(li,s,doutf8) \
1118 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1119
1120/* We know what class REx starts with. Try to find this position... */
1121/* if reginfo is NULL, its a dryrun */
1122/* annoyingly all the vars in this routine have different names from their counterparts
1123 in regmatch. /grrr */
1124
3c3eec57 1125STATIC char *
07be1b83 1126S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1127 const char *strend, regmatch_info *reginfo)
a687059c 1128{
27da23d5 1129 dVAR;
bbe252da 1130 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1131 char *m;
d8093b23 1132 STRLEN ln;
5dab1207 1133 STRLEN lnc;
078c425b 1134 register STRLEN uskip;
d8093b23
G
1135 unsigned int c1;
1136 unsigned int c2;
6eb5f6b9
JH
1137 char *e;
1138 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1139 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf
YO
1140 RXi_GET_DECL(prog,progi);
1141
6eb5f6b9
JH
1142 /* We know what class it must start with. */
1143 switch (OP(c)) {
6eb5f6b9 1144 case ANYOF:
388cc4de 1145 if (do_utf8) {
4cadc6a9 1146 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1147 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1148 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1149 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1150 }
1151 else {
1152 while (s < strend) {
1153 STRLEN skip = 1;
1154
32fc9b6a 1155 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1156 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1157 /* The assignment of 2 is intentional:
1158 * for the folded sharp s, the skip is 2. */
1159 (skip = SHARP_S_SKIP))) {
24b23f37 1160 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1161 goto got_it;
1162 else
1163 tmp = doevery;
1164 }
1165 else
1166 tmp = 1;
1167 s += skip;
1168 }
a0d0e21e 1169 }
6eb5f6b9 1170 break;
f33976b4 1171 case CANY:
4cadc6a9 1172 REXEC_FBC_SCAN(
24b23f37 1173 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1174 goto got_it;
1175 else
1176 tmp = doevery;
4cadc6a9 1177 );
f33976b4 1178 break;
6eb5f6b9 1179 case EXACTF:
5dab1207
NIS
1180 m = STRING(c);
1181 ln = STR_LEN(c); /* length to match in octets/bytes */
1182 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1183 if (UTF) {
a2a2844f 1184 STRLEN ulen1, ulen2;
5dab1207 1185 U8 *sm = (U8 *) m;
89ebb4a3
JH
1186 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1187 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1188 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1189
1190 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1191 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1192
89ebb4a3 1193 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1194 0, uniflags);
89ebb4a3 1195 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1196 0, uniflags);
5dab1207
NIS
1197 lnc = 0;
1198 while (sm < ((U8 *) m + ln)) {
1199 lnc++;
1200 sm += UTF8SKIP(sm);
1201 }
1aa99e6b
IH
1202 }
1203 else {
1204 c1 = *(U8*)m;
1205 c2 = PL_fold[c1];
1206 }
6eb5f6b9
JH
1207 goto do_exactf;
1208 case EXACTFL:
5dab1207
NIS
1209 m = STRING(c);
1210 ln = STR_LEN(c);
1211 lnc = (I32) ln;
d8093b23 1212 c1 = *(U8*)m;
6eb5f6b9
JH
1213 c2 = PL_fold_locale[c1];
1214 do_exactf:
db12adc6 1215 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1216
3b0527fe 1217 if (!reginfo && e < s)
6eb5f6b9 1218 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1219
60a8b682
JH
1220 /* The idea in the EXACTF* cases is to first find the
1221 * first character of the EXACTF* node and then, if
1222 * necessary, case-insensitively compare the full
1223 * text of the node. The c1 and c2 are the first
1224 * characters (though in Unicode it gets a bit
1225 * more complicated because there are more cases
7f16dd3d
JH
1226 * than just upper and lower: one needs to use
1227 * the so-called folding case for case-insensitive
1228 * matching (called "loose matching" in Unicode).
1229 * ibcmp_utf8() will do just that. */
60a8b682 1230
1aa99e6b 1231 if (do_utf8) {
575cac57 1232 UV c, f;
89ebb4a3 1233 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1234 STRLEN len, foldlen;
4ad0818d 1235 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1236 if (c1 == c2) {
5dab1207
NIS
1237 /* Upper and lower of 1st char are equal -
1238 * probably not a "letter". */
1aa99e6b 1239 while (s <= e) {
89ebb4a3 1240 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1241 uniflags);
4cadc6a9 1242 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1243 }
09091399
JH
1244 }
1245 else {
1aa99e6b 1246 while (s <= e) {
89ebb4a3 1247 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1248 uniflags);
80aecb99 1249
60a8b682 1250 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1251 * Note that not all the possible combinations
1252 * are handled here: some of them are handled
1253 * by the standard folding rules, and some of
1254 * them (the character class or ANYOF cases)
1255 * are handled during compiletime in
1256 * regexec.c:S_regclass(). */
880bd946
JH
1257 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1258 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1259 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1260
4cadc6a9 1261 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1262 }
09091399 1263 }
1aa99e6b
IH
1264 }
1265 else {
1266 if (c1 == c2)
4cadc6a9 1267 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1268 else
4cadc6a9 1269 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1270 }
1271 break;
bbce6d69 1272 case BOUNDL:
3280af22 1273 PL_reg_flags |= RF_tainted;
bbce6d69 1274 /* FALL THROUGH */
a0d0e21e 1275 case BOUND:
ffc61ed2 1276 if (do_utf8) {
12d33761 1277 if (s == PL_bostr)
ffc61ed2
JH
1278 tmp = '\n';
1279 else {
6136c704 1280 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1281 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1282 }
1283 tmp = ((OP(c) == BOUND ?
9041c2e3 1284 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1285 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1286 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1287 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1288 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1289 isALNUM_LC_utf8((U8*)s)))
1290 {
1291 tmp = !tmp;
4cadc6a9 1292 REXEC_FBC_TRYIT;
a687059c 1293 }
4cadc6a9 1294 );
a0d0e21e 1295 }
667bb95a 1296 else {
12d33761 1297 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1298 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1299 REXEC_FBC_SCAN(
ffc61ed2
JH
1300 if (tmp ==
1301 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1302 tmp = !tmp;
4cadc6a9 1303 REXEC_FBC_TRYIT;
a0ed51b3 1304 }
4cadc6a9 1305 );
a0ed51b3 1306 }
24b23f37 1307 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1308 goto got_it;
1309 break;
bbce6d69 1310 case NBOUNDL:
3280af22 1311 PL_reg_flags |= RF_tainted;
bbce6d69 1312 /* FALL THROUGH */
a0d0e21e 1313 case NBOUND:
ffc61ed2 1314 if (do_utf8) {
12d33761 1315 if (s == PL_bostr)
ffc61ed2
JH
1316 tmp = '\n';
1317 else {
6136c704 1318 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1319 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1320 }
1321 tmp = ((OP(c) == NBOUND ?
9041c2e3 1322 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1323 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1324 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1325 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1326 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1327 isALNUM_LC_utf8((U8*)s)))
1328 tmp = !tmp;
4cadc6a9
YO
1329 else REXEC_FBC_TRYIT;
1330 );
a0d0e21e 1331 }
667bb95a 1332 else {
12d33761 1333 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1334 tmp = ((OP(c) == NBOUND ?
1335 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1336 REXEC_FBC_SCAN(
ffc61ed2
JH
1337 if (tmp ==
1338 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1339 tmp = !tmp;
4cadc6a9
YO
1340 else REXEC_FBC_TRYIT;
1341 );
a0ed51b3 1342 }
24b23f37 1343 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1344 goto got_it;
1345 break;
a0d0e21e 1346 case ALNUM:
4cadc6a9
YO
1347 REXEC_FBC_CSCAN_PRELOAD(
1348 LOAD_UTF8_CHARCLASS_ALNUM(),
1349 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1350 isALNUM(*s)
1351 );
bbce6d69 1352 case ALNUML:
4cadc6a9
YO
1353 REXEC_FBC_CSCAN_TAINT(
1354 isALNUM_LC_utf8((U8*)s),
1355 isALNUM_LC(*s)
1356 );
a0d0e21e 1357 case NALNUM:
4cadc6a9
YO
1358 REXEC_FBC_CSCAN_PRELOAD(
1359 LOAD_UTF8_CHARCLASS_ALNUM(),
1360 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1361 !isALNUM(*s)
1362 );
bbce6d69 1363 case NALNUML:
4cadc6a9
YO
1364 REXEC_FBC_CSCAN_TAINT(
1365 !isALNUM_LC_utf8((U8*)s),
1366 !isALNUM_LC(*s)
1367 );
a0d0e21e 1368 case SPACE:
4cadc6a9
YO
1369 REXEC_FBC_CSCAN_PRELOAD(
1370 LOAD_UTF8_CHARCLASS_SPACE(),
1371 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1372 isSPACE(*s)
1373 );
bbce6d69 1374 case SPACEL:
4cadc6a9
YO
1375 REXEC_FBC_CSCAN_TAINT(
1376 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1377 isSPACE_LC(*s)
1378 );
a0d0e21e 1379 case NSPACE:
4cadc6a9
YO
1380 REXEC_FBC_CSCAN_PRELOAD(
1381 LOAD_UTF8_CHARCLASS_SPACE(),
1382 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1383 !isSPACE(*s)
1384 );
bbce6d69 1385 case NSPACEL:
4cadc6a9
YO
1386 REXEC_FBC_CSCAN_TAINT(
1387 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1388 !isSPACE_LC(*s)
1389 );
a0d0e21e 1390 case DIGIT:
4cadc6a9
YO
1391 REXEC_FBC_CSCAN_PRELOAD(
1392 LOAD_UTF8_CHARCLASS_DIGIT(),
1393 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1394 isDIGIT(*s)
1395 );
b8c5462f 1396 case DIGITL:
4cadc6a9
YO
1397 REXEC_FBC_CSCAN_TAINT(
1398 isDIGIT_LC_utf8((U8*)s),
1399 isDIGIT_LC(*s)
1400 );
a0d0e21e 1401 case NDIGIT:
4cadc6a9
YO
1402 REXEC_FBC_CSCAN_PRELOAD(
1403 LOAD_UTF8_CHARCLASS_DIGIT(),
1404 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1405 !isDIGIT(*s)
1406 );
b8c5462f 1407 case NDIGITL:
4cadc6a9
YO
1408 REXEC_FBC_CSCAN_TAINT(
1409 !isDIGIT_LC_utf8((U8*)s),
1410 !isDIGIT_LC(*s)
1411 );
1de06328
YO
1412 case AHOCORASICKC:
1413 case AHOCORASICK:
07be1b83
YO
1414 {
1415 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1416 trie_type = do_utf8 ?
1417 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1418 : trie_plain;
1419 /* what trie are we using right now */
1420 reg_ac_data *aho
f8fc2ecf 1421 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
07be1b83
YO
1422 reg_trie_data *trie=aho->trie;
1423
1424 const char *last_start = strend - trie->minlen;
6148ee25 1425#ifdef DEBUGGING
07be1b83 1426 const char *real_start = s;
6148ee25 1427#endif
07be1b83 1428 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1429 SV *sv_points;
1430 U8 **points; /* map of where we were in the input string
786e8c11 1431 when reading a given char. For ASCII this
be8e71aa
YO
1432 is unnecessary overhead as the relationship
1433 is always 1:1, but for unicode, especially
1434 case folded unicode this is not true. */
f9e705e8 1435 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1436 U8 *bitmap=NULL;
1437
07be1b83
YO
1438
1439 GET_RE_DEBUG_FLAGS_DECL;
1440
be8e71aa
YO
1441 /* We can't just allocate points here. We need to wrap it in
1442 * an SV so it gets freed properly if there is a croak while
1443 * running the match */
1444 ENTER;
1445 SAVETMPS;
1446 sv_points=newSV(maxlen * sizeof(U8 *));
1447 SvCUR_set(sv_points,
1448 maxlen * sizeof(U8 *));
1449 SvPOK_on(sv_points);
1450 sv_2mortal(sv_points);
1451 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1452 if ( trie_type != trie_utf8_fold
1453 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1454 {
786e8c11
YO
1455 if (trie->bitmap)
1456 bitmap=(U8*)trie->bitmap;
1457 else
1458 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1459 }
786e8c11
YO
1460 /* this is the Aho-Corasick algorithm modified a touch
1461 to include special handling for long "unknown char"
1462 sequences. The basic idea being that we use AC as long
1463 as we are dealing with a possible matching char, when
1464 we encounter an unknown char (and we have not encountered
1465 an accepting state) we scan forward until we find a legal
1466 starting char.
1467 AC matching is basically that of trie matching, except
1468 that when we encounter a failing transition, we fall back
1469 to the current states "fail state", and try the current char
1470 again, a process we repeat until we reach the root state,
1471 state 1, or a legal transition. If we fail on the root state
1472 then we can either terminate if we have reached an accepting
1473 state previously, or restart the entire process from the beginning
1474 if we have not.
1475
1476 */
07be1b83
YO
1477 while (s <= last_start) {
1478 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1479 U8 *uc = (U8*)s;
1480 U16 charid = 0;
1481 U32 base = 1;
1482 U32 state = 1;
1483 UV uvc = 0;
1484 STRLEN len = 0;
1485 STRLEN foldlen = 0;
1486 U8 *uscan = (U8*)NULL;
1487 U8 *leftmost = NULL;
786e8c11
YO
1488#ifdef DEBUGGING
1489 U32 accepted_word= 0;
1490#endif
07be1b83
YO
1491 U32 pointpos = 0;
1492
1493 while ( state && uc <= (U8*)strend ) {
1494 int failed=0;
786e8c11
YO
1495 U32 word = aho->states[ state ].wordnum;
1496
1de06328
YO
1497 if( state==1 ) {
1498 if ( bitmap ) {
1499 DEBUG_TRIE_EXECUTE_r(
1500 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1501 dump_exec_pos( (char *)uc, c, strend, real_start,
1502 (char *)uc, do_utf8 );
1503 PerlIO_printf( Perl_debug_log,
1504 " Scanning for legal start char...\n");
1505 }
1506 );
1507 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1508 uc++;
786e8c11 1509 }
1de06328 1510 s= (char *)uc;
786e8c11 1511 }
786e8c11
YO
1512 if (uc >(U8*)last_start) break;
1513 }
1514
1515 if ( word ) {
1516 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1517 if (!leftmost || lpos < leftmost) {
1518 DEBUG_r(accepted_word=word);
07be1b83 1519 leftmost= lpos;
786e8c11 1520 }
07be1b83 1521 if (base==0) break;
786e8c11 1522
07be1b83
YO
1523 }
1524 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1525 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1526 uvc, charid, foldlen, foldbuf, uniflags);
786e8c11
YO
1527 DEBUG_TRIE_EXECUTE_r({
1528 dump_exec_pos( (char *)uc, c, strend, real_start,
1529 s, do_utf8 );
07be1b83 1530 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1531 " Charid:%3u CP:%4"UVxf" ",
1532 charid, uvc);
1533 });
07be1b83
YO
1534
1535 do {
6148ee25 1536#ifdef DEBUGGING
786e8c11 1537 word = aho->states[ state ].wordnum;
6148ee25 1538#endif
07be1b83
YO
1539 base = aho->states[ state ].trans.base;
1540
786e8c11
YO
1541 DEBUG_TRIE_EXECUTE_r({
1542 if (failed)
1543 dump_exec_pos( (char *)uc, c, strend, real_start,
1544 s, do_utf8 );
07be1b83 1545 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1546 "%sState: %4"UVxf", word=%"UVxf,
1547 failed ? " Fail transition to " : "",
1548 (UV)state, (UV)word);
1549 });
07be1b83
YO
1550 if ( base ) {
1551 U32 tmp;
1552 if (charid &&
1553 (base + charid > trie->uniquecharcount )
1554 && (base + charid - 1 - trie->uniquecharcount
1555 < trie->lasttrans)
1556 && trie->trans[base + charid - 1 -
1557 trie->uniquecharcount].check == state
1558 && (tmp=trie->trans[base + charid - 1 -
1559 trie->uniquecharcount ].next))
1560 {
786e8c11
YO
1561 DEBUG_TRIE_EXECUTE_r(
1562 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1563 state = tmp;
1564 break;
1565 }
1566 else {
786e8c11
YO
1567 DEBUG_TRIE_EXECUTE_r(
1568 PerlIO_printf( Perl_debug_log," - fail\n"));
1569 failed = 1;
1570 state = aho->fail[state];
07be1b83
YO
1571 }
1572 }
1573 else {
1574 /* we must be accepting here */
786e8c11
YO
1575 DEBUG_TRIE_EXECUTE_r(
1576 PerlIO_printf( Perl_debug_log," - accepting\n"));
1577 failed = 1;
07be1b83
YO
1578 break;
1579 }
1580 } while(state);
786e8c11 1581 uc += len;
07be1b83
YO
1582 if (failed) {
1583 if (leftmost)
1584 break;
786e8c11 1585 if (!state) state = 1;
07be1b83
YO
1586 }
1587 }
1588 if ( aho->states[ state ].wordnum ) {
1589 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1590 if (!leftmost || lpos < leftmost) {
1591 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1592 leftmost = lpos;
786e8c11 1593 }
07be1b83 1594 }
07be1b83
YO
1595 if (leftmost) {
1596 s = (char*)leftmost;
786e8c11
YO
1597 DEBUG_TRIE_EXECUTE_r({
1598 PerlIO_printf(
1599 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1600 (UV)accepted_word, s - real_start
1601 );
1602 });
24b23f37 1603 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1604 FREETMPS;
1605 LEAVE;
07be1b83 1606 goto got_it;
be8e71aa 1607 }
07be1b83 1608 s = HOPc(s,1);
786e8c11
YO
1609 DEBUG_TRIE_EXECUTE_r({
1610 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1611 });
07be1b83 1612 } else {
786e8c11
YO
1613 DEBUG_TRIE_EXECUTE_r(
1614 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1615 break;
1616 }
1617 }
be8e71aa
YO
1618 FREETMPS;
1619 LEAVE;
07be1b83
YO
1620 }
1621 break;
b3c9acc1 1622 default:
3c3eec57
GS
1623 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1624 break;
d6a28714 1625 }
6eb5f6b9
JH
1626 return 0;
1627 got_it:
1628 return s;
1629}
1630
1631/*
1632 - regexec_flags - match a regexp against a string
1633 */
1634I32
1635Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1636 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1637/* strend: pointer to null at end of string */
1638/* strbeg: real beginning of string */
1639/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1640/* data: May be used for some additional optimizations.
1641 Currently its only used, with a U32 cast, for transmitting
1642 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1643/* nosave: For optimizations. */
1644{
97aff369 1645 dVAR;
24b23f37 1646 /*register*/ char *s;
6eb5f6b9 1647 register regnode *c;
24b23f37 1648 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1649 I32 minlen; /* must match at least this many chars */
1650 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1651 I32 end_shift = 0; /* Same for the end. */ /* CC */
1652 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1653 char *scream_olds = NULL;
3dab1dad 1654 SV* const oreplsv = GvSV(PL_replgv);
f9f4320a 1655 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1656 I32 multiline;
f8fc2ecf 1657 RXi_GET_DECL(prog,progi);
3b0527fe 1658 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1659
1660 GET_RE_DEBUG_FLAGS_DECL;
1661
9d4ba2ae 1662 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1663
1664 /* Be paranoid... */
1665 if (prog == NULL || startpos == NULL) {
1666 Perl_croak(aTHX_ "NULL regexp parameter");
1667 return 0;
1668 }
1669
bbe252da 1670 multiline = prog->extflags & RXf_PMf_MULTILINE;
3b0527fe 1671 reginfo.prog = prog;
2757e526 1672
bac06658 1673 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1674 DEBUG_EXECUTE_r(
1675 debug_start_match(prog, do_utf8, startpos, strend,
1676 "Matching");
1677 );
bac06658 1678
6eb5f6b9 1679 minlen = prog->minlen;
1de06328
YO
1680
1681 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1683 "String too short [regexec_flags]...\n"));
1684 goto phooey;
1aa99e6b 1685 }
6eb5f6b9 1686
1de06328 1687
6eb5f6b9 1688 /* Check validity of program. */
f8fc2ecf 1689 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1690 Perl_croak(aTHX_ "corrupted regexp program");
1691 }
1692
1693 PL_reg_flags = 0;
1694 PL_reg_eval_set = 0;
1695 PL_reg_maxiter = 0;
1696
bbe252da 1697 if (prog->extflags & RXf_UTF8)
6eb5f6b9
JH
1698 PL_reg_flags |= RF_utf8;
1699
1700 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1701 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1702 PL_bostr = strbeg;
3b0527fe 1703 reginfo.sv = sv;
6eb5f6b9
JH
1704
1705 /* Mark end of line for $ (and such) */
1706 PL_regeol = strend;
1707
1708 /* see how far we have to get to not match where we matched before */
3b0527fe 1709 reginfo.till = startpos+minend;
6eb5f6b9 1710
6eb5f6b9
JH
1711 /* If there is a "must appear" string, look for it. */
1712 s = startpos;
1713
bbe252da 1714 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1715 MAGIC *mg;
1716
1717 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
58e23c8d 1718 reginfo.ganch = startpos + prog->gofs;
6eb5f6b9
JH
1719 else if (sv && SvTYPE(sv) >= SVt_PVMG
1720 && SvMAGIC(sv)
14befaf4
DM
1721 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1722 && mg->mg_len >= 0) {
3b0527fe 1723 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
bbe252da 1724 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1725 if (s > reginfo.ganch)
6eb5f6b9 1726 goto phooey;
58e23c8d 1727 s = reginfo.ganch - prog->gofs;
6eb5f6b9
JH
1728 }
1729 }
58e23c8d
YO
1730 else if (data) {
1731 reginfo.ganch = strbeg + (UV)data;
1732 } else /* pos() not defined */
3b0527fe 1733 reginfo.ganch = strbeg;
6eb5f6b9 1734 }
c74340f9
YO
1735 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
1736 I32 *t;
f8fc2ecf 1737 if (!progi->swap) {
c74340f9
YO
1738 /* We have to be careful. If the previous successful match
1739 was from this regex we don't want a subsequent paritally
1740 successful match to clobber the old results.
1741 So when we detect this possibility we add a swap buffer
1742 to the re, and switch the buffer each match. If we fail
1743 we switch it back, otherwise we leave it swapped.
1744 */
f8fc2ecf 1745 Newxz(progi->swap, 1, regexp_paren_ofs);
c74340f9 1746 /* no need to copy these */
f8fc2ecf
YO
1747 Newxz(progi->swap->startp, prog->nparens + 1, I32);
1748 Newxz(progi->swap->endp, prog->nparens + 1, I32);
c74340f9 1749 }
f8fc2ecf
YO
1750 t = progi->swap->startp;
1751 progi->swap->startp = prog->startp;
c74340f9 1752 prog->startp = t;
f8fc2ecf
YO
1753 t = progi->swap->endp;
1754 progi->swap->endp = prog->endp;
c74340f9
YO
1755 prog->endp = t;
1756 }
a0714e2c 1757 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1758 re_scream_pos_data d;
1759
1760 d.scream_olds = &scream_olds;
1761 d.scream_pos = &scream_pos;
1762 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1763 if (!s) {
a3621e74 1764 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1765 goto phooey; /* not present */
3fa9c3d7 1766 }
6eb5f6b9
JH
1767 }
1768
1de06328 1769
6eb5f6b9
JH
1770
1771 /* Simplest case: anchored match need be tried only once. */
1772 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1773 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1774 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 1775 goto got_it;
bbe252da
YO
1776 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1777 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
1778 {
1779 char *end;
1780
1781 if (minlen)
1782 dontbother = minlen - 1;
1aa99e6b 1783 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1784 /* for multiline we only have to try after newlines */
33b8afdf 1785 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1786 if (s == startpos)
1787 goto after_try;
1788 while (1) {
24b23f37 1789 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1790 goto got_it;
1791 after_try:
1792 if (s >= end)
1793 goto phooey;
bbe252da 1794 if (prog->extflags & RXf_USE_INTUIT) {
6eb5f6b9
JH
1795 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1796 if (!s)
1797 goto phooey;
1798 }
1799 else
1800 s++;
1801 }
1802 } else {
1803 if (s > startpos)
1804 s--;
1805 while (s < end) {
1806 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 1807 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1808 goto got_it;
1809 }
1810 }
1811 }
1812 }
1813 goto phooey;
bbe252da 1814 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
1815 {
1816 /* the warning about reginfo.ganch being used without intialization
bbe252da 1817 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 1818 and we only enter this block when the same bit is set. */
58e23c8d
YO
1819 char *tmp_s = reginfo.ganch - prog->gofs;
1820 if (regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
1821 goto got_it;
1822 goto phooey;
1823 }
1824
1825 /* Messy cases: unanchored match. */
bbe252da 1826 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
1827 /* we have /x+whatever/ */
1828 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1829 char ch;
bf93d4cc
GS
1830#ifdef DEBUGGING
1831 int did_match = 0;
1832#endif
33b8afdf
JH
1833 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1834 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1835 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1836
1aa99e6b 1837 if (do_utf8) {
4cadc6a9 1838 REXEC_FBC_SCAN(
6eb5f6b9 1839 if (*s == ch) {
a3621e74 1840 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1841 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1842 s += UTF8SKIP(s);
1843 while (s < strend && *s == ch)
1844 s += UTF8SKIP(s);
1845 }
4cadc6a9 1846 );
6eb5f6b9
JH
1847 }
1848 else {
4cadc6a9 1849 REXEC_FBC_SCAN(
6eb5f6b9 1850 if (*s == ch) {
a3621e74 1851 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1852 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1853 s++;
1854 while (s < strend && *s == ch)
1855 s++;
1856 }
4cadc6a9 1857 );
6eb5f6b9 1858 }
a3621e74 1859 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1860 PerlIO_printf(Perl_debug_log,
b7953727
JH
1861 "Did not find anchored character...\n")
1862 );
6eb5f6b9 1863 }
a0714e2c
SS
1864 else if (prog->anchored_substr != NULL
1865 || prog->anchored_utf8 != NULL
1866 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1867 && prog->float_max_offset < strend - s)) {
1868 SV *must;
1869 I32 back_max;
1870 I32 back_min;
1871 char *last;
6eb5f6b9 1872 char *last1; /* Last position checked before */
bf93d4cc
GS
1873#ifdef DEBUGGING
1874 int did_match = 0;
1875#endif
33b8afdf
JH
1876 if (prog->anchored_substr || prog->anchored_utf8) {
1877 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1878 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1879 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1880 back_max = back_min = prog->anchored_offset;
1881 } else {
1882 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1883 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1884 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1885 back_max = prog->float_max_offset;
1886 back_min = prog->float_min_offset;
1887 }
1de06328
YO
1888
1889
33b8afdf
JH
1890 if (must == &PL_sv_undef)
1891 /* could not downgrade utf8 check substring, so must fail */
1892 goto phooey;
1893
1de06328
YO
1894 if (back_min<0) {
1895 last = strend;
1896 } else {
1897 last = HOP3c(strend, /* Cannot start after this */
1898 -(I32)(CHR_SVLEN(must)
1899 - (SvTAIL(must) != 0) + back_min), strbeg);
1900 }
6eb5f6b9
JH
1901 if (s > PL_bostr)
1902 last1 = HOPc(s, -1);
1903 else
1904 last1 = s - 1; /* bogus */
1905
a0288114 1906 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1907 check_substr==must. */
1908 scream_pos = -1;
1909 dontbother = end_shift;
1910 strend = HOPc(strend, -dontbother);
1911 while ( (s <= last) &&
9041c2e3 1912 ((flags & REXEC_SCREAM)
1de06328 1913 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1914 end_shift, &scream_pos, 0))
1de06328 1915 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1916 (unsigned char*)strend, must,
7fba1cd6 1917 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1918 /* we may be pointing at the wrong string */
1919 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1920 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1921 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1922 if (HOPc(s, -back_max) > last1) {
1923 last1 = HOPc(s, -back_min);
1924 s = HOPc(s, -back_max);
1925 }
1926 else {
52657f30 1927 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1928
1929 last1 = HOPc(s, -back_min);
52657f30 1930 s = t;
6eb5f6b9 1931 }
1aa99e6b 1932 if (do_utf8) {
6eb5f6b9 1933 while (s <= last1) {
24b23f37 1934 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1935 goto got_it;
1936 s += UTF8SKIP(s);
1937 }
1938 }
1939 else {
1940 while (s <= last1) {
24b23f37 1941 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1942 goto got_it;
1943 s++;
1944 }
1945 }
1946 }
ab3bbdeb
YO
1947 DEBUG_EXECUTE_r(if (!did_match) {
1948 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1949 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1950 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1951 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1952 ? "anchored" : "floating"),
ab3bbdeb
YO
1953 quoted, RE_SV_TAIL(must));
1954 });
6eb5f6b9
JH
1955 goto phooey;
1956 }
f8fc2ecf 1957 else if ( (c = progi->regstclass) ) {
f14c76ed 1958 if (minlen) {
f8fc2ecf 1959 const OPCODE op = OP(progi->regstclass);
66e933ab 1960 /* don't bother with what can't match */
786e8c11 1961 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1962 strend = HOPc(strend, -(minlen - 1));
1963 }
a3621e74 1964 DEBUG_EXECUTE_r({
be8e71aa 1965 SV * const prop = sv_newmortal();
32fc9b6a 1966 regprop(prog, prop, c);
0df25f3d 1967 {
ab3bbdeb
YO
1968 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1969 s,strend-s,60);
0df25f3d 1970 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1971 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1972 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1973 quoted, (int)(strend - s));
0df25f3d 1974 }
ffc61ed2 1975 });
3b0527fe 1976 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1977 goto got_it;
07be1b83 1978 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1979 }
1980 else {
1981 dontbother = 0;
a0714e2c 1982 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1983 /* Trim the end. */
d6a28714 1984 char *last;
33b8afdf
JH
1985 SV* float_real;
1986
1987 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1988 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1989 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1990
1991 if (flags & REXEC_SCREAM) {
33b8afdf 1992 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1993 end_shift, &scream_pos, 1); /* last one */
1994 if (!last)
ffc61ed2 1995 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1996 /* we may be pointing at the wrong string */
1997 else if (RX_MATCH_COPIED(prog))
3f7c398e 1998 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1999 }
d6a28714
JH
2000 else {
2001 STRLEN len;
cfd0369c 2002 const char * const little = SvPV_const(float_real, len);
d6a28714 2003
33b8afdf 2004 if (SvTAIL(float_real)) {
d6a28714
JH
2005 if (memEQ(strend - len + 1, little, len - 1))
2006 last = strend - len + 1;
7fba1cd6 2007 else if (!multiline)
9041c2e3 2008 last = memEQ(strend - len, little, len)
bd61b366 2009 ? strend - len : NULL;
b8c5462f 2010 else
d6a28714
JH
2011 goto find_last;
2012 } else {
2013 find_last:
9041c2e3 2014 if (len)
d6a28714 2015 last = rninstr(s, strend, little, little + len);
b8c5462f 2016 else
a0288114 2017 last = strend; /* matching "$" */
b8c5462f 2018 }
b8c5462f 2019 }
bf93d4cc 2020 if (last == NULL) {
6bda09f9
YO
2021 DEBUG_EXECUTE_r(
2022 PerlIO_printf(Perl_debug_log,
2023 "%sCan't trim the tail, match fails (should not happen)%s\n",
2024 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2025 goto phooey; /* Should not happen! */
2026 }
d6a28714
JH
2027 dontbother = strend - last + prog->float_min_offset;
2028 }
2029 if (minlen && (dontbother < minlen))
2030 dontbother = minlen - 1;
2031 strend -= dontbother; /* this one's always in bytes! */
2032 /* We don't know much -- general case. */
1aa99e6b 2033 if (do_utf8) {
d6a28714 2034 for (;;) {
24b23f37 2035 if (regtry(&reginfo, &s))
d6a28714
JH
2036 goto got_it;
2037 if (s >= strend)
2038 break;
b8c5462f 2039 s += UTF8SKIP(s);
d6a28714
JH
2040 };
2041 }
2042 else {
2043 do {
24b23f37 2044 if (regtry(&reginfo, &s))
d6a28714
JH
2045 goto got_it;
2046 } while (s++ < strend);
2047 }
2048 }
2049
2050 /* Failure. */
2051 goto phooey;
2052
2053got_it:
2054 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2055
2056 if (PL_reg_eval_set) {
2057 /* Preserve the current value of $^R */
2058 if (oreplsv != GvSV(PL_replgv))
2059 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2060 restored, the value remains
2061 the same. */
4f639d21 2062 restore_pos(aTHX_ prog);
d6a28714 2063 }
81714fb9
YO
2064 if (prog->paren_names)
2065 (void)hv_iterinit(prog->paren_names);
d6a28714
JH
2066
2067 /* make sure $`, $&, $', and $digit will work later */
2068 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2069 RX_MATCH_COPY_FREE(prog);
d6a28714 2070 if (flags & REXEC_COPY_STR) {
be8e71aa 2071 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2072#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2073 if ((SvIsCOW(sv)
2074 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2075 if (DEBUG_C_TEST) {
2076 PerlIO_printf(Perl_debug_log,
2077 "Copy on write: regexp capture, type %d\n",
2078 (int) SvTYPE(sv));
2079 }
2080 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2081 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2082 assert (SvPOKp(prog->saved_copy));
2083 } else
2084#endif
2085 {
2086 RX_MATCH_COPIED_on(prog);
2087 s = savepvn(strbeg, i);
2088 prog->subbeg = s;
2089 }
d6a28714 2090 prog->sublen = i;
d6a28714
JH
2091 }
2092 else {
2093 prog->subbeg = strbeg;
2094 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2095 }
2096 }
9041c2e3 2097
d6a28714
JH
2098 return 1;
2099
2100phooey:
a3621e74 2101 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2102 PL_colors[4], PL_colors[5]));
d6a28714 2103 if (PL_reg_eval_set)
4f639d21 2104 restore_pos(aTHX_ prog);
f8fc2ecf 2105 if (progi->swap) {
c74340f9
YO
2106 /* we failed :-( roll it back */
2107 I32 *t;
f8fc2ecf
YO
2108 t = progi->swap->startp;
2109 progi->swap->startp = prog->startp;
c74340f9 2110 prog->startp = t;
f8fc2ecf
YO
2111 t = progi->swap->endp;
2112 progi->swap->endp = prog->endp;
c74340f9
YO
2113 prog->endp = t;
2114 }
d6a28714
JH
2115 return 0;
2116}
2117
6bda09f9 2118
d6a28714
JH
2119/*
2120 - regtry - try match at specific point
2121 */
2122STATIC I32 /* 0 failure, 1 success */
24b23f37 2123S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2124{
97aff369 2125 dVAR;
d6a28714
JH
2126 register I32 *sp;
2127 register I32 *ep;
2128 CHECKPOINT lastcp;
3b0527fe 2129 regexp *prog = reginfo->prog;
f8fc2ecf 2130 RXi_GET_DECL(prog,progi);
a3621e74 2131 GET_RE_DEBUG_FLAGS_DECL;
24b23f37 2132 reginfo->cutpoint=NULL;
d6a28714 2133
bbe252da 2134 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2135 MAGIC *mg;
2136
2137 PL_reg_eval_set = RS_init;
a3621e74 2138 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2139 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2140 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2141 ));
ea8d6ae1 2142 SAVESTACK_CXPOS();
d6a28714
JH
2143 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2144 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2145 SAVETMPS;
2146 /* Apparently this is not needed, judging by wantarray. */
e8347627 2147 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2148 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2149
3b0527fe 2150 if (reginfo->sv) {
d6a28714 2151 /* Make $_ available to executed code. */
3b0527fe 2152 if (reginfo->sv != DEFSV) {
59f00321 2153 SAVE_DEFSV;
3b0527fe 2154 DEFSV = reginfo->sv;
b8c5462f 2155 }
d6a28714 2156
3b0527fe
DM
2157 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2158 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2159 /* prepare for quick setting of pos */
d300d9fa 2160#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2161 if (SvIsCOW(reginfo->sv))
2162 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2163#endif
3dab1dad 2164 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2165 &PL_vtbl_mglob, NULL, 0);
d6a28714 2166 mg->mg_len = -1;
b8c5462f 2167 }
d6a28714
JH
2168 PL_reg_magic = mg;
2169 PL_reg_oldpos = mg->mg_len;
4f639d21 2170 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2171 }
09687e5a 2172 if (!PL_reg_curpm) {
a02a5408 2173 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2174#ifdef USE_ITHREADS
2175 {
be8e71aa 2176 SV* const repointer = newSViv(0);
577e12cc 2177 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2178 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2179 av_push(PL_regex_padav,repointer);
2180 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2181 PL_regex_pad = AvARRAY(PL_regex_padav);
2182 }
2183#endif
2184 }
aaa362c4 2185 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2186 PL_reg_oldcurpm = PL_curpm;
2187 PL_curpm = PL_reg_curpm;
2188 if (RX_MATCH_COPIED(prog)) {
2189 /* Here is a serious problem: we cannot rewrite subbeg,
2190 since it may be needed if this match fails. Thus
2191 $` inside (?{}) could fail... */
2192 PL_reg_oldsaved = prog->subbeg;
2193 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2194#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2195 PL_nrs = prog->saved_copy;
2196#endif
d6a28714
JH
2197 RX_MATCH_COPIED_off(prog);
2198 }
2199 else
bd61b366 2200 PL_reg_oldsaved = NULL;
d6a28714
JH
2201 prog->subbeg = PL_bostr;
2202 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2203 }
24b23f37
YO
2204 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2205 prog->startp[0] = *startpos - PL_bostr;
2206 PL_reginput = *startpos;
d6a28714 2207 PL_reglastparen = &prog->lastparen;
a01268b5 2208 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2209 prog->lastparen = 0;
03994de8 2210 prog->lastcloseparen = 0;
d6a28714 2211 PL_regsize = 0;
6bda09f9
YO
2212 PL_regstartp = prog->startp;
2213 PL_regendp = prog->endp;
d6a28714
JH
2214 if (PL_reg_start_tmpl <= prog->nparens) {
2215 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2216 if(PL_reg_start_tmp)
2217 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2218 else
a02a5408 2219 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2220 }
2221
2222 /* XXXX What this code is doing here?!!! There should be no need
2223 to do this again and again, PL_reglastparen should take care of
3dd2943c 2224 this! --ilya*/
dafc8851
JH
2225
2226 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2227 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2228 * PL_reglastparen), is not needed at all by the test suite
2229 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2230 * enough, for building DynaLoader, or otherwise this
2231 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2232 * will happen. Meanwhile, this code *is* needed for the
2233 * above-mentioned test suite tests to succeed. The common theme
2234 * on those tests seems to be returning null fields from matches.
2235 * --jhi */
dafc8851 2236#if 1
c74340f9
YO
2237 sp = PL_regstartp;
2238 ep = PL_regendp;
d6a28714 2239 if (prog->nparens) {
097eb12c 2240 register I32 i;
eb160463 2241 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2242 *++sp = -1;
2243 *++ep = -1;
2244 }
2245 }
dafc8851 2246#endif
02db2b7b 2247 REGCP_SET(lastcp);
f8fc2ecf 2248 if (regmatch(reginfo, progi->program + 1)) {
c74340f9 2249 PL_regendp[0] = PL_reginput - PL_bostr;
d6a28714
JH
2250 return 1;
2251 }
24b23f37
YO
2252 if (reginfo->cutpoint)
2253 *startpos= reginfo->cutpoint;
02db2b7b 2254 REGCP_UNWIND(lastcp);
d6a28714
JH
2255 return 0;
2256}
2257
02db2b7b 2258
8ba1375e
MJD
2259#define sayYES goto yes
2260#define sayNO goto no
262b90c4 2261#define sayNO_SILENT goto no_silent
8ba1375e 2262
f9f4320a
YO
2263/* we dont use STMT_START/END here because it leads to
2264 "unreachable code" warnings, which are bogus, but distracting. */
2265#define CACHEsayNO \
c476f425
DM
2266 if (ST.cache_mask) \
2267 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2268 sayNO
3298f257 2269
a3621e74 2270/* this is used to determine how far from the left messages like
265c4333
YO
2271 'failed...' are printed. It should be set such that messages
2272 are inline with the regop output that created them.
a3621e74 2273*/
265c4333 2274#define REPORT_CODE_OFF 32
a3621e74
YO
2275
2276
2277/* Make sure there is a test for this +1 options in re_tests */
2278#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2279
40a82448
DM
2280#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2281#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2282
86545054
DM
2283#define SLAB_FIRST(s) (&(s)->states[0])
2284#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2285
5d9a96ca
DM
2286/* grab a new slab and return the first slot in it */
2287
2288STATIC regmatch_state *
2289S_push_slab(pTHX)
2290{
54df2634
NC
2291#if PERL_VERSION < 9
2292 dMY_CXT;
2293#endif
5d9a96ca
DM
2294 regmatch_slab *s = PL_regmatch_slab->next;
2295 if (!s) {
2296 Newx(s, 1, regmatch_slab);
2297 s->prev = PL_regmatch_slab;
2298 s->next = NULL;
2299 PL_regmatch_slab->next = s;
2300 }
2301 PL_regmatch_slab = s;
86545054 2302 return SLAB_FIRST(s);
5d9a96ca 2303}
5b47454d 2304
95b24440 2305
40a82448
DM
2306/* push a new state then goto it */
2307
2308#define PUSH_STATE_GOTO(state, node) \
2309 scan = node; \
2310 st->resume_state = state; \
2311 goto push_state;
2312
2313/* push a new state with success backtracking, then goto it */
2314
2315#define PUSH_YES_STATE_GOTO(state, node) \
2316 scan = node; \
2317 st->resume_state = state; \
2318 goto push_yes_state;
2319
aa283a38 2320
aa283a38 2321
d6a28714 2322/*
95b24440 2323
bf1f174e
DM
2324regmatch() - main matching routine
2325
2326This is basically one big switch statement in a loop. We execute an op,
2327set 'next' to point the next op, and continue. If we come to a point which
2328we may need to backtrack to on failure such as (A|B|C), we push a
2329backtrack state onto the backtrack stack. On failure, we pop the top
2330state, and re-enter the loop at the state indicated. If there are no more
2331states to pop, we return failure.
2332
2333Sometimes we also need to backtrack on success; for example /A+/, where
2334after successfully matching one A, we need to go back and try to
2335match another one; similarly for lookahead assertions: if the assertion
2336completes successfully, we backtrack to the state just before the assertion
2337and then carry on. In these cases, the pushed state is marked as
2338'backtrack on success too'. This marking is in fact done by a chain of
2339pointers, each pointing to the previous 'yes' state. On success, we pop to
2340the nearest yes state, discarding any intermediate failure-only states.
2341Sometimes a yes state is pushed just to force some cleanup code to be
2342called at the end of a successful match or submatch; e.g. (??{$re}) uses
2343it to free the inner regex.
2344
2345Note that failure backtracking rewinds the cursor position, while
2346success backtracking leaves it alone.
2347
2348A pattern is complete when the END op is executed, while a subpattern
2349such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2350ops trigger the "pop to last yes state if any, otherwise return true"
2351behaviour.
2352
2353A common convention in this function is to use A and B to refer to the two
2354subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2355the subpattern to be matched possibly multiple times, while B is the entire
2356rest of the pattern. Variable and state names reflect this convention.
2357
2358The states in the main switch are the union of ops and failure/success of
2359substates associated with with that op. For example, IFMATCH is the op
2360that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2361'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2362successfully matched A and IFMATCH_A_fail is a state saying that we have
2363just failed to match A. Resume states always come in pairs. The backtrack
2364state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2365at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2366on success or failure.
2367
2368The struct that holds a backtracking state is actually a big union, with
2369one variant for each major type of op. The variable st points to the
2370top-most backtrack struct. To make the code clearer, within each
2371block of code we #define ST to alias the relevant union.
2372
2373Here's a concrete example of a (vastly oversimplified) IFMATCH
2374implementation:
2375
2376 switch (state) {
2377 ....
2378
2379#define ST st->u.ifmatch
2380
2381 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2382 ST.foo = ...; // some state we wish to save
95b24440 2383 ...
bf1f174e
DM
2384 // push a yes backtrack state with a resume value of
2385 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2386 // first node of A:
2387 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2388 // NOTREACHED
2389
2390 case IFMATCH_A: // we have successfully executed A; now continue with B
2391 next = B;
2392 bar = ST.foo; // do something with the preserved value
2393 break;
2394
2395 case IFMATCH_A_fail: // A failed, so the assertion failed
2396 ...; // do some housekeeping, then ...
2397 sayNO; // propagate the failure
2398
2399#undef ST
95b24440 2400
bf1f174e
DM
2401 ...
2402 }
95b24440 2403
bf1f174e
DM
2404For any old-timers reading this who are familiar with the old recursive
2405approach, the code above is equivalent to:
95b24440 2406
bf1f174e
DM
2407 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2408 {
2409 int foo = ...
95b24440 2410 ...
bf1f174e
DM
2411 if (regmatch(A)) {
2412 next = B;
2413 bar = foo;
2414 break;
95b24440 2415 }
bf1f174e
DM
2416 ...; // do some housekeeping, then ...
2417 sayNO; // propagate the failure
95b24440 2418 }
bf1f174e
DM
2419
2420The topmost backtrack state, pointed to by st, is usually free. If you
2421want to claim it, populate any ST.foo fields in it with values you wish to
2422save, then do one of
2423
2424 PUSH_STATE_GOTO(resume_state, node);
2425 PUSH_YES_STATE_GOTO(resume_state, node);
2426
2427which sets that backtrack state's resume value to 'resume_state', pushes a
2428new free entry to the top of the backtrack stack, then goes to 'node'.
2429On backtracking, the free slot is popped, and the saved state becomes the
2430new free state. An ST.foo field in this new top state can be temporarily
2431accessed to retrieve values, but once the main loop is re-entered, it
2432becomes available for reuse.
2433
2434Note that the depth of the backtrack stack constantly increases during the
2435left-to-right execution of the pattern, rather than going up and down with
2436the pattern nesting. For example the stack is at its maximum at Z at the
2437end of the pattern, rather than at X in the following:
2438
2439 /(((X)+)+)+....(Y)+....Z/
2440
2441The only exceptions to this are lookahead/behind assertions and the cut,
2442(?>A), which pop all the backtrack states associated with A before
2443continuing.
2444
2445Bascktrack state structs are allocated in slabs of about 4K in size.
2446PL_regmatch_state and st always point to the currently active state,
2447and PL_regmatch_slab points to the slab currently containing
2448PL_regmatch_state. The first time regmatch() is called, the first slab is
2449allocated, and is never freed until interpreter destruction. When the slab
2450is full, a new one is allocated and chained to the end. At exit from
2451regmatch(), slabs allocated since entry are freed.
2452
2453*/
95b24440 2454
40a82448 2455
5bc10b2c 2456#define DEBUG_STATE_pp(pp) \
265c4333 2457 DEBUG_STATE_r({ \
5bc10b2c
DM
2458 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2459 PerlIO_printf(Perl_debug_log, \
5d458dd8 2460 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2461 depth*2, "", \
5d458dd8
YO
2462 reg_name[st->resume_state], \
2463 ((st==yes_state||st==mark_state) ? "[" : ""), \
2464 ((st==yes_state) ? "Y" : ""), \
2465 ((st==mark_state) ? "M" : ""), \
2466 ((st==yes_state||st==mark_state) ? "]" : "") \
2467 ); \
265c4333 2468 });
5bc10b2c 2469
40a82448 2470
3dab1dad 2471#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2472
3df15adc 2473#ifdef DEBUGGING
5bc10b2c 2474
ab3bbdeb
YO
2475STATIC void
2476S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2477 const char *start, const char *end, const char *blurb)
2478{
bbe252da 2479 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
ab3bbdeb
YO
2480 if (!PL_colorset)
2481 reginitcolors();
2482 {
2483 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2484 prog->precomp, prog->prelen, 60);
2485
2486 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2487 start, end - start, 60);
2488
2489 PerlIO_printf(Perl_debug_log,
2490 "%s%s REx%s %s against %s\n",
2491 PL_colors[4], blurb, PL_colors[5], s0, s1);
2492
2493 if (do_utf8||utf8_pat)
1de06328
YO
2494 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2495 utf8_pat ? "pattern" : "",
2496 utf8_pat && do_utf8 ? " and " : "",
2497 do_utf8 ? "string" : ""
ab3bbdeb
YO
2498 );
2499 }
2500}
3df15adc
YO
2501
2502STATIC void
786e8c11
YO
2503S_dump_exec_pos(pTHX_ const char *locinput,
2504 const regnode *scan,
2505 const char *loc_regeol,
2506 const char *loc_bostr,
2507 const char *loc_reg_starttry,
2508 const bool do_utf8)
07be1b83 2509{
786e8c11 2510 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2511 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2512 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2513 /* The part of the string before starttry has one color
2514 (pref0_len chars), between starttry and current
2515 position another one (pref_len - pref0_len chars),
2516 after the current position the third one.
2517 We assume that pref0_len <= pref_len, otherwise we
2518 decrease pref0_len. */
786e8c11
YO
2519 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2520 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2521 int pref0_len;
2522
2523 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2524 pref_len++;
786e8c11
YO
2525 pref0_len = pref_len - (locinput - loc_reg_starttry);
2526 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2527 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2528 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2529 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2530 l--;
2531 if (pref0_len < 0)
2532 pref0_len = 0;
2533 if (pref0_len > pref_len)
2534 pref0_len = pref_len;
2535 {
3df15adc 2536 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2537
ab3bbdeb 2538 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2539 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2540
ab3bbdeb 2541 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2542 (locinput - pref_len + pref0_len),
1de06328 2543 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2544
ab3bbdeb 2545 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2546 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2547
1de06328 2548 const STRLEN tlen=len0+len1+len2;
3df15adc 2549 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2550 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2551 (IV)(locinput - loc_bostr),
07be1b83 2552 len0, s0,
07be1b83 2553 len1, s1,
07be1b83 2554 (docolor ? "" : "> <"),
07be1b83 2555 len2, s2,
f9f4320a 2556 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2557 "");
2558 }
2559}
3df15adc 2560
07be1b83
YO
2561#endif
2562
0a4db386
YO
2563/* reg_check_named_buff_matched()
2564 * Checks to see if a named buffer has matched. The data array of
2565 * buffer numbers corresponding to the buffer is expected to reside
2566 * in the regexp->data->data array in the slot stored in the ARG() of
2567 * node involved. Note that this routine doesn't actually care about the
2568 * name, that information is not preserved from compilation to execution.
2569 * Returns the index of the leftmost defined buffer with the given name
2570 * or 0 if non of the buffers matched.
2571 */
2572STATIC I32
2573S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2574 I32 n;
f8fc2ecf
YO
2575 RXi_GET_DECL(rex,rexi);
2576 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
0a4db386
YO
2577 I32 *nums=(I32*)SvPVX(sv_dat);
2578 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2579 if ((I32)*PL_reglastparen >= nums[n] &&
2580 PL_regendp[nums[n]] != -1)
2581 {
2582 return nums[n];
2583 }
2584 }
2585 return 0;
2586}
2587
d6a28714 2588STATIC I32 /* 0 failure, 1 success */
24b23f37 2589S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2590{
54df2634
NC
2591#if PERL_VERSION < 9
2592 dMY_CXT;
2593#endif
27da23d5 2594 dVAR;
95b24440 2595 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2596 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2597
3b0527fe 2598 regexp *rex = reginfo->prog;
f8fc2ecf
YO
2599 RXi_GET_DECL(rex,rexi);
2600
5d9a96ca
DM
2601 regmatch_slab *orig_slab;
2602 regmatch_state *orig_state;
a3621e74 2603
5d9a96ca
DM
2604 /* the current state. This is a cached copy of PL_regmatch_state */
2605 register regmatch_state *st;
95b24440 2606
5d9a96ca
DM
2607 /* cache heavy used fields of st in registers */
2608 register regnode *scan;
2609 register regnode *next;
438e9bae 2610 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2611 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2612 register char *locinput = PL_reginput;
5d9a96ca 2613 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2614
b69b0499 2615 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2616 int depth = 0; /* depth of backtrack stack */
1a147d38 2617 int nochange_depth = 0; /* depth of GOSUB recursion with nochange*/
77cb431f
DM
2618 regmatch_state *yes_state = NULL; /* state to pop to on success of
2619 subpattern */
e2e6a0f1
YO
2620 /* mark_state piggy backs on the yes_state logic so that when we unwind
2621 the stack on success we can update the mark_state as we go */
2622 regmatch_state *mark_state = NULL; /* last mark state we have seen */
5d458dd8 2623
faec1544 2624 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2625 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2626 U32 state_num;
5d458dd8
YO
2627 bool no_final = 0; /* prevent failure from backtracking? */
2628 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2629 char *startpoint = PL_reginput;
5d458dd8
YO
2630 SV *popmark = NULL; /* are we looking for a mark? */
2631 SV *sv_commit = NULL; /* last mark name seen in failure */
2632 SV *sv_yes_mark = NULL; /* last mark name we have seen
2633 during a successfull match */
2634 U32 lastopen = 0; /* last open we saw */
2635 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
58e23c8d 2636
5d458dd8 2637
24d3c4a9
DM
2638 /* these three flags are set by various ops to signal information to
2639 * the very next op. They have a useful lifetime of exactly one loop
2640 * iteration, and are not preserved or restored by state pushes/pops
2641 */
2642 bool sw = 0; /* the condition value in (?(cond)a|b) */
2643 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2644 int logical = 0; /* the following EVAL is:
2645 0: (?{...})
2646 1: (?(?{...})X|Y)
2647 2: (??{...})
2648 or the following IFMATCH/UNLESSM is:
2649 false: plain (?=foo)
2650 true: used as a condition: (?(?=foo))
2651 */
2652
95b24440 2653#ifdef DEBUGGING
e68ec53f 2654 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2655#endif
2656
58e23c8d 2657 DEBUG_OPTIMISE_r( {
24b23f37
YO
2658 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2659 });
5d9a96ca
DM
2660 /* on first ever call to regmatch, allocate first slab */
2661 if (!PL_regmatch_slab) {
2662 Newx(PL_regmatch_slab, 1, regmatch_slab);
2663 PL_regmatch_slab->prev = NULL;
2664 PL_regmatch_slab->next = NULL;
86545054 2665 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2666 }
2667
2668 /* remember current high-water mark for exit */
2669 /* XXX this should be done with SAVE* instead */
2670 orig_slab = PL_regmatch_slab;
2671 orig_state = PL_regmatch_state;
2672
2673 /* grab next free state slot */
2674 st = ++PL_regmatch_state;
86545054 2675 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2676 st = PL_regmatch_state = S_push_slab(aTHX);
2677
d6a28714
JH
2678 /* Note that nextchr is a byte even in UTF */
2679 nextchr = UCHARAT(locinput);
2680 scan = prog;
2681 while (scan != NULL) {
8ba1375e 2682
a3621e74 2683 DEBUG_EXECUTE_r( {
6136c704 2684 SV * const prop = sv_newmortal();
1de06328 2685 regnode *rnext=regnext(scan);
786e8c11 2686 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2687 regprop(rex, prop, scan);
07be1b83
YO
2688
2689 PerlIO_printf(Perl_debug_log,
2690 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2691 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2692 SvPVX_const(prop),
1de06328 2693 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2694 0 : (IV)(rnext - rexi->program));
2a782b5b 2695 });
d6a28714
JH
2696
2697 next = scan + NEXT_OFF(scan);
2698 if (next == scan)
2699 next = NULL;
40a82448 2700 state_num = OP(scan);
d6a28714 2701
40a82448
DM
2702 reenter_switch:
2703 switch (state_num) {
d6a28714 2704 case BOL:
7fba1cd6 2705 if (locinput == PL_bostr)
d6a28714 2706 {
3b0527fe 2707 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2708 break;
2709 }
d6a28714
JH
2710 sayNO;
2711 case MBOL:
12d33761
HS
2712 if (locinput == PL_bostr ||
2713 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2714 {
b8c5462f
JH
2715 break;
2716 }
d6a28714
JH
2717 sayNO;
2718 case SBOL:
c2a73568 2719 if (locinput == PL_bostr)
b8c5462f 2720 break;
d6a28714
JH
2721 sayNO;
2722 case GPOS:
3b0527fe 2723 if (locinput == reginfo->ganch)
d6a28714
JH
2724 break;
2725 sayNO;
2726 case EOL:
d6a28714
JH
2727 goto seol;
2728 case MEOL:
d6a28714 2729 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2730 sayNO;
b8c5462f 2731 break;
d6a28714
JH
2732 case SEOL:
2733 seol:
2734 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2735 sayNO;
d6a28714 2736 if (PL_regeol - locinput > 1)
b8c5462f 2737 sayNO;
b8c5462f 2738 break;
d6a28714
JH
2739 case EOS:
2740 if (PL_regeol != locinput)
b8c5462f 2741 sayNO;
d6a28714 2742 break;
ffc61ed2 2743 case SANY:
d6a28714 2744 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2745 sayNO;
f33976b4
DB
2746 if (do_utf8) {
2747 locinput += PL_utf8skip[nextchr];
2748 if (locinput > PL_regeol)
2749 sayNO;
2750 nextchr = UCHARAT(locinput);
2751 }
2752 else
2753 nextchr = UCHARAT(++locinput);
2754 break;
2755 case CANY:
2756 if (!nextchr && locinput >= PL_regeol)
2757 sayNO;
b8c5462f 2758 nextchr = UCHARAT(++locinput);
a0d0e21e 2759 break;
ffc61ed2 2760 case REG_ANY:
1aa99e6b
IH
2761 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2762 sayNO;
2763 if (do_utf8) {
b8c5462f 2764 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2765 if (locinput > PL_regeol)
2766 sayNO;
a0ed51b3 2767 nextchr = UCHARAT(locinput);
a0ed51b3 2768 }
1aa99e6b
IH
2769 else
2770 nextchr = UCHARAT(++locinput);
a0ed51b3 2771 break;
166ba7cd
DM
2772
2773#undef ST
2774#define ST st->u.trie
786e8c11
YO
2775 case TRIEC:
2776 /* In this case the charclass data is available inline so
2777 we can fail fast without a lot of extra overhead.
2778 */
2779 if (scan->flags == EXACT || !do_utf8) {
2780 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2781 DEBUG_EXECUTE_r(
2782 PerlIO_printf(Perl_debug_log,
2783 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2784 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2785 );
2786 sayNO_SILENT;
2787 /* NOTREACHED */
2788 }
2789 }
2790 /* FALL THROUGH */
5b47454d 2791 case TRIE:
3dab1dad 2792 {
07be1b83 2793 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2794 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2795 trie_type = do_utf8 ?
2796 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2797 : trie_plain;
2798
2799 /* what trie are we using right now */
be8e71aa 2800 reg_trie_data * const trie
f8fc2ecf 2801 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3dab1dad 2802 U32 state = trie->startstate;
166ba7cd 2803
3dab1dad
YO
2804 if (trie->bitmap && trie_type != trie_utf8_fold &&
2805 !TRIE_BITMAP_TEST(trie,*locinput)
2806 ) {
2807 if (trie->states[ state ].wordnum) {
2808 DEBUG_EXECUTE_r(
2809 PerlIO_printf(Perl_debug_log,
2810 "%*s %smatched empty string...%s\n",
5bc10b2c 2811 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2812 );
2813 break;
2814 } else {
2815 DEBUG_EXECUTE_r(
2816 PerlIO_printf(Perl_debug_log,
786e8c11 2817 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2818 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2819 );
2820 sayNO_SILENT;
2821 }
2822 }
166ba7cd 2823
786e8c11
YO
2824 {
2825 U8 *uc = ( U8* )locinput;
2826
2827 STRLEN len = 0;
2828 STRLEN foldlen = 0;
2829 U8 *uscan = (U8*)NULL;
2830 STRLEN bufflen=0;
2831 SV *sv_accept_buff = NULL;
2832 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2833
2834 ST.accepted = 0; /* how many accepting states we have seen */
2835 ST.B = next;
2836 ST.jump = trie->jump;
786e8c11 2837 ST.me = scan;
786e8c11 2838
07be1b83
YO
2839 /*
2840 traverse the TRIE keeping track of all accepting states
2841 we transition through until we get to a failing node.
2842 */
2843
a3621e74 2844 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2845 U32 base = trie->states[ state ].trans.base;
f9f4320a 2846 UV uvc = 0;
786e8c11
YO
2847 U16 charid;
2848 /* We use charid to hold the wordnum as we don't use it
2849 for charid until after we have done the wordnum logic.
2850 We define an alias just so that the wordnum logic reads
2851 more naturally. */
2852
2853#define got_wordnum charid
2854 got_wordnum = trie->states[ state ].wordnum;
2855
2856 if ( got_wordnum ) {
2857 if ( ! ST.accepted ) {
5b47454d
DM
2858 ENTER;
2859 SAVETMPS;
2860 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2861 sv_accept_buff=newSV(bufflen *
2862 sizeof(reg_trie_accepted) - 1);
786e8c11 2863 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2864 SvPOK_on(sv_accept_buff);
2865 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2866 SAVETMPS;
2867 ST.accept_buff =
5b47454d
DM
2868 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2869 }
786e8c11 2870 do {
166ba7cd 2871 if (ST.accepted >= bufflen) {
5b47454d 2872 bufflen *= 2;
166ba7cd 2873 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2874 SvGROW(sv_accept_buff,
2875 bufflen * sizeof(reg_trie_accepted));
2876 }
2877 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2878 + sizeof(reg_trie_accepted));
a3621e74 2879
786e8c11
YO
2880
2881 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2882 ST.accept_buff[ST.accepted].endpos = uc;
2883 ++ST.accepted;
2884 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2885 }
2886#undef got_wordnum
a3621e74 2887
07be1b83 2888 DEBUG_TRIE_EXECUTE_r({
786e8c11 2889 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2890 PerlIO_printf( Perl_debug_log,
786e8c11 2891 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2892 2+depth * 2, "", PL_colors[4],
786e8c11 2893 (UV)state, (UV)ST.accepted );
07be1b83 2894 });
a3621e74
YO
2895
2896 if ( base ) {
4cadc6a9
YO
2897 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2898 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2899
5b47454d
DM
2900 if (charid &&
2901 (base + charid > trie->uniquecharcount )
2902 && (base + charid - 1 - trie->uniquecharcount
2903 < trie->lasttrans)
2904 && trie->trans[base + charid - 1 -
2905 trie->uniquecharcount].check == state)
2906 {
2907 state = trie->trans[base + charid - 1 -
2908 trie->uniquecharcount ].next;
2909 }
2910 else {
2911 state = 0;
2912 }
2913 uc += len;
2914
2915 }
2916 else {
a3621e74
YO
2917 state = 0;
2918 }
2919 DEBUG_TRIE_EXECUTE_r(
e4584336 2920 PerlIO_printf( Perl_debug_log,
786e8c11 2921 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2922 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2923 );
2924 }
166ba7cd 2925 if (!ST.accepted )
a3621e74 2926 sayNO;
a3621e74 2927
166ba7cd
DM
2928 DEBUG_EXECUTE_r(
2929 PerlIO_printf( Perl_debug_log,
2930 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 2931 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
2932 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2933 );
786e8c11 2934 }}
166ba7cd
DM
2935
2936 /* FALL THROUGH */
166ba7cd 2937 case TRIE_next_fail: /* we failed - try next alterative */
5d458dd8
YO
2938 if (do_cutgroup) {
2939 do_cutgroup = 0;
2940 no_final = 0;
2941 }
166ba7cd
DM
2942 if ( ST.accepted == 1 ) {
2943 /* only one choice left - just continue */
2944 DEBUG_EXECUTE_r({
2945 reg_trie_data * const trie
f8fc2ecf 2946 = (reg_trie_data*)rexi->data->data[ ARG(ST.me) ];
5c9f2f80
YO
2947 SV ** const tmp = av_fetch( trie->words,
2948 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
2949 SV *sv= tmp ? sv_newmortal() : NULL;
2950
166ba7cd
DM
2951 PerlIO_printf( Perl_debug_log,
2952 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 2953 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 2954 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
2955 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
2956 PL_colors[0], PL_colors[1],
2957 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2958 )
2959 : "not compiled under -Dr",
166ba7cd
DM
2960 PL_colors[5] );
2961 });
2962 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2963 /* in this case we free tmps/leave before we call regmatch
2964 as we wont be using accept_buff again. */
5d458dd8 2965
166ba7cd
DM
2966 locinput = PL_reginput;
2967 nextchr = UCHARAT(locinput);
5d458dd8
YO
2968 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
2969 scan = ST.B;
2970 else
2971 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
2972 if (!has_cutgroup) {
2973 FREETMPS;
2974 LEAVE;
2975 } else {
2976 ST.accepted--;
2977 PUSH_YES_STATE_GOTO(TRIE_next, scan);
2978 }
786e8c11 2979
166ba7cd
DM
2980 continue; /* execute rest of RE */
2981 }
2982
2983 if (!ST.accepted-- ) {
5d458dd8
YO
2984 DEBUG_EXECUTE_r({
2985 PerlIO_printf( Perl_debug_log,
2986 "%*s %sTRIE failed...%s\n",
2987 REPORT_CODE_OFF+depth*2, "",
2988 PL_colors[4],
2989 PL_colors[5] );
2990 });
166ba7cd
DM
2991 FREETMPS;
2992 LEAVE;
5d458dd8 2993 sayNO_SILENT;
166ba7cd
DM
2994 }
2995
a3621e74 2996 /*
166ba7cd
DM
2997 There are at least two accepting states left. Presumably
2998 the number of accepting states is going to be low,
2999 typically two. So we simply scan through to find the one
3000 with lowest wordnum. Once we find it, we swap the last
3001 state into its place and decrement the size. We then try to
3002 match the rest of the pattern at the point where the word
3003 ends. If we succeed, control just continues along the
3004 regex; if we fail we return here to try the next accepting
3005 state
3006 */
a3621e74 3007
166ba7cd
DM
3008 {
3009 U32 best = 0;
3010 U32 cur;
3011 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3012 DEBUG_TRIE_EXECUTE_r(
f2278c82 3013 PerlIO_printf( Perl_debug_log,
166ba7cd 3014 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3015 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3016 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3017 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3018 );
3019
3020 if (ST.accept_buff[cur].wordnum <
3021 ST.accept_buff[best].wordnum)
3022 best = cur;
a3621e74 3023 }
166ba7cd
DM
3024
3025 DEBUG_EXECUTE_r({
3026 reg_trie_data * const trie
f8fc2ecf 3027 = (reg_trie_data*)rexi->data->data[ ARG(ST.me) ];
5c9f2f80
YO
3028 SV ** const tmp = av_fetch( trie->words,
3029 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3030 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3031 ST.B :
7f69552c 3032 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3033 SV *sv= tmp ? sv_newmortal() : NULL;
3034
265c4333
YO
3035 PerlIO_printf( Perl_debug_log,
3036 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3037 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3038 ST.accept_buff[best].wordnum,
de734bd5
A
3039 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3040 PL_colors[0], PL_colors[1],
3041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3042 ) : "not compiled under -Dr",
265c4333 3043 REG_NODE_NUM(nextop),
166ba7cd
DM
3044 PL_colors[5] );
3045 });
3046
3047 if ( best<ST.accepted ) {
3048 reg_trie_accepted tmp = ST.accept_buff[ best ];
3049 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3050 ST.accept_buff[ ST.accepted ] = tmp;
3051 best = ST.accepted;
a3621e74 3052 }
166ba7cd 3053 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3054 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3055 scan = ST.B;
786e8c11
YO
3056 /* NOTREACHED */
3057 } else {
5d458dd8 3058 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
786e8c11
YO
3059 /* NOTREACHED */
3060 }
5d458dd8
YO
3061 if (has_cutgroup) {
3062 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3063 /* NOTREACHED */
3064 } else {
3065 PUSH_STATE_GOTO(TRIE_next, scan);
3066 /* NOTREACHED */
3067 }
786e8c11 3068 /* NOTREACHED */
166ba7cd 3069 }
166ba7cd 3070 /* NOTREACHED */
5d458dd8
YO
3071 case TRIE_next:
3072 FREETMPS;
3073 LEAVE;
3074 sayYES;
166ba7cd
DM
3075#undef ST
3076
95b24440
DM
3077 case EXACT: {
3078 char *s = STRING(scan);
24d3c4a9 3079 ln = STR_LEN(scan);
eb160463 3080 if (do_utf8 != UTF) {
bc517b45 3081 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3082 char *l = locinput;
24d3c4a9 3083 const char * const e = s + ln;
a72c7584 3084
5ff6fc6d
JH
3085 if (do_utf8) {
3086 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3087 while (s < e) {
a3b680e6 3088 STRLEN ulen;
1aa99e6b 3089 if (l >= PL_regeol)
5ff6fc6d
JH
3090 sayNO;
3091 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3092 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3093 uniflags))
5ff6fc6d 3094 sayNO;
bc517b45 3095 l += ulen;
5ff6fc6d 3096 s ++;
1aa99e6b 3097 }
5ff6fc6d
JH
3098 }
3099 else {
3100 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3101 while (s < e) {
a3b680e6 3102 STRLEN ulen;
1aa99e6b
IH
3103 if (l >= PL_regeol)
3104 sayNO;
5ff6fc6d 3105 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3106 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3107 uniflags))
1aa99e6b 3108 sayNO;
bc517b45 3109 s += ulen;
a72c7584 3110 l ++;
1aa99e6b 3111 }
5ff6fc6d 3112 }
1aa99e6b
IH
3113 locinput = l;
3114 nextchr = UCHARAT(locinput);
3115 break;
3116 }
bc517b45 3117 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3118 /* Inline the first character, for speed. */
3119 if (UCHARAT(s) != nextchr)
3120 sayNO;
24d3c4a9 3121 if (PL_regeol - locinput < ln)
d6a28714 3122 sayNO;
24d3c4a9 3123 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3124 sayNO;
24d3c4a9 3125 locinput += ln;
d6a28714
JH
3126 nextchr = UCHARAT(locinput);
3127 break;
95b24440 3128 }
d6a28714 3129 case EXACTFL:
b8c5462f
JH
3130 PL_reg_flags |= RF_tainted;
3131 /* FALL THROUGH */
95b24440 3132 case EXACTF: {
be8e71aa 3133 char * const s = STRING(scan);
24d3c4a9 3134 ln = STR_LEN(scan);
d6a28714 3135
d07ddd77
JH
3136 if (do_utf8 || UTF) {
3137 /* Either target or the pattern are utf8. */
be8e71aa 3138 const char * const l = locinput;
d07ddd77 3139 char *e = PL_regeol;
bc517b45 3140
24d3c4a9 3141 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3142 l, &e, 0, do_utf8)) {
5486206c
JH
3143 /* One more case for the sharp s:
3144 * pack("U0U*", 0xDF) =~ /ss/i,
3145 * the 0xC3 0x9F are the UTF-8
3146 * byte sequence for the U+00DF. */
3147 if (!(do_utf8 &&
3148 toLOWER(s[0]) == 's' &&
24d3c4a9 3149 ln >= 2 &&
5486206c
JH
3150 toLOWER(s[1]) == 's' &&
3151 (U8)l[0] == 0xC3 &&
3152 e - l >= 2 &&
3153 (U8)l[1] == 0x9F))
3154 sayNO;
3155 }
d07ddd77
JH
3156 locinput = e;
3157 nextchr = UCHARAT(locinput);
3158 break;
a0ed51b3 3159 }
d6a28714 3160
bc517b45
JH
3161 /* Neither the target and the pattern are utf8. */
3162
d6a28714
JH
3163 /* Inline the first character, for speed. */
3164 if (UCHARAT(s) != nextchr &&
3165 UCHARAT(s) != ((OP(scan) == EXACTF)
3166 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3167 sayNO;
24d3c4a9 3168 if (PL_regeol - locinput < ln)
b8c5462f 3169 sayNO;
24d3c4a9
DM
3170 if (ln > 1 && (OP(scan) == EXACTF
3171 ? ibcmp(s, locinput, ln)
3172 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3173 sayNO;
24d3c4a9 3174 locinput += ln;
d6a28714 3175 nextchr = UCHARAT(locinput);
a0d0e21e 3176 break;
95b24440 3177 }
d6a28714 3178 case ANYOF:
ffc61ed2 3179 if (do_utf8) {
9e55ce06
JH
3180 STRLEN inclasslen = PL_regeol - locinput;
3181
32fc9b6a 3182 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3183 goto anyof_fail;
ffc61ed2
JH
3184 if (locinput >= PL_regeol)
3185 sayNO;
0f0076b4 3186 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3187 nextchr = UCHARAT(locinput);
e0f9d4a8 3188 break;
ffc61ed2
JH
3189 }
3190 else {
3191 if (nextchr < 0)
3192 nextchr = UCHARAT(locinput);
32fc9b6a 3193 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3194 goto anyof_fail;
ffc61ed2
JH
3195 if (!nextchr && locinput >= PL_regeol)
3196 sayNO;
3197 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3198 break;
3199 }
262b90c4 3200 anyof_fail:
e0f9d4a8
JH
3201 /* If we might have the case of the German sharp s
3202 * in a casefolding Unicode character class. */
3203
ebc501f0
JH
3204 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3205 locinput += SHARP_S_SKIP;
e0f9d4a8 3206 nextchr = UCHARAT(locinput);
ffc61ed2 3207 }
e0f9d4a8
JH
3208 else
3209 sayNO;
b8c5462f 3210 break;
d6a28714 3211 case ALNUML:
b8c5462f
JH
3212 PL_reg_flags |= RF_tainted;
3213 /* FALL THROUGH */
d6a28714 3214 case ALNUM:
b8c5462f 3215 if (!nextchr)
4633a7c4 3216 sayNO;
ffc61ed2 3217 if (do_utf8) {
1a4fad37 3218 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3219 if (!(OP(scan) == ALNUM
bb7a0f54 3220 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3221 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3222 {
3223 sayNO;
a0ed51b3 3224 }
b8c5462f 3225 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3226 nextchr = UCHARAT(locinput);
3227 break;
3228 }
ffc61ed2 3229 if (!(OP(scan) == ALNUM
d6a28714 3230 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3231 sayNO;
b8c5462f 3232 nextchr = UCHARAT(++locinput);
a0d0e21e 3233 break;
d6a28714 3234 case NALNUML:
b8c5462f
JH
3235 PL_reg_flags |= RF_tainted;
3236 /* FALL THROUGH */
d6a28714
JH
3237 case NALNUM:
3238 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3239 sayNO;
ffc61ed2 3240 if (do_utf8) {
1a4fad37 3241 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3242 if (OP(scan) == NALNUM
bb7a0f54 3243 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3244 : isALNUM_LC_utf8((U8*)locinput))
3245 {
b8c5462f 3246 sayNO;
d6a28714 3247 }
b8c5462f
JH
3248 locinput += PL_utf8skip[nextchr];
3249 nextchr = UCHARAT(locinput);
3250 break;
3251 }
ffc61ed2 3252 if (OP(scan) == NALNUM
d6a28714 3253 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3254 sayNO;
76e3520e 3255 nextchr = UCHARAT(++locinput);
a0d0e21e 3256 break;
d6a28714
JH
3257 case BOUNDL:
3258 case NBOUNDL:
3280af22 3259 PL_reg_flags |= RF_tainted;
bbce6d69 3260 /* FALL THROUGH */
d6a28714
JH
3261 case BOUND:
3262 case NBOUND:
3263 /* was last char in word? */
ffc61ed2 3264 if (do_utf8) {
12d33761 3265 if (locinput == PL_bostr)
24d3c4a9 3266 ln = '\n';
ffc61ed2 3267 else {
a3b680e6 3268 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3269
24d3c4a9 3270 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3271 }
3272 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3273 ln = isALNUM_uni(ln);
1a4fad37 3274 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3275 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3276 }
3277 else {
24d3c4a9 3278 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3279 n = isALNUM_LC_utf8((U8*)locinput);
3280 }
a0ed51b3 3281 }
d6a28714 3282 else {
24d3c4a9 3283 ln = (locinput != PL_bostr) ?
12d33761 3284 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3285 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3286 ln = isALNUM(ln);
ffc61ed2
JH
3287 n = isALNUM(nextchr);
3288 }
3289 else {
24d3c4a9 3290 ln = isALNUM_LC(ln);
ffc61ed2
JH
3291 n = isALNUM_LC(nextchr);
3292 }
d6a28714 3293 }
24d3c4a9 3294 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3295 OP(scan) == BOUNDL))
3296 sayNO;
a0ed51b3 3297 break;
d6a28714 3298 case SPACEL:
3280af22 3299 PL_reg_flags |= RF_tainted;
bbce6d69 3300 /* FALL THROUGH */
d6a28714 3301 case SPACE:
9442cb0e 3302 if (!nextchr)
4633a7c4 3303 sayNO;
1aa99e6b 3304 if (do_utf8) {
fd400ab9 3305 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3306 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3307 if (!(OP(scan) == SPACE
bb7a0f54 3308 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3309 : isSPACE_LC_utf8((U8*)locinput)))
3310 {
3311 sayNO;
3312 }
3313 locinput += PL_utf8skip[nextchr];
3314 nextchr = UCHARAT(locinput);
3315 break;
d6a28714 3316 }
ffc61ed2
JH
3317 if (!(OP(scan) == SPACE
3318 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3319 sayNO;
3320 nextchr = UCHARAT(++locinput);
3321 }
3322 else {
3323 if (!(OP(scan) == SPACE
3324 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3325 sayNO;
3326 nextchr = UCHARAT(++locinput);
a0ed51b3 3327 }
a0ed51b3 3328 break;
d6a28714 3329 case NSPACEL:
3280af22 3330 PL_reg_flags |= RF_tainted;
bbce6d69 3331 /* FALL THROUGH */
d6a28714 3332 case NSPACE:
9442cb0e 3333 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3334 sayNO;
1aa99e6b 3335 if (do_utf8) {
1a4fad37 3336 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3337 if (OP(scan) == NSPACE
bb7a0f54 3338 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3339 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3340 {
3341 sayNO;
3342 }
3343 locinput += PL_utf8skip[nextchr];
3344 nextchr = UCHARAT(locinput);
3345 break;
a0ed51b3 3346 }
ffc61ed2 3347 if (OP(scan) == NSPACE
d6a28714 3348 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3349 sayNO;
76e3520e 3350 nextchr = UCHARAT(++locinput);
a0d0e21e 3351 break;
d6a28714 3352 case DIGITL:
a0ed51b3
LW
3353 PL_reg_flags |= RF_tainted;
3354 /* FALL THROUGH */
d6a28714 3355 case DIGIT:
9442cb0e 3356 if (!nextchr)
a0ed51b3 3357 sayNO;
1aa99e6b 3358 if (do_utf8) {
1a4fad37 3359 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3360 if (!(OP(scan) == DIGIT
bb7a0f54 3361 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3362 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3363 {
a0ed51b3 3364 sayNO;
dfe13c55 3365 }
6f06b55f 3366 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3367 nextchr = UCHARAT(locinput);
3368 break;
3369 }
ffc61ed2 3370 if (!(OP(scan) == DIGIT
9442cb0e 3371 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3372 sayNO;
3373 nextchr = UCHARAT(++locinput);
3374 break;
d6a28714 3375 case NDIGITL:
b8c5462f
JH
3376 PL_reg_flags |= RF_tainted;
3377 /* FALL THROUGH */
d6a28714 3378 case NDIGIT:
9442cb0e 3379 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3380 sayNO;
1aa99e6b 3381 if (do_utf8) {
1a4fad37 3382 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3383 if (OP(scan) == NDIGIT
bb7a0f54 3384 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3385 : isDIGIT_LC_utf8((U8*)locinput))
3386 {
a0ed51b3 3387 sayNO;
9442cb0e 3388 }
6f06b55f 3389 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3390 nextchr = UCHARAT(locinput);
3391 break;
3392 }
ffc61ed2 3393 if (OP(scan) == NDIGIT
9442cb0e 3394 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3395 sayNO;
3396 nextchr = UCHARAT(++locinput);
3397