This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Signedness nit, found by Jarkko
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
125/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
129 OP(rn) == EVAL || \
cca55fe3
JP
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 134)
ee9b8eae 135#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 136
ee9b8eae
YO
137#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
138
139#if 0
140/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
145
146#else
147/* ... so we use this as its faster. */
148#define IS_TEXT(rn) ( OP(rn)==EXACT )
149#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
151
152#endif
e2d8ce26 153
a84d97b6
HS
154/*
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
157*/
cca55fe3 158#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 162 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 163 else if (type == PLUS) \
cca55fe3 164 rn = NEXTOPER(rn); \
3dab1dad 165 else if (type == IFMATCH) \
a84d97b6 166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 167 else rn += NEXT_OFF(rn); \
3dab1dad 168 } \
5f80c4cf 169} STMT_END
74750237 170
c476f425 171
acfe0abc 172static void restore_pos(pTHX_ void *arg);
51371543 173
76e3520e 174STATIC CHECKPOINT
cea2e8a9 175S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 176{
97aff369 177 dVAR;
a3b680e6 178 const int retval = PL_savestack_ix;
b1ce53c5 179#define REGCP_PAREN_ELEMS 4
a3b680e6 180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 181 int p;
40a82448 182 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 183
e49a9654
IH
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
186
f0ab9afb 187#define REGCP_OTHER_ELEMS 7
4b3c1a47 188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 189
3280af22 190 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 191/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
192 SSPUSHINT(PL_regoffs[p].end);
193 SSPUSHINT(PL_regoffs[p].start);
3280af22 194 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 195 SSPUSHINT(p);
e7707071 196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 198 (UV)p, (IV)PL_regoffs[p].start,
40a82448 199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 200 (IV)PL_regoffs[p].end
40a82448 201 ));
a0d0e21e 202 }
b1ce53c5 203/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 204 SSPUSHPTR(PL_regoffs);
3280af22
NIS
205 SSPUSHINT(PL_regsize);
206 SSPUSHINT(*PL_reglastparen);
a01268b5 207 SSPUSHINT(*PL_reglastcloseparen);
3280af22 208 SSPUSHPTR(PL_reginput);
41123dfd
JH
209#define REGCP_FRAME_ELEMS 2
210/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
211 * are needed for the regexp context stack bookkeeping. */
212 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 213 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 214
a0d0e21e
LW
215 return retval;
216}
217
c277df42 218/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
219#define REGCP_SET(cp) \
220 DEBUG_STATE_r( \
ab3bbdeb 221 PerlIO_printf(Perl_debug_log, \
e4f74956 222 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
223 (IV)PL_savestack_ix)); \
224 cp = PL_savestack_ix
c3464db5 225
ab3bbdeb 226#define REGCP_UNWIND(cp) \
e4f74956 227 DEBUG_STATE_r( \
ab3bbdeb 228 if (cp != PL_savestack_ix) \
e4f74956
YO
229 PerlIO_printf(Perl_debug_log, \
230 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
231 (IV)(cp), (IV)PL_savestack_ix)); \
232 regcpblow(cp)
c277df42 233
76e3520e 234STATIC char *
097eb12c 235S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 236{
97aff369 237 dVAR;
3b6647e0 238 U32 i;
a0d0e21e 239 char *input;
b1ce53c5 240
a3621e74
YO
241 GET_RE_DEBUG_FLAGS_DECL;
242
b1ce53c5 243 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 244 i = SSPOPINT;
b1ce53c5
JH
245 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
246 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 247 input = (char *) SSPOPPTR;
a01268b5 248 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
249 *PL_reglastparen = SSPOPINT;
250 PL_regsize = SSPOPINT;
f0ab9afb 251 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 252
6bda09f9 253
b1ce53c5 254 /* Now restore the parentheses context. */
41123dfd
JH
255 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
256 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 257 I32 tmps;
097eb12c 258 U32 paren = (U32)SSPOPINT;
3280af22 259 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 260 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 261 tmps = SSPOPINT;
3280af22 262 if (paren <= *PL_reglastparen)
f0ab9afb 263 PL_regoffs[paren].end = tmps;
e7707071 264 DEBUG_BUFFERS_r(
c3464db5 265 PerlIO_printf(Perl_debug_log,
b900a521 266 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 267 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 268 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 269 (IV)PL_regoffs[paren].end,
3280af22 270 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 271 );
a0d0e21e 272 }
e7707071 273 DEBUG_BUFFERS_r(
bb7a0f54 274 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 275 PerlIO_printf(Perl_debug_log,
faccc32b 276 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 277 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
278 }
279 );
daf18116 280#if 1
dafc8851
JH
281 /* It would seem that the similar code in regtry()
282 * already takes care of this, and in fact it is in
283 * a better location to since this code can #if 0-ed out
284 * but the code in regtry() is needed or otherwise tests
285 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
286 * (as of patchlevel 7877) will fail. Then again,
287 * this code seems to be necessary or otherwise
288 * building DynaLoader will fail:
289 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
290 * --jhi */
3b6647e0 291 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 292 if (i > PL_regsize)
f0ab9afb
NC
293 PL_regoffs[i].start = -1;
294 PL_regoffs[i].end = -1;
a0d0e21e 295 }
dafc8851 296#endif
a0d0e21e
LW
297 return input;
298}
299
02db2b7b 300#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 301
a687059c 302/*
e50aee73 303 * pregexec and friends
a687059c
LW
304 */
305
76234dfb 306#ifndef PERL_IN_XSUB_RE
a687059c 307/*
c277df42 308 - pregexec - match a regexp against a string
a687059c 309 */
c277df42 310I32
49d7dfbc 311Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 312 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
313/* strend: pointer to null at end of string */
314/* strbeg: real beginning of string */
315/* minend: end of match must be >=minend after stringarg. */
316/* nosave: For optimizations. */
317{
318 return
9041c2e3 319 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
320 nosave ? 0 : REXEC_COPY_STR);
321}
76234dfb 322#endif
22e551b9 323
9041c2e3 324/*
cad2e5aa
JH
325 * Need to implement the following flags for reg_anch:
326 *
327 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
328 * USE_INTUIT_ML
329 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
330 * INTUIT_AUTORITATIVE_ML
331 * INTUIT_ONCE_NOML - Intuit can match in one location only.
332 * INTUIT_ONCE_ML
333 *
334 * Another flag for this function: SECOND_TIME (so that float substrs
335 * with giant delta may be not rechecked).
336 */
337
338/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
339
3f7c398e 340/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
341 Otherwise, only SvCUR(sv) is used to get strbeg. */
342
343/* XXXX We assume that strpos is strbeg unless sv. */
344
6eb5f6b9
JH
345/* XXXX Some places assume that there is a fixed substring.
346 An update may be needed if optimizer marks as "INTUITable"
347 RExen without fixed substrings. Similarly, it is assumed that
348 lengths of all the strings are no more than minlen, thus they
349 cannot come from lookahead.
40d049e4
YO
350 (Or minlen should take into account lookahead.)
351 NOTE: Some of this comment is not correct. minlen does now take account
352 of lookahead/behind. Further research is required. -- demerphq
353
354*/
6eb5f6b9 355
2c2d71f5
JH
356/* A failure to find a constant substring means that there is no need to make
357 an expensive call to REx engine, thus we celebrate a failure. Similarly,
358 finding a substring too deep into the string means that less calls to
30944b6d
IZ
359 regtry() should be needed.
360
361 REx compiler's optimizer found 4 possible hints:
362 a) Anchored substring;
363 b) Fixed substring;
364 c) Whether we are anchored (beginning-of-line or \G);
365 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 366 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
367 string which does not contradict any of them.
368 */
2c2d71f5 369
6eb5f6b9
JH
370/* Most of decisions we do here should have been done at compile time.
371 The nodes of the REx which we used for the search should have been
372 deleted from the finite automaton. */
373
cad2e5aa 374char *
49d7dfbc 375Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
cad2e5aa
JH
376 char *strend, U32 flags, re_scream_pos_data *data)
377{
97aff369 378 dVAR;
b7953727 379 register I32 start_shift = 0;
cad2e5aa 380 /* Should be nonnegative! */
b7953727 381 register I32 end_shift = 0;
2c2d71f5
JH
382 register char *s;
383 register SV *check;
a1933d95 384 char *strbeg;
cad2e5aa 385 char *t;
1de06328 386 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 387 I32 ml_anch;
bd61b366
SS
388 register char *other_last = NULL; /* other substr checked before this */
389 char *check_at = NULL; /* check substr found at this pos */
bbe252da 390 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 391 RXi_GET_DECL(prog,progi);
30944b6d 392#ifdef DEBUGGING
890ce7af 393 const char * const i_strpos = strpos;
30944b6d 394#endif
a3621e74
YO
395
396 GET_RE_DEBUG_FLAGS_DECL;
397
a30b2f1f 398 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 399
bbe252da 400 if (prog->extflags & RXf_UTF8) {
b8d68ded
JH
401 PL_reg_flags |= RF_utf8;
402 }
ab3bbdeb
YO
403 DEBUG_EXECUTE_r(
404 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
405 sv ? "Guessing start of match in sv for"
406 : "Guessing start of match in string for");
2a782b5b 407 );
cad2e5aa 408
c344f387
JH
409 /* CHR_DIST() would be more correct here but it makes things slow. */
410 if (prog->minlen > strend - strpos) {
a3621e74 411 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 412 "String too short... [re_intuit_start]\n"));
cad2e5aa 413 goto fail;
2c2d71f5 414 }
1de06328 415
a1933d95 416 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 417 PL_regeol = strend;
33b8afdf
JH
418 if (do_utf8) {
419 if (!prog->check_utf8 && prog->check_substr)
420 to_utf8_substr(prog);
421 check = prog->check_utf8;
422 } else {
423 if (!prog->check_substr && prog->check_utf8)
424 to_byte_substr(prog);
425 check = prog->check_substr;
426 }
1de06328 427 if (check == &PL_sv_undef) {
a3621e74 428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 429 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
430 goto fail;
431 }
bbe252da
YO
432 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
433 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
434 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 435 && !multiline ) ); /* Check after \n? */
cad2e5aa 436
7e25d62c 437 if (!ml_anch) {
bbe252da
YO
438 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
439 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 440 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
441 && sv && !SvROK(sv)
442 && (strpos != strbeg)) {
a3621e74 443 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
444 goto fail;
445 }
446 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 447 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 448 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
449 I32 slen;
450
1aa99e6b 451 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 452
653099ff
GS
453 if (SvTAIL(check)) {
454 slen = SvCUR(check); /* >= 1 */
cad2e5aa 455
9041c2e3 456 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 457 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 458 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 459 goto fail_finish;
cad2e5aa
JH
460 }
461 /* Now should match s[0..slen-2] */
462 slen--;
3f7c398e 463 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 464 || (slen > 1
3f7c398e 465 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 466 report_neq:
a3621e74 467 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
468 goto fail_finish;
469 }
cad2e5aa 470 }
3f7c398e 471 else if (*SvPVX_const(check) != *s
653099ff 472 || ((slen = SvCUR(check)) > 1
3f7c398e 473 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 474 goto report_neq;
c315bfe8 475 check_at = s;
2c2d71f5 476 goto success_at_start;
7e25d62c 477 }
cad2e5aa 478 }
2c2d71f5 479 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 480 s = strpos;
2c2d71f5 481 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
482 end_shift = prog->check_end_shift;
483
2c2d71f5 484 if (!ml_anch) {
a3b680e6 485 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 486 - (SvTAIL(check) != 0);
a3b680e6 487 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
488
489 if (end_shift < eshift)
490 end_shift = eshift;
491 }
cad2e5aa 492 }
2c2d71f5 493 else { /* Can match at random position */
cad2e5aa
JH
494 ml_anch = 0;
495 s = strpos;
1de06328
YO
496 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
497 end_shift = prog->check_end_shift;
498
499 /* end shift should be non negative here */
cad2e5aa
JH
500 }
501
bcdf7404 502#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 503 if (end_shift < 0)
1de06328 504 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 505 (IV)end_shift, prog->precomp);
2c2d71f5
JH
506#endif
507
2c2d71f5
JH
508 restart:
509 /* Find a possible match in the region s..strend by looking for
510 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
511
512 {
513 I32 srch_start_shift = start_shift;
514 I32 srch_end_shift = end_shift;
515 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
516 srch_end_shift -= ((strbeg - s) - srch_start_shift);
517 srch_start_shift = strbeg - s;
518 }
6bda09f9 519 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
520 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
521 (IV)prog->check_offset_min,
522 (IV)srch_start_shift,
523 (IV)srch_end_shift,
524 (IV)prog->check_end_shift);
525 });
526
cad2e5aa 527 if (flags & REXEC_SCREAM) {
cad2e5aa 528 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 529 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 530
2c2d71f5
JH
531 if (PL_screamfirst[BmRARE(check)] >= 0
532 || ( BmRARE(check) == '\n'
85c508c3 533 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 534 && SvTAIL(check) ))
9041c2e3 535 s = screaminstr(sv, check,
1de06328 536 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 537 else
2c2d71f5 538 goto fail_finish;
4addbd3b
HS
539 /* we may be pointing at the wrong string */
540 if (s && RX_MATCH_COPIED(prog))
3f7c398e 541 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
542 if (data)
543 *data->scream_olds = s;
544 }
1de06328
YO
545 else {
546 U8* start_point;
547 U8* end_point;
bbe252da 548 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
549 start_point= (U8*)(s + srch_start_shift);
550 end_point= (U8*)(strend - srch_end_shift);
551 } else {
552 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
553 end_point= HOP3(strend, -srch_end_shift, strbeg);
554 }
6bda09f9 555 DEBUG_OPTIMISE_MORE_r({
56570a2c 556 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 557 (int)(end_point - start_point),
fc8cd66c 558 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
559 start_point);
560 });
561
562 s = fbm_instr( start_point, end_point,
7fba1cd6 563 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
564 }
565 }
cad2e5aa
JH
566 /* Update the count-of-usability, remove useless subpatterns,
567 unshift s. */
2c2d71f5 568
ab3bbdeb
YO
569 DEBUG_EXECUTE_r({
570 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
571 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
572 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 573 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
574 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
575 ? "anchored" : "floating"),
576 quoted,
577 RE_SV_TAIL(check),
578 (s ? " at offset " : "...\n") );
579 });
2c2d71f5
JH
580
581 if (!s)
582 goto fail_finish;
2c2d71f5 583 /* Finish the diagnostic message */
a3621e74 584 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 585
1de06328
YO
586 /* XXX dmq: first branch is for positive lookbehind...
587 Our check string is offset from the beginning of the pattern.
588 So we need to do any stclass tests offset forward from that
589 point. I think. :-(
590 */
591
592
593
594 check_at=s;
595
596
2c2d71f5
JH
597 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
598 Start with the other substr.
599 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 600 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
601 *always* match. Probably should be marked during compile...
602 Probably it is right to do no SCREAM here...
603 */
604
1de06328
YO
605 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
606 : (prog->float_substr && prog->anchored_substr))
607 {
30944b6d 608 /* Take into account the "other" substring. */
2c2d71f5
JH
609 /* XXXX May be hopelessly wrong for UTF... */
610 if (!other_last)
6eb5f6b9 611 other_last = strpos;
33b8afdf 612 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
613 do_other_anchored:
614 {
890ce7af
AL
615 char * const last = HOP3c(s, -start_shift, strbeg);
616 char *last1, *last2;
be8e71aa 617 char * const saved_s = s;
33b8afdf 618 SV* must;
2c2d71f5 619
2c2d71f5
JH
620 t = s - prog->check_offset_max;
621 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 622 && (!do_utf8
0ce71af7 623 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 624 && t > strpos)))
6f207bd3 625 NOOP;
2c2d71f5
JH
626 else
627 t = strpos;
1aa99e6b 628 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
629 if (t < other_last) /* These positions already checked */
630 t = other_last;
1aa99e6b 631 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
632 if (last < last1)
633 last1 = last;
1de06328
YO
634 /* XXXX It is not documented what units *_offsets are in.
635 We assume bytes, but this is clearly wrong.
636 Meaning this code needs to be carefully reviewed for errors.
637 dmq.
638 */
639
2c2d71f5 640 /* On end-of-str: see comment below. */
33b8afdf
JH
641 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
642 if (must == &PL_sv_undef) {
643 s = (char*)NULL;
1de06328 644 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
645 }
646 else
647 s = fbm_instr(
648 (unsigned char*)t,
649 HOP3(HOP3(last1, prog->anchored_offset, strend)
650 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
651 must,
7fba1cd6 652 multiline ? FBMrf_MULTILINE : 0
33b8afdf 653 );
ab3bbdeb
YO
654 DEBUG_EXECUTE_r({
655 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
656 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
657 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 658 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
659 quoted, RE_SV_TAIL(must));
660 });
661
662
2c2d71f5
JH
663 if (!s) {
664 if (last1 >= last2) {
a3621e74 665 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
666 ", giving up...\n"));
667 goto fail_finish;
668 }
a3621e74 669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 670 ", trying floating at offset %ld...\n",
be8e71aa 671 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
672 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
673 s = HOP3c(last, 1, strend);
2c2d71f5
JH
674 goto restart;
675 }
676 else {
a3621e74 677 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 678 (long)(s - i_strpos)));
1aa99e6b
IH
679 t = HOP3c(s, -prog->anchored_offset, strbeg);
680 other_last = HOP3c(s, 1, strend);
be8e71aa 681 s = saved_s;
2c2d71f5
JH
682 if (t == strpos)
683 goto try_at_start;
2c2d71f5
JH
684 goto try_at_offset;
685 }
30944b6d 686 }
2c2d71f5
JH
687 }
688 else { /* Take into account the floating substring. */
33b8afdf 689 char *last, *last1;
be8e71aa 690 char * const saved_s = s;
33b8afdf
JH
691 SV* must;
692
693 t = HOP3c(s, -start_shift, strbeg);
694 last1 = last =
695 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
696 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
697 last = HOP3c(t, prog->float_max_offset, strend);
698 s = HOP3c(t, prog->float_min_offset, strend);
699 if (s < other_last)
700 s = other_last;
2c2d71f5 701 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
702 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
703 /* fbm_instr() takes into account exact value of end-of-str
704 if the check is SvTAIL(ed). Since false positives are OK,
705 and end-of-str is not later than strend we are OK. */
706 if (must == &PL_sv_undef) {
707 s = (char*)NULL;
1de06328 708 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
709 }
710 else
2c2d71f5 711 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
712 (unsigned char*)last + SvCUR(must)
713 - (SvTAIL(must)!=0),
7fba1cd6 714 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
715 DEBUG_EXECUTE_r({
716 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
717 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
718 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 719 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
720 quoted, RE_SV_TAIL(must));
721 });
33b8afdf
JH
722 if (!s) {
723 if (last1 == last) {
a3621e74 724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
725 ", giving up...\n"));
726 goto fail_finish;
2c2d71f5 727 }
a3621e74 728 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 729 ", trying anchored starting at offset %ld...\n",
be8e71aa 730 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
731 other_last = last;
732 s = HOP3c(t, 1, strend);
733 goto restart;
734 }
735 else {
a3621e74 736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
737 (long)(s - i_strpos)));
738 other_last = s; /* Fix this later. --Hugo */
be8e71aa 739 s = saved_s;
33b8afdf
JH
740 if (t == strpos)
741 goto try_at_start;
742 goto try_at_offset;
743 }
2c2d71f5 744 }
cad2e5aa 745 }
2c2d71f5 746
1de06328 747
9ef43ace 748 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 749
6bda09f9 750 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
751 PerlIO_printf(Perl_debug_log,
752 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
753 (IV)prog->check_offset_min,
754 (IV)prog->check_offset_max,
755 (IV)(s-strpos),
756 (IV)(t-strpos),
757 (IV)(t-s),
758 (IV)(strend-strpos)
759 )
760 );
761
2c2d71f5 762 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 763 && (!do_utf8
9ef43ace 764 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
765 && t > strpos)))
766 {
2c2d71f5
JH
767 /* Fixed substring is found far enough so that the match
768 cannot start at strpos. */
769 try_at_offset:
cad2e5aa 770 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
771 /* Eventually fbm_*() should handle this, but often
772 anchored_offset is not 0, so this check will not be wasted. */
773 /* XXXX In the code below we prefer to look for "^" even in
774 presence of anchored substrings. And we search even
775 beyond the found float position. These pessimizations
776 are historical artefacts only. */
777 find_anchor:
2c2d71f5 778 while (t < strend - prog->minlen) {
cad2e5aa 779 if (*t == '\n') {
4ee3650e 780 if (t < check_at - prog->check_offset_min) {
33b8afdf 781 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
782 /* Since we moved from the found position,
783 we definitely contradict the found anchored
30944b6d
IZ
784 substr. Due to the above check we do not
785 contradict "check" substr.
786 Thus we can arrive here only if check substr
787 is float. Redo checking for "other"=="fixed".
788 */
9041c2e3 789 strpos = t + 1;
a3621e74 790 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 791 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
792 goto do_other_anchored;
793 }
4ee3650e
GS
794 /* We don't contradict the found floating substring. */
795 /* XXXX Why not check for STCLASS? */
cad2e5aa 796 s = t + 1;
a3621e74 797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 798 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
799 goto set_useful;
800 }
4ee3650e
GS
801 /* Position contradicts check-string */
802 /* XXXX probably better to look for check-string
803 than for "\n", so one should lower the limit for t? */
a3621e74 804 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 805 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 806 other_last = strpos = s = t + 1;
cad2e5aa
JH
807 goto restart;
808 }
809 t++;
810 }
a3621e74 811 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 812 PL_colors[0], PL_colors[1]));
2c2d71f5 813 goto fail_finish;
cad2e5aa 814 }
f5952150 815 else {
a3621e74 816 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 817 PL_colors[0], PL_colors[1]));
f5952150 818 }
cad2e5aa
JH
819 s = t;
820 set_useful:
33b8afdf 821 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
822 }
823 else {
f5952150 824 /* The found string does not prohibit matching at strpos,
2c2d71f5 825 - no optimization of calling REx engine can be performed,
f5952150
GS
826 unless it was an MBOL and we are not after MBOL,
827 or a future STCLASS check will fail this. */
2c2d71f5
JH
828 try_at_start:
829 /* Even in this situation we may use MBOL flag if strpos is offset
830 wrt the start of the string. */
05b4157f 831 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 832 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 833 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 834 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 835 {
cad2e5aa
JH
836 t = strpos;
837 goto find_anchor;
838 }
a3621e74 839 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 840 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 841 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 842 );
2c2d71f5 843 success_at_start:
bbe252da 844 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
845 && (do_utf8 ? (
846 prog->check_utf8 /* Could be deleted already */
847 && --BmUSEFUL(prog->check_utf8) < 0
848 && (prog->check_utf8 == prog->float_utf8)
849 ) : (
850 prog->check_substr /* Could be deleted already */
851 && --BmUSEFUL(prog->check_substr) < 0
852 && (prog->check_substr == prog->float_substr)
853 )))
66e933ab 854 {
cad2e5aa 855 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
857 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
858 if (do_utf8 ? prog->check_substr : prog->check_utf8)
859 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
860 prog->check_substr = prog->check_utf8 = NULL; /* disable */
861 prog->float_substr = prog->float_utf8 = NULL; /* clear */
862 check = NULL; /* abort */
cad2e5aa 863 s = strpos;
3cf5c195
IZ
864 /* XXXX This is a remnant of the old implementation. It
865 looks wasteful, since now INTUIT can use many
6eb5f6b9 866 other heuristics. */
bbe252da 867 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
868 }
869 else
870 s = strpos;
871 }
872
6eb5f6b9
JH
873 /* Last resort... */
874 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
875 /* trie stclasses are too expensive to use here, we are better off to
876 leave it to regmatch itself */
f8fc2ecf 877 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
878 /* minlen == 0 is possible if regstclass is \b or \B,
879 and the fixed substr is ''$.
880 Since minlen is already taken into account, s+1 is before strend;
881 accidentally, minlen >= 1 guaranties no false positives at s + 1
882 even for \b or \B. But (minlen? 1 : 0) below assumes that
883 regstclass does not come from lookahead... */
884 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
885 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
886 const U8* const str = (U8*)STRING(progi->regstclass);
887 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
888 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 889 : 1);
1de06328
YO
890 char * endpos;
891 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
892 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
893 else if (prog->float_substr || prog->float_utf8)
894 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
895 else
896 endpos= strend;
897
70685ca0
JH
898 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
899 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 900
6eb5f6b9 901 t = s;
f8fc2ecf 902 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
903 if (!s) {
904#ifdef DEBUGGING
cbbf8932 905 const char *what = NULL;
6eb5f6b9
JH
906#endif
907 if (endpos == strend) {
a3621e74 908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
909 "Could not match STCLASS...\n") );
910 goto fail;
911 }
a3621e74 912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 913 "This position contradicts STCLASS...\n") );
bbe252da 914 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 915 goto fail;
6eb5f6b9 916 /* Contradict one of substrings */
33b8afdf
JH
917 if (prog->anchored_substr || prog->anchored_utf8) {
918 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 919 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 920 hop_and_restart:
1aa99e6b 921 s = HOP3c(t, 1, strend);
66e933ab
GS
922 if (s + start_shift + end_shift > strend) {
923 /* XXXX Should be taken into account earlier? */
a3621e74 924 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
925 "Could not match STCLASS...\n") );
926 goto fail;
927 }
5e39e1e5
HS
928 if (!check)
929 goto giveup;
a3621e74 930 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 931 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
932 what, (long)(s + start_shift - i_strpos)) );
933 goto restart;
934 }
66e933ab 935 /* Have both, check_string is floating */
6eb5f6b9
JH
936 if (t + start_shift >= check_at) /* Contradicts floating=check */
937 goto retry_floating_check;
938 /* Recheck anchored substring, but not floating... */
9041c2e3 939 s = check_at;
5e39e1e5
HS
940 if (!check)
941 goto giveup;
a3621e74 942 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 943 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
944 (long)(other_last - i_strpos)) );
945 goto do_other_anchored;
946 }
60e71179
GS
947 /* Another way we could have checked stclass at the
948 current position only: */
949 if (ml_anch) {
950 s = t = t + 1;
5e39e1e5
HS
951 if (!check)
952 goto giveup;
a3621e74 953 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 954 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 955 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 956 goto try_at_offset;
66e933ab 957 }
33b8afdf 958 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 959 goto fail;
6eb5f6b9
JH
960 /* Check is floating subtring. */
961 retry_floating_check:
962 t = check_at - start_shift;
a3621e74 963 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
964 goto hop_and_restart;
965 }
b7953727 966 if (t != s) {
a3621e74 967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 968 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
969 (long)(t - i_strpos), (long)(s - i_strpos))
970 );
971 }
972 else {
a3621e74 973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
974 "Does not contradict STCLASS...\n");
975 );
976 }
6eb5f6b9 977 }
5e39e1e5 978 giveup:
a3621e74 979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
980 PL_colors[4], (check ? "Guessed" : "Giving up"),
981 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 982 return s;
2c2d71f5
JH
983
984 fail_finish: /* Substring not found */
33b8afdf
JH
985 if (prog->check_substr || prog->check_utf8) /* could be removed already */
986 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 987 fail:
a3621e74 988 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 989 PL_colors[4], PL_colors[5]));
bd61b366 990 return NULL;
cad2e5aa 991}
9661b544 992
786e8c11 993
3b0527fe 994
55eed653
NC
995#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
996uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
997 switch (trie_type) { \
998 case trie_utf8_fold: \
999 if ( foldlen>0 ) { \
1000 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1001 foldlen -= len; \
1002 uscan += len; \
1003 len=0; \
1004 } else { \
1005 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1006 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1007 foldlen -= UNISKIP( uvc ); \
1008 uscan = foldbuf + UNISKIP( uvc ); \
1009 } \
1010 break; \
1011 case trie_utf8: \
1012 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1013 break; \
1014 case trie_plain: \
1015 uvc = (UV)*uc; \
1016 len = 1; \
1017 } \
1018 \
1019 if (uvc < 256) { \
1020 charid = trie->charmap[ uvc ]; \
1021 } \
1022 else { \
1023 charid = 0; \
55eed653
NC
1024 if (widecharmap) { \
1025 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1026 (char*)&uvc, sizeof(UV), 0); \
1027 if (svpp) \
1028 charid = (U16)SvIV(*svpp); \
1029 } \
1030 } \
1031} STMT_END
1032
1033#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1034 if ( (CoNd) \
1035 && (ln == len || \
1036 ibcmp_utf8(s, NULL, 0, do_utf8, \
1037 m, NULL, ln, (bool)UTF)) \
24b23f37 1038 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1039 goto got_it; \
1040 else { \
1041 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1042 uvchr_to_utf8(tmpbuf, c); \
1043 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1044 if ( f != c \
1045 && (f == c1 || f == c2) \
1046 && (ln == foldlen || \
1047 !ibcmp_utf8((char *) foldbuf, \
1048 NULL, foldlen, do_utf8, \
1049 m, \
1050 NULL, ln, (bool)UTF)) \
24b23f37 1051 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1052 goto got_it; \
1053 } \
1054 s += len
1055
1056#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1057STMT_START { \
1058 while (s <= e) { \
1059 if ( (CoNd) \
1060 && (ln == 1 || !(OP(c) == EXACTF \
1061 ? ibcmp(s, m, ln) \
1062 : ibcmp_locale(s, m, ln))) \
24b23f37 1063 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1064 goto got_it; \
1065 s++; \
1066 } \
1067} STMT_END
1068
1069#define REXEC_FBC_UTF8_SCAN(CoDe) \
1070STMT_START { \
1071 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1072 CoDe \
1073 s += uskip; \
1074 } \
1075} STMT_END
1076
1077#define REXEC_FBC_SCAN(CoDe) \
1078STMT_START { \
1079 while (s < strend) { \
1080 CoDe \
1081 s++; \
1082 } \
1083} STMT_END
1084
1085#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1086REXEC_FBC_UTF8_SCAN( \
1087 if (CoNd) { \
24b23f37 1088 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1089 goto got_it; \
1090 else \
1091 tmp = doevery; \
1092 } \
1093 else \
1094 tmp = 1; \
1095)
1096
1097#define REXEC_FBC_CLASS_SCAN(CoNd) \
1098REXEC_FBC_SCAN( \
1099 if (CoNd) { \
24b23f37 1100 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1101 goto got_it; \
1102 else \
1103 tmp = doevery; \
1104 } \
1105 else \
1106 tmp = 1; \
1107)
1108
1109#define REXEC_FBC_TRYIT \
24b23f37 1110if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1111 goto got_it
1112
e1d1eefb
YO
1113#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1114 if (do_utf8) { \
1115 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1116 } \
1117 else { \
1118 REXEC_FBC_CLASS_SCAN(CoNd); \
1119 } \
1120 break
1121
4cadc6a9
YO
1122#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1123 if (do_utf8) { \
1124 UtFpReLoAd; \
1125 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1126 } \
1127 else { \
1128 REXEC_FBC_CLASS_SCAN(CoNd); \
1129 } \
1130 break
1131
1132#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1133 PL_reg_flags |= RF_tainted; \
1134 if (do_utf8) { \
1135 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1136 } \
1137 else { \
1138 REXEC_FBC_CLASS_SCAN(CoNd); \
1139 } \
1140 break
1141
786e8c11
YO
1142#define DUMP_EXEC_POS(li,s,doutf8) \
1143 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1144
1145/* We know what class REx starts with. Try to find this position... */
1146/* if reginfo is NULL, its a dryrun */
1147/* annoyingly all the vars in this routine have different names from their counterparts
1148 in regmatch. /grrr */
1149
3c3eec57 1150STATIC char *
07be1b83 1151S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1152 const char *strend, regmatch_info *reginfo)
a687059c 1153{
27da23d5 1154 dVAR;
bbe252da 1155 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1156 char *m;
d8093b23 1157 STRLEN ln;
5dab1207 1158 STRLEN lnc;
078c425b 1159 register STRLEN uskip;
d8093b23 1160 unsigned int c1;
1161 unsigned int c2;
6eb5f6b9
JH
1162 char *e;
1163 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1164 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf
YO
1165 RXi_GET_DECL(prog,progi);
1166
6eb5f6b9
JH
1167 /* We know what class it must start with. */
1168 switch (OP(c)) {
6eb5f6b9 1169 case ANYOF:
388cc4de 1170 if (do_utf8) {
4cadc6a9 1171 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1172 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1173 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1174 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1175 }
1176 else {
1177 while (s < strend) {
1178 STRLEN skip = 1;
1179
32fc9b6a 1180 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1181 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1182 /* The assignment of 2 is intentional:
1183 * for the folded sharp s, the skip is 2. */
1184 (skip = SHARP_S_SKIP))) {
24b23f37 1185 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1186 goto got_it;
1187 else
1188 tmp = doevery;
1189 }
1190 else
1191 tmp = 1;
1192 s += skip;
1193 }
a0d0e21e 1194 }
6eb5f6b9 1195 break;
f33976b4 1196 case CANY:
4cadc6a9 1197 REXEC_FBC_SCAN(
24b23f37 1198 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1199 goto got_it;
1200 else
1201 tmp = doevery;
4cadc6a9 1202 );
f33976b4 1203 break;
6eb5f6b9 1204 case EXACTF:
5dab1207
NIS
1205 m = STRING(c);
1206 ln = STR_LEN(c); /* length to match in octets/bytes */
1207 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1208 if (UTF) {
a2a2844f 1209 STRLEN ulen1, ulen2;
5dab1207 1210 U8 *sm = (U8 *) m;
89ebb4a3
JH
1211 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1212 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1213 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1214
1215 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1216 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1217
89ebb4a3 1218 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1219 0, uniflags);
89ebb4a3 1220 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1221 0, uniflags);
5dab1207
NIS
1222 lnc = 0;
1223 while (sm < ((U8 *) m + ln)) {
1224 lnc++;
1225 sm += UTF8SKIP(sm);
1226 }
1aa99e6b
IH
1227 }
1228 else {
1229 c1 = *(U8*)m;
1230 c2 = PL_fold[c1];
1231 }
6eb5f6b9
JH
1232 goto do_exactf;
1233 case EXACTFL:
5dab1207
NIS
1234 m = STRING(c);
1235 ln = STR_LEN(c);
1236 lnc = (I32) ln;
d8093b23 1237 c1 = *(U8*)m;
6eb5f6b9
JH
1238 c2 = PL_fold_locale[c1];
1239 do_exactf:
db12adc6 1240 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1241
3b0527fe 1242 if (!reginfo && e < s)
6eb5f6b9 1243 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1244
60a8b682
JH
1245 /* The idea in the EXACTF* cases is to first find the
1246 * first character of the EXACTF* node and then, if
1247 * necessary, case-insensitively compare the full
1248 * text of the node. The c1 and c2 are the first
1249 * characters (though in Unicode it gets a bit
1250 * more complicated because there are more cases
7f16dd3d
JH
1251 * than just upper and lower: one needs to use
1252 * the so-called folding case for case-insensitive
1253 * matching (called "loose matching" in Unicode).
1254 * ibcmp_utf8() will do just that. */
60a8b682 1255
1aa99e6b 1256 if (do_utf8) {
575cac57 1257 UV c, f;
89ebb4a3 1258 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1259 STRLEN len, foldlen;
4ad0818d 1260 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1261 if (c1 == c2) {
5dab1207
NIS
1262 /* Upper and lower of 1st char are equal -
1263 * probably not a "letter". */
1aa99e6b 1264 while (s <= e) {
89ebb4a3 1265 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1266 uniflags);
4cadc6a9 1267 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1268 }
09091399
JH
1269 }
1270 else {
1aa99e6b 1271 while (s <= e) {
89ebb4a3 1272 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1273 uniflags);
80aecb99 1274
60a8b682 1275 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1276 * Note that not all the possible combinations
1277 * are handled here: some of them are handled
1278 * by the standard folding rules, and some of
1279 * them (the character class or ANYOF cases)
1280 * are handled during compiletime in
1281 * regexec.c:S_regclass(). */
880bd946
JH
1282 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1283 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1284 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1285
4cadc6a9 1286 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1287 }
09091399 1288 }
1aa99e6b
IH
1289 }
1290 else {
1291 if (c1 == c2)
4cadc6a9 1292 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1293 else
4cadc6a9 1294 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1295 }
1296 break;
bbce6d69 1297 case BOUNDL:
3280af22 1298 PL_reg_flags |= RF_tainted;
bbce6d69 1299 /* FALL THROUGH */
a0d0e21e 1300 case BOUND:
ffc61ed2 1301 if (do_utf8) {
12d33761 1302 if (s == PL_bostr)
ffc61ed2
JH
1303 tmp = '\n';
1304 else {
6136c704 1305 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1306 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1307 }
1308 tmp = ((OP(c) == BOUND ?
9041c2e3 1309 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1310 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1311 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1312 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1313 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1314 isALNUM_LC_utf8((U8*)s)))
1315 {
1316 tmp = !tmp;
4cadc6a9 1317 REXEC_FBC_TRYIT;
a687059c 1318 }
4cadc6a9 1319 );
a0d0e21e 1320 }
667bb95a 1321 else {
12d33761 1322 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1323 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1324 REXEC_FBC_SCAN(
ffc61ed2
JH
1325 if (tmp ==
1326 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1327 tmp = !tmp;
4cadc6a9 1328 REXEC_FBC_TRYIT;
a0ed51b3 1329 }
4cadc6a9 1330 );
a0ed51b3 1331 }
24b23f37 1332 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1333 goto got_it;
1334 break;
bbce6d69 1335 case NBOUNDL:
3280af22 1336 PL_reg_flags |= RF_tainted;
bbce6d69 1337 /* FALL THROUGH */
a0d0e21e 1338 case NBOUND:
ffc61ed2 1339 if (do_utf8) {
12d33761 1340 if (s == PL_bostr)
ffc61ed2
JH
1341 tmp = '\n';
1342 else {
6136c704 1343 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1344 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1345 }
1346 tmp = ((OP(c) == NBOUND ?
9041c2e3 1347 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1348 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1349 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1350 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1351 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1352 isALNUM_LC_utf8((U8*)s)))
1353 tmp = !tmp;
4cadc6a9
YO
1354 else REXEC_FBC_TRYIT;
1355 );
a0d0e21e 1356 }
667bb95a 1357 else {
12d33761 1358 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1359 tmp = ((OP(c) == NBOUND ?
1360 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1361 REXEC_FBC_SCAN(
ffc61ed2
JH
1362 if (tmp ==
1363 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1364 tmp = !tmp;
4cadc6a9
YO
1365 else REXEC_FBC_TRYIT;
1366 );
a0ed51b3 1367 }
24b23f37 1368 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1369 goto got_it;
1370 break;
a0d0e21e 1371 case ALNUM:
4cadc6a9
YO
1372 REXEC_FBC_CSCAN_PRELOAD(
1373 LOAD_UTF8_CHARCLASS_ALNUM(),
1374 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1375 isALNUM(*s)
1376 );
bbce6d69 1377 case ALNUML:
4cadc6a9
YO
1378 REXEC_FBC_CSCAN_TAINT(
1379 isALNUM_LC_utf8((U8*)s),
1380 isALNUM_LC(*s)
1381 );
a0d0e21e 1382 case NALNUM:
4cadc6a9
YO
1383 REXEC_FBC_CSCAN_PRELOAD(
1384 LOAD_UTF8_CHARCLASS_ALNUM(),
1385 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1386 !isALNUM(*s)
1387 );
bbce6d69 1388 case NALNUML:
4cadc6a9
YO
1389 REXEC_FBC_CSCAN_TAINT(
1390 !isALNUM_LC_utf8((U8*)s),
1391 !isALNUM_LC(*s)
1392 );
a0d0e21e 1393 case SPACE:
4cadc6a9
YO
1394 REXEC_FBC_CSCAN_PRELOAD(
1395 LOAD_UTF8_CHARCLASS_SPACE(),
1396 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1397 isSPACE(*s)
1398 );
bbce6d69 1399 case SPACEL:
4cadc6a9
YO
1400 REXEC_FBC_CSCAN_TAINT(
1401 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1402 isSPACE_LC(*s)
1403 );
a0d0e21e 1404 case NSPACE:
4cadc6a9
YO
1405 REXEC_FBC_CSCAN_PRELOAD(
1406 LOAD_UTF8_CHARCLASS_SPACE(),
1407 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1408 !isSPACE(*s)
1409 );
bbce6d69 1410 case NSPACEL:
4cadc6a9
YO
1411 REXEC_FBC_CSCAN_TAINT(
1412 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1413 !isSPACE_LC(*s)
1414 );
a0d0e21e 1415 case DIGIT:
4cadc6a9
YO
1416 REXEC_FBC_CSCAN_PRELOAD(
1417 LOAD_UTF8_CHARCLASS_DIGIT(),
1418 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1419 isDIGIT(*s)
1420 );
b8c5462f 1421 case DIGITL:
4cadc6a9
YO
1422 REXEC_FBC_CSCAN_TAINT(
1423 isDIGIT_LC_utf8((U8*)s),
1424 isDIGIT_LC(*s)
1425 );
a0d0e21e 1426 case NDIGIT:
4cadc6a9
YO
1427 REXEC_FBC_CSCAN_PRELOAD(
1428 LOAD_UTF8_CHARCLASS_DIGIT(),
1429 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1430 !isDIGIT(*s)
1431 );
b8c5462f 1432 case NDIGITL:
4cadc6a9
YO
1433 REXEC_FBC_CSCAN_TAINT(
1434 !isDIGIT_LC_utf8((U8*)s),
1435 !isDIGIT_LC(*s)
1436 );
e1d1eefb
YO
1437 case LNBREAK:
1438 REXEC_FBC_CSCAN(
1439 is_LNBREAK_utf8(s),
1440 is_LNBREAK_latin1(s)
1441 );
1442 case VERTWS:
1443 REXEC_FBC_CSCAN(
1444 is_VERTWS_utf8(s),
1445 is_VERTWS_latin1(s)
1446 );
1447 case NVERTWS:
1448 REXEC_FBC_CSCAN(
1449 !is_VERTWS_utf8(s),
1450 !is_VERTWS_latin1(s)
1451 );
1452 case HORIZWS:
1453 REXEC_FBC_CSCAN(
1454 is_HORIZWS_utf8(s),
1455 is_HORIZWS_latin1(s)
1456 );
1457 case NHORIZWS:
1458 REXEC_FBC_CSCAN(
1459 !is_HORIZWS_utf8(s),
1460 !is_HORIZWS_latin1(s)
1461 );
1de06328
YO
1462 case AHOCORASICKC:
1463 case AHOCORASICK:
07be1b83
YO
1464 {
1465 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1466 trie_type = do_utf8 ?
1467 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1468 : trie_plain;
1469 /* what trie are we using right now */
1470 reg_ac_data *aho
f8fc2ecf 1471 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1472 reg_trie_data *trie
1473 = (reg_trie_data*)progi->data->data[ aho->trie ];
55eed653 1474 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
07be1b83
YO
1475
1476 const char *last_start = strend - trie->minlen;
6148ee25 1477#ifdef DEBUGGING
07be1b83 1478 const char *real_start = s;
6148ee25 1479#endif
07be1b83 1480 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1481 SV *sv_points;
1482 U8 **points; /* map of where we were in the input string
786e8c11 1483 when reading a given char. For ASCII this
be8e71aa
YO
1484 is unnecessary overhead as the relationship
1485 is always 1:1, but for unicode, especially
1486 case folded unicode this is not true. */
f9e705e8 1487 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1488 U8 *bitmap=NULL;
1489
07be1b83
YO
1490
1491 GET_RE_DEBUG_FLAGS_DECL;
1492
be8e71aa
YO
1493 /* We can't just allocate points here. We need to wrap it in
1494 * an SV so it gets freed properly if there is a croak while
1495 * running the match */
1496 ENTER;
1497 SAVETMPS;
1498 sv_points=newSV(maxlen * sizeof(U8 *));
1499 SvCUR_set(sv_points,
1500 maxlen * sizeof(U8 *));
1501 SvPOK_on(sv_points);
1502 sv_2mortal(sv_points);
1503 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1504 if ( trie_type != trie_utf8_fold
1505 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1506 {
786e8c11
YO
1507 if (trie->bitmap)
1508 bitmap=(U8*)trie->bitmap;
1509 else
1510 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1511 }
786e8c11
YO
1512 /* this is the Aho-Corasick algorithm modified a touch
1513 to include special handling for long "unknown char"
1514 sequences. The basic idea being that we use AC as long
1515 as we are dealing with a possible matching char, when
1516 we encounter an unknown char (and we have not encountered
1517 an accepting state) we scan forward until we find a legal
1518 starting char.
1519 AC matching is basically that of trie matching, except
1520 that when we encounter a failing transition, we fall back
1521 to the current states "fail state", and try the current char
1522 again, a process we repeat until we reach the root state,
1523 state 1, or a legal transition. If we fail on the root state
1524 then we can either terminate if we have reached an accepting
1525 state previously, or restart the entire process from the beginning
1526 if we have not.
1527
1528 */
07be1b83
YO
1529 while (s <= last_start) {
1530 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1531 U8 *uc = (U8*)s;
1532 U16 charid = 0;
1533 U32 base = 1;
1534 U32 state = 1;
1535 UV uvc = 0;
1536 STRLEN len = 0;
1537 STRLEN foldlen = 0;
1538 U8 *uscan = (U8*)NULL;
1539 U8 *leftmost = NULL;
786e8c11
YO
1540#ifdef DEBUGGING
1541 U32 accepted_word= 0;
1542#endif
07be1b83
YO
1543 U32 pointpos = 0;
1544
1545 while ( state && uc <= (U8*)strend ) {
1546 int failed=0;
786e8c11
YO
1547 U32 word = aho->states[ state ].wordnum;
1548
1de06328
YO
1549 if( state==1 ) {
1550 if ( bitmap ) {
1551 DEBUG_TRIE_EXECUTE_r(
1552 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1553 dump_exec_pos( (char *)uc, c, strend, real_start,
1554 (char *)uc, do_utf8 );
1555 PerlIO_printf( Perl_debug_log,
1556 " Scanning for legal start char...\n");
1557 }
1558 );
1559 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1560 uc++;
786e8c11 1561 }
1de06328 1562 s= (char *)uc;
786e8c11 1563 }
786e8c11
YO
1564 if (uc >(U8*)last_start) break;
1565 }
1566
1567 if ( word ) {
1568 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1569 if (!leftmost || lpos < leftmost) {
1570 DEBUG_r(accepted_word=word);
07be1b83 1571 leftmost= lpos;
786e8c11 1572 }
07be1b83 1573 if (base==0) break;
786e8c11 1574
07be1b83
YO
1575 }
1576 points[pointpos++ % maxlen]= uc;
55eed653
NC
1577 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1578 uscan, len, uvc, charid, foldlen,
1579 foldbuf, uniflags);
786e8c11
YO
1580 DEBUG_TRIE_EXECUTE_r({
1581 dump_exec_pos( (char *)uc, c, strend, real_start,
1582 s, do_utf8 );
07be1b83 1583 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1584 " Charid:%3u CP:%4"UVxf" ",
1585 charid, uvc);
1586 });
07be1b83
YO
1587
1588 do {
6148ee25 1589#ifdef DEBUGGING
786e8c11 1590 word = aho->states[ state ].wordnum;
6148ee25 1591#endif
07be1b83
YO
1592 base = aho->states[ state ].trans.base;
1593
786e8c11
YO
1594 DEBUG_TRIE_EXECUTE_r({
1595 if (failed)
1596 dump_exec_pos( (char *)uc, c, strend, real_start,
1597 s, do_utf8 );
07be1b83 1598 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1599 "%sState: %4"UVxf", word=%"UVxf,
1600 failed ? " Fail transition to " : "",
1601 (UV)state, (UV)word);
1602 });
07be1b83
YO
1603 if ( base ) {
1604 U32 tmp;
1605 if (charid &&
1606 (base + charid > trie->uniquecharcount )
1607 && (base + charid - 1 - trie->uniquecharcount
1608 < trie->lasttrans)
1609 && trie->trans[base + charid - 1 -
1610 trie->uniquecharcount].check == state
1611 && (tmp=trie->trans[base + charid - 1 -
1612 trie->uniquecharcount ].next))
1613 {
786e8c11
YO
1614 DEBUG_TRIE_EXECUTE_r(
1615 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1616 state = tmp;
1617 break;
1618 }
1619 else {
786e8c11
YO
1620 DEBUG_TRIE_EXECUTE_r(
1621 PerlIO_printf( Perl_debug_log," - fail\n"));
1622 failed = 1;
1623 state = aho->fail[state];
07be1b83
YO
1624 }
1625 }
1626 else {
1627 /* we must be accepting here */
786e8c11
YO
1628 DEBUG_TRIE_EXECUTE_r(
1629 PerlIO_printf( Perl_debug_log," - accepting\n"));
1630 failed = 1;
07be1b83
YO
1631 break;
1632 }
1633 } while(state);
786e8c11 1634 uc += len;
07be1b83
YO
1635 if (failed) {
1636 if (leftmost)
1637 break;
786e8c11 1638 if (!state) state = 1;
07be1b83
YO
1639 }
1640 }
1641 if ( aho->states[ state ].wordnum ) {
1642 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1643 if (!leftmost || lpos < leftmost) {
1644 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1645 leftmost = lpos;
786e8c11 1646 }
07be1b83 1647 }
07be1b83
YO
1648 if (leftmost) {
1649 s = (char*)leftmost;
786e8c11
YO
1650 DEBUG_TRIE_EXECUTE_r({
1651 PerlIO_printf(
70685ca0
JH
1652 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1653 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1654 );
1655 });
24b23f37 1656 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1657 FREETMPS;
1658 LEAVE;
07be1b83 1659 goto got_it;
be8e71aa 1660 }
07be1b83 1661 s = HOPc(s,1);
786e8c11
YO
1662 DEBUG_TRIE_EXECUTE_r({
1663 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1664 });
07be1b83 1665 } else {
786e8c11
YO
1666 DEBUG_TRIE_EXECUTE_r(
1667 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1668 break;
1669 }
1670 }
be8e71aa
YO
1671 FREETMPS;
1672 LEAVE;
07be1b83
YO
1673 }
1674 break;
b3c9acc1 1675 default:
3c3eec57
GS
1676 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1677 break;
d6a28714 1678 }
6eb5f6b9
JH
1679 return 0;
1680 got_it:
1681 return s;
1682}
1683
e6aad7ab 1684static void
fae667d5 1685S_swap_match_buff (pTHX_ regexp *prog) {
f0ab9afb 1686 regexp_paren_pair *t;
fae667d5 1687
28d8d7f4 1688 if (!prog->swap) {
fae667d5
YO
1689 /* We have to be careful. If the previous successful match
1690 was from this regex we don't want a subsequent paritally
1691 successful match to clobber the old results.
1692 So when we detect this possibility we add a swap buffer
1693 to the re, and switch the buffer each match. If we fail
1694 we switch it back, otherwise we leave it swapped.
1695 */
f0ab9afb 1696 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair);
fae667d5 1697 }
f0ab9afb
NC
1698 t = prog->swap;
1699 prog->swap = prog->offs;
1700 prog->offs = t;
fae667d5
YO
1701}
1702
1703
6eb5f6b9
JH
1704/*
1705 - regexec_flags - match a regexp against a string
1706 */
1707I32
49d7dfbc 1708Perl_regexec_flags(pTHX_ REGEXP * const prog, char *stringarg, register char *strend,
6eb5f6b9
JH
1709 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1710/* strend: pointer to null at end of string */
1711/* strbeg: real beginning of string */
1712/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1713/* data: May be used for some additional optimizations.
1714 Currently its only used, with a U32 cast, for transmitting
1715 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1716/* nosave: For optimizations. */
1717{
97aff369 1718 dVAR;
24b23f37 1719 /*register*/ char *s;
6eb5f6b9 1720 register regnode *c;
24b23f37 1721 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1722 I32 minlen; /* must match at least this many chars */
1723 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1724 I32 end_shift = 0; /* Same for the end. */ /* CC */
1725 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1726 char *scream_olds = NULL;
f9f4320a 1727 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1728 I32 multiline;
f8fc2ecf 1729 RXi_GET_DECL(prog,progi);
3b0527fe 1730 regmatch_info reginfo; /* create some info to pass to regtry etc */
fae667d5 1731 bool swap_on_fail = 0;
a3621e74
YO
1732
1733 GET_RE_DEBUG_FLAGS_DECL;
1734
9d4ba2ae 1735 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1736
1737 /* Be paranoid... */
1738 if (prog == NULL || startpos == NULL) {
1739 Perl_croak(aTHX_ "NULL regexp parameter");
1740 return 0;
1741 }
1742
bbe252da 1743 multiline = prog->extflags & RXf_PMf_MULTILINE;
3b0527fe 1744 reginfo.prog = prog;
2757e526 1745
bac06658 1746 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1747 DEBUG_EXECUTE_r(
1748 debug_start_match(prog, do_utf8, startpos, strend,
1749 "Matching");
1750 );
bac06658 1751
6eb5f6b9 1752 minlen = prog->minlen;
1de06328
YO
1753
1754 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1755 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1756 "String too short [regexec_flags]...\n"));
1757 goto phooey;
1aa99e6b 1758 }
6eb5f6b9 1759
1de06328 1760
6eb5f6b9 1761 /* Check validity of program. */
f8fc2ecf 1762 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1763 Perl_croak(aTHX_ "corrupted regexp program");
1764 }
1765
1766 PL_reg_flags = 0;
1767 PL_reg_eval_set = 0;
1768 PL_reg_maxiter = 0;
1769
bbe252da 1770 if (prog->extflags & RXf_UTF8)
6eb5f6b9
JH
1771 PL_reg_flags |= RF_utf8;
1772
1773 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1774 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1775 PL_bostr = strbeg;
3b0527fe 1776 reginfo.sv = sv;
6eb5f6b9
JH
1777
1778 /* Mark end of line for $ (and such) */
1779 PL_regeol = strend;
1780
1781 /* see how far we have to get to not match where we matched before */
3b0527fe 1782 reginfo.till = startpos+minend;
6eb5f6b9 1783
6eb5f6b9
JH
1784 /* If there is a "must appear" string, look for it. */
1785 s = startpos;
1786
bbe252da 1787 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1788 MAGIC *mg;
1789
1790 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
58e23c8d 1791 reginfo.ganch = startpos + prog->gofs;
6eb5f6b9
JH
1792 else if (sv && SvTYPE(sv) >= SVt_PVMG
1793 && SvMAGIC(sv)
14befaf4
DM
1794 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1795 && mg->mg_len >= 0) {
3b0527fe 1796 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
bbe252da 1797 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1798 if (s > reginfo.ganch)
6eb5f6b9 1799 goto phooey;
58e23c8d 1800 s = reginfo.ganch - prog->gofs;
6eb5f6b9
JH
1801 }
1802 }
58e23c8d 1803 else if (data) {
70685ca0 1804 reginfo.ganch = strbeg + PTR2UV(data);
58e23c8d 1805 } else /* pos() not defined */
3b0527fe 1806 reginfo.ganch = strbeg;
6eb5f6b9 1807 }
c74340f9 1808 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
fae667d5
YO
1809 swap_on_fail = 1;
1810 swap_match_buff(prog); /* do we need a save destructor here for
1811 eval dies? */
c74340f9 1812 }
a0714e2c 1813 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1814 re_scream_pos_data d;
1815
1816 d.scream_olds = &scream_olds;
1817 d.scream_pos = &scream_pos;
1818 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1819 if (!s) {
a3621e74 1820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1821 goto phooey; /* not present */
3fa9c3d7 1822 }
6eb5f6b9
JH
1823 }
1824
1de06328 1825
6eb5f6b9
JH
1826
1827 /* Simplest case: anchored match need be tried only once. */
1828 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1829 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1830 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 1831 goto got_it;
bbe252da
YO
1832 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1833 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
1834 {
1835 char *end;
1836
1837 if (minlen)
1838 dontbother = minlen - 1;
1aa99e6b 1839 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1840 /* for multiline we only have to try after newlines */
33b8afdf 1841 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1842 if (s == startpos)
1843 goto after_try;
1844 while (1) {
24b23f37 1845 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1846 goto got_it;
1847 after_try:
1848 if (s >= end)
1849 goto phooey;
bbe252da 1850 if (prog->extflags & RXf_USE_INTUIT) {
6eb5f6b9
JH
1851 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1852 if (!s)
1853 goto phooey;
1854 }
1855 else
1856 s++;
1857 }
1858 } else {
1859 if (s > startpos)
1860 s--;
1861 while (s < end) {
1862 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 1863 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1864 goto got_it;
1865 }
1866 }
1867 }
1868 }
1869 goto phooey;
bbe252da 1870 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
1871 {
1872 /* the warning about reginfo.ganch being used without intialization
bbe252da 1873 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 1874 and we only enter this block when the same bit is set. */
58e23c8d
YO
1875 char *tmp_s = reginfo.ganch - prog->gofs;
1876 if (regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
1877 goto got_it;
1878 goto phooey;
1879 }
1880
1881 /* Messy cases: unanchored match. */
bbe252da 1882 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
1883 /* we have /x+whatever/ */
1884 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1885 char ch;
bf93d4cc
GS
1886#ifdef DEBUGGING
1887 int did_match = 0;
1888#endif
33b8afdf
JH
1889 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1890 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1891 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1892
1aa99e6b 1893 if (do_utf8) {
4cadc6a9 1894 REXEC_FBC_SCAN(
6eb5f6b9 1895 if (*s == ch) {
a3621e74 1896 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1897 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1898 s += UTF8SKIP(s);
1899 while (s < strend && *s == ch)
1900 s += UTF8SKIP(s);
1901 }
4cadc6a9 1902 );
6eb5f6b9
JH
1903 }
1904 else {
4cadc6a9 1905 REXEC_FBC_SCAN(
6eb5f6b9 1906 if (*s == ch) {
a3621e74 1907 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1908 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1909 s++;
1910 while (s < strend && *s == ch)
1911 s++;
1912 }
4cadc6a9 1913 );
6eb5f6b9 1914 }
a3621e74 1915 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1916 PerlIO_printf(Perl_debug_log,
b7953727
JH
1917 "Did not find anchored character...\n")
1918 );
6eb5f6b9 1919 }
a0714e2c
SS
1920 else if (prog->anchored_substr != NULL
1921 || prog->anchored_utf8 != NULL
1922 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1923 && prog->float_max_offset < strend - s)) {
1924 SV *must;
1925 I32 back_max;
1926 I32 back_min;
1927 char *last;
6eb5f6b9 1928 char *last1; /* Last position checked before */
bf93d4cc
GS
1929#ifdef DEBUGGING
1930 int did_match = 0;
1931#endif
33b8afdf
JH
1932 if (prog->anchored_substr || prog->anchored_utf8) {
1933 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1934 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1935 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1936 back_max = back_min = prog->anchored_offset;
1937 } else {
1938 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1939 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1940 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1941 back_max = prog->float_max_offset;
1942 back_min = prog->float_min_offset;
1943 }
1de06328
YO
1944
1945
33b8afdf
JH
1946 if (must == &PL_sv_undef)
1947 /* could not downgrade utf8 check substring, so must fail */
1948 goto phooey;
1949
1de06328
YO
1950 if (back_min<0) {
1951 last = strend;
1952 } else {
1953 last = HOP3c(strend, /* Cannot start after this */
1954 -(I32)(CHR_SVLEN(must)
1955 - (SvTAIL(must) != 0) + back_min), strbeg);
1956 }
6eb5f6b9
JH
1957 if (s > PL_bostr)
1958 last1 = HOPc(s, -1);
1959 else
1960 last1 = s - 1; /* bogus */
1961
a0288114 1962 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1963 check_substr==must. */
1964 scream_pos = -1;
1965 dontbother = end_shift;
1966 strend = HOPc(strend, -dontbother);
1967 while ( (s <= last) &&
9041c2e3 1968 ((flags & REXEC_SCREAM)
1de06328 1969 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1970 end_shift, &scream_pos, 0))
1de06328 1971 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1972 (unsigned char*)strend, must,
7fba1cd6 1973 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1974 /* we may be pointing at the wrong string */
1975 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1976 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1977 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1978 if (HOPc(s, -back_max) > last1) {
1979 last1 = HOPc(s, -back_min);
1980 s = HOPc(s, -back_max);
1981 }
1982 else {
52657f30 1983 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1984
1985 last1 = HOPc(s, -back_min);
52657f30 1986 s = t;
6eb5f6b9 1987 }
1aa99e6b 1988 if (do_utf8) {
6eb5f6b9 1989 while (s <= last1) {
24b23f37 1990 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1991 goto got_it;
1992 s += UTF8SKIP(s);
1993 }
1994 }
1995 else {
1996 while (s <= last1) {
24b23f37 1997 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1998 goto got_it;
1999 s++;
2000 }
2001 }
2002 }
ab3bbdeb
YO
2003 DEBUG_EXECUTE_r(if (!did_match) {
2004 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2005 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2006 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2007 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2008 ? "anchored" : "floating"),
ab3bbdeb
YO
2009 quoted, RE_SV_TAIL(must));
2010 });
6eb5f6b9
JH
2011 goto phooey;
2012 }
f8fc2ecf 2013 else if ( (c = progi->regstclass) ) {
f14c76ed 2014 if (minlen) {
f8fc2ecf 2015 const OPCODE op = OP(progi->regstclass);
66e933ab 2016 /* don't bother with what can't match */
786e8c11 2017 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2018 strend = HOPc(strend, -(minlen - 1));
2019 }
a3621e74 2020 DEBUG_EXECUTE_r({
be8e71aa 2021 SV * const prop = sv_newmortal();
32fc9b6a 2022 regprop(prog, prop, c);
0df25f3d 2023 {
02daf0ab 2024 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2025 s,strend-s,60);
0df25f3d 2026 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2027 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2028 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2029 quoted, (int)(strend - s));
0df25f3d 2030 }
ffc61ed2 2031 });
3b0527fe 2032 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2033 goto got_it;
07be1b83 2034 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2035 }
2036 else {
2037 dontbother = 0;
a0714e2c 2038 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2039 /* Trim the end. */
d6a28714 2040 char *last;
33b8afdf
JH
2041 SV* float_real;
2042
2043 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2044 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2045 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2046
2047 if (flags & REXEC_SCREAM) {
33b8afdf 2048 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2049 end_shift, &scream_pos, 1); /* last one */
2050 if (!last)
ffc61ed2 2051 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2052 /* we may be pointing at the wrong string */
2053 else if (RX_MATCH_COPIED(prog))
3f7c398e 2054 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2055 }
d6a28714
JH
2056 else {
2057 STRLEN len;
cfd0369c 2058 const char * const little = SvPV_const(float_real, len);
d6a28714 2059
33b8afdf 2060 if (SvTAIL(float_real)) {
d6a28714
JH
2061 if (memEQ(strend - len + 1, little, len - 1))
2062 last = strend - len + 1;
7fba1cd6 2063 else if (!multiline)
9041c2e3 2064 last = memEQ(strend - len, little, len)
bd61b366 2065 ? strend - len : NULL;
b8c5462f 2066 else
d6a28714
JH
2067 goto find_last;
2068 } else {
2069 find_last:
9041c2e3 2070 if (len)
d6a28714 2071 last = rninstr(s, strend, little, little + len);
b8c5462f 2072 else
a0288114 2073 last = strend; /* matching "$" */
b8c5462f 2074 }
b8c5462f 2075 }
bf93d4cc 2076 if (last == NULL) {
6bda09f9
YO
2077 DEBUG_EXECUTE_r(
2078 PerlIO_printf(Perl_debug_log,
2079 "%sCan't trim the tail, match fails (should not happen)%s\n",
2080 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2081 goto phooey; /* Should not happen! */
2082 }
d6a28714
JH
2083 dontbother = strend - last + prog->float_min_offset;
2084 }
2085 if (minlen && (dontbother < minlen))
2086 dontbother = minlen - 1;
2087 strend -= dontbother; /* this one's always in bytes! */
2088 /* We don't know much -- general case. */
1aa99e6b 2089 if (do_utf8) {
d6a28714 2090 for (;;) {
24b23f37 2091 if (regtry(&reginfo, &s))
d6a28714
JH
2092 goto got_it;
2093 if (s >= strend)
2094 break;
b8c5462f 2095 s += UTF8SKIP(s);
d6a28714
JH
2096 };
2097 }
2098 else {
2099 do {
24b23f37 2100 if (regtry(&reginfo, &s))
d6a28714
JH
2101 goto got_it;
2102 } while (s++ < strend);
2103 }
2104 }
2105
2106 /* Failure. */
2107 goto phooey;
2108
2109got_it:
2110 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2111
19b95bf0 2112 if (PL_reg_eval_set)
4f639d21 2113 restore_pos(aTHX_ prog);
81714fb9
YO
2114 if (prog->paren_names)
2115 (void)hv_iterinit(prog->paren_names);
d6a28714
JH
2116
2117 /* make sure $`, $&, $', and $digit will work later */
2118 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2119 RX_MATCH_COPY_FREE(prog);
d6a28714 2120 if (flags & REXEC_COPY_STR) {
be8e71aa 2121 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2122#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2123 if ((SvIsCOW(sv)
2124 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2125 if (DEBUG_C_TEST) {
2126 PerlIO_printf(Perl_debug_log,
2127 "Copy on write: regexp capture, type %d\n",
2128 (int) SvTYPE(sv));
2129 }
2130 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2131 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2132 assert (SvPOKp(prog->saved_copy));
2133 } else
2134#endif
2135 {
2136 RX_MATCH_COPIED_on(prog);
2137 s = savepvn(strbeg, i);
2138 prog->subbeg = s;
2139 }
d6a28714 2140 prog->sublen = i;
d6a28714
JH
2141 }
2142 else {
2143 prog->subbeg = strbeg;
2144 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2145 }
2146 }
9041c2e3 2147
d6a28714
JH
2148 return 1;
2149
2150phooey:
a3621e74 2151 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2152 PL_colors[4], PL_colors[5]));
d6a28714 2153 if (PL_reg_eval_set)
4f639d21 2154 restore_pos(aTHX_ prog);
fae667d5 2155 if (swap_on_fail)
c74340f9 2156 /* we failed :-( roll it back */
fae667d5
YO
2157 swap_match_buff(prog);
2158
d6a28714
JH
2159 return 0;
2160}
2161
6bda09f9 2162
d6a28714
JH
2163/*
2164 - regtry - try match at specific point
2165 */
2166STATIC I32 /* 0 failure, 1 success */
24b23f37 2167S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2168{
97aff369 2169 dVAR;
d6a28714 2170 CHECKPOINT lastcp;
3b0527fe 2171 regexp *prog = reginfo->prog;
f8fc2ecf 2172 RXi_GET_DECL(prog,progi);
a3621e74 2173 GET_RE_DEBUG_FLAGS_DECL;
24b23f37 2174 reginfo->cutpoint=NULL;
d6a28714 2175
bbe252da 2176 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2177 MAGIC *mg;
2178
2179 PL_reg_eval_set = RS_init;
a3621e74 2180 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2181 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2182 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2183 ));
ea8d6ae1 2184 SAVESTACK_CXPOS();
d6a28714
JH
2185 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2186 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2187 SAVETMPS;
2188 /* Apparently this is not needed, judging by wantarray. */
e8347627 2189 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2190 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2191
3b0527fe 2192 if (reginfo->sv) {
d6a28714 2193 /* Make $_ available to executed code. */
3b0527fe 2194 if (reginfo->sv != DEFSV) {
59f00321 2195 SAVE_DEFSV;
3b0527fe 2196 DEFSV = reginfo->sv;
b8c5462f 2197 }
d6a28714 2198
3b0527fe
DM
2199 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2200 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2201 /* prepare for quick setting of pos */
d300d9fa 2202#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2203 if (SvIsCOW(reginfo->sv))
2204 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2205#endif
3dab1dad 2206 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2207 &PL_vtbl_mglob, NULL, 0);
d6a28714 2208 mg->mg_len = -1;
b8c5462f 2209 }
d6a28714
JH
2210 PL_reg_magic = mg;
2211 PL_reg_oldpos = mg->mg_len;
4f639d21 2212 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2213 }
09687e5a 2214 if (!PL_reg_curpm) {
a02a5408 2215 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2216#ifdef USE_ITHREADS
2217 {
be8e71aa 2218 SV* const repointer = newSViv(0);
577e12cc 2219 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2220 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2221 av_push(PL_regex_padav,repointer);
2222 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2223 PL_regex_pad = AvARRAY(PL_regex_padav);
2224 }
2225#endif
2226 }
aaa362c4 2227 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2228 PL_reg_oldcurpm = PL_curpm;
2229 PL_curpm = PL_reg_curpm;
2230 if (RX_MATCH_COPIED(prog)) {
2231 /* Here is a serious problem: we cannot rewrite subbeg,
2232 since it may be needed if this match fails. Thus
2233 $` inside (?{}) could fail... */
2234 PL_reg_oldsaved = prog->subbeg;
2235 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2236#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2237 PL_nrs = prog->saved_copy;
2238#endif
d6a28714
JH
2239 RX_MATCH_COPIED_off(prog);
2240 }
2241 else
bd61b366 2242 PL_reg_oldsaved = NULL;
d6a28714
JH
2243 prog->subbeg = PL_bostr;
2244 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2245 }
24b23f37 2246 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2247 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2248 PL_reginput = *startpos;
d6a28714 2249 PL_reglastparen = &prog->lastparen;
a01268b5 2250 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2251 prog->lastparen = 0;
03994de8 2252 prog->lastcloseparen = 0;
d6a28714 2253 PL_regsize = 0;
f0ab9afb 2254 PL_regoffs = prog->offs;
d6a28714
JH
2255 if (PL_reg_start_tmpl <= prog->nparens) {
2256 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2257 if(PL_reg_start_tmp)
2258 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2259 else
a02a5408 2260 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2261 }
2262
2263 /* XXXX What this code is doing here?!!! There should be no need
2264 to do this again and again, PL_reglastparen should take care of
3dd2943c 2265 this! --ilya*/
dafc8851
JH
2266
2267 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2268 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2269 * PL_reglastparen), is not needed at all by the test suite
2270 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2271 * enough, for building DynaLoader, or otherwise this
2272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2273 * will happen. Meanwhile, this code *is* needed for the
2274 * above-mentioned test suite tests to succeed. The common theme
2275 * on those tests seems to be returning null fields from matches.
2276 * --jhi */
dafc8851 2277#if 1
d6a28714 2278 if (prog->nparens) {
f0ab9afb 2279 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2280 register I32 i;
eb160463 2281 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2282 ++pp;
2283 pp->start = -1;
2284 pp->end = -1;
d6a28714
JH
2285 }
2286 }
dafc8851 2287#endif
02db2b7b 2288 REGCP_SET(lastcp);
f8fc2ecf 2289 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2290 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2291 return 1;
2292 }
24b23f37
YO
2293 if (reginfo->cutpoint)
2294 *startpos= reginfo->cutpoint;
02db2b7b 2295 REGCP_UNWIND(lastcp);
d6a28714
JH
2296 return 0;
2297}
2298
02db2b7b 2299
8ba1375e
MJD
2300#define sayYES goto yes
2301#define sayNO goto no
262b90c4 2302#define sayNO_SILENT goto no_silent
8ba1375e 2303
f9f4320a
YO
2304/* we dont use STMT_START/END here because it leads to
2305 "unreachable code" warnings, which are bogus, but distracting. */
2306#define CACHEsayNO \
c476f425
DM
2307 if (ST.cache_mask) \
2308 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2309 sayNO
3298f257 2310
a3621e74 2311/* this is used to determine how far from the left messages like
265c4333
YO
2312 'failed...' are printed. It should be set such that messages
2313 are inline with the regop output that created them.
a3621e74 2314*/
265c4333 2315#define REPORT_CODE_OFF 32
a3621e74
YO
2316
2317
2318/* Make sure there is a test for this +1 options in re_tests */
2319#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2320
40a82448
DM
2321#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2322#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2323
86545054
DM
2324#define SLAB_FIRST(s) (&(s)->states[0])
2325#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2326
5d9a96ca
DM
2327/* grab a new slab and return the first slot in it */
2328
2329STATIC regmatch_state *
2330S_push_slab(pTHX)
2331{
a35a87e7 2332#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2333 dMY_CXT;
2334#endif
5d9a96ca
DM
2335 regmatch_slab *s = PL_regmatch_slab->next;
2336 if (!s) {
2337 Newx(s, 1, regmatch_slab);
2338 s->prev = PL_regmatch_slab;
2339 s->next = NULL;
2340 PL_regmatch_slab->next = s;
2341 }
2342 PL_regmatch_slab = s;
86545054 2343 return SLAB_FIRST(s);
5d9a96ca 2344}
5b47454d 2345
95b24440 2346
40a82448
DM
2347/* push a new state then goto it */
2348
2349#define PUSH_STATE_GOTO(state, node) \
2350 scan = node; \
2351 st->resume_state = state; \
2352 goto push_state;
2353
2354/* push a new state with success backtracking, then goto it */
2355
2356#define PUSH_YES_STATE_GOTO(state, node) \
2357 scan = node; \
2358 st->resume_state = state; \
2359 goto push_yes_state;
2360
aa283a38 2361
aa283a38 2362
d6a28714 2363/*
95b24440 2364
bf1f174e
DM
2365regmatch() - main matching routine
2366
2367This is basically one big switch statement in a loop. We execute an op,
2368set 'next' to point the next op, and continue. If we come to a point which
2369we may need to backtrack to on failure such as (A|B|C), we push a
2370backtrack state onto the backtrack stack. On failure, we pop the top
2371state, and re-enter the loop at the state indicated. If there are no more
2372states to pop, we return failure.
2373
2374Sometimes we also need to backtrack on success; for example /A+/, where
2375after successfully matching one A, we need to go back and try to
2376match another one; similarly for lookahead assertions: if the assertion
2377completes successfully, we backtrack to the state just before the assertion
2378and then carry on. In these cases, the pushed state is marked as
2379'backtrack on success too'. This marking is in fact done by a chain of
2380pointers, each pointing to the previous 'yes' state. On success, we pop to
2381the nearest yes state, discarding any intermediate failure-only states.
2382Sometimes a yes state is pushed just to force some cleanup code to be
2383called at the end of a successful match or submatch; e.g. (??{$re}) uses
2384it to free the inner regex.
2385
2386Note that failure backtracking rewinds the cursor position, while
2387success backtracking leaves it alone.
2388
2389A pattern is complete when the END op is executed, while a subpattern
2390such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2391ops trigger the "pop to last yes state if any, otherwise return true"
2392behaviour.
2393
2394A common convention in this function is to use A and B to refer to the two
2395subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2396the subpattern to be matched possibly multiple times, while B is the entire
2397rest of the pattern. Variable and state names reflect this convention.
2398
2399The states in the main switch are the union of ops and failure/success of
2400substates associated with with that op. For example, IFMATCH is the op
2401that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2402'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2403successfully matched A and IFMATCH_A_fail is a state saying that we have
2404just failed to match A. Resume states always come in pairs. The backtrack
2405state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2406at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2407on success or failure.
2408
2409The struct that holds a backtracking state is actually a big union, with
2410one variant for each major type of op. The variable st points to the
2411top-most backtrack struct. To make the code clearer, within each
2412block of code we #define ST to alias the relevant union.
2413
2414Here's a concrete example of a (vastly oversimplified) IFMATCH
2415implementation:
2416
2417 switch (state) {
2418 ....
2419
2420#define ST st->u.ifmatch
2421
2422 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2423 ST.foo = ...; // some state we wish to save
95b24440 2424 ...
bf1f174e
DM
2425 // push a yes backtrack state with a resume value of
2426 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2427 // first node of A:
2428 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2429 // NOTREACHED
2430
2431 case IFMATCH_A: // we have successfully executed A; now continue with B
2432 next = B;
2433 bar = ST.foo; // do something with the preserved value
2434 break;
2435
2436 case IFMATCH_A_fail: // A failed, so the assertion failed
2437 ...; // do some housekeeping, then ...
2438 sayNO; // propagate the failure
2439
2440#undef ST
95b24440 2441
bf1f174e
DM
2442 ...
2443 }
95b24440 2444
bf1f174e
DM
2445For any old-timers reading this who are familiar with the old recursive
2446approach, the code above is equivalent to:
95b24440 2447
bf1f174e
DM
2448 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2449 {
2450 int foo = ...
95b24440 2451 ...
bf1f174e
DM
2452 if (regmatch(A)) {
2453 next = B;
2454 bar = foo;
2455 break;
95b24440 2456 }
bf1f174e
DM
2457 ...; // do some housekeeping, then ...
2458 sayNO; // propagate the failure
95b24440 2459 }
bf1f174e
DM
2460
2461The topmost backtrack state, pointed to by st, is usually free. If you
2462want to claim it, populate any ST.foo fields in it with values you wish to
2463save, then do one of
2464
2465 PUSH_STATE_GOTO(resume_state, node);
2466 PUSH_YES_STATE_GOTO(resume_state, node);
2467
2468which sets that backtrack state's resume value to 'resume_state', pushes a
2469new free entry to the top of the backtrack stack, then goes to 'node'.
2470On backtracking, the free slot is popped, and the saved state becomes the
2471new free state. An ST.foo field in this new top state can be temporarily
2472accessed to retrieve values, but once the main loop is re-entered, it
2473becomes available for reuse.
2474
2475Note that the depth of the backtrack stack constantly increases during the
2476left-to-right execution of the pattern, rather than going up and down with
2477the pattern nesting. For example the stack is at its maximum at Z at the
2478end of the pattern, rather than at X in the following:
2479
2480 /(((X)+)+)+....(Y)+....Z/
2481
2482The only exceptions to this are lookahead/behind assertions and the cut,
2483(?>A), which pop all the backtrack states associated with A before
2484continuing.
2485
2486Bascktrack state structs are allocated in slabs of about 4K in size.
2487PL_regmatch_state and st always point to the currently active state,
2488and PL_regmatch_slab points to the slab currently containing
2489PL_regmatch_state. The first time regmatch() is called, the first slab is
2490allocated, and is never freed until interpreter destruction. When the slab
2491is full, a new one is allocated and chained to the end. At exit from
2492regmatch(), slabs allocated since entry are freed.
2493
2494*/
95b24440 2495
40a82448 2496
5bc10b2c 2497#define DEBUG_STATE_pp(pp) \
265c4333 2498 DEBUG_STATE_r({ \
5bc10b2c
DM
2499 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2500 PerlIO_printf(Perl_debug_log, \
5d458dd8 2501 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2502 depth*2, "", \
13d6edb4 2503 PL_reg_name[st->resume_state], \
5d458dd8
YO
2504 ((st==yes_state||st==mark_state) ? "[" : ""), \
2505 ((st==yes_state) ? "Y" : ""), \
2506 ((st==mark_state) ? "M" : ""), \
2507 ((st==yes_state||st==mark_state) ? "]" : "") \
2508 ); \
265c4333 2509 });
5bc10b2c 2510
40a82448 2511
3dab1dad 2512#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2513
3df15adc 2514#ifdef DEBUGGING
5bc10b2c 2515
ab3bbdeb
YO
2516STATIC void
2517S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2518 const char *start, const char *end, const char *blurb)
2519{
bbe252da 2520 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
ab3bbdeb
YO
2521 if (!PL_colorset)
2522 reginitcolors();
2523 {
2524 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2525 prog->precomp, prog->prelen, 60);
2526
2527 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2528 start, end - start, 60);
2529
2530 PerlIO_printf(Perl_debug_log,
2531 "%s%s REx%s %s against %s\n",
2532 PL_colors[4], blurb, PL_colors[5], s0, s1);
2533
2534 if (do_utf8||utf8_pat)
1de06328
YO
2535 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2536 utf8_pat ? "pattern" : "",
2537 utf8_pat && do_utf8 ? " and " : "",
2538 do_utf8 ? "string" : ""
ab3bbdeb
YO
2539 );
2540 }
2541}
3df15adc
YO
2542
2543STATIC void
786e8c11
YO
2544S_dump_exec_pos(pTHX_ const char *locinput,
2545 const regnode *scan,
2546 const char *loc_regeol,
2547 const char *loc_bostr,
2548 const char *loc_reg_starttry,
2549 const bool do_utf8)
07be1b83 2550{
786e8c11 2551 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2552 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2553 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2554 /* The part of the string before starttry has one color
2555 (pref0_len chars), between starttry and current
2556 position another one (pref_len - pref0_len chars),
2557 after the current position the third one.
2558 We assume that pref0_len <= pref_len, otherwise we
2559 decrease pref0_len. */
786e8c11
YO
2560 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2561 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2562 int pref0_len;
2563
2564 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2565 pref_len++;
786e8c11
YO
2566 pref0_len = pref_len - (locinput - loc_reg_starttry);
2567 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2568 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2569 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2570 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2571 l--;
2572 if (pref0_len < 0)
2573 pref0_len = 0;
2574 if (pref0_len > pref_len)
2575 pref0_len = pref_len;
2576 {
3df15adc 2577 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2578
ab3bbdeb 2579 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2580 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2581
ab3bbdeb 2582 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2583 (locinput - pref_len + pref0_len),
1de06328 2584 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2585
ab3bbdeb 2586 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2587 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2588
1de06328 2589 const STRLEN tlen=len0+len1+len2;
3df15adc 2590 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2591 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2592 (IV)(locinput - loc_bostr),
07be1b83 2593 len0, s0,
07be1b83 2594 len1, s1,
07be1b83 2595 (docolor ? "" : "> <"),
07be1b83 2596 len2, s2,
f9f4320a 2597 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2598 "");
2599 }
2600}
3df15adc 2601
07be1b83
YO
2602#endif
2603
0a4db386
YO
2604/* reg_check_named_buff_matched()
2605 * Checks to see if a named buffer has matched. The data array of
2606 * buffer numbers corresponding to the buffer is expected to reside
2607 * in the regexp->data->data array in the slot stored in the ARG() of
2608 * node involved. Note that this routine doesn't actually care about the
2609 * name, that information is not preserved from compilation to execution.
2610 * Returns the index of the leftmost defined buffer with the given name
2611 * or 0 if non of the buffers matched.
2612 */
2613STATIC I32
2614S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2615 I32 n;
f8fc2ecf
YO
2616 RXi_GET_DECL(rex,rexi);
2617 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
0a4db386
YO
2618 I32 *nums=(I32*)SvPVX(sv_dat);
2619 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2620 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2621 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2622 {
2623 return nums[n];
2624 }
2625 }
2626 return 0;
2627}
2628
2f554ef7
DM
2629
2630/* free all slabs above current one - called during LEAVE_SCOPE */
2631
2632STATIC void
2633S_clear_backtrack_stack(pTHX_ void *p)
2634{
2635 regmatch_slab *s = PL_regmatch_slab->next;
2636 PERL_UNUSED_ARG(p);
2637
2638 if (!s)
2639 return;
2640 PL_regmatch_slab->next = NULL;
2641 while (s) {
2642 regmatch_slab * const osl = s;
2643 s = s->next;
2644 Safefree(osl);
2645 }
2646}
2647
2648
28d8d7f4
YO
2649#define SETREX(Re1,Re2) \
2650 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2651 Re1 = (Re2)
2652
d6a28714 2653STATIC I32 /* 0 failure, 1 success */
24b23f37 2654S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2655{
a35a87e7 2656#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2657 dMY_CXT;
2658#endif
27da23d5 2659 dVAR;
95b24440 2660 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2661 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2662
3b0527fe 2663 regexp *rex = reginfo->prog;
f8fc2ecf
YO
2664 RXi_GET_DECL(rex,rexi);
2665
2f554ef7 2666 I32 oldsave;
a3621e74 2667
5d9a96ca
DM
2668 /* the current state. This is a cached copy of PL_regmatch_state */
2669 register regmatch_state *st;
95b24440 2670
5d9a96ca
DM
2671 /* cache heavy used fields of st in registers */
2672 register regnode *scan;
2673 register regnode *next;
438e9bae 2674 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2675 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2676 register char *locinput = PL_reginput;
5d9a96ca 2677 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2678
b69b0499 2679 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2680 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2681 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2682 const U32 max_nochange_depth =
2683 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2684 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2685
77cb431f
DM
2686 regmatch_state *yes_state = NULL; /* state to pop to on success of
2687 subpattern */
e2e6a0f1
YO
2688 /* mark_state piggy backs on the yes_state logic so that when we unwind
2689 the stack on success we can update the mark_state as we go */
2690 regmatch_state *mark_state = NULL; /* last mark state we have seen */
5d458dd8 2691
faec1544 2692 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2693 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2694 U32 state_num;
5d458dd8
YO
2695 bool no_final = 0; /* prevent failure from backtracking? */
2696 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2697 char *startpoint = PL_reginput;
5d458dd8
YO
2698 SV *popmark = NULL; /* are we looking for a mark? */
2699 SV *sv_commit = NULL; /* last mark name seen in failure */
2700 SV *sv_yes_mark = NULL; /* last mark name we have seen
2701 during a successfull match */
2702 U32 lastopen = 0; /* last open we saw */
2703 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0
DM
2704
2705 SV* const oreplsv = GvSV(PL_replgv);
58e23c8d 2706
5d458dd8 2707
24d3c4a9
DM
2708 /* these three flags are set by various ops to signal information to
2709 * the very next op. They have a useful lifetime of exactly one loop
2710 * iteration, and are not preserved or restored by state pushes/pops
2711 */
2712 bool sw = 0; /* the condition value in (?(cond)a|b) */
2713 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2714 int logical = 0; /* the following EVAL is:
2715 0: (?{...})
2716 1: (?(?{...})X|Y)
2717 2: (??{...})
2718 or the following IFMATCH/UNLESSM is:
2719 false: plain (?=foo)
2720 true: used as a condition: (?(?=foo))
2721 */
2722
95b24440 2723#ifdef DEBUGGING
e68ec53f 2724 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2725#endif
2726
3b57cd43 2727 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2728 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2729 }));
5d9a96ca
DM
2730 /* on first ever call to regmatch, allocate first slab */
2731 if (!PL_regmatch_slab) {
2732 Newx(PL_regmatch_slab, 1, regmatch_slab);
2733 PL_regmatch_slab->prev = NULL;
2734 PL_regmatch_slab->next = NULL;
86545054 2735 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2736 }
2737
2f554ef7
DM
2738 oldsave = PL_savestack_ix;
2739 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2740 SAVEVPTR(PL_regmatch_slab);
2741 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
2742
2743 /* grab next free state slot */
2744 st = ++PL_regmatch_state;
86545054 2745 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2746 st = PL_regmatch_state = S_push_slab(aTHX);
2747
d6a28714
JH
2748 /* Note that nextchr is a byte even in UTF */
2749 nextchr = UCHARAT(locinput);
2750 scan = prog;
2751 while (scan != NULL) {
8ba1375e 2752
a3621e74 2753 DEBUG_EXECUTE_r( {
6136c704 2754 SV * const prop = sv_newmortal();
1de06328 2755 regnode *rnext=regnext(scan);
786e8c11 2756 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2757 regprop(rex, prop, scan);
07be1b83
YO
2758
2759 PerlIO_printf(Perl_debug_log,
2760 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2761 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2762 SvPVX_const(prop),
1de06328 2763 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2764 0 : (IV)(rnext - rexi->program));
2a782b5b 2765 });
d6a28714
JH
2766
2767 next = scan + NEXT_OFF(scan);
2768 if (next == scan)
2769 next = NULL;
40a82448 2770 state_num = OP(scan);
d6a28714 2771
40a82448
DM
2772 reenter_switch:
2773 switch (state_num) {
d6a28714 2774 case BOL:
7fba1cd6 2775 if (locinput == PL_bostr)
d6a28714 2776 {
3b0527fe 2777 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2778 break;
2779 }
d6a28714
JH
2780 sayNO;
2781 case MBOL:
12d33761
HS
2782 if (locinput == PL_bostr ||
2783 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2784 {
b8c5462f
JH
2785 break;
2786 }
d6a28714
JH
2787 sayNO;
2788 case SBOL:
c2a73568 2789 if (locinput == PL_bostr)
b8c5462f 2790 break;
d6a28714
JH
2791 sayNO;
2792 case GPOS:
3b0527fe 2793 if (locinput == reginfo->ganch)
d6a28714
JH
2794 break;
2795 sayNO;
ee9b8eae
YO
2796
2797 case KEEPS:
2798 /* update the startpoint */
f0ab9afb 2799 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 2800 PL_reginput = locinput;
f0ab9afb 2801 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
2802 PUSH_STATE_GOTO(KEEPS_next, next);
2803 /*NOT-REACHED*/
2804 case KEEPS_next_fail:
2805 /* rollback the start point change */
f0ab9afb 2806 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
2807 sayNO_SILENT;
2808 /*NOT-REACHED*/
d6a28714 2809 case EOL:
d6a28714
JH
2810 goto seol;
2811 case MEOL:
d6a28714 2812 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2813 sayNO;
b8c5462f 2814 break;
d6a28714
JH
2815 case SEOL:
2816 seol:
2817 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2818 sayNO;
d6a28714 2819 if (PL_regeol - locinput > 1)
b8c5462f 2820 sayNO;
b8c5462f 2821 break;
d6a28714
JH
2822 case EOS:
2823 if (PL_regeol != locinput)
b8c5462f 2824 sayNO;
d6a28714 2825 break;
ffc61ed2 2826 case SANY:
d6a28714 2827 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2828 sayNO;
f33976b4
DB
2829 if (do_utf8) {
2830 locinput += PL_utf8skip[nextchr];
2831 if (locinput > PL_regeol)
2832 sayNO;
2833 nextchr = UCHARAT(locinput);
2834 }
2835 else
2836 nextchr = UCHARAT(++locinput);
2837 break;
2838 case CANY:
2839 if (!nextchr && locinput >= PL_regeol)
2840 sayNO;
b8c5462f 2841 nextchr = UCHARAT(++locinput);
a0d0e21e 2842 break;
ffc61ed2 2843 case REG_ANY:
1aa99e6b
IH
2844 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2845 sayNO;
2846 if (do_utf8) {
b8c5462f 2847 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2848 if (locinput > PL_regeol)
2849 sayNO;
a0ed51b3 2850 nextchr = UCHARAT(locinput);
a0ed51b3 2851 }
1aa99e6b
IH
2852 else
2853 nextchr = UCHARAT(++locinput);
a0ed51b3 2854 break;
166ba7cd
DM
2855
2856#undef ST
2857#define ST st->u.trie
786e8c11
YO
2858 case TRIEC:
2859 /* In this case the charclass data is available inline so
2860 we can fail fast without a lot of extra overhead.
2861 */
2862 if (scan->flags == EXACT || !do_utf8) {
2863 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2864 DEBUG_EXECUTE_r(
2865 PerlIO_printf(Perl_debug_log,
2866 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2867 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2868 );
2869 sayNO_SILENT;
2870 /* NOTREACHED */
2871 }
2872 }
2873 /* FALL THROUGH */
5b47454d 2874 case TRIE:
3dab1dad 2875 {
07be1b83 2876 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2877 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2878 trie_type = do_utf8 ?
2879 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2880 : trie_plain;
2881
2882 /* what trie are we using right now */
be8e71aa 2883 reg_trie_data * const trie
f8fc2ecf 2884 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
55eed653 2885 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
3dab1dad 2886 U32 state = trie->startstate;
166ba7cd 2887
3dab1dad
YO
2888 if (trie->bitmap && trie_type != trie_utf8_fold &&
2889 !TRIE_BITMAP_TEST(trie,*locinput)
2890 ) {
2891 if (trie->states[ state ].wordnum) {
2892 DEBUG_EXECUTE_r(
2893 PerlIO_printf(Perl_debug_log,
2894 "%*s %smatched empty string...%s\n",
5bc10b2c 2895 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2896 );
2897 break;
2898 } else {
2899 DEBUG_EXECUTE_r(
2900 PerlIO_printf(Perl_debug_log,
786e8c11 2901 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2902 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2903 );
2904 sayNO_SILENT;
2905 }
2906 }
166ba7cd 2907
786e8c11
YO
2908 {
2909 U8 *uc = ( U8* )locinput;
2910
2911 STRLEN len = 0;
2912 STRLEN foldlen = 0;
2913 U8 *uscan = (U8*)NULL;
2914 STRLEN bufflen=0;
2915 SV *sv_accept_buff = NULL;
2916 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2917
2918 ST.accepted = 0; /* how many accepting states we have seen */
2919 ST.B = next;
2920 ST.jump = trie->jump;
786e8c11 2921 ST.me = scan;
07be1b83
YO
2922 /*
2923 traverse the TRIE keeping track of all accepting states
2924 we transition through until we get to a failing node.
2925 */
2926
a3621e74 2927 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2928 U32 base = trie->states[ state ].trans.base;
f9f4320a 2929 UV uvc = 0;
786e8c11
YO
2930 U16 charid;
2931 /* We use charid to hold the wordnum as we don't use it
2932 for charid until after we have done the wordnum logic.
2933 We define an alias just so that the wordnum logic reads
2934 more naturally. */
2935
2936#define got_wordnum charid
2937 got_wordnum = trie->states[ state ].wordnum;
2938
2939 if ( got_wordnum ) {
2940 if ( ! ST.accepted ) {
5b47454d
DM
2941 ENTER;
2942 SAVETMPS;
2943 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2944 sv_accept_buff=newSV(bufflen *
2945 sizeof(reg_trie_accepted) - 1);
786e8c11 2946 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2947 SvPOK_on(sv_accept_buff);
2948 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2949 SAVETMPS;
2950 ST.accept_buff =
5b47454d
DM
2951 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2952 }
786e8c11 2953 do {
166ba7cd 2954 if (ST.accepted >= bufflen) {
5b47454d 2955 bufflen *= 2;
166ba7cd 2956 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2957 SvGROW(sv_accept_buff,
2958 bufflen * sizeof(reg_trie_accepted));
2959 }
2960 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2961 + sizeof(reg_trie_accepted));
a3621e74 2962
786e8c11
YO
2963
2964 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2965 ST.accept_buff[ST.accepted].endpos = uc;
2966 ++ST.accepted;
2967 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2968 }
2969#undef got_wordnum
a3621e74 2970
07be1b83 2971 DEBUG_TRIE_EXECUTE_r({
786e8c11 2972 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2973 PerlIO_printf( Perl_debug_log,
786e8c11 2974 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2975 2+depth * 2, "", PL_colors[4],
786e8c11 2976 (UV)state, (UV)ST.accepted );
07be1b83 2977 });
a3621e74
YO
2978
2979 if ( base ) {
55eed653
NC
2980 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2981 uscan, len, uvc, charid, foldlen,
2982 foldbuf, uniflags);
a3621e74 2983
5b47454d
DM
2984 if (charid &&
2985 (base + charid > trie->uniquecharcount )
2986 && (base + charid - 1 - trie->uniquecharcount
2987 < trie->lasttrans)
2988 && trie->trans[base + charid - 1 -
2989 trie->uniquecharcount].check == state)
2990 {
2991 state = trie->trans[base + charid - 1 -
2992 trie->uniquecharcount ].next;
2993 }
2994 else {
2995 state = 0;
2996 }
2997 uc += len;
2998
2999 }
3000 else {
a3621e74
YO
3001 state = 0;
3002 }
3003 DEBUG_TRIE_EXECUTE_r(
e4584336 3004 PerlIO_printf( Perl_debug_log,
786e8c11 3005 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3006 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3007 );
3008 }
166ba7cd 3009 if (!ST.accepted )
a3621e74 3010 sayNO;
a3621e74 3011
166ba7cd
DM
3012 DEBUG_EXECUTE_r(
3013 PerlIO_printf( Perl_debug_log,
3014 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3015 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3016 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3017 );
786e8c11 3018 }}
fae667d5
YO
3019 goto trie_first_try; /* jump into the fail handler */
3020 /* NOTREACHED */
166ba7cd 3021 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
3022 if ( ST.jump) {
3023 REGCP_UNWIND(ST.cp);
3024 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3025 PL_regoffs[n].end = -1;
fae667d5
YO
3026 *PL_reglastparen = n;
3027 }
3028 trie_first_try:
5d458dd8
YO
3029 if (do_cutgroup) {
3030 do_cutgroup = 0;
3031 no_final = 0;
3032 }
fae667d5
YO
3033
3034 if ( ST.jump) {
3035 ST.lastparen = *PL_reglastparen;
3036 REGCP_SET(ST.cp);
3037 }
166ba7cd
DM
3038 if ( ST.accepted == 1 ) {
3039 /* only one choice left - just continue */
3040 DEBUG_EXECUTE_r({
2b8b4781
NC
3041 AV *const trie_words
3042 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3043 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3044 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3045 SV *sv= tmp ? sv_newmortal() : NULL;
3046
166ba7cd
DM
3047 PerlIO_printf( Perl_debug_log,
3048 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3049 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3050 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3051 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3052 PL_colors[0], PL_colors[1],
3053 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3054 )
3055 : "not compiled under -Dr",
166ba7cd
DM
3056 PL_colors[5] );
3057 });
3058 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3059 /* in this case we free tmps/leave before we call regmatch
3060 as we wont be using accept_buff again. */
5d458dd8 3061
166ba7cd
DM
3062 locinput = PL_reginput;
3063 nextchr = UCHARAT(locinput);
5d458dd8
YO
3064 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3065 scan = ST.B;
3066 else
3067 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3068 if (!has_cutgroup) {
3069 FREETMPS;
3070 LEAVE;
3071 } else {
3072 ST.accepted--;
3073 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3074 }
786e8c11 3075
166ba7cd
DM
3076 continue; /* execute rest of RE */
3077 }
fae667d5
YO
3078
3079 if ( !ST.accepted-- ) {
5d458dd8
YO
3080 DEBUG_EXECUTE_r({
3081 PerlIO_printf( Perl_debug_log,
3082 "%*s %sTRIE failed...%s\n",
3083 REPORT_CODE_OFF+depth*2, "",
3084 PL_colors[4],
3085 PL_colors[5] );
3086 });
166ba7cd
DM
3087 FREETMPS;
3088 LEAVE;
5d458dd8 3089 sayNO_SILENT;
fae667d5
YO
3090 /*NOTREACHED*/
3091 }
166ba7cd 3092
a3621e74 3093 /*
166ba7cd
DM
3094 There are at least two accepting states left. Presumably
3095 the number of accepting states is going to be low,
3096 typically two. So we simply scan through to find the one
3097 with lowest wordnum. Once we find it, we swap the last
3098 state into its place and decrement the size. We then try to
3099 match the rest of the pattern at the point where the word
3100 ends. If we succeed, control just continues along the
3101 regex; if we fail we return here to try the next accepting
3102 state
3103 */
a3621e74 3104
166ba7cd
DM
3105 {
3106 U32 best = 0;
3107 U32 cur;
3108 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3109 DEBUG_TRIE_EXECUTE_r(
f2278c82 3110 PerlIO_printf( Perl_debug_log,
166ba7cd 3111 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3112 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3113 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3114 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3115 );
3116
3117 if (ST.accept_buff[cur].wordnum <
3118 ST.accept_buff[best].wordnum)
3119 best = cur;
a3621e74 3120 }
166ba7cd
DM
3121
3122 DEBUG_EXECUTE_r({
2b8b4781
NC
3123 AV *const trie_words
3124 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3125 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3126 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3127 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3128 ST.B :
7f69552c 3129 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3130 SV *sv= tmp ? sv_newmortal() : NULL;
3131
265c4333
YO
3132 PerlIO_printf( Perl_debug_log,
3133 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3134 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3135 ST.accept_buff[best].wordnum,
de734bd5
A
3136 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3137 PL_colors[0], PL_colors[1],
3138 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3139 ) : "not compiled under -Dr",
265c4333 3140 REG_NODE_NUM(nextop),
166ba7cd
DM
3141 PL_colors[5] );
3142 });
3143
3144 if ( best<ST.accepted ) {
3145 reg_trie_accepted tmp = ST.accept_buff[ best ];
3146 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3147 ST.accept_buff[ ST.accepted ] = tmp;
3148 best = ST.accepted;
a3621e74 3149 }
166ba7cd 3150 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3151 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3152 scan = ST.B;
786e8c11
YO
3153 /* NOTREACHED */
3154 } else {
5d458dd8 3155 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
786e8c11
YO
3156 /* NOTREACHED */
3157 }
5d458dd8
YO
3158 if (has_cutgroup) {
3159 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3160 /* NOTREACHED */
3161 } else {
3162 PUSH_STATE_GOTO(TRIE_next, scan);
3163 /* NOTREACHED */
3164 }
786e8c11 3165 /* NOTREACHED */
166ba7cd 3166 }
166ba7cd 3167 /* NOTREACHED */
5d458dd8
YO
3168 case TRIE_next:
3169 FREETMPS;
3170 LEAVE;
3171 sayYES;
166ba7cd
DM
3172#undef ST
3173
95b24440
DM
3174 case EXACT: {
3175 char *s = STRING(scan);
24d3c4a9 3176 ln = STR_LEN(scan);
eb160463 3177 if (do_utf8 != UTF) {
bc517b45 3178 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3179 char *l = locinput;
24d3c4a9 3180 const char * const e = s + ln;
a72c7584 3181
5ff6fc6d
JH
3182 if (do_utf8) {
3183 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3184 while (s < e) {
a3b680e6 3185 STRLEN ulen;
1aa99e6b 3186 if (l >= PL_regeol)
5ff6fc6d
JH
3187 sayNO;
3188 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3189 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3190 uniflags))
5ff6fc6d 3191 sayNO;
bc517b45 3192 l += ulen;
5ff6fc6d 3193 s ++;
1aa99e6b 3194 }
5ff6fc6d
JH
3195 }
3196 else {
3197 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3198 while (s < e) {
a3b680e6 3199 STRLEN ulen;
1aa99e6b
IH
3200 if (l >= PL_regeol)
3201 sayNO;
5ff6fc6d 3202 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3203 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3204 uniflags))
1aa99e6b 3205 sayNO;
bc517b45 3206 s += ulen;
a72c7584 3207 l ++;
1aa99e6b 3208 }
5ff6fc6d 3209 }
1aa99e6b
IH
3210 locinput = l;
3211 nextchr = UCHARAT(locinput);
3212 break;
3213 }
bc517b45 3214 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3215 /* Inline the first character, for speed. */
3216 if (UCHARAT(s) != nextchr)
3217 sayNO;
24d3c4a9 3218 if (PL_regeol - locinput < ln)
d6a28714 3219 sayNO;
24d3c4a9 3220 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3221 sayNO;
24d3c4a9 3222 locinput += ln;
d6a28714
JH
3223 nextchr = UCHARAT(locinput);
3224 break;
95b24440 3225 }
d6a28714 3226 case EXACTFL:
b8c5462f
JH
3227 PL_reg_flags |= RF_tainted;
3228 /* FALL THROUGH */
95b24440 3229 case EXACTF: {
be8e71aa 3230 char * const s = STRING(scan);
24d3c4a9 3231 ln = STR_LEN(scan);
d6a28714 3232
d07ddd77
JH
3233 if (do_utf8 || UTF) {
3234 /* Either target or the pattern are utf8. */
be8e71aa 3235 const char * const l = locinput;
d07ddd77 3236 char *e = PL_regeol;
bc517b45 3237
24d3c4a9 3238 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3239 l, &e, 0, do_utf8)) {
5486206c
JH
3240 /* One more case for the sharp s:
3241 * pack("U0U*", 0xDF) =~ /ss/i,
3242 * the 0xC3 0x9F are the UTF-8
3243 * byte sequence for the U+00DF. */
e1d1eefb 3244
5486206c 3245 if (!(do_utf8 &&
e1d1eefb 3246 toLOWER(s[0]) == 's' &&
24d3c4a9 3247 ln >= 2 &&
5486206c
JH
3248 toLOWER(s[1]) == 's' &&
3249 (U8)l[0] == 0xC3 &&
3250 e - l >= 2 &&
3251 (U8)l[1] == 0x9F))
3252 sayNO;
3253 }
d07ddd77
JH
3254 locinput = e;
3255 nextchr = UCHARAT(locinput);
3256 break;
a0ed51b3 3257 }
d6a28714 3258
bc517b45
JH
3259 /* Neither the target and the pattern are utf8. */
3260
d6a28714
JH
3261 /* Inline the first character, for speed. */
3262 if (UCHARAT(s) != nextchr &&
3263 UCHARAT(s) != ((OP(scan) == EXACTF)
3264 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3265 sayNO;
24d3c4a9 3266 if (PL_regeol - locinput < ln)
b8c5462f 3267 sayNO;
24d3c4a9
DM
3268 if (ln > 1 && (OP(scan) == EXACTF
3269 ? ibcmp(s, locinput, ln)
3270 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3271 sayNO;
24d3c4a9 3272 locinput += ln;
d6a28714 3273 nextchr = UCHARAT(locinput);
a0d0e21e 3274 break;
95b24440 3275 }
d6a28714 3276 case ANYOF:
ffc61ed2 3277 if (do_utf8) {
9e55ce06
JH
3278 STRLEN inclasslen = PL_regeol - locinput;
3279
32fc9b6a 3280 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3281 goto anyof_fail;
ffc61ed2
JH
3282 if (locinput >= PL_regeol)
3283 sayNO;
0f0076b4 3284 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3285 nextchr = UCHARAT(locinput);
e0f9d4a8 3286 break;
ffc61ed2
JH
3287 }
3288 else {
3289 if (nextchr < 0)
3290 nextchr = UCHARAT(locinput);
32fc9b6a 3291 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3292 goto anyof_fail;
ffc61ed2
JH
3293 if (!nextchr && locinput >= PL_regeol)
3294 sayNO;
3295 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3296 break;
3297 }
262b90c4 3298 anyof_fail:
e0f9d4a8
JH
3299 /* If we might have the case of the German sharp s
3300 * in a casefolding Unicode character class. */
3301
ebc501f0
JH
3302 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3303 locinput += SHARP_S_SKIP;
e0f9d4a8 3304 nextchr = UCHARAT(locinput);
ffc61ed2 3305 }
e0f9d4a8
JH
3306 else
3307 sayNO;
b8c5462f 3308 break;
d6a28714 3309 case ALNUML:
b8c5462f
JH
3310 PL_reg_flags |= RF_tainted;
3311 /* FALL THROUGH */
d6a28714 3312 case ALNUM:
b8c5462f 3313 if (!nextchr)
4633a7c4 3314 sayNO;
ffc61ed2 3315 if (do_utf8) {
1a4fad37 3316 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3317 if (!(OP(scan) == ALNUM
bb7a0f54 3318 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3319 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3320 {
3321 sayNO;
a0ed51b3 3322 }
b8c5462f 3323 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3324 nextchr = UCHARAT(locinput);
3325 break;
3326 }
ffc61ed2 3327 if (!(OP(scan) == ALNUM
d6a28714 3328 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3329 sayNO;
b8c5462f 3330 nextchr = UCHARAT(++locinput);
a0d0e21e 3331 break;
d6a28714 3332 case NALNUML:
b8c5462f
JH
3333 PL_reg_flags |= RF_tainted;
3334 /* FALL THROUGH */
d6a28714
JH
3335 case NALNUM:
3336 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3337 sayNO;
ffc61ed2 3338 if (do_utf8) {
1a4fad37 3339 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3340 if (OP(scan) == NALNUM
bb7a0f54 3341 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3342 : isALNUM_LC_utf8((U8*)locinput))
3343 {
b8c5462f 3344 sayNO;
d6a28714 3345 }
b8c5462f
JH
3346 locinput += PL_utf8skip[nextchr];
3347 nextchr = UCHARAT(locinput);
3348 break;
3349 }
ffc61ed2 3350 if (OP(scan) == NALNUM
d6a28714 3351 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3352 sayNO;
76e3520e 3353 nextchr = UCHARAT(++locinput);
a0d0e21e 3354 break;
d6a28714
JH
3355 case BOUNDL:
3356 case NBOUNDL:
3280af22 3357 PL_reg_flags |= RF_tainted;
bbce6d69 3358 /* FALL THROUGH */
d6a28714
JH
3359 case BOUND:
3360 case NBOUND:
3361 /* was last char in word? */
ffc61ed2 3362 if (do_utf8) {
12d33761 3363 if (locinput == PL_bostr)
24d3c4a9 3364 ln = '\n';
ffc61ed2 3365 else {
a3b680e6 3366 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3367
24d3c4a9 3368 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3369 }
3370 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3371 ln = isALNUM_uni(ln);
1a4fad37 3372 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3373 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3374 }
3375 else {
24d3c4a9 3376 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3377 n = isALNUM_LC_utf8((U8*)locinput);
3378 }
a0ed51b3 3379 }
d6a28714 3380 else {
24d3c4a9 3381 ln = (locinput != PL_bostr) ?
12d33761 3382 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3383 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3384 ln = isALNUM(ln);
ffc61ed2
JH
3385 n = isALNUM(nextchr);
3386 }
3387 else {
24d3c4a9 3388 ln = isALNUM_LC(ln);
ffc61ed2
JH
3389 n = isALNUM_LC(nextchr);
3390 }
d6a28714 3391 }
24d3c4a9 3392 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3393 OP(scan) == BOUNDL))
3394 sayNO;
a0ed51b3 3395 break;
d6a28714 3396 case SPACEL:
3280af22 3397 PL_reg_flags |= RF_tainted;
bbce6d69 3398 /* FALL THROUGH */
d6a28714 3399 case SPACE:
9442cb0e