This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
fixed Fcntl::S_IFMT() breakage introduced by change 30674 (blead 26701)
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
125/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
129 OP(rn) == EVAL || \
cca55fe3
JP
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 134)
ee9b8eae 135#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 136
ee9b8eae
YO
137#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
138
139#if 0
140/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
145
146#else
147/* ... so we use this as its faster. */
148#define IS_TEXT(rn) ( OP(rn)==EXACT )
149#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
151
152#endif
e2d8ce26 153
a84d97b6
HS
154/*
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
157*/
cca55fe3 158#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 162 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 163 else if (type == PLUS) \
cca55fe3 164 rn = NEXTOPER(rn); \
3dab1dad 165 else if (type == IFMATCH) \
a84d97b6 166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 167 else rn += NEXT_OFF(rn); \
3dab1dad 168 } \
5f80c4cf 169} STMT_END
74750237 170
c476f425 171
acfe0abc 172static void restore_pos(pTHX_ void *arg);
51371543 173
76e3520e 174STATIC CHECKPOINT
cea2e8a9 175S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 176{
97aff369 177 dVAR;
a3b680e6 178 const int retval = PL_savestack_ix;
b1ce53c5 179#define REGCP_PAREN_ELEMS 4
a3b680e6 180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 181 int p;
40a82448 182 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 183
e49a9654
IH
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
186
f0ab9afb 187#define REGCP_OTHER_ELEMS 7
4b3c1a47 188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 189
3280af22 190 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 191/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
192 SSPUSHINT(PL_regoffs[p].end);
193 SSPUSHINT(PL_regoffs[p].start);
3280af22 194 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 195 SSPUSHINT(p);
e7707071 196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 198 (UV)p, (IV)PL_regoffs[p].start,
40a82448 199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 200 (IV)PL_regoffs[p].end
40a82448 201 ));
a0d0e21e 202 }
b1ce53c5 203/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 204 SSPUSHPTR(PL_regoffs);
3280af22
NIS
205 SSPUSHINT(PL_regsize);
206 SSPUSHINT(*PL_reglastparen);
a01268b5 207 SSPUSHINT(*PL_reglastcloseparen);
3280af22 208 SSPUSHPTR(PL_reginput);
41123dfd
JH
209#define REGCP_FRAME_ELEMS 2
210/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
211 * are needed for the regexp context stack bookkeeping. */
212 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 213 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 214
a0d0e21e
LW
215 return retval;
216}
217
c277df42 218/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
219#define REGCP_SET(cp) \
220 DEBUG_STATE_r( \
ab3bbdeb 221 PerlIO_printf(Perl_debug_log, \
e4f74956 222 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
223 (IV)PL_savestack_ix)); \
224 cp = PL_savestack_ix
c3464db5 225
ab3bbdeb 226#define REGCP_UNWIND(cp) \
e4f74956 227 DEBUG_STATE_r( \
ab3bbdeb 228 if (cp != PL_savestack_ix) \
e4f74956
YO
229 PerlIO_printf(Perl_debug_log, \
230 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
231 (IV)(cp), (IV)PL_savestack_ix)); \
232 regcpblow(cp)
c277df42 233
76e3520e 234STATIC char *
097eb12c 235S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 236{
97aff369 237 dVAR;
3b6647e0 238 U32 i;
a0d0e21e 239 char *input;
b1ce53c5 240
a3621e74
YO
241 GET_RE_DEBUG_FLAGS_DECL;
242
b1ce53c5 243 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 244 i = SSPOPINT;
b1ce53c5
JH
245 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
246 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 247 input = (char *) SSPOPPTR;
a01268b5 248 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
249 *PL_reglastparen = SSPOPINT;
250 PL_regsize = SSPOPINT;
f0ab9afb 251 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 252
6bda09f9 253
b1ce53c5 254 /* Now restore the parentheses context. */
41123dfd
JH
255 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
256 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 257 I32 tmps;
097eb12c 258 U32 paren = (U32)SSPOPINT;
3280af22 259 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 260 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 261 tmps = SSPOPINT;
3280af22 262 if (paren <= *PL_reglastparen)
f0ab9afb 263 PL_regoffs[paren].end = tmps;
e7707071 264 DEBUG_BUFFERS_r(
c3464db5 265 PerlIO_printf(Perl_debug_log,
b900a521 266 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 267 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 268 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 269 (IV)PL_regoffs[paren].end,
3280af22 270 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 271 );
a0d0e21e 272 }
e7707071 273 DEBUG_BUFFERS_r(
bb7a0f54 274 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 275 PerlIO_printf(Perl_debug_log,
faccc32b 276 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 277 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
278 }
279 );
daf18116 280#if 1
dafc8851
JH
281 /* It would seem that the similar code in regtry()
282 * already takes care of this, and in fact it is in
283 * a better location to since this code can #if 0-ed out
284 * but the code in regtry() is needed or otherwise tests
285 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
286 * (as of patchlevel 7877) will fail. Then again,
287 * this code seems to be necessary or otherwise
288 * building DynaLoader will fail:
289 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
290 * --jhi */
3b6647e0 291 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 292 if (i > PL_regsize)
f0ab9afb
NC
293 PL_regoffs[i].start = -1;
294 PL_regoffs[i].end = -1;
a0d0e21e 295 }
dafc8851 296#endif
a0d0e21e
LW
297 return input;
298}
299
02db2b7b 300#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 301
a687059c 302/*
e50aee73 303 * pregexec and friends
a687059c
LW
304 */
305
76234dfb 306#ifndef PERL_IN_XSUB_RE
a687059c 307/*
c277df42 308 - pregexec - match a regexp against a string
a687059c 309 */
c277df42 310I32
864dbfa3 311Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 312 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
313/* strend: pointer to null at end of string */
314/* strbeg: real beginning of string */
315/* minend: end of match must be >=minend after stringarg. */
316/* nosave: For optimizations. */
317{
318 return
9041c2e3 319 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
320 nosave ? 0 : REXEC_COPY_STR);
321}
76234dfb 322#endif
22e551b9 323
9041c2e3 324/*
cad2e5aa
JH
325 * Need to implement the following flags for reg_anch:
326 *
327 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
328 * USE_INTUIT_ML
329 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
330 * INTUIT_AUTORITATIVE_ML
331 * INTUIT_ONCE_NOML - Intuit can match in one location only.
332 * INTUIT_ONCE_ML
333 *
334 * Another flag for this function: SECOND_TIME (so that float substrs
335 * with giant delta may be not rechecked).
336 */
337
338/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
339
3f7c398e 340/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
341 Otherwise, only SvCUR(sv) is used to get strbeg. */
342
343/* XXXX We assume that strpos is strbeg unless sv. */
344
6eb5f6b9
JH
345/* XXXX Some places assume that there is a fixed substring.
346 An update may be needed if optimizer marks as "INTUITable"
347 RExen without fixed substrings. Similarly, it is assumed that
348 lengths of all the strings are no more than minlen, thus they
349 cannot come from lookahead.
40d049e4
YO
350 (Or minlen should take into account lookahead.)
351 NOTE: Some of this comment is not correct. minlen does now take account
352 of lookahead/behind. Further research is required. -- demerphq
353
354*/
6eb5f6b9 355
2c2d71f5
JH
356/* A failure to find a constant substring means that there is no need to make
357 an expensive call to REx engine, thus we celebrate a failure. Similarly,
358 finding a substring too deep into the string means that less calls to
30944b6d
IZ
359 regtry() should be needed.
360
361 REx compiler's optimizer found 4 possible hints:
362 a) Anchored substring;
363 b) Fixed substring;
364 c) Whether we are anchored (beginning-of-line or \G);
365 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 366 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
367 string which does not contradict any of them.
368 */
2c2d71f5 369
6eb5f6b9
JH
370/* Most of decisions we do here should have been done at compile time.
371 The nodes of the REx which we used for the search should have been
372 deleted from the finite automaton. */
373
cad2e5aa
JH
374char *
375Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
376 char *strend, U32 flags, re_scream_pos_data *data)
377{
97aff369 378 dVAR;
b7953727 379 register I32 start_shift = 0;
cad2e5aa 380 /* Should be nonnegative! */
b7953727 381 register I32 end_shift = 0;
2c2d71f5
JH
382 register char *s;
383 register SV *check;
a1933d95 384 char *strbeg;
cad2e5aa 385 char *t;
1de06328 386 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 387 I32 ml_anch;
bd61b366
SS
388 register char *other_last = NULL; /* other substr checked before this */
389 char *check_at = NULL; /* check substr found at this pos */
bbe252da 390 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 391 RXi_GET_DECL(prog,progi);
30944b6d 392#ifdef DEBUGGING
890ce7af 393 const char * const i_strpos = strpos;
30944b6d 394#endif
a3621e74
YO
395
396 GET_RE_DEBUG_FLAGS_DECL;
397
a30b2f1f 398 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 399
bbe252da 400 if (prog->extflags & RXf_UTF8) {
b8d68ded
JH
401 PL_reg_flags |= RF_utf8;
402 }
ab3bbdeb
YO
403 DEBUG_EXECUTE_r(
404 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
405 sv ? "Guessing start of match in sv for"
406 : "Guessing start of match in string for");
2a782b5b 407 );
cad2e5aa 408
c344f387
JH
409 /* CHR_DIST() would be more correct here but it makes things slow. */
410 if (prog->minlen > strend - strpos) {
a3621e74 411 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 412 "String too short... [re_intuit_start]\n"));
cad2e5aa 413 goto fail;
2c2d71f5 414 }
1de06328 415
a1933d95 416 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 417 PL_regeol = strend;
33b8afdf
JH
418 if (do_utf8) {
419 if (!prog->check_utf8 && prog->check_substr)
420 to_utf8_substr(prog);
421 check = prog->check_utf8;
422 } else {
423 if (!prog->check_substr && prog->check_utf8)
424 to_byte_substr(prog);
425 check = prog->check_substr;
426 }
1de06328 427 if (check == &PL_sv_undef) {
a3621e74 428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 429 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
430 goto fail;
431 }
bbe252da
YO
432 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
433 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
434 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 435 && !multiline ) ); /* Check after \n? */
cad2e5aa 436
7e25d62c 437 if (!ml_anch) {
bbe252da
YO
438 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
439 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 440 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
441 && sv && !SvROK(sv)
442 && (strpos != strbeg)) {
a3621e74 443 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
444 goto fail;
445 }
446 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 447 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 448 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
449 I32 slen;
450
1aa99e6b 451 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 452
653099ff
GS
453 if (SvTAIL(check)) {
454 slen = SvCUR(check); /* >= 1 */
cad2e5aa 455
9041c2e3 456 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 457 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 458 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 459 goto fail_finish;
cad2e5aa
JH
460 }
461 /* Now should match s[0..slen-2] */
462 slen--;
3f7c398e 463 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 464 || (slen > 1
3f7c398e 465 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 466 report_neq:
a3621e74 467 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
468 goto fail_finish;
469 }
cad2e5aa 470 }
3f7c398e 471 else if (*SvPVX_const(check) != *s
653099ff 472 || ((slen = SvCUR(check)) > 1
3f7c398e 473 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 474 goto report_neq;
c315bfe8 475 check_at = s;
2c2d71f5 476 goto success_at_start;
7e25d62c 477 }
cad2e5aa 478 }
2c2d71f5 479 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 480 s = strpos;
2c2d71f5 481 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
482 end_shift = prog->check_end_shift;
483
2c2d71f5 484 if (!ml_anch) {
a3b680e6 485 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 486 - (SvTAIL(check) != 0);
a3b680e6 487 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
488
489 if (end_shift < eshift)
490 end_shift = eshift;
491 }
cad2e5aa 492 }
2c2d71f5 493 else { /* Can match at random position */
cad2e5aa
JH
494 ml_anch = 0;
495 s = strpos;
1de06328
YO
496 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
497 end_shift = prog->check_end_shift;
498
499 /* end shift should be non negative here */
cad2e5aa
JH
500 }
501
bcdf7404 502#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 503 if (end_shift < 0)
1de06328 504 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 505 (IV)end_shift, prog->precomp);
2c2d71f5
JH
506#endif
507
2c2d71f5
JH
508 restart:
509 /* Find a possible match in the region s..strend by looking for
510 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
511
512 {
513 I32 srch_start_shift = start_shift;
514 I32 srch_end_shift = end_shift;
515 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
516 srch_end_shift -= ((strbeg - s) - srch_start_shift);
517 srch_start_shift = strbeg - s;
518 }
6bda09f9 519 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
520 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
521 (IV)prog->check_offset_min,
522 (IV)srch_start_shift,
523 (IV)srch_end_shift,
524 (IV)prog->check_end_shift);
525 });
526
cad2e5aa 527 if (flags & REXEC_SCREAM) {
cad2e5aa 528 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 529 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 530
2c2d71f5
JH
531 if (PL_screamfirst[BmRARE(check)] >= 0
532 || ( BmRARE(check) == '\n'
85c508c3 533 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 534 && SvTAIL(check) ))
9041c2e3 535 s = screaminstr(sv, check,
1de06328 536 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 537 else
2c2d71f5 538 goto fail_finish;
4addbd3b
HS
539 /* we may be pointing at the wrong string */
540 if (s && RX_MATCH_COPIED(prog))
3f7c398e 541 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
542 if (data)
543 *data->scream_olds = s;
544 }
1de06328
YO
545 else {
546 U8* start_point;
547 U8* end_point;
bbe252da 548 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
549 start_point= (U8*)(s + srch_start_shift);
550 end_point= (U8*)(strend - srch_end_shift);
551 } else {
552 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
553 end_point= HOP3(strend, -srch_end_shift, strbeg);
554 }
6bda09f9 555 DEBUG_OPTIMISE_MORE_r({
56570a2c 556 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 557 (int)(end_point - start_point),
fc8cd66c 558 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
559 start_point);
560 });
561
562 s = fbm_instr( start_point, end_point,
7fba1cd6 563 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
564 }
565 }
cad2e5aa
JH
566 /* Update the count-of-usability, remove useless subpatterns,
567 unshift s. */
2c2d71f5 568
ab3bbdeb
YO
569 DEBUG_EXECUTE_r({
570 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
571 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
572 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 573 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
574 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
575 ? "anchored" : "floating"),
576 quoted,
577 RE_SV_TAIL(check),
578 (s ? " at offset " : "...\n") );
579 });
2c2d71f5
JH
580
581 if (!s)
582 goto fail_finish;
2c2d71f5 583 /* Finish the diagnostic message */
a3621e74 584 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 585
1de06328
YO
586 /* XXX dmq: first branch is for positive lookbehind...
587 Our check string is offset from the beginning of the pattern.
588 So we need to do any stclass tests offset forward from that
589 point. I think. :-(
590 */
591
592
593
594 check_at=s;
595
596
2c2d71f5
JH
597 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
598 Start with the other substr.
599 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 600 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
601 *always* match. Probably should be marked during compile...
602 Probably it is right to do no SCREAM here...
603 */
604
1de06328
YO
605 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
606 : (prog->float_substr && prog->anchored_substr))
607 {
30944b6d 608 /* Take into account the "other" substring. */
2c2d71f5
JH
609 /* XXXX May be hopelessly wrong for UTF... */
610 if (!other_last)
6eb5f6b9 611 other_last = strpos;
33b8afdf 612 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
613 do_other_anchored:
614 {
890ce7af
AL
615 char * const last = HOP3c(s, -start_shift, strbeg);
616 char *last1, *last2;
be8e71aa 617 char * const saved_s = s;
33b8afdf 618 SV* must;
2c2d71f5 619
2c2d71f5
JH
620 t = s - prog->check_offset_max;
621 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 622 && (!do_utf8
0ce71af7 623 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 624 && t > strpos)))
6f207bd3 625 NOOP;
2c2d71f5
JH
626 else
627 t = strpos;
1aa99e6b 628 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
629 if (t < other_last) /* These positions already checked */
630 t = other_last;
1aa99e6b 631 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
632 if (last < last1)
633 last1 = last;
1de06328
YO
634 /* XXXX It is not documented what units *_offsets are in.
635 We assume bytes, but this is clearly wrong.
636 Meaning this code needs to be carefully reviewed for errors.
637 dmq.
638 */
639
2c2d71f5 640 /* On end-of-str: see comment below. */
33b8afdf
JH
641 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
642 if (must == &PL_sv_undef) {
643 s = (char*)NULL;
1de06328 644 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
645 }
646 else
647 s = fbm_instr(
648 (unsigned char*)t,
649 HOP3(HOP3(last1, prog->anchored_offset, strend)
650 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
651 must,
7fba1cd6 652 multiline ? FBMrf_MULTILINE : 0
33b8afdf 653 );
ab3bbdeb
YO
654 DEBUG_EXECUTE_r({
655 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
656 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
657 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 658 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
659 quoted, RE_SV_TAIL(must));
660 });
661
662
2c2d71f5
JH
663 if (!s) {
664 if (last1 >= last2) {
a3621e74 665 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
666 ", giving up...\n"));
667 goto fail_finish;
668 }
a3621e74 669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 670 ", trying floating at offset %ld...\n",
be8e71aa 671 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
672 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
673 s = HOP3c(last, 1, strend);
2c2d71f5
JH
674 goto restart;
675 }
676 else {
a3621e74 677 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 678 (long)(s - i_strpos)));
1aa99e6b
IH
679 t = HOP3c(s, -prog->anchored_offset, strbeg);
680 other_last = HOP3c(s, 1, strend);
be8e71aa 681 s = saved_s;
2c2d71f5
JH
682 if (t == strpos)
683 goto try_at_start;
2c2d71f5
JH
684 goto try_at_offset;
685 }
30944b6d 686 }
2c2d71f5
JH
687 }
688 else { /* Take into account the floating substring. */
33b8afdf 689 char *last, *last1;
be8e71aa 690 char * const saved_s = s;
33b8afdf
JH
691 SV* must;
692
693 t = HOP3c(s, -start_shift, strbeg);
694 last1 = last =
695 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
696 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
697 last = HOP3c(t, prog->float_max_offset, strend);
698 s = HOP3c(t, prog->float_min_offset, strend);
699 if (s < other_last)
700 s = other_last;
2c2d71f5 701 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
702 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
703 /* fbm_instr() takes into account exact value of end-of-str
704 if the check is SvTAIL(ed). Since false positives are OK,
705 and end-of-str is not later than strend we are OK. */
706 if (must == &PL_sv_undef) {
707 s = (char*)NULL;
1de06328 708 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
709 }
710 else
2c2d71f5 711 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
712 (unsigned char*)last + SvCUR(must)
713 - (SvTAIL(must)!=0),
7fba1cd6 714 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
715 DEBUG_EXECUTE_r({
716 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
717 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
718 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 719 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
720 quoted, RE_SV_TAIL(must));
721 });
33b8afdf
JH
722 if (!s) {
723 if (last1 == last) {
a3621e74 724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
725 ", giving up...\n"));
726 goto fail_finish;
2c2d71f5 727 }
a3621e74 728 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 729 ", trying anchored starting at offset %ld...\n",
be8e71aa 730 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
731 other_last = last;
732 s = HOP3c(t, 1, strend);
733 goto restart;
734 }
735 else {
a3621e74 736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
737 (long)(s - i_strpos)));
738 other_last = s; /* Fix this later. --Hugo */
be8e71aa 739 s = saved_s;
33b8afdf
JH
740 if (t == strpos)
741 goto try_at_start;
742 goto try_at_offset;
743 }
2c2d71f5 744 }
cad2e5aa 745 }
2c2d71f5 746
1de06328 747
9ef43ace 748 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 749
6bda09f9 750 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
751 PerlIO_printf(Perl_debug_log,
752 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
753 (IV)prog->check_offset_min,
754 (IV)prog->check_offset_max,
755 (IV)(s-strpos),
756 (IV)(t-strpos),
757 (IV)(t-s),
758 (IV)(strend-strpos)
759 )
760 );
761
2c2d71f5 762 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 763 && (!do_utf8
9ef43ace 764 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
765 && t > strpos)))
766 {
2c2d71f5
JH
767 /* Fixed substring is found far enough so that the match
768 cannot start at strpos. */
769 try_at_offset:
cad2e5aa 770 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
771 /* Eventually fbm_*() should handle this, but often
772 anchored_offset is not 0, so this check will not be wasted. */
773 /* XXXX In the code below we prefer to look for "^" even in
774 presence of anchored substrings. And we search even
775 beyond the found float position. These pessimizations
776 are historical artefacts only. */
777 find_anchor:
2c2d71f5 778 while (t < strend - prog->minlen) {
cad2e5aa 779 if (*t == '\n') {
4ee3650e 780 if (t < check_at - prog->check_offset_min) {
33b8afdf 781 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
782 /* Since we moved from the found position,
783 we definitely contradict the found anchored
30944b6d
IZ
784 substr. Due to the above check we do not
785 contradict "check" substr.
786 Thus we can arrive here only if check substr
787 is float. Redo checking for "other"=="fixed".
788 */
9041c2e3 789 strpos = t + 1;
a3621e74 790 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 791 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
792 goto do_other_anchored;
793 }
4ee3650e
GS
794 /* We don't contradict the found floating substring. */
795 /* XXXX Why not check for STCLASS? */
cad2e5aa 796 s = t + 1;
a3621e74 797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 798 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
799 goto set_useful;
800 }
4ee3650e
GS
801 /* Position contradicts check-string */
802 /* XXXX probably better to look for check-string
803 than for "\n", so one should lower the limit for t? */
a3621e74 804 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 805 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 806 other_last = strpos = s = t + 1;
cad2e5aa
JH
807 goto restart;
808 }
809 t++;
810 }
a3621e74 811 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 812 PL_colors[0], PL_colors[1]));
2c2d71f5 813 goto fail_finish;
cad2e5aa 814 }
f5952150 815 else {
a3621e74 816 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 817 PL_colors[0], PL_colors[1]));
f5952150 818 }
cad2e5aa
JH
819 s = t;
820 set_useful:
33b8afdf 821 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
822 }
823 else {
f5952150 824 /* The found string does not prohibit matching at strpos,
2c2d71f5 825 - no optimization of calling REx engine can be performed,
f5952150
GS
826 unless it was an MBOL and we are not after MBOL,
827 or a future STCLASS check will fail this. */
2c2d71f5
JH
828 try_at_start:
829 /* Even in this situation we may use MBOL flag if strpos is offset
830 wrt the start of the string. */
05b4157f 831 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 832 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 833 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 834 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 835 {
cad2e5aa
JH
836 t = strpos;
837 goto find_anchor;
838 }
a3621e74 839 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 840 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 841 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 842 );
2c2d71f5 843 success_at_start:
bbe252da 844 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
845 && (do_utf8 ? (
846 prog->check_utf8 /* Could be deleted already */
847 && --BmUSEFUL(prog->check_utf8) < 0
848 && (prog->check_utf8 == prog->float_utf8)
849 ) : (
850 prog->check_substr /* Could be deleted already */
851 && --BmUSEFUL(prog->check_substr) < 0
852 && (prog->check_substr == prog->float_substr)
853 )))
66e933ab 854 {
cad2e5aa 855 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
857 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
858 if (do_utf8 ? prog->check_substr : prog->check_utf8)
859 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
860 prog->check_substr = prog->check_utf8 = NULL; /* disable */
861 prog->float_substr = prog->float_utf8 = NULL; /* clear */
862 check = NULL; /* abort */
cad2e5aa 863 s = strpos;
3cf5c195
IZ
864 /* XXXX This is a remnant of the old implementation. It
865 looks wasteful, since now INTUIT can use many
6eb5f6b9 866 other heuristics. */
bbe252da 867 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
868 }
869 else
870 s = strpos;
871 }
872
6eb5f6b9
JH
873 /* Last resort... */
874 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
875 /* trie stclasses are too expensive to use here, we are better off to
876 leave it to regmatch itself */
f8fc2ecf 877 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
878 /* minlen == 0 is possible if regstclass is \b or \B,
879 and the fixed substr is ''$.
880 Since minlen is already taken into account, s+1 is before strend;
881 accidentally, minlen >= 1 guaranties no false positives at s + 1
882 even for \b or \B. But (minlen? 1 : 0) below assumes that
883 regstclass does not come from lookahead... */
884 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
885 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
886 const U8* const str = (U8*)STRING(progi->regstclass);
887 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
888 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 889 : 1);
1de06328
YO
890 char * endpos;
891 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
892 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
893 else if (prog->float_substr || prog->float_utf8)
894 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
895 else
896 endpos= strend;
897
70685ca0
JH
898 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
899 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 900
6eb5f6b9 901 t = s;
f8fc2ecf 902 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
903 if (!s) {
904#ifdef DEBUGGING
cbbf8932 905 const char *what = NULL;
6eb5f6b9
JH
906#endif
907 if (endpos == strend) {
a3621e74 908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
909 "Could not match STCLASS...\n") );
910 goto fail;
911 }
a3621e74 912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 913 "This position contradicts STCLASS...\n") );
bbe252da 914 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 915 goto fail;
6eb5f6b9 916 /* Contradict one of substrings */
33b8afdf
JH
917 if (prog->anchored_substr || prog->anchored_utf8) {
918 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 919 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 920 hop_and_restart:
1aa99e6b 921 s = HOP3c(t, 1, strend);
66e933ab
GS
922 if (s + start_shift + end_shift > strend) {
923 /* XXXX Should be taken into account earlier? */
a3621e74 924 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
925 "Could not match STCLASS...\n") );
926 goto fail;
927 }
5e39e1e5
HS
928 if (!check)
929 goto giveup;
a3621e74 930 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 931 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
932 what, (long)(s + start_shift - i_strpos)) );
933 goto restart;
934 }
66e933ab 935 /* Have both, check_string is floating */
6eb5f6b9
JH
936 if (t + start_shift >= check_at) /* Contradicts floating=check */
937 goto retry_floating_check;
938 /* Recheck anchored substring, but not floating... */
9041c2e3 939 s = check_at;
5e39e1e5
HS
940 if (!check)
941 goto giveup;
a3621e74 942 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 943 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
944 (long)(other_last - i_strpos)) );
945 goto do_other_anchored;
946 }
60e71179
GS
947 /* Another way we could have checked stclass at the
948 current position only: */
949 if (ml_anch) {
950 s = t = t + 1;
5e39e1e5
HS
951 if (!check)
952 goto giveup;
a3621e74 953 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 954 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 955 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 956 goto try_at_offset;
66e933ab 957 }
33b8afdf 958 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 959 goto fail;
6eb5f6b9
JH
960 /* Check is floating subtring. */
961 retry_floating_check:
962 t = check_at - start_shift;
a3621e74 963 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
964 goto hop_and_restart;
965 }
b7953727 966 if (t != s) {
a3621e74 967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 968 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
969 (long)(t - i_strpos), (long)(s - i_strpos))
970 );
971 }
972 else {
a3621e74 973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
974 "Does not contradict STCLASS...\n");
975 );
976 }
6eb5f6b9 977 }
5e39e1e5 978 giveup:
a3621e74 979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
980 PL_colors[4], (check ? "Guessed" : "Giving up"),
981 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 982 return s;
2c2d71f5
JH
983
984 fail_finish: /* Substring not found */
33b8afdf
JH
985 if (prog->check_substr || prog->check_utf8) /* could be removed already */
986 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 987 fail:
a3621e74 988 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 989 PL_colors[4], PL_colors[5]));
bd61b366 990 return NULL;
cad2e5aa 991}
9661b544 992
786e8c11 993
3b0527fe 994
55eed653
NC
995#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
996uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
997 switch (trie_type) { \
998 case trie_utf8_fold: \
999 if ( foldlen>0 ) { \
1000 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1001 foldlen -= len; \
1002 uscan += len; \
1003 len=0; \
1004 } else { \
1005 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1006 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1007 foldlen -= UNISKIP( uvc ); \
1008 uscan = foldbuf + UNISKIP( uvc ); \
1009 } \
1010 break; \
1011 case trie_utf8: \
1012 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1013 break; \
1014 case trie_plain: \
1015 uvc = (UV)*uc; \
1016 len = 1; \
1017 } \
1018 \
1019 if (uvc < 256) { \
1020 charid = trie->charmap[ uvc ]; \
1021 } \
1022 else { \
1023 charid = 0; \
55eed653
NC
1024 if (widecharmap) { \
1025 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1026 (char*)&uvc, sizeof(UV), 0); \
1027 if (svpp) \
1028 charid = (U16)SvIV(*svpp); \
1029 } \
1030 } \
1031} STMT_END
1032
1033#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1034 if ( (CoNd) \
1035 && (ln == len || \
1036 ibcmp_utf8(s, NULL, 0, do_utf8, \
1037 m, NULL, ln, (bool)UTF)) \
24b23f37 1038 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1039 goto got_it; \
1040 else { \
1041 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1042 uvchr_to_utf8(tmpbuf, c); \
1043 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1044 if ( f != c \
1045 && (f == c1 || f == c2) \
1046 && (ln == foldlen || \
1047 !ibcmp_utf8((char *) foldbuf, \
1048 NULL, foldlen, do_utf8, \
1049 m, \
1050 NULL, ln, (bool)UTF)) \
24b23f37 1051 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1052 goto got_it; \
1053 } \
1054 s += len
1055
1056#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1057STMT_START { \
1058 while (s <= e) { \
1059 if ( (CoNd) \
1060 && (ln == 1 || !(OP(c) == EXACTF \
1061 ? ibcmp(s, m, ln) \
1062 : ibcmp_locale(s, m, ln))) \
24b23f37 1063 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1064 goto got_it; \
1065 s++; \
1066 } \
1067} STMT_END
1068
1069#define REXEC_FBC_UTF8_SCAN(CoDe) \
1070STMT_START { \
1071 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1072 CoDe \
1073 s += uskip; \
1074 } \
1075} STMT_END
1076
1077#define REXEC_FBC_SCAN(CoDe) \
1078STMT_START { \
1079 while (s < strend) { \
1080 CoDe \
1081 s++; \
1082 } \
1083} STMT_END
1084
1085#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1086REXEC_FBC_UTF8_SCAN( \
1087 if (CoNd) { \
24b23f37 1088 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1089 goto got_it; \
1090 else \
1091 tmp = doevery; \
1092 } \
1093 else \
1094 tmp = 1; \
1095)
1096
1097#define REXEC_FBC_CLASS_SCAN(CoNd) \
1098REXEC_FBC_SCAN( \
1099 if (CoNd) { \
24b23f37 1100 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1101 goto got_it; \
1102 else \
1103 tmp = doevery; \
1104 } \
1105 else \
1106 tmp = 1; \
1107)
1108
1109#define REXEC_FBC_TRYIT \
24b23f37 1110if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1111 goto got_it
1112
e1d1eefb
YO
1113#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1114 if (do_utf8) { \
1115 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1116 } \
1117 else { \
1118 REXEC_FBC_CLASS_SCAN(CoNd); \
1119 } \
1120 break
1121
4cadc6a9
YO
1122#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1123 if (do_utf8) { \
1124 UtFpReLoAd; \
1125 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1126 } \
1127 else { \
1128 REXEC_FBC_CLASS_SCAN(CoNd); \
1129 } \
1130 break
1131
1132#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1133 PL_reg_flags |= RF_tainted; \
1134 if (do_utf8) { \
1135 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1136 } \
1137 else { \
1138 REXEC_FBC_CLASS_SCAN(CoNd); \
1139 } \
1140 break
1141
786e8c11
YO
1142#define DUMP_EXEC_POS(li,s,doutf8) \
1143 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1144
1145/* We know what class REx starts with. Try to find this position... */
1146/* if reginfo is NULL, its a dryrun */
1147/* annoyingly all the vars in this routine have different names from their counterparts
1148 in regmatch. /grrr */
1149
3c3eec57 1150STATIC char *
07be1b83 1151S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1152 const char *strend, regmatch_info *reginfo)
a687059c 1153{
27da23d5 1154 dVAR;
bbe252da 1155 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1156 char *m;
d8093b23 1157 STRLEN ln;
5dab1207 1158 STRLEN lnc;
078c425b 1159 register STRLEN uskip;
d8093b23
G
1160 unsigned int c1;
1161 unsigned int c2;
6eb5f6b9
JH
1162 char *e;
1163 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1164 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf
YO
1165 RXi_GET_DECL(prog,progi);
1166
6eb5f6b9
JH
1167 /* We know what class it must start with. */
1168 switch (OP(c)) {
6eb5f6b9 1169 case ANYOF:
388cc4de 1170 if (do_utf8) {
4cadc6a9 1171 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1172 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1173 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1174 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1175 }
1176 else {
1177 while (s < strend) {
1178 STRLEN skip = 1;
1179
32fc9b6a 1180 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1181 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1182 /* The assignment of 2 is intentional:
1183 * for the folded sharp s, the skip is 2. */
1184 (skip = SHARP_S_SKIP))) {
24b23f37 1185 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1186 goto got_it;
1187 else
1188 tmp = doevery;
1189 }
1190 else
1191 tmp = 1;
1192 s += skip;
1193 }
a0d0e21e 1194 }
6eb5f6b9 1195 break;
f33976b4 1196 case CANY:
4cadc6a9 1197 REXEC_FBC_SCAN(
24b23f37 1198 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1199 goto got_it;
1200 else
1201 tmp = doevery;
4cadc6a9 1202 );
f33976b4 1203 break;
6eb5f6b9 1204 case EXACTF:
5dab1207
NIS
1205 m = STRING(c);
1206 ln = STR_LEN(c); /* length to match in octets/bytes */
1207 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1208 if (UTF) {
a2a2844f 1209 STRLEN ulen1, ulen2;
5dab1207 1210 U8 *sm = (U8 *) m;
89ebb4a3
JH
1211 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1212 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1213 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1214
1215 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1216 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1217
89ebb4a3 1218 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1219 0, uniflags);
89ebb4a3 1220 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1221 0, uniflags);
5dab1207
NIS
1222 lnc = 0;
1223 while (sm < ((U8 *) m + ln)) {
1224 lnc++;
1225 sm += UTF8SKIP(sm);
1226 }
1aa99e6b
IH
1227 }
1228 else {
1229 c1 = *(U8*)m;
1230 c2 = PL_fold[c1];
1231 }
6eb5f6b9
JH
1232 goto do_exactf;
1233 case EXACTFL:
5dab1207
NIS
1234 m = STRING(c);
1235 ln = STR_LEN(c);
1236 lnc = (I32) ln;
d8093b23 1237 c1 = *(U8*)m;
6eb5f6b9
JH
1238 c2 = PL_fold_locale[c1];
1239 do_exactf:
db12adc6 1240 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1241
3b0527fe 1242 if (!reginfo && e < s)
6eb5f6b9 1243 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1244
60a8b682
JH
1245 /* The idea in the EXACTF* cases is to first find the
1246 * first character of the EXACTF* node and then, if
1247 * necessary, case-insensitively compare the full
1248 * text of the node. The c1 and c2 are the first
1249 * characters (though in Unicode it gets a bit
1250 * more complicated because there are more cases
7f16dd3d
JH
1251 * than just upper and lower: one needs to use
1252 * the so-called folding case for case-insensitive
1253 * matching (called "loose matching" in Unicode).
1254 * ibcmp_utf8() will do just that. */
60a8b682 1255
1aa99e6b 1256 if (do_utf8) {
575cac57 1257 UV c, f;
89ebb4a3 1258 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1259 STRLEN len, foldlen;
4ad0818d 1260 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1261 if (c1 == c2) {
5dab1207
NIS
1262 /* Upper and lower of 1st char are equal -
1263 * probably not a "letter". */
1aa99e6b 1264 while (s <= e) {
89ebb4a3 1265 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1266 uniflags);
4cadc6a9 1267 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1268 }
09091399
JH
1269 }
1270 else {
1aa99e6b 1271 while (s <= e) {
89ebb4a3 1272 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1273 uniflags);
80aecb99 1274
60a8b682 1275 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1276 * Note that not all the possible combinations
1277 * are handled here: some of them are handled
1278 * by the standard folding rules, and some of
1279 * them (the character class or ANYOF cases)
1280 * are handled during compiletime in
1281 * regexec.c:S_regclass(). */
880bd946
JH
1282 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1283 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1284 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1285
4cadc6a9 1286 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1287 }
09091399 1288 }
1aa99e6b
IH
1289 }
1290 else {
1291 if (c1 == c2)
4cadc6a9 1292 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1293 else
4cadc6a9 1294 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1295 }
1296 break;
bbce6d69 1297 case BOUNDL:
3280af22 1298 PL_reg_flags |= RF_tainted;
bbce6d69 1299 /* FALL THROUGH */
a0d0e21e 1300 case BOUND:
ffc61ed2 1301 if (do_utf8) {
12d33761 1302 if (s == PL_bostr)
ffc61ed2
JH
1303 tmp = '\n';
1304 else {
6136c704 1305 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1306 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1307 }
1308 tmp = ((OP(c) == BOUND ?
9041c2e3 1309 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1310 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1311 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1312 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1313 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1314 isALNUM_LC_utf8((U8*)s)))
1315 {
1316 tmp = !tmp;
4cadc6a9 1317 REXEC_FBC_TRYIT;
a687059c 1318 }
4cadc6a9 1319 );
a0d0e21e 1320 }
667bb95a 1321 else {
12d33761 1322 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1323 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1324 REXEC_FBC_SCAN(
ffc61ed2
JH
1325 if (tmp ==
1326 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1327 tmp = !tmp;
4cadc6a9 1328 REXEC_FBC_TRYIT;
a0ed51b3 1329 }
4cadc6a9 1330 );
a0ed51b3 1331 }
24b23f37 1332 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1333 goto got_it;
1334 break;
bbce6d69 1335 case NBOUNDL:
3280af22 1336 PL_reg_flags |= RF_tainted;
bbce6d69 1337 /* FALL THROUGH */
a0d0e21e 1338 case NBOUND:
ffc61ed2 1339 if (do_utf8) {
12d33761 1340 if (s == PL_bostr)
ffc61ed2
JH
1341 tmp = '\n';
1342 else {
6136c704 1343 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1344 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1345 }
1346 tmp = ((OP(c) == NBOUND ?
9041c2e3 1347 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1348 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1349 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1350 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1351 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1352 isALNUM_LC_utf8((U8*)s)))
1353 tmp = !tmp;
4cadc6a9
YO
1354 else REXEC_FBC_TRYIT;
1355 );
a0d0e21e 1356 }
667bb95a 1357 else {
12d33761 1358 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1359 tmp = ((OP(c) == NBOUND ?
1360 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1361 REXEC_FBC_SCAN(
ffc61ed2
JH
1362 if (tmp ==
1363 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1364 tmp = !tmp;
4cadc6a9
YO
1365 else REXEC_FBC_TRYIT;
1366 );
a0ed51b3 1367 }
24b23f37 1368 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1369 goto got_it;
1370 break;
a0d0e21e 1371 case ALNUM:
4cadc6a9
YO
1372 REXEC_FBC_CSCAN_PRELOAD(
1373 LOAD_UTF8_CHARCLASS_ALNUM(),
1374 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1375 isALNUM(*s)
1376 );
bbce6d69 1377 case ALNUML:
4cadc6a9
YO
1378 REXEC_FBC_CSCAN_TAINT(
1379 isALNUM_LC_utf8((U8*)s),
1380 isALNUM_LC(*s)
1381 );
a0d0e21e 1382 case NALNUM:
4cadc6a9
YO
1383 REXEC_FBC_CSCAN_PRELOAD(
1384 LOAD_UTF8_CHARCLASS_ALNUM(),
1385 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1386 !isALNUM(*s)
1387 );
bbce6d69 1388 case NALNUML:
4cadc6a9
YO
1389 REXEC_FBC_CSCAN_TAINT(
1390 !isALNUM_LC_utf8((U8*)s),
1391 !isALNUM_LC(*s)
1392 );
a0d0e21e 1393 case SPACE:
4cadc6a9
YO
1394 REXEC_FBC_CSCAN_PRELOAD(
1395 LOAD_UTF8_CHARCLASS_SPACE(),
1396 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1397 isSPACE(*s)
1398 );
bbce6d69 1399 case SPACEL:
4cadc6a9
YO
1400 REXEC_FBC_CSCAN_TAINT(
1401 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1402 isSPACE_LC(*s)
1403 );
a0d0e21e 1404 case NSPACE:
4cadc6a9
YO
1405 REXEC_FBC_CSCAN_PRELOAD(
1406 LOAD_UTF8_CHARCLASS_SPACE(),
1407 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1408 !isSPACE(*s)
1409 );
bbce6d69 1410 case NSPACEL:
4cadc6a9
YO
1411 REXEC_FBC_CSCAN_TAINT(
1412 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1413 !isSPACE_LC(*s)
1414 );
a0d0e21e 1415 case DIGIT:
4cadc6a9
YO
1416 REXEC_FBC_CSCAN_PRELOAD(
1417 LOAD_UTF8_CHARCLASS_DIGIT(),
1418 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1419 isDIGIT(*s)
1420 );
b8c5462f 1421 case DIGITL:
4cadc6a9
YO
1422 REXEC_FBC_CSCAN_TAINT(
1423 isDIGIT_LC_utf8((U8*)s),
1424 isDIGIT_LC(*s)
1425 );
a0d0e21e 1426 case NDIGIT:
4cadc6a9
YO
1427 REXEC_FBC_CSCAN_PRELOAD(
1428 LOAD_UTF8_CHARCLASS_DIGIT(),
1429 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1430 !isDIGIT(*s)
1431 );
b8c5462f 1432 case NDIGITL:
4cadc6a9
YO
1433 REXEC_FBC_CSCAN_TAINT(
1434 !isDIGIT_LC_utf8((U8*)s),
1435 !isDIGIT_LC(*s)
1436 );
e1d1eefb
YO
1437 case LNBREAK:
1438 REXEC_FBC_CSCAN(
1439 is_LNBREAK_utf8(s),
1440 is_LNBREAK_latin1(s)
1441 );
1442 case VERTWS:
1443 REXEC_FBC_CSCAN(
1444 is_VERTWS_utf8(s),
1445 is_VERTWS_latin1(s)
1446 );
1447 case NVERTWS:
1448 REXEC_FBC_CSCAN(
1449 !is_VERTWS_utf8(s),
1450 !is_VERTWS_latin1(s)
1451 );
1452 case HORIZWS:
1453 REXEC_FBC_CSCAN(
1454 is_HORIZWS_utf8(s),
1455 is_HORIZWS_latin1(s)
1456 );
1457 case NHORIZWS:
1458 REXEC_FBC_CSCAN(
1459 !is_HORIZWS_utf8(s),
1460 !is_HORIZWS_latin1(s)
1461 );
1de06328
YO
1462 case AHOCORASICKC:
1463 case AHOCORASICK:
07be1b83
YO
1464 {
1465 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1466 trie_type = do_utf8 ?
1467 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1468 : trie_plain;
1469 /* what trie are we using right now */
1470 reg_ac_data *aho
f8fc2ecf 1471 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1472 reg_trie_data *trie
1473 = (reg_trie_data*)progi->data->data[ aho->trie ];
55eed653 1474 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
07be1b83
YO
1475
1476 const char *last_start = strend - trie->minlen;
6148ee25 1477#ifdef DEBUGGING
07be1b83 1478 const char *real_start = s;
6148ee25 1479#endif
07be1b83 1480 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1481 SV *sv_points;
1482 U8 **points; /* map of where we were in the input string
786e8c11 1483 when reading a given char. For ASCII this
be8e71aa
YO
1484 is unnecessary overhead as the relationship
1485 is always 1:1, but for unicode, especially
1486 case folded unicode this is not true. */
f9e705e8 1487 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1488 U8 *bitmap=NULL;
1489
07be1b83
YO
1490
1491 GET_RE_DEBUG_FLAGS_DECL;
1492
be8e71aa
YO
1493 /* We can't just allocate points here. We need to wrap it in
1494 * an SV so it gets freed properly if there is a croak while
1495 * running the match */
1496 ENTER;
1497 SAVETMPS;
1498 sv_points=newSV(maxlen * sizeof(U8 *));
1499 SvCUR_set(sv_points,
1500 maxlen * sizeof(U8 *));
1501 SvPOK_on(sv_points);
1502 sv_2mortal(sv_points);
1503 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1504 if ( trie_type != trie_utf8_fold
1505 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1506 {
786e8c11
YO
1507 if (trie->bitmap)
1508 bitmap=(U8*)trie->bitmap;
1509 else
1510 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1511 }
786e8c11
YO
1512 /* this is the Aho-Corasick algorithm modified a touch
1513 to include special handling for long "unknown char"
1514 sequences. The basic idea being that we use AC as long
1515 as we are dealing with a possible matching char, when
1516 we encounter an unknown char (and we have not encountered
1517 an accepting state) we scan forward until we find a legal
1518 starting char.
1519 AC matching is basically that of trie matching, except
1520 that when we encounter a failing transition, we fall back
1521 to the current states "fail state", and try the current char
1522 again, a process we repeat until we reach the root state,
1523 state 1, or a legal transition. If we fail on the root state
1524 then we can either terminate if we have reached an accepting
1525 state previously, or restart the entire process from the beginning
1526 if we have not.
1527
1528 */
07be1b83
YO
1529 while (s <= last_start) {
1530 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1531 U8 *uc = (U8*)s;
1532 U16 charid = 0;
1533 U32 base = 1;
1534 U32 state = 1;
1535 UV uvc = 0;
1536 STRLEN len = 0;
1537 STRLEN foldlen = 0;
1538 U8 *uscan = (U8*)NULL;
1539 U8 *leftmost = NULL;
786e8c11
YO
1540#ifdef DEBUGGING
1541 U32 accepted_word= 0;
1542#endif
07be1b83
YO
1543 U32 pointpos = 0;
1544
1545 while ( state && uc <= (U8*)strend ) {
1546 int failed=0;
786e8c11
YO
1547 U32 word = aho->states[ state ].wordnum;
1548
1de06328
YO
1549 if( state==1 ) {
1550 if ( bitmap ) {
1551 DEBUG_TRIE_EXECUTE_r(
1552 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1553 dump_exec_pos( (char *)uc, c, strend, real_start,
1554 (char *)uc, do_utf8 );
1555 PerlIO_printf( Perl_debug_log,
1556 " Scanning for legal start char...\n");
1557 }
1558 );
1559 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1560 uc++;
786e8c11 1561 }
1de06328 1562 s= (char *)uc;
786e8c11 1563 }
786e8c11
YO
1564 if (uc >(U8*)last_start) break;
1565 }
1566
1567 if ( word ) {
1568 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1569 if (!leftmost || lpos < leftmost) {
1570 DEBUG_r(accepted_word=word);
07be1b83 1571 leftmost= lpos;
786e8c11 1572 }
07be1b83 1573 if (base==0) break;
786e8c11 1574
07be1b83
YO
1575 }
1576 points[pointpos++ % maxlen]= uc;
55eed653
NC
1577 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1578 uscan, len, uvc, charid, foldlen,
1579 foldbuf, uniflags);
786e8c11
YO
1580 DEBUG_TRIE_EXECUTE_r({
1581 dump_exec_pos( (char *)uc, c, strend, real_start,
1582 s, do_utf8 );
07be1b83 1583 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1584 " Charid:%3u CP:%4"UVxf" ",
1585 charid, uvc);
1586 });
07be1b83
YO
1587
1588 do {
6148ee25 1589#ifdef DEBUGGING
786e8c11 1590 word = aho->states[ state ].wordnum;
6148ee25 1591#endif
07be1b83
YO
1592 base = aho->states[ state ].trans.base;
1593
786e8c11
YO
1594 DEBUG_TRIE_EXECUTE_r({
1595 if (failed)
1596 dump_exec_pos( (char *)uc, c, strend, real_start,
1597 s, do_utf8 );
07be1b83 1598 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1599 "%sState: %4"UVxf", word=%"UVxf,
1600 failed ? " Fail transition to " : "",
1601 (UV)state, (UV)word);
1602 });
07be1b83
YO
1603 if ( base ) {
1604 U32 tmp;
1605 if (charid &&
1606 (base + charid > trie->uniquecharcount )
1607 && (base + charid - 1 - trie->uniquecharcount
1608 < trie->lasttrans)
1609 && trie->trans[base + charid - 1 -
1610 trie->uniquecharcount].check == state
1611 && (tmp=trie->trans[base + charid - 1 -
1612 trie->uniquecharcount ].next))
1613 {
786e8c11
YO
1614 DEBUG_TRIE_EXECUTE_r(
1615 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1616 state = tmp;
1617 break;
1618 }
1619 else {
786e8c11
YO
1620 DEBUG_TRIE_EXECUTE_r(
1621 PerlIO_printf( Perl_debug_log," - fail\n"));
1622 failed = 1;
1623 state = aho->fail[state];
07be1b83
YO
1624 }
1625 }
1626 else {
1627 /* we must be accepting here */
786e8c11
YO
1628 DEBUG_TRIE_EXECUTE_r(
1629 PerlIO_printf( Perl_debug_log," - accepting\n"));
1630 failed = 1;
07be1b83
YO
1631 break;
1632 }
1633 } while(state);
786e8c11 1634 uc += len;
07be1b83
YO
1635 if (failed) {
1636 if (leftmost)
1637 break;
786e8c11 1638 if (!state) state = 1;
07be1b83
YO
1639 }
1640 }
1641 if ( aho->states[ state ].wordnum ) {
1642 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1643 if (!leftmost || lpos < leftmost) {
1644 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1645 leftmost = lpos;
786e8c11 1646 }
07be1b83 1647 }
07be1b83
YO
1648 if (leftmost) {
1649 s = (char*)leftmost;
786e8c11
YO
1650 DEBUG_TRIE_EXECUTE_r({
1651 PerlIO_printf(
70685ca0
JH
1652 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1653 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1654 );
1655 });
24b23f37 1656 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1657 FREETMPS;
1658 LEAVE;
07be1b83 1659 goto got_it;
be8e71aa 1660 }
07be1b83 1661 s = HOPc(s,1);
786e8c11
YO
1662 DEBUG_TRIE_EXECUTE_r({
1663 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1664 });
07be1b83 1665 } else {
786e8c11
YO
1666 DEBUG_TRIE_EXECUTE_r(
1667 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1668 break;
1669 }
1670 }
be8e71aa
YO
1671 FREETMPS;
1672 LEAVE;
07be1b83
YO
1673 }
1674 break;
b3c9acc1 1675 default:
3c3eec57
GS
1676 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1677 break;
d6a28714 1678 }
6eb5f6b9
JH
1679 return 0;
1680 got_it:
1681 return s;
1682}
1683
e6aad7ab 1684static void
fae667d5 1685S_swap_match_buff (pTHX_ regexp *prog) {
f0ab9afb 1686 regexp_paren_pair *t;
fae667d5 1687
28d8d7f4 1688 if (!prog->swap) {
fae667d5
YO
1689 /* We have to be careful. If the previous successful match
1690 was from this regex we don't want a subsequent paritally
1691 successful match to clobber the old results.
1692 So when we detect this possibility we add a swap buffer
1693 to the re, and switch the buffer each match. If we fail
1694 we switch it back, otherwise we leave it swapped.
1695 */
f0ab9afb 1696 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair);
fae667d5 1697 }
f0ab9afb
NC
1698 t = prog->swap;
1699 prog->swap = prog->offs;
1700 prog->offs = t;
fae667d5
YO
1701}
1702
1703
6eb5f6b9
JH
1704/*
1705 - regexec_flags - match a regexp against a string
1706 */
1707I32
1708Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1709 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1710/* strend: pointer to null at end of string */
1711/* strbeg: real beginning of string */
1712/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1713/* data: May be used for some additional optimizations.
1714 Currently its only used, with a U32 cast, for transmitting
1715 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1716/* nosave: For optimizations. */
1717{
97aff369 1718 dVAR;
24b23f37 1719 /*register*/ char *s;
6eb5f6b9 1720 register regnode *c;
24b23f37 1721 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1722 I32 minlen; /* must match at least this many chars */
1723 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1724 I32 end_shift = 0; /* Same for the end. */ /* CC */
1725 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1726 char *scream_olds = NULL;
f9f4320a 1727 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1728 I32 multiline;
f8fc2ecf 1729 RXi_GET_DECL(prog,progi);
3b0527fe 1730 regmatch_info reginfo; /* create some info to pass to regtry etc */
fae667d5 1731 bool swap_on_fail = 0;
a3621e74
YO
1732
1733 GET_RE_DEBUG_FLAGS_DECL;
1734
9d4ba2ae 1735 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1736
1737 /* Be paranoid... */
1738 if (prog == NULL || startpos == NULL) {
1739 Perl_croak(aTHX_ "NULL regexp parameter");
1740 return 0;
1741 }
1742
bbe252da 1743 multiline = prog->extflags & RXf_PMf_MULTILINE;
3b0527fe 1744 reginfo.prog = prog;
2757e526 1745
bac06658 1746 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1747 DEBUG_EXECUTE_r(
1748 debug_start_match(prog, do_utf8, startpos, strend,
1749 "Matching");
1750 );
bac06658 1751
6eb5f6b9 1752 minlen = prog->minlen;
1de06328
YO
1753
1754 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1755 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1756 "String too short [regexec_flags]...\n"));
1757 goto phooey;
1aa99e6b 1758 }
6eb5f6b9 1759
1de06328 1760
6eb5f6b9 1761 /* Check validity of program. */
f8fc2ecf 1762 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1763 Perl_croak(aTHX_ "corrupted regexp program");
1764 }
1765
1766 PL_reg_flags = 0;
1767 PL_reg_eval_set = 0;
1768 PL_reg_maxiter = 0;
1769
bbe252da 1770 if (prog->extflags & RXf_UTF8)
6eb5f6b9
JH
1771 PL_reg_flags |= RF_utf8;
1772
1773 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1774 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1775 PL_bostr = strbeg;
3b0527fe 1776 reginfo.sv = sv;
6eb5f6b9
JH
1777
1778 /* Mark end of line for $ (and such) */
1779 PL_regeol = strend;
1780
1781 /* see how far we have to get to not match where we matched before */
3b0527fe 1782 reginfo.till = startpos+minend;
6eb5f6b9 1783
6eb5f6b9
JH
1784 /* If there is a "must appear" string, look for it. */
1785 s = startpos;
1786
bbe252da 1787 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1788 MAGIC *mg;
1789
1790 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
58e23c8d 1791 reginfo.ganch = startpos + prog->gofs;
6eb5f6b9
JH
1792 else if (sv && SvTYPE(sv) >= SVt_PVMG
1793 && SvMAGIC(sv)
14befaf4
DM
1794 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1795 && mg->mg_len >= 0) {
3b0527fe 1796 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
bbe252da 1797 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1798 if (s > reginfo.ganch)
6eb5f6b9 1799 goto phooey;
58e23c8d 1800 s = reginfo.ganch - prog->gofs;
6eb5f6b9
JH
1801 }
1802 }
58e23c8d 1803 else if (data) {
70685ca0 1804 reginfo.ganch = strbeg + PTR2UV(data);
58e23c8d 1805 } else /* pos() not defined */
3b0527fe 1806 reginfo.ganch = strbeg;
6eb5f6b9 1807 }
c74340f9 1808 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
fae667d5
YO
1809 swap_on_fail = 1;
1810 swap_match_buff(prog); /* do we need a save destructor here for
1811 eval dies? */
c74340f9 1812 }
a0714e2c 1813 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1814 re_scream_pos_data d;
1815
1816 d.scream_olds = &scream_olds;
1817 d.scream_pos = &scream_pos;
1818 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1819 if (!s) {
a3621e74 1820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1821 goto phooey; /* not present */
3fa9c3d7 1822 }
6eb5f6b9
JH
1823 }
1824
1de06328 1825
6eb5f6b9
JH
1826
1827 /* Simplest case: anchored match need be tried only once. */
1828 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1829 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1830 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 1831 goto got_it;
bbe252da
YO
1832 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1833 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
1834 {
1835 char *end;
1836
1837 if (minlen)
1838 dontbother = minlen - 1;
1aa99e6b 1839 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1840 /* for multiline we only have to try after newlines */
33b8afdf 1841 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1842 if (s == startpos)
1843 goto after_try;
1844 while (1) {
24b23f37 1845 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1846 goto got_it;
1847 after_try:
1848 if (s >= end)
1849 goto phooey;
bbe252da 1850 if (prog->extflags & RXf_USE_INTUIT) {
6eb5f6b9
JH
1851 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1852 if (!s)
1853 goto phooey;
1854 }
1855 else
1856 s++;
1857 }
1858 } else {
1859 if (s > startpos)
1860 s--;
1861 while (s < end) {
1862 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 1863 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1864 goto got_it;
1865 }
1866 }
1867 }
1868 }
1869 goto phooey;
bbe252da 1870 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
1871 {
1872 /* the warning about reginfo.ganch being used without intialization
bbe252da 1873 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 1874 and we only enter this block when the same bit is set. */
58e23c8d
YO
1875 char *tmp_s = reginfo.ganch - prog->gofs;
1876 if (regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
1877 goto got_it;
1878 goto phooey;
1879 }
1880
1881 /* Messy cases: unanchored match. */
bbe252da 1882 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
1883 /* we have /x+whatever/ */
1884 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1885 char ch;
bf93d4cc
GS
1886#ifdef DEBUGGING
1887 int did_match = 0;
1888#endif
33b8afdf
JH
1889 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1890 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1891 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1892
1aa99e6b 1893 if (do_utf8) {
4cadc6a9 1894 REXEC_FBC_SCAN(
6eb5f6b9 1895 if (*s == ch) {
a3621e74 1896 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1897 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1898 s += UTF8SKIP(s);
1899 while (s < strend && *s == ch)
1900 s += UTF8SKIP(s);
1901 }
4cadc6a9 1902 );
6eb5f6b9
JH
1903 }
1904 else {
4cadc6a9 1905 REXEC_FBC_SCAN(
6eb5f6b9 1906 if (*s == ch) {
a3621e74 1907 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1908 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1909 s++;
1910 while (s < strend && *s == ch)
1911 s++;
1912 }
4cadc6a9 1913 );
6eb5f6b9 1914 }
a3621e74 1915 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1916 PerlIO_printf(Perl_debug_log,
b7953727
JH
1917 "Did not find anchored character...\n")
1918 );
6eb5f6b9 1919 }
a0714e2c
SS
1920 else if (prog->anchored_substr != NULL
1921 || prog->anchored_utf8 != NULL
1922 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1923 && prog->float_max_offset < strend - s)) {
1924 SV *must;
1925 I32 back_max;
1926 I32 back_min;
1927 char *last;
6eb5f6b9 1928 char *last1; /* Last position checked before */
bf93d4cc
GS
1929#ifdef DEBUGGING
1930 int did_match = 0;
1931#endif
33b8afdf
JH
1932 if (prog->anchored_substr || prog->anchored_utf8) {
1933 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1934 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1935 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1936 back_max = back_min = prog->anchored_offset;
1937 } else {
1938 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1939 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1940 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1941 back_max = prog->float_max_offset;
1942 back_min = prog->float_min_offset;
1943 }
1de06328
YO
1944
1945
33b8afdf
JH
1946 if (must == &PL_sv_undef)
1947 /* could not downgrade utf8 check substring, so must fail */
1948 goto phooey;
1949
1de06328
YO
1950 if (back_min<0) {
1951 last = strend;
1952 } else {
1953 last = HOP3c(strend, /* Cannot start after this */
1954 -(I32)(CHR_SVLEN(must)
1955 - (SvTAIL(must) != 0) + back_min), strbeg);
1956 }
6eb5f6b9
JH
1957 if (s > PL_bostr)
1958 last1 = HOPc(s, -1);
1959 else
1960 last1 = s - 1; /* bogus */
1961
a0288114 1962 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1963 check_substr==must. */
1964 scream_pos = -1;
1965 dontbother = end_shift;
1966 strend = HOPc(strend, -dontbother);
1967 while ( (s <= last) &&
9041c2e3 1968 ((flags & REXEC_SCREAM)
1de06328 1969 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1970 end_shift, &scream_pos, 0))
1de06328 1971 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1972 (unsigned char*)strend, must,
7fba1cd6 1973 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1974 /* we may be pointing at the wrong string */
1975 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1976 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1977 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1978 if (HOPc(s, -back_max) > last1) {
1979 last1 = HOPc(s, -back_min);
1980 s = HOPc(s, -back_max);
1981 }
1982 else {
52657f30 1983 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1984
1985 last1 = HOPc(s, -back_min);
52657f30 1986 s = t;
6eb5f6b9 1987 }
1aa99e6b 1988 if (do_utf8) {
6eb5f6b9 1989 while (s <= last1) {
24b23f37 1990 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1991 goto got_it;
1992 s += UTF8SKIP(s);
1993 }
1994 }
1995 else {
1996 while (s <= last1) {
24b23f37 1997 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1998 goto got_it;
1999 s++;
2000 }
2001 }
2002 }
ab3bbdeb
YO
2003 DEBUG_EXECUTE_r(if (!did_match) {
2004 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2005 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2006 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2007 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2008 ? "anchored" : "floating"),
ab3bbdeb
YO
2009 quoted, RE_SV_TAIL(must));
2010 });
6eb5f6b9
JH
2011 goto phooey;
2012 }
f8fc2ecf 2013 else if ( (c = progi->regstclass) ) {
f14c76ed 2014 if (minlen) {
f8fc2ecf 2015 const OPCODE op = OP(progi->regstclass);
66e933ab 2016 /* don't bother with what can't match */
786e8c11 2017 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2018 strend = HOPc(strend, -(minlen - 1));
2019 }
a3621e74 2020 DEBUG_EXECUTE_r({
be8e71aa 2021 SV * const prop = sv_newmortal();
32fc9b6a 2022 regprop(prog, prop, c);
0df25f3d 2023 {
02daf0ab 2024 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2025 s,strend-s,60);
0df25f3d 2026 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2027 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2028 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2029 quoted, (int)(strend - s));
0df25f3d 2030 }
ffc61ed2 2031 });
3b0527fe 2032 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2033 goto got_it;
07be1b83 2034 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2035 }
2036 else {
2037 dontbother = 0;
a0714e2c 2038 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2039 /* Trim the end. */
d6a28714 2040 char *last;
33b8afdf
JH
2041 SV* float_real;
2042
2043 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2044 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2045 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2046
2047 if (flags & REXEC_SCREAM) {
33b8afdf 2048 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2049 end_shift, &scream_pos, 1); /* last one */
2050 if (!last)
ffc61ed2 2051 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2052 /* we may be pointing at the wrong string */
2053 else if (RX_MATCH_COPIED(prog))
3f7c398e 2054 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2055 }
d6a28714
JH
2056 else {
2057 STRLEN len;
cfd0369c 2058 const char * const little = SvPV_const(float_real, len);
d6a28714 2059
33b8afdf 2060 if (SvTAIL(float_real)) {
d6a28714
JH
2061 if (memEQ(strend - len + 1, little, len - 1))
2062 last = strend - len + 1;
7fba1cd6 2063 else if (!multiline)
9041c2e3 2064 last = memEQ(strend - len, little, len)
bd61b366 2065 ? strend - len : NULL;
b8c5462f 2066 else
d6a28714
JH
2067 goto find_last;
2068 } else {
2069 find_last:
9041c2e3 2070 if (len)
d6a28714 2071 last = rninstr(s, strend, little, little + len);
b8c5462f 2072 else
a0288114 2073 last = strend; /* matching "$" */
b8c5462f 2074 }
b8c5462f 2075 }
bf93d4cc 2076 if (last == NULL) {
6bda09f9
YO
2077 DEBUG_EXECUTE_r(
2078 PerlIO_printf(Perl_debug_log,
2079 "%sCan't trim the tail, match fails (should not happen)%s\n",
2080 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2081 goto phooey; /* Should not happen! */
2082 }
d6a28714
JH
2083 dontbother = strend - last + prog->float_min_offset;
2084 }
2085 if (minlen && (dontbother < minlen))
2086 dontbother = minlen - 1;
2087 strend -= dontbother; /* this one's always in bytes! */
2088 /* We don't know much -- general case. */
1aa99e6b 2089 if (do_utf8) {
d6a28714 2090 for (;;) {
24b23f37 2091 if (regtry(&reginfo, &s))
d6a28714
JH
2092 goto got_it;
2093 if (s >= strend)
2094 break;
b8c5462f 2095 s += UTF8SKIP(s);
d6a28714
JH
2096 };
2097 }
2098 else {
2099 do {
24b23f37 2100 if (regtry(&reginfo, &s))
d6a28714
JH
2101 goto got_it;
2102 } while (s++ < strend);
2103 }
2104 }
2105
2106 /* Failure. */
2107 goto phooey;
2108
2109got_it:
2110 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2111
19b95bf0 2112 if (PL_reg_eval_set)
4f639d21 2113 restore_pos(aTHX_ prog);
81714fb9
YO
2114 if (prog->paren_names)
2115 (void)hv_iterinit(prog->paren_names);
d6a28714
JH
2116
2117 /* make sure $`, $&, $', and $digit will work later */
2118 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2119 RX_MATCH_COPY_FREE(prog);
d6a28714 2120 if (flags & REXEC_COPY_STR) {
be8e71aa 2121 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2122#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2123 if ((SvIsCOW(sv)
2124 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2125 if (DEBUG_C_TEST) {
2126 PerlIO_printf(Perl_debug_log,
2127 "Copy on write: regexp capture, type %d\n",
2128 (int) SvTYPE(sv));
2129 }
2130 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2131 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2132 assert (SvPOKp(prog->saved_copy));
2133 } else
2134#endif
2135 {
2136 RX_MATCH_COPIED_on(prog);
2137 s = savepvn(strbeg, i);
2138 prog->subbeg = s;
2139 }
d6a28714 2140 prog->sublen = i;
d6a28714
JH
2141 }
2142 else {
2143 prog->subbeg = strbeg;
2144 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2145 }
2146 }
9041c2e3 2147
d6a28714
JH
2148 return 1;
2149
2150phooey:
a3621e74 2151 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2152 PL_colors[4], PL_colors[5]));
d6a28714 2153 if (PL_reg_eval_set)
4f639d21 2154 restore_pos(aTHX_ prog);
fae667d5 2155 if (swap_on_fail)
c74340f9 2156 /* we failed :-( roll it back */
fae667d5
YO
2157 swap_match_buff(prog);
2158
d6a28714
JH
2159 return 0;
2160}
2161
6bda09f9 2162
d6a28714
JH
2163/*
2164 - regtry - try match at specific point
2165 */
2166STATIC I32 /* 0 failure, 1 success */
24b23f37 2167S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2168{
97aff369 2169 dVAR;
d6a28714 2170 CHECKPOINT lastcp;
3b0527fe 2171 regexp *prog = reginfo->prog;
f8fc2ecf 2172 RXi_GET_DECL(prog,progi);
a3621e74 2173 GET_RE_DEBUG_FLAGS_DECL;
24b23f37 2174 reginfo->cutpoint=NULL;
d6a28714 2175
bbe252da 2176 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2177 MAGIC *mg;
2178
2179 PL_reg_eval_set = RS_init;
a3621e74 2180 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2181 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2182 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2183 ));
ea8d6ae1 2184 SAVESTACK_CXPOS();
d6a28714
JH
2185 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2186 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2187 SAVETMPS;
2188 /* Apparently this is not needed, judging by wantarray. */
e8347627 2189 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2190 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2191
3b0527fe 2192 if (reginfo->sv) {
d6a28714 2193 /* Make $_ available to executed code. */
3b0527fe 2194 if (reginfo->sv != DEFSV) {
59f00321 2195 SAVE_DEFSV;
3b0527fe 2196 DEFSV = reginfo->sv;
b8c5462f 2197 }
d6a28714 2198
3b0527fe
DM
2199 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2200 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2201 /* prepare for quick setting of pos */
d300d9fa 2202#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2203 if (SvIsCOW(reginfo->sv))
2204 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2205#endif
3dab1dad 2206 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2207 &PL_vtbl_mglob, NULL, 0);
d6a28714 2208 mg->mg_len = -1;
b8c5462f 2209 }
d6a28714
JH
2210 PL_reg_magic = mg;
2211 PL_reg_oldpos = mg->mg_len;
4f639d21 2212 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2213 }
09687e5a 2214 if (!PL_reg_curpm) {
a02a5408 2215 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2216#ifdef USE_ITHREADS
2217 {
be8e71aa 2218 SV* const repointer = newSViv(0);
577e12cc 2219 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2220 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2221 av_push(PL_regex_padav,repointer);
2222 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2223 PL_regex_pad = AvARRAY(PL_regex_padav);
2224 }
2225#endif
2226 }
aaa362c4 2227 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2228 PL_reg_oldcurpm = PL_curpm;
2229 PL_curpm = PL_reg_curpm;
2230 if (RX_MATCH_COPIED(prog)) {
2231 /* Here is a serious problem: we cannot rewrite subbeg,
2232 since it may be needed if this match fails. Thus
2233 $` inside (?{}) could fail... */
2234 PL_reg_oldsaved = prog->subbeg;
2235 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2236#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2237 PL_nrs = prog->saved_copy;
2238#endif
d6a28714
JH
2239 RX_MATCH_COPIED_off(prog);
2240 }
2241 else
bd61b366 2242 PL_reg_oldsaved = NULL;
d6a28714
JH
2243 prog->subbeg = PL_bostr;
2244 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2245 }
24b23f37 2246 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2247 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2248 PL_reginput = *startpos;
d6a28714 2249 PL_reglastparen = &prog->lastparen;
a01268b5 2250 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2251 prog->lastparen = 0;
03994de8 2252 prog->lastcloseparen = 0;
d6a28714 2253 PL_regsize = 0;
f0ab9afb 2254 PL_regoffs = prog->offs;
d6a28714
JH
2255 if (PL_reg_start_tmpl <= prog->nparens) {
2256 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2257 if(PL_reg_start_tmp)
2258 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2259 else
a02a5408 2260 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2261 }
2262
2263 /* XXXX What this code is doing here?!!! There should be no need
2264 to do this again and again, PL_reglastparen should take care of
3dd2943c 2265 this! --ilya*/
dafc8851
JH
2266
2267 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2268 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2269 * PL_reglastparen), is not needed at all by the test suite
2270 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2271 * enough, for building DynaLoader, or otherwise this
2272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2273 * will happen. Meanwhile, this code *is* needed for the
2274 * above-mentioned test suite tests to succeed. The common theme
2275 * on those tests seems to be returning null fields from matches.
2276 * --jhi */
dafc8851 2277#if 1
d6a28714 2278 if (prog->nparens) {
f0ab9afb 2279 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2280 register I32 i;
eb160463 2281 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2282 ++pp;
2283 pp->start = -1;
2284 pp->end = -1;
d6a28714
JH
2285 }
2286 }
dafc8851 2287#endif
02db2b7b 2288 REGCP_SET(lastcp);
f8fc2ecf 2289 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2290 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2291 return 1;
2292 }
24b23f37
YO
2293 if (reginfo->cutpoint)
2294 *startpos= reginfo->cutpoint;
02db2b7b 2295 REGCP_UNWIND(lastcp);
d6a28714
JH
2296 return 0;
2297}
2298
02db2b7b 2299
8ba1375e
MJD
2300#define sayYES goto yes
2301#define sayNO goto no
262b90c4 2302#define sayNO_SILENT goto no_silent
8ba1375e 2303
f9f4320a
YO
2304/* we dont use STMT_START/END here because it leads to
2305 "unreachable code" warnings, which are bogus, but distracting. */
2306#define CACHEsayNO \
c476f425
DM
2307 if (ST.cache_mask) \
2308 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2309 sayNO
3298f257 2310
a3621e74 2311/* this is used to determine how far from the left messages like
265c4333
YO
2312 'failed...' are printed. It should be set such that messages
2313 are inline with the regop output that created them.
a3621e74 2314*/
265c4333 2315#define REPORT_CODE_OFF 32
a3621e74
YO
2316
2317
2318/* Make sure there is a test for this +1 options in re_tests */
2319#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2320
40a82448
DM
2321#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2322#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2323
86545054
DM
2324#define SLAB_FIRST(s) (&(s)->states[0])
2325#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2326
5d9a96ca
DM
2327/* grab a new slab and return the first slot in it */
2328
2329STATIC regmatch_state *
2330S_push_slab(pTHX)
2331{
a35a87e7 2332#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2333 dMY_CXT;
2334#endif
5d9a96ca
DM
2335 regmatch_slab *s = PL_regmatch_slab->next;
2336 if (!s) {
2337 Newx(s, 1, regmatch_slab);
2338 s->prev = PL_regmatch_slab;
2339 s->next = NULL;
2340 PL_regmatch_slab->next = s;
2341 }
2342 PL_regmatch_slab = s;
86545054 2343 return SLAB_FIRST(s);
5d9a96ca 2344}
5b47454d 2345
95b24440 2346
40a82448
DM
2347/* push a new state then goto it */
2348
2349#define PUSH_STATE_GOTO(state, node) \
2350 scan = node; \
2351 st->resume_state = state; \
2352 goto push_state;
2353
2354/* push a new state with success backtracking, then goto it */
2355
2356#define PUSH_YES_STATE_GOTO(state, node) \
2357 scan = node; \
2358 st->resume_state = state; \
2359 goto push_yes_state;
2360
aa283a38 2361
aa283a38 2362
d6a28714 2363/*
95b24440 2364
bf1f174e
DM
2365regmatch() - main matching routine
2366
2367This is basically one big switch statement in a loop. We execute an op,
2368set 'next' to point the next op, and continue. If we come to a point which
2369we may need to backtrack to on failure such as (A|B|C), we push a
2370backtrack state onto the backtrack stack. On failure, we pop the top
2371state, and re-enter the loop at the state indicated. If there are no more
2372states to pop, we return failure.
2373
2374Sometimes we also need to backtrack on success; for example /A+/, where
2375after successfully matching one A, we need to go back and try to
2376match another one; similarly for lookahead assertions: if the assertion
2377completes successfully, we backtrack to the state just before the assertion
2378and then carry on. In these cases, the pushed state is marked as
2379'backtrack on success too'. This marking is in fact done by a chain of
2380pointers, each pointing to the previous 'yes' state. On success, we pop to
2381the nearest yes state, discarding any intermediate failure-only states.
2382Sometimes a yes state is pushed just to force some cleanup code to be
2383called at the end of a successful match or submatch; e.g. (??{$re}) uses
2384it to free the inner regex.
2385
2386Note that failure backtracking rewinds the cursor position, while
2387success backtracking leaves it alone.
2388
2389A pattern is complete when the END op is executed, while a subpattern
2390such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2391ops trigger the "pop to last yes state if any, otherwise return true"
2392behaviour.
2393
2394A common convention in this function is to use A and B to refer to the two
2395subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2396the subpattern to be matched possibly multiple times, while B is the entire
2397rest of the pattern. Variable and state names reflect this convention.
2398
2399The states in the main switch are the union of ops and failure/success of
2400substates associated with with that op. For example, IFMATCH is the op
2401that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2402'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2403successfully matched A and IFMATCH_A_fail is a state saying that we have
2404just failed to match A. Resume states always come in pairs. The backtrack
2405state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2406at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2407on success or failure.
2408
2409The struct that holds a backtracking state is actually a big union, with
2410one variant for each major type of op. The variable st points to the
2411top-most backtrack struct. To make the code clearer, within each
2412block of code we #define ST to alias the relevant union.
2413
2414Here's a concrete example of a (vastly oversimplified) IFMATCH
2415implementation:
2416
2417 switch (state) {
2418 ....
2419
2420#define ST st->u.ifmatch
2421
2422 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2423 ST.foo = ...; // some state we wish to save
95b24440 2424 ...
bf1f174e
DM
2425 // push a yes backtrack state with a resume value of
2426 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2427 // first node of A:
2428 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2429 // NOTREACHED
2430
2431 case IFMATCH_A: // we have successfully executed A; now continue with B
2432 next = B;
2433 bar = ST.foo; // do something with the preserved value
2434 break;
2435
2436 case IFMATCH_A_fail: // A failed, so the assertion failed
2437 ...; // do some housekeeping, then ...
2438 sayNO; // propagate the failure
2439
2440#undef ST
95b24440 2441
bf1f174e
DM
2442 ...
2443 }
95b24440 2444
bf1f174e
DM
2445For any old-timers reading this who are familiar with the old recursive
2446approach, the code above is equivalent to:
95b24440 2447
bf1f174e
DM
2448 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2449 {
2450 int foo = ...
95b24440 2451 ...
bf1f174e
DM
2452 if (regmatch(A)) {
2453 next = B;
2454 bar = foo;
2455 break;
95b24440 2456 }
bf1f174e
DM
2457 ...; // do some housekeeping, then ...
2458 sayNO; // propagate the failure
95b24440 2459 }
bf1f174e
DM
2460
2461The topmost backtrack state, pointed to by st, is usually free. If you
2462want to claim it, populate any ST.foo fields in it with values you wish to
2463save, then do one of
2464
2465 PUSH_STATE_GOTO(resume_state, node);
2466 PUSH_YES_STATE_GOTO(resume_state, node);
2467
2468which sets that backtrack state's resume value to 'resume_state', pushes a
2469new free entry to the top of the backtrack stack, then goes to 'node'.
2470On backtracking, the free slot is popped, and the saved state becomes the
2471new free state. An ST.foo field in this new top state can be temporarily
2472accessed to retrieve values, but once the main loop is re-entered, it
2473becomes available for reuse.
2474
2475Note that the depth of the backtrack stack constantly increases during the
2476left-to-right execution of the pattern, rather than going up and down with
2477the pattern nesting. For example the stack is at its maximum at Z at the
2478end of the pattern, rather than at X in the following:
2479
2480 /(((X)+)+)+....(Y)+....Z/
2481
2482The only exceptions to this are lookahead/behind assertions and the cut,
2483(?>A), which pop all the backtrack states associated with A before
2484continuing.
2485
2486Bascktrack state structs are allocated in slabs of about 4K in size.
2487PL_regmatch_state and st always point to the currently active state,
2488and PL_regmatch_slab points to the slab currently containing
2489PL_regmatch_state. The first time regmatch() is called, the first slab is
2490allocated, and is never freed until interpreter destruction. When the slab
2491is full, a new one is allocated and chained to the end. At exit from
2492regmatch(), slabs allocated since entry are freed.
2493
2494*/
95b24440 2495
40a82448 2496
5bc10b2c 2497#define DEBUG_STATE_pp(pp) \
265c4333 2498 DEBUG_STATE_r({ \
5bc10b2c
DM
2499 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2500 PerlIO_printf(Perl_debug_log, \
5d458dd8 2501 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2502 depth*2, "", \
13d6edb4 2503 PL_reg_name[st->resume_state], \
5d458dd8
YO
2504 ((st==yes_state||st==mark_state) ? "[" : ""), \
2505 ((st==yes_state) ? "Y" : ""), \
2506 ((st==mark_state) ? "M" : ""), \
2507 ((st==yes_state||st==mark_state) ? "]" : "") \
2508 ); \
265c4333 2509 });
5bc10b2c 2510
40a82448 2511
3dab1dad 2512#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2513
3df15adc 2514#ifdef DEBUGGING
5bc10b2c 2515
ab3bbdeb
YO
2516STATIC void
2517S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2518 const char *start, const char *end, const char *blurb)
2519{
bbe252da 2520 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
ab3bbdeb
YO
2521 if (!PL_colorset)
2522 reginitcolors();
2523 {
2524 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2525 prog->precomp, prog->prelen, 60);
2526
2527 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2528 start, end - start, 60);
2529
2530 PerlIO_printf(Perl_debug_log,
2531 "%s%s REx%s %s against %s\n",
2532 PL_colors[4], blurb, PL_colors[5], s0, s1);
2533
2534 if (do_utf8||utf8_pat)
1de06328
YO
2535 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2536 utf8_pat ? "pattern" : "",
2537 utf8_pat && do_utf8 ? " and " : "",
2538 do_utf8 ? "string" : ""
ab3bbdeb
YO
2539 );
2540 }
2541}
3df15adc
YO
2542
2543STATIC void
786e8c11
YO
2544S_dump_exec_pos(pTHX_ const char *locinput,
2545 const regnode *scan,
2546 const char *loc_regeol,
2547 const char *loc_bostr,
2548 const char *loc_reg_starttry,
2549 const bool do_utf8)
07be1b83 2550{
786e8c11 2551 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2552 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2553 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2554 /* The part of the string before starttry has one color
2555 (pref0_len chars), between starttry and current
2556 position another one (pref_len - pref0_len chars),
2557 after the current position the third one.
2558 We assume that pref0_len <= pref_len, otherwise we
2559 decrease pref0_len. */
786e8c11
YO
2560 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2561 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2562 int pref0_len;
2563
2564 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2565 pref_len++;
786e8c11
YO
2566 pref0_len = pref_len - (locinput - loc_reg_starttry);
2567 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2568 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2569 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2570 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2571 l--;
2572 if (pref0_len < 0)
2573 pref0_len = 0;
2574 if (pref0_len > pref_len)
2575 pref0_len = pref_len;
2576 {
3df15adc 2577 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2578
ab3bbdeb 2579 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2580 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2581
ab3bbdeb 2582 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2583 (locinput - pref_len + pref0_len),
1de06328 2584 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2585
ab3bbdeb 2586 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2587 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2588
1de06328 2589 const STRLEN tlen=len0+len1+len2;
3df15adc 2590 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2591 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2592 (IV)(locinput - loc_bostr),
07be1b83 2593 len0, s0,
07be1b83 2594 len1, s1,
07be1b83 2595 (docolor ? "" : "> <"),
07be1b83 2596 len2, s2,
f9f4320a 2597 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2598 "");
2599 }
2600}
3df15adc 2601
07be1b83
YO
2602#endif
2603
0a4db386
YO
2604/* reg_check_named_buff_matched()
2605 * Checks to see if a named buffer has matched. The data array of
2606 * buffer numbers corresponding to the buffer is expected to reside
2607 * in the regexp->data->data array in the slot stored in the ARG() of
2608 * node involved. Note that this routine doesn't actually care about the
2609 * name, that information is not preserved from compilation to execution.
2610 * Returns the index of the leftmost defined buffer with the given name
2611 * or 0 if non of the buffers matched.
2612 */
2613STATIC I32
2614S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2615 I32 n;
f8fc2ecf
YO
2616 RXi_GET_DECL(rex,rexi);
2617 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
0a4db386
YO
2618 I32 *nums=(I32*)SvPVX(sv_dat);
2619 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2620 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2621 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2622 {
2623 return nums[n];
2624 }
2625 }
2626 return 0;
2627}
2628
2f554ef7
DM
2629
2630/* free all slabs above current one - called during LEAVE_SCOPE */
2631
2632STATIC void
2633S_clear_backtrack_stack(pTHX_ void *p)
2634{
2635 regmatch_slab *s = PL_regmatch_slab->next;
2636 PERL_UNUSED_ARG(p);
2637
2638 if (!s)
2639 return;
2640 PL_regmatch_slab->next = NULL;
2641 while (s) {
2642 regmatch_slab * const osl = s;
2643 s = s->next;
2644 Safefree(osl);
2645 }
2646}
2647
2648
28d8d7f4
YO
2649#define SETREX(Re1,Re2) \
2650 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2651 Re1 = (Re2)
2652
d6a28714 2653STATIC I32 /* 0 failure, 1 success */
24b23f37 2654S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2655{
a35a87e7 2656#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2657 dMY_CXT;
2658#endif
27da23d5 2659 dVAR;
95b24440 2660 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2661 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2662
3b0527fe 2663 regexp *rex = reginfo->prog;
f8fc2ecf
YO
2664 RXi_GET_DECL(rex,rexi);
2665
2f554ef7 2666 I32 oldsave;
a3621e74 2667
5d9a96ca
DM
2668 /* the current state. This is a cached copy of PL_regmatch_state */
2669 register regmatch_state *st;
95b24440 2670
5d9a96ca
DM
2671 /* cache heavy used fields of st in registers */
2672 register regnode *scan;
2673 register regnode *next;
438e9bae 2674 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2675 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2676 register char *locinput = PL_reginput;
5d9a96ca 2677 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2678
b69b0499 2679 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2680 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2681 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2682 const U32 max_nochange_depth =
2683 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2684 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2685
77cb431f
DM
2686 regmatch_state *yes_state = NULL; /* state to pop to on success of
2687 subpattern */
e2e6a0f1
YO
2688 /* mark_state piggy backs on the yes_state logic so that when we unwind
2689 the stack on success we can update the mark_state as we go */
2690 regmatch_state *mark_state = NULL; /* last mark state we have seen */
5d458dd8 2691
faec1544 2692 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2693 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2694 U32 state_num;
5d458dd8
YO
2695 bool no_final = 0; /* prevent failure from backtracking? */
2696 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2697 char *startpoint = PL_reginput;
5d458dd8
YO
2698 SV *popmark = NULL; /* are we looking for a mark? */
2699 SV *sv_commit = NULL; /* last mark name seen in failure */
2700 SV *sv_yes_mark = NULL; /* last mark name we have seen
2701 during a successfull match */
2702 U32 lastopen = 0; /* last open we saw */
2703 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0
DM
2704
2705 SV* const oreplsv = GvSV(PL_replgv);
58e23c8d 2706
5d458dd8 2707
24d3c4a9
DM
2708 /* these three flags are set by various ops to signal information to
2709 * the very next op. They have a useful lifetime of exactly one loop
2710 * iteration, and are not preserved or restored by state pushes/pops
2711 */
2712 bool sw = 0; /* the condition value in (?(cond)a|b) */
2713 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2714 int logical = 0; /* the following EVAL is:
2715 0: (?{...})
2716 1: (?(?{...})X|Y)
2717 2: (??{...})
2718 or the following IFMATCH/UNLESSM is:
2719 false: plain (?=foo)
2720 true: used as a condition: (?(?=foo))
2721 */
2722
95b24440 2723#ifdef DEBUGGING
e68ec53f 2724 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2725#endif
2726
3b57cd43 2727 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2728 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2729 }));
5d9a96ca
DM
2730 /* on first ever call to regmatch, allocate first slab */
2731 if (!PL_regmatch_slab) {
2732 Newx(PL_regmatch_slab, 1, regmatch_slab);
2733 PL_regmatch_slab->prev = NULL;
2734 PL_regmatch_slab->next = NULL;
86545054 2735 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2736 }
2737
2f554ef7
DM
2738 oldsave = PL_savestack_ix;
2739 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2740 SAVEVPTR(PL_regmatch_slab);
2741 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
2742
2743 /* grab next free state slot */
2744 st = ++PL_regmatch_state;
86545054 2745 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2746 st = PL_regmatch_state = S_push_slab(aTHX);
2747
d6a28714
JH
2748 /* Note that nextchr is a byte even in UTF */
2749 nextchr = UCHARAT(locinput);
2750 scan = prog;
2751 while (scan != NULL) {
8ba1375e 2752
a3621e74 2753 DEBUG_EXECUTE_r( {
6136c704 2754 SV * const prop = sv_newmortal();
1de06328 2755 regnode *rnext=regnext(scan);
786e8c11 2756 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2757 regprop(rex, prop, scan);
07be1b83
YO
2758
2759 PerlIO_printf(Perl_debug_log,
2760 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2761 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2762 SvPVX_const(prop),
1de06328 2763 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2764 0 : (IV)(rnext - rexi->program));
2a782b5b 2765 });
d6a28714
JH
2766
2767 next = scan + NEXT_OFF(scan);
2768 if (next == scan)
2769 next = NULL;
40a82448 2770 state_num = OP(scan);
d6a28714 2771
40a82448
DM
2772 reenter_switch:
2773 switch (state_num) {
d6a28714 2774 case BOL:
7fba1cd6 2775 if (locinput == PL_bostr)
d6a28714 2776 {
3b0527fe 2777 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2778 break;
2779 }
d6a28714
JH
2780 sayNO;
2781 case MBOL:
12d33761
HS
2782 if (locinput == PL_bostr ||
2783 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2784 {
b8c5462f
JH
2785 break;
2786 }
d6a28714
JH
2787 sayNO;
2788 case SBOL:
c2a73568 2789 if (locinput == PL_bostr)
b8c5462f 2790 break;
d6a28714
JH
2791 sayNO;
2792 case GPOS:
3b0527fe 2793 if (locinput == reginfo->ganch)
d6a28714
JH
2794 break;
2795 sayNO;
ee9b8eae
YO
2796
2797 case KEEPS:
2798 /* update the startpoint */
f0ab9afb 2799 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 2800 PL_reginput = locinput;
f0ab9afb 2801 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
2802 PUSH_STATE_GOTO(KEEPS_next, next);
2803 /*NOT-REACHED*/
2804 case KEEPS_next_fail:
2805 /* rollback the start point change */
f0ab9afb 2806 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
2807 sayNO_SILENT;
2808 /*NOT-REACHED*/
d6a28714 2809 case EOL:
d6a28714
JH
2810 goto seol;
2811 case MEOL:
d6a28714 2812 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2813 sayNO;
b8c5462f 2814 break;
d6a28714
JH
2815 case SEOL:
2816 seol:
2817 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2818 sayNO;
d6a28714 2819 if (PL_regeol - locinput > 1)
b8c5462f 2820 sayNO;
b8c5462f 2821 break;
d6a28714
JH
2822 case EOS:
2823 if (PL_regeol != locinput)
b8c5462f 2824 sayNO;
d6a28714 2825 break;
ffc61ed2 2826 case SANY:
d6a28714 2827 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2828 sayNO;
f33976b4
DB
2829 if (do_utf8) {
2830 locinput += PL_utf8skip[nextchr];
2831 if (locinput > PL_regeol)
2832 sayNO;
2833 nextchr = UCHARAT(locinput);
2834 }
2835 else
2836 nextchr = UCHARAT(++locinput);
2837 break;
2838 case CANY:
2839 if (!nextchr && locinput >= PL_regeol)
2840 sayNO;
b8c5462f 2841 nextchr = UCHARAT(++locinput);
a0d0e21e 2842 break;
ffc61ed2 2843 case REG_ANY:
1aa99e6b
IH
2844 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2845 sayNO;
2846 if (do_utf8) {
b8c5462f 2847 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2848 if (locinput > PL_regeol)
2849 sayNO;
a0ed51b3 2850 nextchr = UCHARAT(locinput);
a0ed51b3 2851 }
1aa99e6b
IH
2852 else
2853 nextchr = UCHARAT(++locinput);
a0ed51b3 2854 break;
166ba7cd
DM
2855
2856#undef ST
2857#define ST st->u.trie
786e8c11
YO
2858 case TRIEC:
2859 /* In this case the charclass data is available inline so
2860 we can fail fast without a lot of extra overhead.
2861 */
2862 if (scan->flags == EXACT || !do_utf8) {
2863 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2864 DEBUG_EXECUTE_r(
2865 PerlIO_printf(Perl_debug_log,
2866 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2867 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2868 );
2869 sayNO_SILENT;
2870 /* NOTREACHED */
2871 }
2872 }
2873 /* FALL THROUGH */
5b47454d 2874 case TRIE:
3dab1dad 2875 {
07be1b83 2876 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2877 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2878 trie_type = do_utf8 ?
2879 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2880 : trie_plain;
2881
2882 /* what trie are we using right now */
be8e71aa 2883 reg_trie_data * const trie
f8fc2ecf 2884 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
55eed653 2885 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
3dab1dad 2886 U32 state = trie->startstate;
166ba7cd 2887
3dab1dad
YO
2888 if (trie->bitmap && trie_type != trie_utf8_fold &&
2889 !TRIE_BITMAP_TEST(trie,*locinput)
2890 ) {
2891 if (trie->states[ state ].wordnum) {
2892 DEBUG_EXECUTE_r(
2893 PerlIO_printf(Perl_debug_log,
2894 "%*s %smatched empty string...%s\n",
5bc10b2c 2895 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2896 );
2897 break;
2898 } else {
2899 DEBUG_EXECUTE_r(
2900 PerlIO_printf(Perl_debug_log,
786e8c11 2901 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2902 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2903 );
2904 sayNO_SILENT;
2905 }
2906 }
166ba7cd 2907
786e8c11
YO
2908 {
2909 U8 *uc = ( U8* )locinput;
2910
2911 STRLEN len = 0;
2912 STRLEN foldlen = 0;
2913 U8 *uscan = (U8*)NULL;
2914 STRLEN bufflen=0;
2915 SV *sv_accept_buff = NULL;
2916 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2917
2918 ST.accepted = 0; /* how many accepting states we have seen */
2919 ST.B = next;
2920 ST.jump = trie->jump;
786e8c11 2921 ST.me = scan;
07be1b83
YO
2922 /*
2923 traverse the TRIE keeping track of all accepting states
2924 we transition through until we get to a failing node.
2925 */
2926
a3621e74 2927 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2928 U32 base = trie->states[ state ].trans.base;
f9f4320a 2929 UV uvc = 0;
786e8c11
YO
2930 U16 charid;
2931 /* We use charid to hold the wordnum as we don't use it
2932 for charid until after we have done the wordnum logic.
2933 We define an alias just so that the wordnum logic reads
2934 more naturally. */
2935
2936#define got_wordnum charid
2937 got_wordnum = trie->states[ state ].wordnum;
2938
2939 if ( got_wordnum ) {
2940 if ( ! ST.accepted ) {
5b47454d
DM
2941 ENTER;
2942 SAVETMPS;
2943 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2944 sv_accept_buff=newSV(bufflen *
2945 sizeof(reg_trie_accepted) - 1);
786e8c11 2946 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2947 SvPOK_on(sv_accept_buff);
2948 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2949 SAVETMPS;
2950 ST.accept_buff =
5b47454d
DM
2951 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2952 }
786e8c11 2953 do {
166ba7cd 2954 if (ST.accepted >= bufflen) {
5b47454d 2955 bufflen *= 2;
166ba7cd 2956 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2957 SvGROW(sv_accept_buff,
2958 bufflen * sizeof(reg_trie_accepted));
2959 }
2960 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2961 + sizeof(reg_trie_accepted));
a3621e74 2962
786e8c11
YO
2963
2964 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2965 ST.accept_buff[ST.accepted].endpos = uc;
2966 ++ST.accepted;
2967 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2968 }
2969#undef got_wordnum
a3621e74 2970
07be1b83 2971 DEBUG_TRIE_EXECUTE_r({
786e8c11 2972 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2973 PerlIO_printf( Perl_debug_log,
786e8c11 2974 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2975 2+depth * 2, "", PL_colors[4],
786e8c11 2976 (UV)state, (UV)ST.accepted );
07be1b83 2977 });
a3621e74
YO
2978
2979 if ( base ) {
55eed653
NC
2980 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2981 uscan, len, uvc, charid, foldlen,
2982 foldbuf, uniflags);
a3621e74 2983
5b47454d
DM
2984 if (charid &&
2985 (base + charid > trie->uniquecharcount )
2986 && (base + charid - 1 - trie->uniquecharcount
2987 < trie->lasttrans)
2988 && trie->trans[base + charid - 1 -
2989 trie->uniquecharcount].check == state)
2990 {
2991 state = trie->trans[base + charid - 1 -
2992 trie->uniquecharcount ].next;
2993 }
2994 else {
2995 state = 0;
2996 }
2997 uc += len;
2998
2999 }
3000 else {
a3621e74
YO
3001 state = 0;
3002 }
3003 DEBUG_TRIE_EXECUTE_r(
e4584336 3004 PerlIO_printf( Perl_debug_log,
786e8c11 3005 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3006 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3007 );
3008 }
166ba7cd 3009 if (!ST.accepted )
a3621e74 3010 sayNO;
a3621e74 3011
166ba7cd
DM
3012 DEBUG_EXECUTE_r(
3013 PerlIO_printf( Perl_debug_log,
3014 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3015 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3016 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3017 );
786e8c11 3018 }}
fae667d5
YO
3019 goto trie_first_try; /* jump into the fail handler */
3020 /* NOTREACHED */
166ba7cd 3021 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
3022 if ( ST.jump) {
3023 REGCP_UNWIND(ST.cp);
3024 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3025 PL_regoffs[n].end = -1;
fae667d5
YO
3026 *PL_reglastparen = n;
3027 }
3028 trie_first_try:
5d458dd8
YO
3029 if (do_cutgroup) {
3030 do_cutgroup = 0;
3031 no_final = 0;
3032 }
fae667d5
YO
3033
3034 if ( ST.jump) {
3035 ST.lastparen = *PL_reglastparen;
3036 REGCP_SET(ST.cp);
3037 }
166ba7cd
DM
3038 if ( ST.accepted == 1 ) {
3039 /* only one choice left - just continue */
3040 DEBUG_EXECUTE_r({
2b8b4781
NC
3041 AV *const trie_words
3042 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3043 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3044 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3045 SV *sv= tmp ? sv_newmortal() : NULL;
3046
166ba7cd
DM
3047 PerlIO_printf( Perl_debug_log,
3048 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3049 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3050 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3051 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3052 PL_colors[0], PL_colors[1],
3053 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3054 )
3055 : "not compiled under -Dr",
166ba7cd
DM
3056 PL_colors[5] );
3057 });
3058 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3059 /* in this case we free tmps/leave before we call regmatch
3060 as we wont be using accept_buff again. */
5d458dd8 3061
166ba7cd
DM
3062 locinput = PL_reginput;
3063 nextchr = UCHARAT(locinput);
5d458dd8
YO
3064 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3065 scan = ST.B;
3066 else
3067 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3068 if (!has_cutgroup) {
3069 FREETMPS;
3070 LEAVE;
3071 } else {
3072 ST.accepted--;
3073 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3074 }
786e8c11 3075
166ba7cd
DM
3076 continue; /* execute rest of RE */
3077 }
fae667d5
YO
3078
3079 if ( !ST.accepted-- ) {
5d458dd8
YO
3080 DEBUG_EXECUTE_r({
3081 PerlIO_printf( Perl_debug_log,
3082 "%*s %sTRIE failed...%s\n",
3083 REPORT_CODE_OFF+depth*2, "",
3084 PL_colors[4],
3085 PL_colors[5] );
3086 });
166ba7cd
DM
3087 FREETMPS;
3088 LEAVE;
5d458dd8 3089 sayNO_SILENT;
fae667d5
YO
3090 /*NOTREACHED*/
3091 }
166ba7cd 3092
a3621e74 3093 /*
166ba7cd
DM
3094 There are at least two accepting states left. Presumably
3095 the number of accepting states is going to be low,
3096 typically two. So we simply scan through to find the one
3097 with lowest wordnum. Once we find it, we swap the last
3098 state into its place and decrement the size. We then try to
3099 match the rest of the pattern at the point where the word
3100 ends. If we succeed, control just continues along the
3101 regex; if we fail we return here to try the next accepting
3102 state
3103 */
a3621e74 3104
166ba7cd
DM
3105 {
3106 U32 best = 0;
3107 U32 cur;
3108 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3109 DEBUG_TRIE_EXECUTE_r(
f2278c82 3110 PerlIO_printf( Perl_debug_log,
166ba7cd 3111 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3112 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3113 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3114 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3115 );
3116
3117 if (ST.accept_buff[cur].wordnum <
3118 ST.accept_buff[best].wordnum)
3119 best = cur;
a3621e74 3120 }
166ba7cd
DM
3121
3122 DEBUG_EXECUTE_r({
2b8b4781
NC
3123 AV *const trie_words
3124 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3125 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3126 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3127 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3128 ST.B :
7f69552c 3129 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3130 SV *sv= tmp ? sv_newmortal() : NULL;
3131
265c4333
YO
3132 PerlIO_printf( Perl_debug_log,
3133 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3134 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3135 ST.accept_buff[best].wordnum,
de734bd5
A
3136 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3137 PL_colors[0], PL_colors[1],
3138 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3139 ) : "not compiled under -Dr",
265c4333 3140 REG_NODE_NUM(nextop),
166ba7cd
DM
3141 PL_colors[5] );
3142 });
3143
3144 if ( best<ST.accepted ) {
3145 reg_trie_accepted tmp = ST.accept_buff[ best ];
3146 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3147 ST.accept_buff[ ST.accepted ] = tmp;
3148 best = ST.accepted;
a3621e74 3149 }
166ba7cd 3150 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3151 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3152 scan = ST.B;
786e8c11
YO
3153 /* NOTREACHED */
3154 } else {
5d458dd8 3155 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
786e8c11
YO
3156 /* NOTREACHED */
3157 }
5d458dd8
YO
3158 if (has_cutgroup) {
3159 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3160 /* NOTREACHED */
3161 } else {
3162 PUSH_STATE_GOTO(TRIE_next, scan);
3163 /* NOTREACHED */
3164 }
786e8c11 3165 /* NOTREACHED */
166ba7cd 3166 }
166ba7cd 3167 /* NOTREACHED */
5d458dd8
YO
3168 case TRIE_next:
3169 FREETMPS;
3170 LEAVE;
3171 sayYES;
166ba7cd
DM
3172#undef ST
3173
95b24440
DM
3174 case EXACT: {
3175 char *s = STRING(scan);
24d3c4a9 3176 ln = STR_LEN(scan);
eb160463 3177 if (do_utf8 != UTF) {
bc517b45 3178 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3179 char *l = locinput;
24d3c4a9 3180 const char * const e = s + ln;
a72c7584 3181
5ff6fc6d
JH
3182 if (do_utf8) {
3183 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3184 while (s < e) {
a3b680e6 3185 STRLEN ulen;
1aa99e6b 3186 if (l >= PL_regeol)
5ff6fc6d
JH
3187 sayNO;
3188 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3189 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3190 uniflags))
5ff6fc6d 3191 sayNO;
bc517b45 3192 l += ulen;
5ff6fc6d 3193 s ++;
1aa99e6b 3194 }
5ff6fc6d
JH
3195 }
3196 else {
3197 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3198 while (s < e) {
a3b680e6 3199 STRLEN ulen;
1aa99e6b
IH
3200 if (l >= PL_regeol)
3201 sayNO;
5ff6fc6d 3202 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3203 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3204 uniflags))
1aa99e6b 3205 sayNO;
bc517b45 3206 s += ulen;
a72c7584 3207 l ++;
1aa99e6b 3208 }
5ff6fc6d 3209 }
1aa99e6b
IH
3210 locinput = l;
3211 nextchr = UCHARAT(locinput);
3212 break;
3213 }
bc517b45 3214 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3215 /* Inline the first character, for speed. */
3216 if (UCHARAT(s) != nextchr)
3217 sayNO;
24d3c4a9 3218 if (PL_regeol - locinput < ln)
d6a28714 3219 sayNO;
24d3c4a9 3220 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3221 sayNO;
24d3c4a9 3222 locinput += ln;
d6a28714
JH
3223 nextchr = UCHARAT(locinput);
3224 break;
95b24440 3225 }
d6a28714 3226 case EXACTFL:
b8c5462f
JH
3227 PL_reg_flags |= RF_tainted;
3228 /* FALL THROUGH */
95b24440 3229 case EXACTF: {
be8e71aa 3230 char * const s = STRING(scan);
24d3c4a9 3231 ln = STR_LEN(scan);
d6a28714 3232
d07ddd77
JH
3233 if (do_utf8 || UTF) {
3234 /* Either target or the pattern are utf8. */
be8e71aa 3235 const char * const l = locinput;
d07ddd77 3236 char *e = PL_regeol;
bc517b45 3237
24d3c4a9 3238 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3239 l, &e, 0, do_utf8)) {
5486206c
JH
3240 /* One more case for the sharp s:
3241 * pack("U0U*", 0xDF) =~ /ss/i,
3242 * the 0xC3 0x9F are the UTF-8
3243 * byte sequence for the U+00DF. */
e1d1eefb 3244
5486206c 3245 if (!(do_utf8 &&
e1d1eefb 3246 toLOWER(s[0]) == 's' &&
24d3c4a9 3247 ln >= 2 &&
5486206c
JH
3248 toLOWER(s[1]) == 's' &&
3249 (U8)l[0] == 0xC3 &&
3250 e - l >= 2 &&
3251 (U8)l[1] == 0x9F))
3252 sayNO;
3253 }
d07ddd77
JH
3254 locinput = e;
3255 nextchr = UCHARAT(locinput);
3256 break;
a0ed51b3 3257 }
d6a28714 3258
bc517b45
JH
3259 /* Neither the target and the pattern are utf8. */
3260
d6a28714
JH
3261 /* Inline the first character, for speed. */
3262 if (UCHARAT(s) != nextchr &&
3263 UCHARAT(s) != ((OP(scan) == EXACTF)
3264 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3265 sayNO;
24d3c4a9 3266 if (PL_regeol - locinput < ln)
b8c5462f 3267 sayNO;
24d3c4a9
DM
3268 if (ln > 1 && (OP(scan) == EXACTF
3269 ? ibcmp(s, locinput, ln)
3270 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3271 sayNO;
24d3c4a9 3272 locinput += ln;
d6a28714 3273 nextchr = UCHARAT(locinput);
a0d0e21e 3274 break;
95b24440 3275 }
d6a28714 3276 case ANYOF:
ffc61ed2 3277 if (do_utf8) {
9e55ce06
JH
3278 STRLEN inclasslen = PL_regeol - locinput;
3279
32fc9b6a 3280 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3281 goto anyof_fail;
ffc61ed2
JH
3282 if (locinput >= PL_regeol)
3283 sayNO;
0f0076b4 3284 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3285 nextchr = UCHARAT(locinput);
e0f9d4a8 3286 break;
ffc61ed2
JH
3287 }
3288 else {
3289 if (nextchr < 0)
3290 nextchr = UCHARAT(locinput);
32fc9b6a 3291 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3292 goto anyof_fail;
ffc61ed2
JH
3293 if (!nextchr && locinput >= PL_regeol)
3294 sayNO;
3295 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3296 break;
3297 }
262b90c4 3298 anyof_fail:
e0f9d4a8
JH
3299 /* If we might have the case of the German sharp s
3300 * in a casefolding Unicode character class. */
3301
ebc501f0
JH
3302 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3303 locinput += SHARP_S_SKIP;
e0f9d4a8 3304 nextchr = UCHARAT(locinput);
ffc61ed2 3305 }
e0f9d4a8
JH
3306 else
3307 sayNO;
b8c5462f 3308 break;
d6a28714 3309 case ALNUML:
b8c5462f
JH
3310 PL_reg_flags |= RF_tainted;
3311 /* FALL THROUGH */
d6a28714 3312 case ALNUM:
b8c5462f 3313 if (!nextchr)
4633a7c4 3314 sayNO;
ffc61ed2 3315 if (do_utf8) {
1a4fad37 3316 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3317 if (!(OP(scan) == ALNUM
bb7a0f54 3318 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3319 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3320 {
3321 sayNO;
a0ed51b3 3322 }
b8c5462f 3323 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3324 nextchr = UCHARAT(locinput);
3325 break;
3326 }
ffc61ed2 3327 if (!(OP(scan) == ALNUM
d6a28714 3328 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3329 sayNO;
b8c5462f 3330 nextchr = UCHARAT(++locinput);
a0d0e21e 3331 break;
d6a28714 3332 case NALNUML:
b8c5462f
JH
3333 PL_reg_flags |= RF_tainted;
3334 /* FALL THROUGH */
d6a28714
JH
3335 case NALNUM:
3336 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3337 sayNO;
ffc61ed2 3338 if (do_utf8) {
1a4fad37 3339 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3340 if (OP(scan) == NALNUM
bb7a0f54 3341 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3342 : isALNUM_LC_utf8((U8*)locinput))
3343 {
b8c5462f 3344 sayNO;
d6a28714 3345 }
b8c5462f
JH
3346 locinput += PL_utf8skip[nextchr];
3347 nextchr = UCHARAT(locinput);
3348 break;
3349 }
ffc61ed2 3350 if (OP(scan) == NALNUM
d6a28714 3351 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3352 sayNO;
76e3520e 3353 nextchr = UCHARAT(++locinput);
a0d0e21e 3354 break;
d6a28714
JH
3355 case BOUNDL:
3356 case NBOUNDL:
3280af22 3357 PL_reg_flags |= RF_tainted;
bbce6d69 3358 /* FALL THROUGH */
d6a28714
JH
3359 case BOUND:
3360 case NBOUND:
3361 /* was last char in word? */
ffc61ed2 3362 if (do_utf8) {
12d33761 3363 if (locinput == PL_bostr)
24d3c4a9 3364 ln = '\n';
ffc61ed2 3365 else {
a3b680e6 3366 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3367
24d3c4a9 3368 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3369 }
3370 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3371 ln = isALNUM_uni(ln);
1a4fad37 3372 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3373 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3374 }
3375 else {
24d3c4a9 3376 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3377 n = isALNUM_LC_utf8((U8*)locinput);
3378 }
a0ed51b3 3379 }
d6a28714 3380 else {
24d3c4a9 3381 ln = (locinput != PL_bostr) ?
12d33761 3382 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3383 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3384 ln = isALNUM(ln);
ffc61ed2
JH
3385 n = isALNUM(nextchr);
3386 }
3387 else {
24d3c4a9 3388 ln = isALNUM_LC(ln);
ffc61ed2
JH
3389 n = isALNUM_LC(nextchr);
3390 }
d6a28714 3391 }
24d3c4a9 3392 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3393 OP(scan) == BOUNDL))
3394 sayNO;
a0ed51b3 3395 break;
d6a28714 3396 case SPACEL:
3280af22 3397 PL_reg_flags |= RF_tainted;
bbce6d69 3398 /* FALL THROUGH */
d6a28714 3399 case SPACE:
9442cb0e 3400 if (!nextchr)
4633a7c4 3401 sayNO;
1aa99e6b 3402 if (do_utf8) {
fd400ab9 3403 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3404 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3405 if (!(OP(scan) == SPACE
bb7a0f54 3406 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3407 : isSPACE_LC_utf8((U8*)locinput)))
3408 {
3409 sayNO;
3410 }
3411 locinput += PL_utf8skip[nextchr];
3412 nextchr = UCHARAT(locinput);
3413 break;
d6a28714 3414 }
ffc61ed2
JH
3415 if (!(OP(scan) == SPACE
3416 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3417 sayNO;
3418 nextchr = UCHARAT(++locinput);
3419 }
3420 else {
3421 if (!(OP(scan) == SPACE
3422 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3423 sayNO;
3424 nextchr = UCHARAT(++locinput);
a0ed51b3 3425 }
a0ed51b3 3426 break;
d6a28714 3427 case NSPACEL:
3280af22 3428 PL_reg_flags |= RF_tainted;
bbce6d69 3429 /* FALL THROUGH */
d6a28714 3430 case NSPACE:
9442cb0e 3431 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3432 sayNO;
1aa99e6b 3433 if (do_utf8) {
1a4fad37 3434 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3435 if (OP(scan) == NSPACE
bb7a0f54 3436 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3437 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3438 {
3439 sayNO;
3440 }
3441 locinput += PL_utf8skip[nextchr];
3442 nextchr = UCHARAT(locinput);
3443 break;
a0ed51b3 3444 }
ffc61ed2 3445 if (OP(scan) == NSPACE
d6a28714 3446 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3447 sayNO;
76e3520e 3448 nextchr = UCHARAT(++locinput);
a0d0e21e 3449 break;
d6a28714 3450 case DIGITL:
a0ed51b3
LW
3451 PL_reg_flags |= RF_tainted;
3452 /* FALL THROUGH */
d6a28714 3453 case DIGIT:
9442cb0e 3454 if (!nextchr)
a0ed51b3 3455 sayNO;
1aa99e6b 3456 if (do_utf8) {
1a4fad37 3457 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3458 if (!(OP(scan) == DIGIT
bb7a0f54 3459 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3460 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3461 {
a0ed51b3 3462 sayNO;
dfe13c55 3463 }
6f06b55f 3464 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3465 nextchr = UCHARAT(locinput);
3466 break;
3467 }
ffc61ed2 3468 if (!(OP(scan) == DIGIT
9442cb0e 3469 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3470 sayNO;
3471 nextchr = UCHARAT(++locinput);
3472 break;
d6a28714 3473 case NDIGITL:
b8c5462f
JH
3474 PL_reg_flags |= RF_tainted;
3475 /* FALL THROUGH */
d6a28714 3476 case NDIGIT:
9442cb0e 3477 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3478 sayNO;
1aa99e6b 3479 if (do_utf8) {
1a4fad37 3480 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3481 if (OP(scan) == NDIGIT
bb7a0f54 3482 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3483 : isDIGIT_LC_utf8((U8*)locinput))
3484 {
a0ed51b3 3485 sayNO;
9442cb0e 3486 }
6f06b55f 3487 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3488 nextchr = UCHARAT(locinput);
3489 break;
3490 }
ffc61ed2 3491 if (OP(scan) == NDIGIT
9442cb0e 3492 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3493 sayNO;
3494 nextchr = UCHARAT(++locinput);
3495 break;
3496 case CLUMP:
b7c83a7e 3497 if (locinput >= PL_regeol)
a0ed51b3 3498 sayNO;
b7c83a7e 3499 if (do_utf8) {
1a4fad37 3500 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3501 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3502 sayNO;
3503 locinput += PL_utf8skip[nextchr];
3504 while (locinput < PL_regeol &&
3505 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3506 locinput += UTF8SKIP(locinput);
3507 if (locinput > PL_regeol)
3508 sayNO;
eb08e2da
JH
3509 }
3510 else
3511 locinput++;
a0ed51b3
LW
3512 nextchr = UCHARAT(locinput);
3513 break;
81714fb9
YO
3514
3515 case NREFFL:
3516 {
3517 char *s;
ff1157ca 3518 char type;
81714fb9
YO
3519 PL_reg_flags |= RF_tainted;
3520 /* FALL THROUGH */
3521 case NREF:
3522 case NREFF:
ff1157ca 3523 type = OP(scan);
0a4db386
YO
3524 n = reg_check_named_buff_matched(rex,scan);
3525
3526 if ( n ) {
3527 type = REF + ( type - NREF );
3528 goto do_ref;
3529 } else {
81714fb9 3530 sayNO;
0a4db386
YO
3531 }
3532 /* unreached */
c8756f30 3533 case REFFL:
3280af22 3534 PL_reg_flags |= RF_tainted;
c8756f30 3535 /* FALL THROUGH */
c277df42 3536 case REF:
81714fb9 3537 case REFF:
c277df42 3538 n = ARG(scan); /* which paren pair */
81714fb9
YO
3539 type = OP(scan);
3540 do_ref:
f0ab9afb 3541 ln = PL_regoffs[n].start;
2c2d71f5 3542 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 3543 if (*PL_reglastparen < n || ln == -1)
af3f8c16 3544 sayNO; /* Do not match unless seen CLOSEn. */
f0ab9afb 3545 if (ln == PL_regoffs[n].end)
a0d0e21e 3546 break;
a0ed51b3 3547
24d3c4a9 3548 s = PL_bostr + ln;
81714fb9 3549 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3550 char *l = locinput;
f0ab9afb 3551 const char *e = PL_bostr + PL_regoffs[n].end;
a0ed51b3
LW
3552 /*
3553 * Note that we can't do the "other character" lookup trick as
3554 * in the 8-bit case (no pun intended) because in Unicode we
3555 * have to map both upper and title case to lower case.
3556 */
81714fb9 3557 if (type == REFF) {
a0ed51b3 3558 while (s < e) {
a3b680e6
AL
3559 STRLEN ulen1, ulen2;
3560 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3561 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3562
a0ed51b3
LW
3563 if (l >= PL_regeol)
3564 sayNO;
a2a2844f
JH
3565 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3566 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3567 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3568 sayNO;
a2a2844f
JH
3569 s += ulen1;
3570 l += ulen2;
a0ed51b3
LW
3571 }
3572 }
3573 locinput = l;
3574 nextchr = UCHARAT(locinput);
3575 break;
3576 }
3577
a0d0e21e 3578 /* Inline the first character, for speed. */
76e3520e 3579 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3580 (type == REF ||
3581 (UCHARAT(s) != (type == REFF
3582 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3583 sayNO;
f0ab9afb 3584 ln = PL_regoffs[n].end - ln;
24d3c4a9 3585 if (locinput + ln > PL_regeol)
4633a7c4 3586 sayNO;
81714fb9 3587 if (ln > 1 && (type == REF
24d3c4a9 3588 ? memNE(s, locinput, ln)
81714fb9 3589 : (type == REFF
24d3c4a9
DM
3590 ? ibcmp(s, locinput, ln)
3591 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3592 sayNO;
24d3c4a9 3593 locinput += ln;
76e3520e 3594 nextchr = UCHARAT(locinput);
a0d0e21e 3595 break;
81714fb9 3596 }
a0d0e21e 3597 case NOTHING:
c277df42 3598 case TAIL:
a0d0e21e
LW
3599 break;
3600 case BACK:
3601 break;
40a82448
DM
3602
3603#undef ST
3604#define ST st->u.eval
c277df42 3605 {
c277df42 3606 SV *ret;
6bda09f9 3607 regexp *re;
f8fc2ecf 3608 regexp_internal *rei;
1a147d38
YO
3609 regnode *startpoint;
3610
3611 case GOSTART:
e7707071
YO
3612 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3613 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 3614 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 3615 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 3616 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
3617 Perl_croak(aTHX_
3618 "Pattern subroutine nesting without pos change"
3619 " exceeded limit in regex");
6bda09f9
YO
3620 } else {
3621 nochange_depth = 0;
1a147d38 3622 }
6bda09f9 3623 re = rex;
f8fc2ecf 3624 rei = rexi;
6bda09f9 3625 (void)ReREFCNT_inc(rex);
1a147d38 3626 if (OP(scan)==GOSUB) {
6bda09f9
YO
3627 startpoint = scan + ARG2L(scan);
3628 ST.close_paren = ARG(scan);
3629 } else {
f8fc2ecf 3630 startpoint = rei->program+1;
6bda09f9
YO
3631 ST.close_paren = 0;
3632 }
3633 goto eval_recurse_doit;
3634 /* NOTREACHED */
3635 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3636 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 3637 if ( ++nochange_depth > max_nochange_depth )
1a147d38 3638 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
3639 } else {
3640 nochange_depth = 0;
3641 }
8e5e9ebe 3642 {
4aabdb9b
DM
3643 /* execute the code in the {...} */
3644 dSP;
6136c704 3645 SV ** const before = SP;
4aabdb9b
DM
3646 OP_4tree * const oop = PL_op;
3647 COP * const ocurcop = PL_curcop;
3648 PAD *old_comppad;
4aabdb9b
DM
3649
3650 n = ARG(scan);
f8fc2ecf 3651 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
3652 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3653 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f8fc2ecf 3654 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
f0ab9afb 3655 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 3656
2bf803e2
YO
3657 if (sv_yes_mark) {
3658 SV *sv_mrk = get_sv("REGMARK", 1);
3659 sv_setsv(sv_mrk, sv_yes_mark);
3660 }
3661
8e5e9ebe
RGS
3662 CALLRUNOPS(aTHX); /* Scalar context. */
3663 SPAGAIN;
3664 if (SP == before)
075aa684 3665 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3666 else {
3667 ret = POPs;
3668 PUTBACK;
3669 }
4aabdb9b
DM
3670
3671 PL_op = oop;
3672 PAD_RESTORE_LOCAL(old_comppad);
3673 PL_curcop = ocurcop;
24d3c4a9 3674 if (!logical) {
4aabdb9b
DM
3675 /* /(?{...})/ */
3676 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3677 break;
3678 }
8e5e9ebe 3679 }
24d3c4a9
DM
3680 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3681 logical = 0;
4aabdb9b 3682 {
4f639d21
DM
3683 /* extract RE object from returned value; compiling if
3684 * necessary */
3685
6136c704 3686 MAGIC *mg = NULL;
be8e71aa 3687 const SV *sv;
faf82a0b
AE
3688 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3689 mg = mg_find(sv, PERL_MAGIC_qr);
3690 else if (SvSMAGICAL(ret)) {
3691 if (SvGMAGICAL(ret))
3692 sv_unmagic(ret, PERL_MAGIC_qr);
3693 else
3694 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3695 }
faf82a0b 3696
0f5d15d6 3697 if (mg) {
28d8d7f4 3698 re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
0f5d15d6
IZ
3699 }
3700 else {
c737faaf 3701 U32 pm_flags = 0;
a3b680e6 3702 const I32 osize = PL_regsize;
0f5d15d6 3703
c737faaf 3704 if (DO_UTF8(ret)) pm_flags |= RXf_UTF8;
3ab4a224 3705 re = CALLREGCOMP(ret, pm_flags);
9041c2e3 3706 if (!(SvFLAGS(ret)
faf82a0b
AE
3707 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3708 | SVs_GMG)))
14befaf4
DM
3709 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3710 PERL_MAGIC_qr,0,0);
0f5d15d6 3711 PL_regsize = osize;
0f5d15d6 3712 }
4aabdb9b 3713 }
28d8d7f4
YO
3714 RX_MATCH_COPIED_off(re);
3715 re->subbeg = rex->subbeg;
3716 re->sublen = rex->sublen;
f8fc2ecf 3717 rei = RXi_GET(re);
6bda09f9
YO
3718 DEBUG_EXECUTE_r(
3719 debug_start_match(re, do_utf8, locinput, PL_regeol,
3720 "Matching embedded");
3721 );
f8fc2ecf 3722 startpoint = rei->program + 1;
1a147d38 3723 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
3724 /* borrowed from regtry */
3725 if (PL_reg_start_tmpl <= re->nparens) {
3726 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3727 if(PL_reg_start_tmp)
3728 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3729 else
3730 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 3731 }
aa283a38 3732
1a147d38 3733 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 3734 /* run the pattern returned from (??{...}) */
40a82448
DM
3735 ST.cp = regcppush(0); /* Save *all* the positions. */
3736 REGCP_SET(ST.lastcp);
6bda09f9 3737
f0ab9afb 3738 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
6bda09f9 3739
4aabdb9b
DM
3740 *PL_reglastparen = 0;
3741 *PL_reglastcloseparen = 0;
4aabdb9b 3742 PL_reginput = locinput;
ae0beba1 3743 PL_regsize = 0;
4aabdb9b
DM
3744
3745 /* XXXX This is too dramatic a measure... */
3746 PL_reg_maxiter = 0;
3747
faec1544 3748 ST.toggle_reg_flags = PL_reg_flags;
bbe252da 3749 if (re->extflags & RXf_UTF8)
faec1544
DM
3750 PL_reg_flags |= RF_utf8;
3751 else
3752 PL_reg_flags &= ~RF_utf8;
3753 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3754
40a82448 3755 ST.prev_rex = rex;
faec1544 3756 ST.prev_curlyx = cur_curlyx;
28d8d7f4 3757 SETREX(rex,re);
f8fc2ecf 3758 rexi = rei;
faec1544 3759 cur_curlyx = NULL;
40a82448 3760 ST.B = next;
faec1544
DM
3761 ST.prev_eval = cur_eval;
3762 cur_eval = st;
faec1544 3763 /* now continue from first node in postoned RE */
6bda09f9 3764 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3765 /* NOTREACHED */
a0ed51b3 3766 }
24d3c4a9
DM
3767 /* logical is 1, /(?(?{...})X|Y)/ */
3768 sw = (bool)SvTRUE(ret);
3769 logical = 0;
c277df42
IZ
3770 break;
3771 }
40a82448 3772
faec1544
DM
3773 case EVAL_AB: /* cleanup after a successful (??{A})B */
3774 /* note: this is called twice; first after popping B, then A */
3775 PL_reg_flags ^= ST.toggle_reg_flags;
40a82448 3776 ReREFCNT_dec(rex);
28d8d7f4 3777 SETREX(rex,ST.prev_rex);
f8fc2ecf 3778 rexi = RXi_GET(rex);
faec1544
DM
3779 regcpblow(ST.cp);
3780 cur_eval = ST.prev_eval;
3781 cur_curlyx = ST.prev_curlyx;
40a82448
DM
3782 /* XXXX This is too dramatic a measure... */
3783 PL_reg_maxiter = 0;
e7707071 3784 if ( nochange_depth )
4b196cd4 3785 nochange_depth--;
262b90c4 3786 sayYES;
40a82448 3787
40a82448 3788
faec1544
DM
3789 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3790 /* note: this is called twice; first after popping B, then A */
3791 PL_reg_flags ^= ST.toggle_reg_flags;
3792 ReREFCNT_dec(rex);
28d8d7f4 3793 SETREX(rex,ST.prev_rex);
f8fc2ecf 3794 rexi = RXi_GET(rex);
40a82448
DM
3795 PL_reginput = locinput;
3796 REGCP_UNWIND(ST.lastcp);
3797 regcppop(rex);
faec1544
DM
3798 cur_eval = ST.prev_eval;
3799 cur_curlyx = ST.prev_curlyx;
3800 /* XXXX This is too dramatic a measure... */
3801 PL_reg_maxiter = 0;
e7707071 3802 if ( nochange_depth )
4b196cd4 3803 nochange_depth--;
40a82448 3804 sayNO_SILENT;
40a82448
DM
3805#undef ST
3806
a0d0e21e 3807 case OPEN:
c277df42 3808 n = ARG(scan); /* which paren pair */
3280af22
NIS
3809 PL_reg_start_tmp[n] = locinput;
3810 if (n > PL_regsize)
3811 PL_regsize = n;
e2e6a0f1 3812 lastopen = n;
a0d0e21e
LW
3813 break;
3814 case CLOSE:
c277df42 3815 n = ARG(scan); /* which paren pair */
f0ab9afb
NC
3816 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3817 PL_regoffs[n].end = locinput - PL_bostr;
7f69552c
YO
3818 /*if (n > PL_regsize)
3819 PL_regsize = n;*/
3b6647e0 3820 if (n > *PL_reglastparen)
3280af22 3821 *PL_reglastparen = n;
a01268b5 3822 *PL_reglastcloseparen = n;
3b6647e0 3823 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
3824 goto fake_end;
3825 }
a0d0e21e 3826 break;
e2e6a0f1
YO
3827 case ACCEPT:
3828 if (ARG(scan)){
3829 regnode *cursor;
3830 for (cursor=scan;
3831 cursor && OP(cursor)!=END;
3832 cursor=regnext(cursor))
3833 {
3834 if ( OP(cursor)==CLOSE ){
3835 n = ARG(cursor);
3836 if ( n <= lastopen ) {
f0ab9afb
NC
3837 PL_regoffs[n].start
3838 = PL_reg_start_tmp[n] - PL_bostr;
3839 PL_regoffs[n].end = locinput - PL_bostr;
e2e6a0f1
YO
3840 /*if (n > PL_regsize)
3841 PL_regsize = n;*/
3b6647e0 3842 if (n > *PL_reglastparen)
e2e6a0f1
YO
3843 *PL_reglastparen = n;
3844 *PL_reglastcloseparen = n;
3b6647e0
RB
3845 if ( n == ARG(scan) || (cur_eval &&
3846 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
3847 break;
3848 }
3849 }
3850 }
3851 }
3852 goto fake_end;
3853 /*NOTREACHED*/
c277df42
IZ
3854 case GROUPP:
3855 n = ARG(scan); /* which paren pair */
f0ab9afb 3856 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
c277df42 3857 break;
0a4db386
YO
3858 case NGROUPP:
3859 /* reg_check_named_buff_matched returns 0 for no match */
3860 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3861 break;
1a147d38 3862 case INSUBP:
0a4db386 3863 n = ARG(scan);
3b6647e0 3864 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
3865 break;
3866 case DEFINEP:
3867 sw = 0;
3868 break;
c277df42 3869 case IFTHEN:
2c2d71f5 3870 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3871 if (sw)
c277df42
IZ
3872 next = NEXTOPER(NEXTOPER(scan));
3873 else {
3874 next = scan + ARG(scan);
3875 if (OP(next) == IFTHEN) /* Fake one. */
3876 next = NEXTOPER(NEXTOPER(next));
3877 }
3878 break;
3879 case LOGICAL:
24d3c4a9 3880 logical = scan->flags;
c277df42 3881 break;
c476f425 3882
2ab05381 3883/*******************************************************************
2ab05381 3884
c476f425
DM
3885The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3886pattern, where A and B are subpatterns. (For simple A, CURLYM or
3887STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 3888
c476f425 3889A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 3890
c476f425
DM
3891On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3892state, which contains the current count, initialised to -1. It also sets
3893cur_curlyx to point to this state, with any previous value saved in the
3894state block.
2ab05381 3895
c476f425
DM
3896CURLYX then jumps straight to the WHILEM op, rather than executing A,
3897since the pattern may possibly match zero times (i.e. it's a while {} loop
3898rather than a do {} while loop).
2ab05381 3899
c476f425
DM
3900Each entry to WHILEM represents a successful match of A. The count in the
3901CURLYX block is incremented, another WHILEM state is pushed, and execution
3902passes to A or B depending on greediness and the current count.
2ab05381 3903
c476f425
DM
3904For example, if matching against the string a1a2a3b (where the aN are
3905substrings that match /A/), then the match progresses as follows: (the
3906pushed states are interspersed with the bits of strings matched so far):
2ab05381 3907
c476f425
DM
3908 <CURLYX cnt=-1>
3909 <CURLYX cnt=0><WHILEM>
3910 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3911 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3912 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3913 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 3914
c476f425
DM
3915(Contrast this with something like CURLYM, which maintains only a single
3916backtrack state:
2ab05381 3917
c476f425
DM
3918 <CURLYM cnt=0> a1
3919 a1 <CURLYM cnt=1> a2
3920 a1 a2 <CURLYM cnt=2> a3
3921 a1 a2 a3 <CURLYM cnt=3> b
3922)
2ab05381 3923
c476f425
DM
3924Each WHILEM state block marks a point to backtrack to upon partial failure
3925of A or B, and also contains some minor state data related to that
3926iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3927overall state, such as the count, and pointers to the A and B ops.
2ab05381 3928
c476f425
DM
3929This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3930must always point to the *current* CURLYX block, the rules are:
2ab05381 3931
c476f425
DM
3932When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3933and set cur_curlyx to point the new block.
2ab05381 3934
c476f425
DM
3935When popping the CURLYX block after a successful or unsuccessful match,
3936restore the previous cur_curlyx.
2ab05381 3937
c476f425
DM
3938When WHILEM is about to execute B, save the current cur_curlyx, and set it
3939to the outer one saved in the CURLYX block.
2ab05381 3940
c476f425
DM
3941When popping the WHILEM block after a successful or unsuccessful B match,
3942restore the previous cur_curlyx.
2ab05381 3943
c476f425
DM
3944Here's an example for the pattern (AI* BI)*BO
3945I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 3946
c476f425
DM
3947cur_
3948curlyx backtrack stack
3949------ ---------------
3950NULL
3951CO <CO prev=NULL> <WO>
3952CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3953CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3954NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 3955
c476f425
DM
3956At this point the pattern succeeds, and we work back down the stack to
3957clean up, restoring as we go:
95b24440 3958
c476f425
DM
3959CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3960CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3961CO <CO prev=NULL> <WO>
3962NULL
a0374537 3963
c476f425
DM
3964*******************************************************************/
3965
3966#define ST st->u.curlyx
3967
3968 case CURLYX: /* start of /A*B/ (for complex A) */
3969 {
3970 /* No need to save/restore up to this paren */
3971 I32 parenfloor = scan->flags;
3972
3973 assert(next); /* keep Coverity happy */
3974 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3975 next += ARG(next);
3976
3977 /* XXXX Probably it is better to teach regpush to support
3978 parenfloor > PL_regsize... */
3979 if (parenfloor > (I32)*PL_reglastparen)
3980 parenfloor = *PL_reglastparen; /* Pessimization... */
3981
3982 ST.prev_curlyx= cur_curlyx;
3983 cur_curlyx = st;
3984 ST.cp = PL_savestack_ix;
3985
3986 /* these fields contain the state of the current curly.
3987 * they are accessed by subsequent WHILEMs */
3988 ST.parenfloor = parenfloor;
3989 ST.min = ARG1(scan);
3990 ST.max = ARG2(scan);
3991 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3992 ST.B = next;
24d3c4a9
DM
3993 ST.minmod = minmod;
3994 minmod = 0;
c476f425
DM
3995 ST.count = -1; /* this will be updated by WHILEM */
3996 ST.lastloc = NULL; /* this will be updated by WHILEM */
3997
3998 PL_reginput = locinput;
3999 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 4000 /* NOTREACHED */
c476f425 4001 }
a0d0e21e 4002
c476f425 4003 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4004 cur_curlyx = ST.prev_curlyx;
4005 sayYES;
4006 /* NOTREACHED */
a0d0e21e 4007
c476f425
DM
4008 case CURLYX_end_fail: /* just failed to match all of A*B */
4009 regcpblow(ST.cp);
4010 cur_curlyx = ST.prev_curlyx;
4011 sayNO;
4012 /* NOTREACHED */
4633a7c4 4013
a0d0e21e 4014
c476f425
DM
4015#undef ST
4016#define ST st->u.whilem
4017
4018 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4019 {
4020 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
4021 I32 n;
4022 assert(cur_curlyx); /* keep Coverity happy */
4023 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4024 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4025 ST.cache_offset = 0;
4026 ST.cache_mask = 0;
4027
4028 PL_reginput = locinput;
4029
4030 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4031 "%*s whilem: matched %ld out of %ld..%ld\n",
4032 REPORT_CODE_OFF+depth*2, "", (long)n,
4033 (long)cur_curlyx->u.curlyx.min,
4034 (long)cur_curlyx->u.curlyx.max)
4035 );
a0d0e21e 4036
c476f425 4037 /* First just match a string of min A's. */
a0d0e21e 4038
c476f425
DM
4039 if (n < cur_curlyx->u.curlyx.min) {
4040 cur_curlyx->u.curlyx.lastloc = locinput;
4041 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4042 /* NOTREACHED */
4043 }
4044
4045 /* If degenerate A matches "", assume A done. */
4046
4047 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4048 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4049 "%*s whilem: empty match detected, trying continuation...\n",
4050 REPORT_CODE_OFF+depth*2, "")
4051 );
4052 goto do_whilem_B_max;
4053 }
4054
4055 /* super-linear cache processing */
4056
4057 if (scan->flags) {
a0d0e21e 4058
2c2d71f5 4059 if (!PL_reg_maxiter) {
c476f425
DM
4060 /* start the countdown: Postpone detection until we
4061 * know the match is not *that* much linear. */
2c2d71f5 4062 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4063 /* possible overflow for long strings and many CURLYX's */
4064 if (PL_reg_maxiter < 0)
4065 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4066 PL_reg_leftiter = PL_reg_maxiter;
4067 }
c476f425 4068
2c2d71f5 4069 if (PL_reg_leftiter-- == 0) {
c476f425 4070 /* initialise cache */
3298f257 4071 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4072 if (PL_reg_poscache) {
eb160463 4073 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4074 Renew(PL_reg_poscache, size, char);
4075 PL_reg_poscache_size = size;
4076 }
4077 Zero(PL_reg_poscache, size, char);
4078 }
4079 else {
4080 PL_reg_poscache_size = size;
a02a5408 4081 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4082 }
c476f425
DM
4083 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4084 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4085 PL_colors[4], PL_colors[5])
4086 );
2c2d71f5 4087 }
c476f425 4088
2c2d71f5 4089 if (PL_reg_leftiter < 0) {
c476f425
DM
4090 /* have we already failed at this position? */
4091 I32 offset, mask;
4092 offset = (scan->flags & 0xf) - 1
4093 + (locinput - PL_bostr) * (scan->flags>>4);
4094 mask = 1 << (offset % 8);
4095 offset /= 8;
4096 if (PL_reg_poscache[offset] & mask) {
4097 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4098 "%*s whilem: (cache) already tried at this position...\n",
4099 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4100 );
3298f257 4101 sayNO; /* cache records failure */
2c2d71f5 4102 }
c476f425
DM
4103 ST.cache_offset = offset;
4104 ST.cache_mask = mask;
2c2d71f5 4105 }
c476f425 4106 }
2c2d71f5 4107
c476f425 4108 /* Prefer B over A for minimal matching. */
a687059c 4109
c476f425
DM
4110 if (cur_curlyx->u.curlyx.minmod) {
4111 ST.save_curlyx = cur_curlyx;
4112 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4113 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4114 REGCP_SET(ST.lastcp);
4115 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4116 /* NOTREACHED */
4117 }
a0d0e21e 4118
c476f425
DM
4119 /* Prefer A over B for maximal matching. */
4120
4121 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4122 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4123 cur_curlyx->u.curlyx.lastloc = locinput;
4124 REGCP_SET(ST.lastcp);
4125 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4126 /* NOTREACHED */
4127 }
4128 goto do_whilem_B_max;
4129 }
4130 /* NOTREACHED */
4131
4132 case WHILEM_B_min: /* just matched B in a minimal match */
4133 case WHILEM_B_max: /* just matched B in a maximal match */
4134 cur_curlyx = ST.save_curlyx;
4135 sayYES;
4136 /* NOTREACHED */
4137
4138 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4139 cur_curlyx = ST.save_curlyx;
4140 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4141 cur_curlyx->u.curlyx.count--;
4142 CACHEsayNO;
4143 /* NOTREACHED */
4144
4145 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4146 REGCP_UNWIND(ST.lastcp);
4147 regcppop(rex);
4148 /* FALL THROUGH */
4149 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4150 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4151 cur_curlyx->u.curlyx.count--;
4152 CACHEsayNO;
4153 /* NOTREACHED */
4154
4155 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4156 REGCP_UNWIND(ST.lastcp);
4157 regcppop(rex); /* Restore some previous $<digit>s? */
4158 PL_reginput = locinput;
4159 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4160 "%*s whilem: failed, trying continuation...\n",
4161 REPORT_CODE_OFF+depth*2, "")
4162 );
4163 do_whilem_B_max:
4164 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4165 && ckWARN(WARN_REGEXP)
4166 && !(PL_reg_flags & RF_warned))
4167 {
4168 PL_reg_flags |= RF_warned;
4169 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4170 "Complex regular subexpression recursion",
4171 REG_INFTY - 1);
4172 }
4173
4174 /* now try B */
4175 ST.save_curlyx = cur_curlyx;
4176 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4177 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4178 /* NOTREACHED */
4179
4180 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4181 cur_curlyx = ST.save_curlyx;
4182 REGCP_UNWIND(ST.lastcp);
4183 regcppop(rex);
4184
4185 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4186 /* Maximum greed exceeded */
4187 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4188 && ckWARN(WARN_REGEXP)
4189 && !(PL_reg_flags & RF_warned))
4190 {
3280af22 4191 PL_reg_flags |= RF_warned;
c476f425
DM
4192 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4193 "%s limit (%d) exceeded",
4194 "Complex regular subexpression recursion",
4195 REG_INFTY - 1);
a0d0e21e 4196 }
c476f425 4197 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4198 CACHEsayNO;
a0d0e21e 4199 }
c476f425
DM
4200
4201 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4202 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4203 );
4204 /* Try grabbing another A and see if it helps. */
4205 PL_reginput = locinput;
4206 cur_curlyx->u.curlyx.lastloc = locinput;
4207 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4208 REGCP_SET(ST.lastcp);
4209 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 4210 /* NOTREACHED */
40a82448
DM
4211
4212#undef ST
4213#define ST st->u.branch
4214
4215 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4216 next = scan + ARG(scan);
4217 if (next == scan)
4218 next = NULL;
40a82448
DM
4219 scan = NEXTOPER(scan);
4220 /* FALL THROUGH */
c277df42 4221
40a82448
DM
4222 case BRANCH: /* /(...|A|...)/ */
4223 scan = NEXTOPER(scan); /* scan now points to inner node */
5d458dd8
YO
4224 if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4225 && !has_cutgroup)
4226 {
40a82448
DM
4227 /* last branch; skip state push and jump direct to node */
4228 continue;
5d458dd8 4229 }
40a82448
DM
4230 ST.lastparen = *PL_reglastparen;
4231 ST.next_branch = next;
4232 REGCP_SET(ST.cp);
4233 PL_reginput = locinput;
02db2b7b 4234
40a82448 4235 /* Now go into the branch */
5d458dd8
YO
4236 if (has_cutgroup) {
4237 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4238 } else {
4239 PUSH_STATE_GOTO(BRANCH_next, scan);
4240 }
40a82448 4241 /* NOTREACHED */
5d458dd8
YO
4242 case CUTGROUP:
4243 PL_reginput = locinput;
4244 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
f8fc2ecf 4245 (SV*)rexi->data->data[ ARG( scan ) ];
5d458dd8
YO
4246 PUSH_STATE_GOTO(CUTGROUP_next,next);
4247 /* NOTREACHED */
4248 case CUTGROUP_next_fail:
4249 do_cutgroup = 1;
4250 no_final = 1;
4251 if (st->u.mark.mark_name)
4252 sv_commit = st->u.mark.mark_name;
4253 sayNO;
4254 /* NOTREACHED */
4255 case BRANCH_next:
4256 sayYES;
4257 /* NOTREACHED */
40a82448 4258 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4259 if (do_cutgroup) {
4260 do_cutgroup = 0;
4261 no_final = 0;
4262 }
40a82448
DM
4263 REGCP_UNWIND(ST.cp);
4264 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 4265 PL_regoffs[n].end = -1;
40a82448 4266 *PL_reglastparen = n;
0a4db386 4267 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4268 scan = ST.next_branch;
4269 /* no more branches? */
5d458dd8
YO
4270 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4271 DEBUG_EXECUTE_r({
4272 PerlIO_printf( Perl_debug_log,
4273 "%*s %sBRANCH failed...%s\n",
4274 REPORT_CODE_OFF+depth*2, "",
4275 PL_colors[4],
4276 PL_colors[5] );
4277 });
4278 sayNO_SILENT;
4279 }
40a82448
DM
4280 continue; /* execute next BRANCH[J] op */
4281 /* NOTREACHED */
4282
a0d0e21e 4283 case MINMOD:
24d3c4a9 4284 minmod = 1;
a0d0e21e 4285 break;
40a82448
DM
4286
4287#undef ST
4288#define ST st->u.curlym
4289
4290 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4291
4292 /* This is an optimisation of CURLYX that enables us to push
4293 * only a single backtracking state, no matter now many matches
4294 * there are in {m,n}. It relies on the pattern being constant
4295 * length, with no parens to influence future backrefs
4296 */
4297
4298 ST.me = scan;
dc45a647 4299 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4300
4301 /* if paren positive, emulate an OPEN/CLOSE around A */
4302 if (ST.me->flags) {
3b6647e0 4303 U32 paren = ST.me->flags;
40a82448
DM
4304 if (paren > PL_regsize)
4305 PL_regsize = paren;
3b6647e0 4306 if (paren > *PL_reglastparen)
40a82448 4307 *PL_reglastparen = paren;
c277df42 4308 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4309 }
40a82448
DM
4310 ST.A = scan;
4311 ST.B = next;
4312 ST.alen = 0;
4313 ST.count = 0;
24d3c4a9
DM
4314 ST.minmod = minmod;
4315 minmod = 0;
40a82448
DM
4316 ST.c1 = CHRTEST_UNINIT;
4317 REGCP_SET(ST.cp);
6407bf3b 4318
40a82448
DM
4319 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4320 goto curlym_do_B;
4321
4322 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4323 PL_reginput = locinput;
40a82448
DM
4324 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4325 /* NOTREACHED */
5f80c4cf 4326
40a82448
DM
4327 case CURLYM_A: /* we've just matched an A */
4328 locinput = st->locinput;
4329 nextchr = UCHARAT(locinput);
4330
4331 ST.count++;
4332 /* after first match, determine A's length: u.curlym.alen */
4333 if (ST.count == 1) {
4334 if (PL_reg_match_utf8) {
4335 char *s = locinput;
4336 while (s < PL_reginput) {
4337 ST.alen++;
4338 s += UTF8SKIP(s);
4339 }
4340 }
4341 else {
4342 ST.alen = PL_reginput - locinput;
4343 }
4344 if (ST.alen == 0)
4345 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4346 }
0cadcf80
DM
4347 DEBUG_EXECUTE_r(
4348 PerlIO_printf(Perl_debug_log,
40a82448 4349 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4350 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4351 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4352 );
4353
40a82448 4354 locinput = PL_reginput;
0a4db386
YO
4355
4356 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4357 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4358 goto fake_end;
4359
4360 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
40a82448
DM
4361 goto curlym_do_A; /* try to match another A */
4362 goto curlym_do_B; /* try to match B */
5f80c4cf 4363
40a82448
DM
4364 case CURLYM_A_fail: /* just failed to match an A */
4365 REGCP_UNWIND(ST.cp);
0a4db386
YO
4366
4367 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4368 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4369 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 4370 sayNO;
0cadcf80 4371
40a82448
DM
4372 curlym_do_B: /* execute the B in /A{m,n}B/ */
4373 PL_reginput = locinput;
4374 if (ST.c1 == CHRTEST_UNINIT) {
4375 /* calculate c1 and c2 for possible match of 1st char
4376 * following curly */
4377 ST.c1 = ST.c2 = CHRTEST_VOID;
4378 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4379 regnode *text_node = ST.B;
4380 if (! HAS_TEXT(text_node))
4381 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
4382 /* this used to be
4383
4384 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4385
4386 But the former is redundant in light of the latter.
4387
4388 if this changes back then the macro for
4389 IS_TEXT and friends need to change.
4390 */
4391 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 4392 {
ee9b8eae 4393
40a82448
DM
4394 ST.c1 = (U8)*STRING(text_node);
4395 ST.c2 =
ee9b8eae 4396 (IS_TEXTF(text_node))
40a82448 4397 ? PL_fold[ST.c1]
ee9b8eae 4398 : (IS_TEXTFL(text_node))
40a82448
DM
4399 ? PL_fold_locale[ST.c1]
4400 : ST.c1;
c277df42 4401 }
c277df42 4402 }
40a82448
DM
4403 }
4404
4405 DEBUG_EXECUTE_r(
4406 PerlIO_printf(Perl_debug_log,
4407 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4408 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4409 "", (IV)ST.count)
4410 );
4411 if (ST.c1 != CHRTEST_VOID
4412 && UCHARAT(PL_reginput) != ST.c1
4413 && UCHARAT(PL_reginput) != ST.c2)
4414 {
4415 /* simulate B failing */
3e901dc0
YO
4416 DEBUG_OPTIMISE_r(
4417 PerlIO_printf(Perl_debug_log,
4418 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4419 (int)(REPORT_CODE_OFF+(depth*2)),"",
4420 (IV)ST.c1,(IV)ST.c2
4421 ));
40a82448
DM
4422 state_num = CURLYM_B_fail;
4423 goto reenter_switch;
4424 }
4425
4426 if (ST.me->flags) {
4427 /* mark current A as captured */
4428 I32 paren = ST.me->flags;
4429 if (ST.count) {
f0ab9afb 4430 PL_regoffs[paren].start
40a82448 4431 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
f0ab9afb 4432 PL_regoffs[paren].end = PL_reginput - PL_bostr;
0a4db386 4433 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4434 }
40a82448 4435 else
f0ab9afb 4436 PL_regoffs[paren].end = -1;
0a4db386 4437 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4438 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4439 {
4440 if (ST.count)
4441 goto fake_end;
4442 else
4443 sayNO;
4444 }
c277df42 4445 }
0a4db386 4446
40a82448 4447 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4448 /* NOTREACHED */
40a82448
DM
4449
4450 case CURLYM_B_fail: /* just failed to match a B */
4451 REGCP_UNWIND(ST.cp);
4452 if (ST.minmod) {
4453 if (ST.count == ARG2(ST.me) /* max */)
4454 sayNO;
4455 goto curlym_do_A; /* try to match a further A */
4456 }
4457 /* backtrack one A */
4458 if (ST.count == ARG1(ST.me) /* min */)
4459 sayNO;
4460 ST.count--;
4461 locinput = HOPc(locinput, -ST.alen);
4462 goto curlym_do_B; /* try to match B */
4463
c255a977
DM
4464#undef ST
4465#define ST st->u.curly
40a82448 4466
c255a977
DM
4467#define CURLY_SETPAREN(paren, success) \
4468 if (paren) { \
4469 if (success) { \
f0ab9afb
NC
4470 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4471 PL_regoffs[paren].end = locinput - PL_bostr; \
0a4db386 4472 *PL_reglastcloseparen = paren; \
c255a977
DM
4473 } \
4474 else \
f0ab9afb 4475 PL_regoffs[paren].end = -1; \
c255a977
DM
4476 }
4477
4478 case STAR: /* /A*B/ where A is width 1 */
4479 ST.paren = 0;
4480 ST.min = 0;
4481 ST.max = REG_INFTY;
a0d0e21e
LW
4482 scan = NEXTOPER(scan);
4483 goto repeat;
c255a977
DM
4484 case PLUS: /* /A+B/ where A is width 1 */
4485 ST.paren = 0;
4486 ST.min = 1;
4487 ST.max = REG_INFTY;
c277df42 4488 scan = NEXTOPER(scan);
c255a977
DM
4489 goto repeat;
4490 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4491 ST.paren = scan->flags; /* Which paren to set */
4492 if (ST.paren > PL_regsize)
4493 PL_regsize = ST.paren;
3b6647e0 4494 if (ST.paren > *PL_reglastparen)
c255a977
DM
4495 *PL_reglastparen = ST.paren;
4496 ST.min = ARG1(scan); /* min to match */
4497 ST.max = ARG2(scan); /* max to match */
0a4db386 4498 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4499 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4500 ST.min=1;
4501 ST.max=1;
4502 }
c255a977
DM
4503 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4504 goto repeat;
4505 case CURLY: /* /A{m,n}B/ where A is width 1 */
4506 ST.paren = 0;
4507 ST.min = ARG1(scan); /* min to match */
4508 ST.max = ARG2(scan); /* max to match */
4509 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4510 repeat:
a0d0e21e
LW
4511 /*
4512 * Lookahead to avoid useless match attempts
4513 * when we know what character comes next.
c255a977 4514 *
5f80c4cf
JP
4515 * Used to only do .*x and .*?x, but now it allows
4516 * for )'s, ('s and (?{ ... })'s to be in the way
4517 * of the quantifier and the EXACT-like node. -- japhy
4518 */
4519
c255a977
DM
4520 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4521 sayNO;
cca55fe3 4522 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4523 U8 *s;
4524 regnode *text_node = next;
4525
3dab1dad
YO
4526 if (! HAS_TEXT(text_node))
4527 FIND_NEXT_IMPT(text_node);
5f80c4cf 4528
9e137952 4529 if (! HAS_TEXT(text_node))
c255a977 4530 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4531 else {
ee9b8eae 4532 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 4533 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4534 goto assume_ok_easy;
cca55fe3 4535 }
be8e71aa
YO
4536 else
4537 s = (U8*)STRING(text_node);
ee9b8eae
YO
4538
4539 /* Currently we only get here when
4540
4541 PL_rekind[OP(text_node)] == EXACT
4542
4543 if this changes back then the macro for IS_TEXT and
4544 friends need to change. */
5f80c4cf 4545 if (!UTF) {
c255a977 4546 ST.c2 = ST.c1 = *s;
ee9b8eae 4547 if (IS_TEXTF(text_node))
c255a977 4548 ST.c2 = PL_fold[ST.c1];
ee9b8eae 4549 else if (IS_TEXTFL(text_node))
c255a977 4550 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4551 }
5f80c4cf 4552 else { /* UTF */
ee9b8eae 4553 if (IS_TEXTF(text_node)) {
a2a2844f 4554 STRLEN ulen1, ulen2;
89ebb4a3
JH
4555 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4556 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4557
4558 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4559 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4560#ifdef EBCDIC
4561 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4562 ckWARN(WARN_UTF8) ?
4563 0 : UTF8_ALLOW_ANY);
4564 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4565 ckWARN(WARN_UTF8) ?
4566 0 : UTF8_ALLOW_ANY);
4567#else
c255a977 4568 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4569 uniflags);
c255a977 4570 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4571 uniflags);
4572#endif
5f80c4cf
JP
4573 }
4574 else {
c255a977 4575 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4576 uniflags);
5f80c4cf 4577 }
1aa99e6b
IH
4578 }
4579 }
bbce6d69 4580 }
a0d0e21e 4581 else
c255a977 4582 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4583 assume_ok_easy:
c255a977
DM
4584
4585 ST.A = scan;
4586 ST.B = next;
3280af22 4587 PL_reginput = locinput;
24d3c4a9
DM
4588 if (minmod) {
4589 minmod = 0;
e2e6a0f1 4590 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 4591 sayNO;
c255a977 4592 ST.count = ST.min;
a0ed51b3 4593 locinput = PL_reginput;
c255a977
DM
4594 REGCP_SET(ST.cp);
4595 if (ST.c1 == CHRTEST_VOID)
4596 goto curly_try_B_min;
4597
4598 ST.oldloc = locinput;
4599
4600 /* set ST.maxpos to the furthest point along the
4601 * string that could possibly match */
4602 if (ST.max == REG_INFTY) {
4603 ST.maxpos = PL_regeol - 1;
4604 if (do_utf8)
4605 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4606 ST.maxpos--;
4607 }
4608 else if (do_utf8) {
4609 int m = ST.max - ST.min;
4610 for (ST.maxpos = locinput;
4611 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4612 ST.maxpos += UTF8SKIP(ST.maxpos);
4613 }
4614 else {
4615 ST.maxpos = locinput + ST.max - ST.min;
4616 if (ST.maxpos >= PL_regeol)
4617 ST.maxpos = PL_regeol - 1;
4618 }
4619 goto curly_try_B_min_known;
4620
4621 }
4622 else {
e2e6a0f1 4623 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
4624 locinput = PL_reginput;
4625 if (ST.count < ST.min)
4626 sayNO;
4627 if ((ST.count > ST.min)
4628 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4629 {
4630 /* A{m,n} must come at the end of the string, there's
4631 * no point in backing off ... */
4632 ST.min = ST.count;
4633 /* ...except that $ and \Z can match before *and* after
4634 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4635 We may back off by one in this case. */
4636 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4637 ST.min--;
4638 }
4639 REGCP_SET(ST.cp);
4640 goto curly_try_B_max;
4641 }
4642 /* NOTREACHED */
4643
4644
4645 case CURLY_B_min_known_fail:
4646 /* failed to find B in a non-greedy match where c1,c2 valid */
4647 if (ST.paren && ST.count)
f0ab9afb 4648 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4649
4650 PL_reginput = locinput; /* Could be reset... */
4651 REGCP_UNWIND(ST.cp);
4652 /* Couldn't or didn't -- move forward. */
4653 ST.oldloc = locinput;
4654 if (do_utf8)
4655 locinput += UTF8SKIP(locinput);
4656 else
4657 locinput++;
4658 ST.count++;
4659 curly_try_B_min_known:
4660 /* find the next place where 'B' could work, then call B */
4661 {
4662 int n;
4663 if (do_utf8) {
4664 n = (ST.oldloc == locinput) ? 0 : 1;
4665 if (ST.c1 == ST.c2) {
4666 STRLEN len;
4667 /* set n to utf8_distance(oldloc, locinput) */
4668 while (locinput <= ST.maxpos &&
4669 utf8n_to_uvchr((U8*)locinput,
4670 UTF8_MAXBYTES, &len,
4671 uniflags) != (UV)ST.c1) {
4672 locinput += len;
4673 n++;
4674 }
1aa99e6b
IH
4675 }
4676 else {
c255a977
DM
4677 /* set n to utf8_distance(oldloc, locinput) */
4678 while (locinput <= ST.maxpos) {
4679 STRLEN len;
4680 const UV c = utf8n_to_uvchr((U8*)locinput,
4681 UTF8_MAXBYTES, &len,
4682 uniflags);
4683 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4684 break;
4685 locinput += len;
4686 n++;
1aa99e6b 4687 }
0fe9bf95
IZ
4688 }
4689 }
c255a977
DM
4690 else {
4691 if (ST.c1 == ST.c2) {
4692 while (locinput <= ST.maxpos &&
4693 UCHARAT(locinput) != ST.c1)
4694 locinput++;
bbce6d69 4695 }
c255a977
DM
4696 else {
4697 while (locinput <= ST.maxpos
4698 && UCHARAT(locinput) != ST.c1
4699 && UCHARAT(locinput) != ST.c2)
4700 locinput++;
a0ed51b3 4701 }
c255a977
DM
4702 n = locinput - ST.oldloc;
4703 }
4704 if (locinput > ST.maxpos)
4705 sayNO;
4706 /* PL_reginput == oldloc now */
4707 if (n) {
4708 ST.count += n;
e2e6a0f1 4709 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 4710 sayNO;
a0d0e21e 4711 }
c255a977
DM
4712 PL_reginput = locinput;
4713 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4714 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4715 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4716 goto fake_end;
4717 }
c255a977 4718 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4719 }
c255a977
DM
4720 /* NOTREACHED */
4721
4722
4723 case CURLY_B_min_fail:
4724 /* failed to find B in a non-greedy match where c1,c2 invalid */
4725 if (ST.paren && ST.count)
f0ab9afb 4726 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4727
4728 REGCP_UNWIND(ST.cp);
4729 /* failed -- move forward one */
4730 PL_reginput = locinput;
e2e6a0f1 4731 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 4732 ST.count++;
a0ed51b3 4733 locinput = PL_reginput;
c255a977
DM
4734 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4735 ST.count > 0)) /* count overflow ? */
15272685 4736 {
c255a977
DM
4737 curly_try_B_min:
4738 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4739 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4740 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4741 goto fake_end;
4742 }
c255a977 4743 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4744 }
4745 }
4633a7c4 4746 sayNO;
c255a977
DM
4747 /* NOTREACHED */
4748
4749
4750 curly_try_B_max:
4751 /* a successful greedy match: now try to match B */
40d049e4 4752 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4753 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
4754 goto fake_end;
4755 }
c255a977
DM
4756 {
4757 UV c = 0;
4758 if (ST.c1 != CHRTEST_VOID)
4759 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4760 UTF8_MAXBYTES, 0, uniflags)
466787eb 4761 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4762 /* If it could work, try it. */
4763 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4764 CURLY_SETPAREN(ST.paren, ST.count);
4765 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4766 /* NOTREACHED */
4767 }
4768 }
4769 /* FALL THROUGH */
4770 case CURLY_B_max_fail:
4771 /* failed to find B in a greedy match */
4772 if (ST.paren && ST.count)
f0ab9afb 4773 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4774
4775 REGCP_UNWIND(ST.cp);
4776 /* back up. */
4777 if (--ST.count < ST.min)
4778 sayNO;
4779 PL_reginput = locinput = HOPc(locinput, -1);
4780 goto curly_try_B_max;
4781
4782#undef ST
4783
a0d0e21e 4784 case END:
6bda09f9 4785 fake_end:
faec1544
DM
4786 if (cur_eval) {
4787 /* we've just finished A in /(??{A})B/; now continue with B */
4788 I32 tmpix;
faec1544
DM
4789 st->u.eval.toggle_reg_flags
4790 = cur_eval->u.eval.toggle_reg_flags;
4791 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4792
4793 st->u.eval.prev_rex = rex; /* inner */
28d8d7f4 4794 SETREX(rex,cur_eval->u.eval.prev_rex);
f8fc2ecf 4795 rexi = RXi_GET(rex);
faec1544
DM
4796 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4797 ReREFCNT_inc(rex);
4798 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4799 REGCP_SET(st->u.eval.lastcp);
4800 PL_reginput = locinput;
4801
4802 /* Restore parens of the outer rex without popping the
4803 * savestack */
4804 tmpix = PL_savestack_ix;
4805 PL_savestack_ix = cur_eval->u.eval.lastcp;
4806 regcppop(rex);
4807 PL_savestack_ix = tmpix;
4808
4809 st->u.eval.prev_eval = cur_eval;
4810 cur_eval = cur_eval->u.eval.prev_eval;
4811 DEBUG_EXECUTE_r(
2a49f0f5
JH
4812 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4813 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
4814 if ( nochange_depth )
4815 nochange_depth--;
4816
4817 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
4818 st->u.eval.prev_eval->u.eval.B); /* match B */
4819 }
4820
3b0527fe 4821 if (locinput < reginfo->till) {
a3621e74 4822 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4823 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4824 PL_colors[4],
4825 (long)(locinput - PL_reg_starttry),
3b0527fe 4826 (long)(reginfo->till - PL_reg_starttry),
7821416a 4827 PL_colors[5]));
58e23c8d 4828
262b90c4 4829 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4830 }
4831 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4832 sayYES; /* Success! */
dad79028
DM
4833
4834 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4835 DEBUG_EXECUTE_r(
4836 PerlIO_printf(Perl_debug_log,
4837 "%*s %ssubpattern success...%s\n",
5bc10b2c 4838 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4839 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4840 sayYES; /* Success! */
dad79028 4841
40a82448
DM
4842#undef ST
4843#define ST st->u.ifmatch
4844
4845 case SUSPEND: /* (?>A) */
4846 ST.wanted = 1;
9fe1d20c 4847 PL_reginput = locinput;
9041c2e3 4848 goto do_ifmatch;
dad79028 4849
40a82448
DM
4850 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4851 ST.wanted = 0;
dad79028
DM
4852 goto ifmatch_trivial_fail_test;
4853
40a82448
DM
4854 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4855 ST.wanted = 1;
dad79028 4856 ifmatch_trivial_fail_test:
a0ed51b3 4857 if (scan->flags) {
52657f30 4858 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4859 if (!s) {
4860 /* trivial fail */
24d3c4a9
DM
4861 if (logical) {
4862 logical = 0;
4863 sw = 1 - (bool)ST.wanted;
dad79028 4864 }
40a82448 4865 else if (ST.wanted)
dad79028
DM
4866 sayNO;
4867 next = scan + ARG(scan);
4868 if (next == scan)
4869 next = NULL;
4870 break;
4871 }
efb30f32 4872 PL_reginput = s;
a0ed51b3
LW
4873 }
4874 else
4875 PL_reginput = locinput;
4876
c277df42 4877 do_ifmatch:
40a82448 4878 ST.me = scan;
24d3c4a9 4879 ST.logical = logical;
40a82448
DM
4880 /* execute body of (?...A) */
4881 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4882 /* NOTREACHED */
4883
4884 case IFMATCH_A_fail: /* body of (?...A) failed */
4885 ST.wanted = !ST.wanted;
4886 /* FALL THROUGH */
4887
4888 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
4889 if (ST.logical) {
4890 sw = (bool)ST.wanted;
40a82448
DM
4891 }
4892 else if (!ST.wanted)
4893 sayNO;
4894
4895 if (OP(ST.me) == SUSPEND)
4896 locinput = PL_reginput;
4897 else {
4898 locinput = PL_reginput = st->locinput;
4899 nextchr = UCHARAT(locinput);
4900 }
4901 scan = ST.me + ARG(ST.me);
4902 if (scan == ST.me)
4903 scan = NULL;
4904 continue; /* execute B */
4905
4906#undef ST
dad79028 4907
c277df42 4908 case LONGJMP:
c277df42
IZ
4909 next = scan + ARG(scan);
4910 if (next == scan)
4911 next = NULL;
a0d0e21e 4912 break;
54612592 4913 case COMMIT:
e2e6a0f1
YO
4914 reginfo->cutpoint = PL_regeol;
4915 /* FALLTHROUGH */
5d458dd8 4916 case PRUNE:
24b23f37 4917 PL_reginput = locinput;
e2e6a0f1 4918 if (!scan->flags)
f8fc2ecf 4919 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
54612592
YO
4920 PUSH_STATE_GOTO(COMMIT_next,next);
4921 /* NOTREACHED */
4922 case COMMIT_next_fail:
4923 no_final = 1;
4924 /* FALLTHROUGH */
7f69552c
YO
4925 case OPFAIL:
4926 sayNO;
e2e6a0f1
YO
4927 /* NOTREACHED */
4928
4929#define ST st->u.mark
4930 case MARKPOINT:
4931 ST.prev_mark = mark_state;
5d458dd8 4932 ST.mark_name = sv_commit = sv_yes_mark
f8fc2ecf 4933 = (SV*)rexi->data->data[ ARG( scan ) ];
e2e6a0f1
YO
4934 mark_state = st;
4935 ST.mark_loc = PL_reginput = locinput;
4936 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4937 /* NOTREACHED */
4938 case MARKPOINT_next:
4939 mark_state = ST.prev_mark;
4940 sayYES;
4941 /* NOTREACHED */
4942 case MARKPOINT_next_fail:
5d458dd8 4943 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
4944 {
4945 if (ST.mark_loc > startpoint)
4946 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4947 popmark = NULL; /* we found our mark */
4948 sv_commit = ST.mark_name;
4949
4950 DEBUG_EXECUTE_r({
5d458dd8 4951 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
4952 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4953 REPORT_CODE_OFF+depth*2, "",
be2597df 4954 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
4955 });
4956 }
4957 mark_state = ST.prev_mark;
5d458dd8
YO
4958 sv_yes_mark = mark_state ?
4959 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
4960 sayNO;
4961 /* NOTREACHED */
5d458dd8
YO
4962 case SKIP:
4963 PL_reginput = locinput;
4964 if (scan->flags) {
2bf803e2 4965 /* (*SKIP) : if we fail we cut here*/
5d458dd8 4966 ST.mark_name = NULL;
e2e6a0f1 4967 ST.mark_loc = locinput;
5d458dd8
YO
4968 PUSH_STATE_GOTO(SKIP_next,next);
4969 } else {
2bf803e2 4970 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
4971 otherwise do nothing. Meaning we need to scan
4972 */
4973 regmatch_state *cur = mark_state;
f8fc2ecf 4974 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
5d458dd8
YO
4975
4976 while (cur) {
4977 if ( sv_eq( cur->u.mark.mark_name,
4978 find ) )
4979 {
4980 ST.mark_name = find;
4981 PUSH_STATE_GOTO( SKIP_next, next );
4982 }
4983 cur = cur->u.mark.prev_mark;
4984 }
e2e6a0f1 4985 }
2bf803e2 4986 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
4987 break;
4988 case SKIP_next_fail:
4989 if (ST.mark_name) {
4990 /* (*CUT:NAME) - Set up to search for the name as we
4991 collapse the stack*/
4992 popmark = ST.mark_name;
4993 } else {
4994 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
4995 if (ST.mark_loc > startpoint)
4996 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
4997 /* but we set sv_commit to latest mark_name if there
4998 is one so they can test to see how things lead to this
4999 cut */
5000 if (mark_state)
5001 sv_commit=mark_state->u.mark.mark_name;
5002 }
e2e6a0f1
YO
5003 no_final = 1;
5004 sayNO;
5005 /* NOTREACHED */
5006#undef ST
5007
e1d1eefb
YO
5008 case LNBREAK:
5009 if ((n=is_LNBREAK(locinput,do_utf8))) {
5010 locinput += n;
5011 nextchr = UCHARAT(locinput);
5012 } else
5013 sayNO;
5014 break;
5015
5016#define CASE_CLASS(nAmE) \
5017 case nAmE: \
5018 if ((n=is_##nAmE(locinput,do_utf8))) { \
5019 locinput += n; \
5020 nextchr = UCHARAT(locinput); \
5021 } else \
5022 sayNO; \
5023 break; \
5024 case N##nAmE: \
5025 if ((n=is_##nAmE(locinput,do_utf8))) { \
5026 sayNO; \
5027 } else { \
5028 locinput += UTF8SKIP(locinput); \
5029 nextchr = UCHARAT(locinput); \
5030 } \
5031 break
5032
5033 CASE_CLASS(VERTWS);
5034 CASE_CLASS(HORIZWS);
5035#undef CASE_CLASS
5036
a0d0e21e 5037 default:
b900a521 5038 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5039 PTR2UV(scan), OP(scan));
cea2e8a9 5040 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5041
5042 } /* end switch */
95b24440 5043
5d458dd8
YO
5044 /* switch break jumps here */
5045 scan = next; /* prepare to execute the next op and ... */
5046 continue; /* ... jump back to the top, reusing st */
95b24440
DM
5047 /* NOTREACHED */
5048
40a82448
DM
5049 push_yes_state:
5050 /* push a state that backtracks on success */
5051 st->u.yes.prev_yes_state = yes_state;
5052 yes_state = st;
5053 /* FALL THROUGH */
5054 push_state:
5055 /* push a new regex state, then continue at scan */
5056 {
5057 regmatch_state *newst;
5058
24b23f37
YO
5059 DEBUG_STACK_r({
5060 regmatch_state *cur = st;
5061 regmatch_state *curyes = yes_state;
5062 int curd = depth;
5063 regmatch_slab *slab = PL_regmatch_slab;
5064 for (;curd > -1;cur--,curd--) {
5065 if (cur < SLAB_FIRST(slab)) {
5066 slab = slab->prev;
5067 cur = SLAB_LAST(slab);
5068 }
5069 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5070 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5071 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5072 (curyes == cur) ? "yes" : ""
5073 );
5074 if (curyes == cur)
5075 curyes = cur->u.yes.prev_yes_state;
5076 }
5077 } else
5078 DEBUG_STATE_pp("push")
5079 );
40a82448 5080 depth++;
40a82448
DM
5081 st->locinput = locinput;
5082 newst = st+1;
5083 if (newst > SLAB_LAST(PL_regmatch_slab))
5084 newst = S_push_slab(aTHX);
5085 PL_regmatch_state = newst;
786e8c11 5086
40a82448
DM
5087 locinput = PL_reginput;
5088 nextchr = UCHARAT(locinput);
5089 st = newst;
5090 continue;
5091 /* NOTREACHED */
5092 }
a0d0e21e 5093 }
a687059c 5094
a0d0e21e
LW
5095 /*
5096 * We get here only if there's trouble -- normally "case END" is
5097 * the terminating point.
5098 */
cea2e8a9 5099 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5100 /*NOTREACHED*/
4633a7c4
LW
5101 sayNO;
5102
262b90c4 5103yes:
77cb431f
DM
5104 if (yes_state) {
5105 /* we have successfully completed a subexpression, but we must now
5106 * pop to the state marked by yes_state and continue from there */
77cb431f 5107 assert(st != yes_state);
5bc10b2c
DM
5108#ifdef DEBUGGING
5109 while (st != yes_state) {
5110 st--;
5111 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5112 PL_regmatch_slab = PL_regmatch_slab->prev;
5113 st = SLAB_LAST(PL_regmatch_slab);
5114 }
e2e6a0f1 5115 DEBUG_STATE_r({
54612592
YO
5116 if (no_final) {
5117 DEBUG_STATE_pp("pop (no final)");
5118 } else {
5119 DEBUG_STATE_pp("pop (yes)");
5120 }
e2e6a0f1 5121 });
5bc10b2c
DM
5122 depth--;
5123 }
5124#else
77cb431f
DM
5125 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5126 || yes_state > SLAB_LAST(PL_regmatch_slab))
5127 {
5128 /* not in this slab, pop slab */
5129 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5130 PL_regmatch_slab = PL_regmatch_slab->prev;
5131 st = SLAB_LAST(PL_regmatch_slab);
5132 }
5133 depth -= (st - yes_state);
5bc10b2c 5134#endif
77cb431f
DM
5135 st = yes_state;
5136 yes_state = st->u.yes.prev_yes_state;
5137 PL_regmatch_state = st;
24b23f37 5138
5d458dd8
YO
5139 if (no_final) {
5140 locinput= st->locinput;
5141 nextchr = UCHARAT(locinput);
5142 }
54612592 5143 state_num = st->resume_state + no_final;
24d3c4a9 5144 goto reenter_switch;
77cb431f
DM
5145 }
5146
a3621e74 5147 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5148 PL_colors[4], PL_colors[5]));
02db2b7b 5149
19b95bf0
DM
5150 if (PL_reg_eval_set) {
5151 /* each successfully executed (?{...}) block does the equivalent of
5152 * local $^R = do {...}
5153 * When popping the save stack, all these locals would be undone;
5154 * bypass this by setting the outermost saved $^R to the latest
5155 * value */
5156 if (oreplsv != GvSV(PL_replgv))
5157 sv_setsv(oreplsv, GvSV(PL_replgv));
5158 }
95b24440 5159 result = 1;
aa283a38 5160 goto final_exit;
4633a7c4
LW
5161
5162no:
a3621e74 5163 DEBUG_EXECUTE_r(
7821416a 5164 PerlIO_printf(Perl_debug_log,
786e8c11 5165 "%*s %sfailed...%s\n",
5bc10b2c 5166 REPORT_CODE_OFF+depth*2, "",
786e8c11 5167 PL_colors[4], PL_colors[5])
7821416a 5168 );
aa283a38 5169
262b90c4 5170no_silent:
54612592
YO
5171 if (no_final) {
5172 if (yes_state) {
5173 goto yes;
5174 } else {
5175 goto final_exit;
5176 }
5177 }
aa283a38
DM
5178 if (depth) {
5179 /* there's a previous state to backtrack to */
40a82448
DM
5180 st--;
5181 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5182 PL_regmatch_slab = PL_regmatch_slab->prev;
5183 st = SLAB_LAST(PL_regmatch_slab);
5184 }
5185 PL_regmatch_state = st;
40a82448
DM
5186 locinput= st->locinput;
5187 nextchr = UCHARAT(locinput);
5188
5bc10b2c
DM
5189 DEBUG_STATE_pp("pop");
5190 depth--;
262b90c4
DM
5191 if (yes_state == st)
5192 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5193
24d3c4a9
DM
5194 state_num = st->resume_state + 1; /* failure = success + 1 */
5195 goto reenter_switch;
95b24440 5196 }
24d3c4a9 5197 result = 0;
aa283a38 5198
262b90c4 5199 final_exit:
bbe252da 5200 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5201 SV *sv_err = get_sv("REGERROR", 1);
5202 SV *sv_mrk = get_sv("REGMARK", 1);
5203 if (result) {
e2e6a0f1 5204 sv_commit = &PL_sv_no;
5d458dd8
YO
5205 if (!sv_yes_mark)
5206 sv_yes_mark = &PL_sv_yes;
5207 } else {
5208 if (!sv_commit)
5209 sv_commit = &PL_sv_yes;
5210 sv_yes_mark = &PL_sv_no;
5211 }
5212 sv_setsv(sv_err, sv_commit);
5213 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5214 }
19b95bf0 5215
2f554ef7
DM
5216 /* clean up; in particular, free all slabs above current one */
5217 LEAVE_SCOPE(oldsave);
5d9a96ca 5218
95b24440 5219 return result;
a687059c
LW
5220}
5221
5222/*
5223 - regrepeat - repeatedly match something simple, report how many
5224 */
5225/*
5226 * [This routine now assumes that it will only match on things of length 1.
5227 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5228 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5229 */
76e3520e 5230STATIC I32
e2e6a0f1 5231S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5232{
27da23d5 5233 dVAR;
a0d0e21e 5234 register char *scan;
a0d0e21e 5235 register I32 c;
3280af22 5236 register char *loceol = PL_regeol;
a0ed51b3 5237 register I32 hardcount = 0;
53c4c00c 5238 register bool do_utf8 = PL_reg_match_utf8;
4f55667c
SP
5239#ifndef DEBUGGING
5240 PERL_UNUSED_ARG(depth);
5241#endif
a0d0e21e 5242
3280af22 5243 scan = PL_reginput;
faf11cac
HS
5244 if (max == REG_INFTY)
5245 max = I32_MAX;
5246 else if (max < loceol - scan)
7f596f4c 5247 loceol = scan + max;
a0d0e21e 5248 switch (OP(p)) {
22c35a8c 5249 case REG_ANY:
1aa99e6b 5250 if (do_utf8) {
ffc61ed2 5251 loceol = PL_regeol;
1aa99e6b 5252 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5253 scan += UTF8SKIP(scan);
5254 hardcount++;
5255 }
5256 } else {
5257 while (scan < loceol && *scan != '\n')
5258 scan++;
a0ed51b3
LW
5259 }
5260 break;
ffc61ed2 5261 case SANY:
def8e4ea
JH
5262 if (do_utf8) {
5263 loceol = PL_regeol;
a0804c9e 5264 while (scan < loceol && hardcount < max) {
def8e4ea
JH
5265 scan += UTF8SKIP(scan);
5266 hardcount++;
5267 }
5268 }
5269 else
5270 scan = loceol;
a0ed51b3 5271 break;
f33976b4
DB
5272 case CANY:
5273 scan = loceol;
5274 break;
090f7165
JH
5275 case EXACT: /* length of string is 1 */
5276 c = (U8)*STRING(p);
5277 while (scan < loceol && UCHARAT(scan) == c)
5278 scan++;
bbce6d69 5279 break;
5280 case EXACTF: /* length of string is 1 */
cd439c50 5281 c = (U8)*STRING(p);
bbce6d69 5282 while (scan < loceol &&
22c35a8c 5283 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 5284 scan++;
5285 break;
5286 case EXACTFL: /* length of string is 1 */
3280af22 5287 PL_reg_flags |= RF_tainted;
cd439c50 5288 c = (U8)*STRING(p);
bbce6d69 5289 while (scan < loceol &&
22c35a8c 5290 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
5291 scan++;
5292 break;
5293 case ANYOF:
ffc61ed2
JH
5294 if (do_utf8) {
5295 loceol = PL_regeol;
cfc92286 5296 while (hardcount < max && scan < loceol &&
32fc9b6a 5297 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
5298 scan += UTF8SKIP(scan);
5299 hardcount++;
5300 }
5301 } else {
32fc9b6a 5302 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
5303 scan++;
5304 }
a0d0e21e
LW
5305 break;
5306 case ALNUM:
1aa99e6b 5307 if (do_utf8) {
ffc61ed2 5308 loceol = PL_regeol;
1a4fad37 5309 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5310 while (hardcount < max && scan < loceol &&
3568d838 5311 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5312 scan += UTF8SKIP(scan);
5313 hardcount++;
5314 }
5315 } else {
5316 while (scan < loceol && isALNUM(*scan))
5317 scan++;
a0ed51b3
LW
5318 }
5319 break;
bbce6d69 5320 case ALNUML:
3280af22 5321 PL_reg_flags |= RF_tainted;
1aa99e6b 5322 if (do_utf8) {
ffc61ed2 5323 loceol = PL_regeol;
1aa99e6b
IH
5324 while (hardcount < max && scan < loceol &&
5325 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5326 scan += UTF8SKIP(scan);
5327 hardcount++;
5328 }
5329 } else {
5330 while (scan < loceol && isALNUM_LC(*scan))
5331 scan++;
a0ed51b3
LW
5332 }
5333 break;
a0d0e21e 5334 case NALNUM:
1aa99e6b 5335 if (do_utf8) {
ffc61ed2 5336 loceol = PL_regeol;
1a4fad37 5337 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5338 while (hardcount < max && scan < loceol &&
3568d838 5339 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5340 scan += UTF8SKIP(scan);
5341 hardcount++;
5342 }
5343 } else {
5344 while (scan < loceol && !isALNUM(*scan))
5345 scan++;
a0ed51b3
LW
5346 }
5347 break;
bbce6d69 5348 case NALNUML:
3280af22 5349 PL_reg_flags |= RF_tainted;
1aa99e6b 5350 if (do_utf8) {
ffc61ed2 5351 loceol = PL_regeol;
1aa99e6b
IH
5352 while (hardcount < max && scan < loceol &&
5353 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5354 scan += UTF8SKIP(scan);
5355 hardcount++;
5356 }
5357 } else {
5358 while (scan < loceol && !isALNUM_LC(*scan))
5359 scan++;
a0ed51b3
LW
5360 }
5361 break;
a0d0e21e 5362 case SPACE:
1aa99e6b 5363 if (do_utf8) {
ffc61ed2 5364 loceol = PL_regeol;
1a4fad37 5365 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5366 while (hardcount < max && scan < loceol &&
3568d838
JH
5367 (*scan == ' ' ||
5368 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5369 scan += UTF8SKIP(scan);
5370 hardcount++;
5371 }
5372 } else {
5373 while (scan < loceol && isSPACE(*scan))
5374 scan++;
a0ed51b3
LW
5375 }
5376 break;
bbce6d69 5377 case SPACEL:
3280af22 5378 PL_reg_flags |= RF_tainted;
1aa99e6b 5379 if (do_utf8) {
ffc61ed2 5380 loceol = PL_regeol;
1aa99e6b 5381 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5382 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5383 scan += UTF8SKIP(scan);
5384 hardcount++;
5385 }
5386 } else {
5387 while (scan < loceol && isSPACE_LC(*scan))
5388 scan++;
a0ed51b3
LW
5389 }
5390 break;
a0d0e21e 5391 case NSPACE:
1aa99e6b 5392 if (do_utf8) {
ffc61ed2 5393 loceol = PL_regeol;
1a4fad37 5394 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5395 while (hardcount < max && scan < loceol &&
3568d838
JH
5396 !(*scan == ' ' ||
5397 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5398 scan += UTF8SKIP(scan);
5399 hardcount++;
5400 }
5401 } else {
5402 while (scan < loceol && !isSPACE(*scan))
5403 scan++;
5404 break;
a0ed51b3 5405 }
bbce6d69 5406 case NSPACEL:
3280af22 5407 PL_reg_flags |= RF_tainted;
1aa99e6b 5408 if (do_utf8) {
ffc61ed2 5409 loceol = PL_regeol;
1aa99e6b 5410 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5411 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5412 scan += UTF8SKIP(scan);
5413 hardcount++;
5414 }
5415 } else {
5416 while (scan < loceol && !isSPACE_LC(*scan))
5417 scan++;
a0ed51b3
LW
5418 }
5419 break;
a0d0e21e 5420 case DIGIT:
1aa99e6b 5421 if (do_utf8) {
ffc61ed2 5422 loceol = PL_regeol;
1a4fad37 5423 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5424 while (hardcount < max && scan < loceol &&
3568d838 5425 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5426 scan += UTF8SKIP(scan);
5427 hardcount++;
5428 }
5429 } else {
5430 while (scan < loceol && isDIGIT(*scan))
5431 scan++;
a0ed51b3
LW
5432 }
5433 break;
a0d0e21e 5434 case NDIGIT:
1aa99e6b 5435 if (do_utf8) {
ffc61ed2 5436 loceol = PL_regeol;
1a4fad37 5437 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5438 while (hardcount < max && scan < loceol &&
3568d838 5439 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5440 scan += UTF8SKIP(scan);
5441 hardcount++;
5442 }
5443 } else {
5444 while (scan < loceol && !isDIGIT(*scan))
5445 scan++;
a0ed51b3 5446 }
e1d1eefb
YO
5447 case LNBREAK:
5448 if (do_utf8) {
5449 loceol = PL_regeol;
5450 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5451 scan += c;
5452 hardcount++;
5453 }
5454 } else {
5455 /*
5456 LNBREAK can match two latin chars, which is ok,
5457 because we have a null terminated string, but we
5458 have to use hardcount in this situation
5459 */
5460 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5461 scan+=c;
5462 hardcount++;
5463 }
5464 }
5465 break;
5466 case HORIZWS:
5467 if (do_utf8) {
5468 loceol = PL_regeol;
5469 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5470 scan += c;
5471 hardcount++;
5472 }
5473 } else {
5474 while (scan < loceol && is_HORIZWS_latin1(scan))
5475 scan++;
5476 }
a0ed51b3 5477 break;
e1d1eefb
YO
5478 case NHORIZWS:
5479 if (do_utf8) {
5480 loceol = PL_regeol;
5481 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5482 scan += UTF8SKIP(scan);
5483 hardcount++;
5484 }
5485 } else {
5486 while (scan < loceol && !is_HORIZWS_latin1(scan))
5487 scan++;
5488
5489 }
5490 break;
5491 case VERTWS:
5492 if (do_utf8) {
5493 loceol = PL_regeol;
5494 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5495 scan += c;
5496 hardcount++;
5497 }
5498 } else {
5499 while (scan < loceol && is_VERTWS_latin1(scan))
5500 scan++;
5501
5502 }
5503 break;
5504 case NVERTWS:
5505 if (do_utf8) {
5506 loceol = PL_regeol;
5507 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5508 scan += UTF8SKIP(scan);
5509 hardcount++;
5510 }
5511 } else {
5512 while (scan < loceol && !is_VERTWS_latin1(scan))
5513 scan++;
5514
5515 }
5516 break;
5517
a0d0e21e
LW
5518 default: /* Called on something of 0 width. */
5519 break; /* So match right here or not at all. */
5520 }
a687059c 5521
a0ed51b3
LW
5522 if (hardcount)
5523 c = hardcount;
5524 else
5525 c = scan - PL_reginput;
3280af22 5526 PL_reginput = scan;
a687059c 5527
a3621e74 5528 DEBUG_r({
e68ec53f 5529 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5530 DEBUG_EXECUTE_r({
e68ec53f
YO
5531 SV * const prop = sv_newmortal();
5532 regprop(prog, prop, p);
5533 PerlIO_printf(Perl_debug_log,
be8e71aa 5534 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 5535 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5536 });
be8e71aa 5537 });
9041c2e3 5538
a0d0e21e 5539 return(c);
a687059c
LW
5540}
5541
c277df42 5542
be8e71aa 5543#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5544/*
ffc61ed2
JH
5545- regclass_swash - prepare the utf8 swash
5546*/
5547
5548SV *
32fc9b6a 5549Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5550{
97aff369 5551 dVAR;
9e55ce06
JH
5552 SV *sw = NULL;
5553 SV *si = NULL;
5554 SV *alt = NULL;
f8fc2ecf
YO
5555 RXi_GET_DECL(prog,progi);
5556 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 5557
4f639d21 5558 if (data && data->count) {
a3b680e6 5559 const U32 n = ARG(node);
ffc61ed2 5560
4f639d21
DM
5561 if (data->what[n] == 's') {
5562 SV * const rv = (SV*)data->data[n];
890ce7af 5563 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5564 SV **const ary = AvARRAY(av);
9e55ce06 5565 SV **a, **b;
9041c2e3 5566
711a919c 5567 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5568 * documentation of these array elements. */
5569
b11f357e 5570 si = *ary;
8f7f7219 5571 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5572 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5573
ffc61ed2
JH
5574 if (a)
5575 sw = *a;
5576 else if (si && doinit) {
5577 sw = swash_init("utf8", "", si, 1, 0);
5578 (void)av_store(av, 1, sw);
5579 }
9e55ce06
JH
5580 if (b)
5581 alt = *b;
ffc61ed2
JH
5582 }
5583 }
5584
9e55ce06
JH
5585 if (listsvp)
5586 *listsvp = si;
5587 if (altsvp)
5588 *altsvp = alt;
ffc61ed2
JH
5589
5590 return sw;
5591}
76234dfb 5592#endif
ffc61ed2
JH
5593
5594/*
ba7b4546 5595 - reginclass - determine if a character falls into a character class
832705d4
JH
5596
5597 The n is the ANYOF regnode, the p is the target string, lenp
5598 is pointer to the maximum length of how far to go in the p
5599 (if the lenp is zero, UTF8SKIP(p) is used),
5600 do_utf8 tells whether the target string is in UTF-8.
5601
bbce6d69 5602 */
5603
76e3520e 5604STATIC bool
32fc9b6a 5605S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5606{
27da23d5 5607 dVAR;
a3b680e6 5608 const char flags = ANYOF_FLAGS(n);
bbce6d69 5609 bool match = FALSE;
cc07378b 5610 UV c = *p;
ae9ddab8 5611 STRLEN len = 0;
9e55ce06 5612 STRLEN plen;
1aa99e6b 5613
19f67299
TS
5614 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5615 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5616 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5617 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5618 if (len == (STRLEN)-1)
5619 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5620 }
bbce6d69 5621
0f0076b4 5622 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5623 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5624 if (lenp)
5625 *lenp = 0;
ffc61ed2 5626 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5627 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5628 match = TRUE;
bbce6d69 5629 }
3568d838 5630 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5631 match = TRUE;
ffc61ed2 5632 if (!match) {
9e55ce06 5633 AV *av;
32fc9b6a 5634 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5635
5636 if (sw) {
3568d838 5637 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5638 match = TRUE;
5639 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5640 if (!match && lenp && av) {
5641 I32 i;
9e55ce06 5642 for (i = 0; i <= av_len(av); i++) {
890ce7af 5643 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5644 STRLEN len;
890ce7af 5645 const char * const s = SvPV_const(sv, len);
9e55ce06 5646
061b10df 5647 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5648 *lenp = len;
5649 match = TRUE;
5650 break;
5651 }
5652 }
5653 }
5654 if (!match) {
89ebb4a3 5655 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5656 STRLEN tmplen;
5657
9e55ce06
JH
5658 to_utf8_fold(p, tmpbuf, &tmplen);
5659 if (swash_fetch(sw, tmpbuf, do_utf8))
5660 match = TRUE;
5661 }
ffc61ed2
JH
5662 }
5663 }
bbce6d69 5664 }
9e55ce06 5665 if (match && lenp && *lenp == 0)
0f0076b4 5666 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5667 }
1aa99e6b 5668 if (!match && c < 256) {
ffc61ed2
JH
5669 if (ANYOF_BITMAP_TEST(n, c))
5670 match = TRUE;
5671 else if (flags & ANYOF_FOLD) {
eb160463 5672 U8 f;
a0ed51b3 5673
ffc61ed2
JH
5674 if (flags & ANYOF_LOCALE) {
5675 PL_reg_flags |= RF_tainted;
5676 f = PL_fold_locale[c];
5677 }
5678 else
5679 f = PL_fold[c];
5680 if (f != c && ANYOF_BITMAP_TEST(n, f))
5681 match = TRUE;
5682 }
5683
5684 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5685 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5686 if (
5687 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5688 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5689 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5690 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5691 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5692 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5693 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5694 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5695 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5696 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5697 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5698 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5699 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5700 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5701 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5702 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5703 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5704 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5705 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5706 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5707 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5708 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5709 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5710 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5711 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5712 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5713 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5714 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5715 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5716 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5717 ) /* How's that for a conditional? */
5718 {
5719 match = TRUE;
5720 }
a0ed51b3 5721 }
a0ed51b3
LW
5722 }
5723
a0ed51b3
LW
5724 return (flags & ANYOF_INVERT) ? !match : match;
5725}
161b471a 5726
dfe13c55 5727STATIC U8 *
0ce71af7 5728S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5729{
97aff369 5730 dVAR;
a0ed51b3 5731 if (off >= 0) {
1aa99e6b 5732 while (off-- && s < lim) {
ffc61ed2 5733 /* XXX could check well-formedness here */
a0ed51b3 5734 s += UTF8SKIP(s);
ffc61ed2 5735 }
a0ed51b3
LW
5736 }
5737 else {
1de06328
YO
5738 while (off++ && s > lim) {
5739 s--;
5740 if (UTF8_IS_CONTINUED(*s)) {
5741 while (s > lim && UTF8_IS_CONTINUATION(*s))
5742 s--;
a0ed51b3 5743 }
1de06328 5744 /* XXX could check well-formedness here */
a0ed51b3
LW
5745 }
5746 }
5747 return s;
5748}
161b471a 5749
f9f4320a
YO
5750#ifdef XXX_dmq
5751/* there are a bunch of places where we use two reghop3's that should
5752 be replaced with this routine. but since thats not done yet
5753 we ifdef it out - dmq
5754*/
dfe13c55 5755STATIC U8 *
1de06328
YO
5756S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5757{
5758 dVAR;
5759 if (off >= 0) {
5760 while (off-- && s < rlim) {
5761 /* XXX could check well-formedness here */
5762 s += UTF8SKIP(s);
5763 }
5764 }
5765 else {
5766 while (off++ && s > llim) {
5767 s--;
5768 if (UTF8_IS_CONTINUED(*s)) {
5769 while (s > llim && UTF8_IS_CONTINUATION(*s))
5770 s--;
5771 }
5772 /* XXX could check well-formedness here */
5773 }
5774 }
5775 return s;
5776}
f9f4320a 5777#endif
1de06328
YO
5778
5779STATIC U8 *
0ce71af7 5780S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5781{
97aff369 5782 dVAR;
a0ed51b3 5783 if (off >= 0) {
1aa99e6b 5784 while (off-- && s < lim) {
ffc61ed2 5785 /* XXX could check well-formedness here */
a0ed51b3 5786 s += UTF8SKIP(s);
ffc61ed2 5787 }
a0ed51b3 5788 if (off >= 0)
3dab1dad 5789 return NULL;
a0ed51b3
LW
5790 }
5791 else {
1de06328
YO
5792 while (off++ && s > lim) {
5793 s--;
5794 if (UTF8_IS_CONTINUED(*s)) {
5795 while (s > lim && UTF8_IS_CONTINUATION(*s))
5796 s--;
a0ed51b3 5797 }
1de06328 5798 /* XXX could check well-formedness here */
a0ed51b3
LW
5799 }
5800 if (off <= 0)
3dab1dad 5801 return NULL;
a0ed51b3
LW
5802 }
5803 return s;
5804}
51371543 5805
51371543 5806static void
acfe0abc 5807restore_pos(pTHX_ void *arg)
51371543 5808{
97aff369 5809 dVAR;
097eb12c 5810 regexp * const rex = (regexp *)arg;
51371543
GS
5811 if (PL_reg_eval_set) {
5812 if (PL_reg_oldsaved) {
4f639d21
DM
5813 rex->subbeg = PL_reg_oldsaved;
5814 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5815#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5816 rex->saved_copy = PL_nrs;
ed252734 5817#endif
4f639d21 5818 RX_MATCH_COPIED_on(rex);
51371543
GS
5819 }
5820 PL_reg_magic->mg_len = PL_reg_oldpos;
5821 PL_reg_eval_set = 0;
5822 PL_curpm = PL_reg_oldcurpm;
5823 }
5824}
33b8afdf
JH
5825
5826STATIC void
5827S_to_utf8_substr(pTHX_ register regexp *prog)
5828{
a1cac82e
NC
5829 int i = 1;
5830 do {
5831 if (prog->substrs->data[i].substr
5832 && !prog->substrs->data[i].utf8_substr) {
5833 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5834 prog->substrs->data[i].utf8_substr = sv;
5835 sv_utf8_upgrade(sv);
610460f9
NC
5836 if (SvVALID(prog->substrs->data[i].substr)) {
5837 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5838 if (flags & FBMcf_TAIL) {
5839 /* Trim the trailing \n that fbm_compile added last
5840 time. */
5841 SvCUR_set(sv, SvCUR(sv) - 1);
5842 /* Whilst this makes the SV technically "invalid" (as its
5843 buffer is no longer followed by "\0") when fbm_compile()
5844 adds the "\n" back, a "\0" is restored. */
5845 }
5846 fbm_compile(sv, flags);
5847 }
a1cac82e
NC
5848 if (prog->substrs->data[i].substr == prog->check_substr)
5849 prog->check_utf8 = sv;
5850 }
5851 } while (i--);
33b8afdf
JH
5852}
5853
5854STATIC void
5855S_to_byte_substr(pTHX_ register regexp *prog)
5856{
97aff369 5857 dVAR;
a1cac82e
NC
5858 int i = 1;
5859 do {
5860 if (prog->substrs->data[i].utf8_substr
5861 && !prog->substrs->data[i].substr) {
5862 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5863 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9
NC
5864 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5865 const U8 flags
5866 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5867 if (flags & FBMcf_TAIL) {
5868 /* Trim the trailing \n that fbm_compile added last
5869 time. */
5870 SvCUR_set(sv, SvCUR(sv) - 1);
5871 }
5872 fbm_compile(sv, flags);
5873 }
a1cac82e
NC
5874 } else {
5875 SvREFCNT_dec(sv);
5876 sv = &PL_sv_undef;
5877 }
5878 prog->substrs->data[i].substr = sv;
5879 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5880 prog->check_substr = sv;
33b8afdf 5881 }
a1cac82e 5882 } while (i--);
33b8afdf 5883}
66610fdd
RGS
5884
5885/*
5886 * Local variables:
5887 * c-indentation-style: bsd
5888 * c-basic-offset: 4
5889 * indent-tabs-mode: t
5890 * End:
5891 *
37442d52
RGS
5892 * ex: set ts=8 sts=4 sw=4 noet:
5893 */